1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "ExecutionPlan"
18 
19 #include "ExecutionPlan.h"
20 
21 #include "BurstBuilder.h"
22 #include "Callbacks.h"
23 #include "CompilationBuilder.h"
24 #include "ExecutionBuilder.h"
25 #include "ExecutionBurstController.h"
26 #include "GraphDump.h"
27 #include "Manager.h"
28 #include "ModelBuilder.h"
29 #include "OperationsUtils.h"
30 #include "TokenHasher.h"
31 #include "Tracing.h"
32 #include "TypeManager.h"
33 #include "Utils.h"
34 
35 #include <cutils/native_handle.h>
36 #include <fcntl.h>
37 #include <openssl/sha.h>
38 #include <sys/stat.h>
39 #include <sys/types.h>
40 #include <functional>
41 #include <map>
42 #include <mutex>
43 #include <queue>
44 #include <strstream>
45 #include <type_traits>
46 #include <unordered_set>
47 #include <utility>
48 #include <vector>
49 
50 using HidlToken = hidl_array<uint8_t, ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN>;
51 
52 namespace android {
53 namespace nn {
54 
55 namespace {
56 
57 // Opens cache file by filename and sets the handle to the opened fd. Returns false on fail. The
58 // handle is expected to come in as empty, and is only set to a fd when the function returns true.
59 // The file descriptor is always opened with both read and write permission.
createCacheHandle(const std::string & cache,bool createIfNotExist,hidl_handle * handle)60 bool createCacheHandle(const std::string& cache, bool createIfNotExist, hidl_handle* handle) {
61     CHECK(handle->getNativeHandle() == nullptr);
62     int fd = open(cache.c_str(), createIfNotExist ? (O_RDWR | O_CREAT) : O_RDWR, S_IRUSR | S_IWUSR);
63     NN_RET_CHECK_GE(fd, 0);
64     native_handle_t* cacheNativeHandle = native_handle_create(1, 0);
65     if (cacheNativeHandle == nullptr) {
66         close(fd);
67         return false;
68     }
69     cacheNativeHandle->data[0] = fd;
70     handle->setTo(cacheNativeHandle, /*shouldOwn=*/true);
71     return true;
72 }
73 
74 // Opens a list of cache files and returns the handle vector. Returns empty vector on fail.
75 // The file descriptors are always opened with both read and write permission.
createCacheHandleVec(uint32_t numCacheFiles,const std::string & baseFileName,bool createIfNotExist)76 hidl_vec<hidl_handle> createCacheHandleVec(uint32_t numCacheFiles, const std::string& baseFileName,
77                                            bool createIfNotExist) {
78     CHECK(numCacheFiles <= static_cast<uint32_t>(Constant::MAX_NUMBER_OF_CACHE_FILES));
79     hidl_vec<hidl_handle> handles(numCacheFiles);
80     for (uint32_t i = 0; i < numCacheFiles; i++) {
81         std::string filename = baseFileName + std::to_string(i);
82         VLOG(COMPILATION) << "Cache " << i << ": " << filename;
83         if (!createCacheHandle(filename, createIfNotExist, &handles[i])) {
84             return hidl_vec<hidl_handle>();
85         }
86     }
87     return handles;
88 }
89 
90 // Maps token to cache file names and sets the handle vectors to the opened fds. Returns false on
91 // fail and leaves the vectors empty. Each vector is expected to come in as empty.
getCacheHandles(const std::string & cacheDir,const uint8_t * token,const std::pair<uint32_t,uint32_t> & numCacheFiles,bool createIfNotExist,hidl_vec<hidl_handle> * modelCache,hidl_vec<hidl_handle> * dataCache)92 bool getCacheHandles(const std::string& cacheDir, const uint8_t* token,
93                      const std::pair<uint32_t, uint32_t>& numCacheFiles, bool createIfNotExist,
94                      hidl_vec<hidl_handle>* modelCache, hidl_vec<hidl_handle>* dataCache) {
95     // The filename includes ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN * 2 characters for token,
96     // and 1 character for model/data cache identifier.
97     std::string filename(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN * 2 + 1, '0');
98     for (uint32_t i = 0; i < ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN; i++) {
99         filename[i * 2] = 'A' + (token[i] & 0x0F);
100         filename[i * 2 + 1] = 'A' + (token[i] >> 4);
101     }
102     CHECK(cacheDir.empty() || cacheDir.back() == '/');
103     std::string cacheFileName = cacheDir + filename;
104 
105     cacheFileName[ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN * 2] = '1';
106     *modelCache = createCacheHandleVec(numCacheFiles.first, cacheFileName, createIfNotExist);
107     if (modelCache->size() != numCacheFiles.first) {
108         return false;
109     }
110     cacheFileName[ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN * 2] = '2';
111     *dataCache = createCacheHandleVec(numCacheFiles.second, cacheFileName, createIfNotExist);
112     if (dataCache->size() != numCacheFiles.second) {
113         modelCache->resize(0);
114         return false;
115     }
116     return true;
117 }
118 
119 // Tries to compile directly from cache, returns false on fail.
compileFromCache(const std::shared_ptr<Device> & device,const std::string & cacheDir,const uint8_t * token,std::shared_ptr<VersionedIPreparedModel> * preparedModel)120 bool compileFromCache(const std::shared_ptr<Device>& device, const std::string& cacheDir,
121                       const uint8_t* token,
122                       std::shared_ptr<VersionedIPreparedModel>* preparedModel) {
123     CHECK(token != nullptr && device != nullptr);
124     VLOG(COMPILATION) << "compileFromCache";
125     *preparedModel = nullptr;
126     HidlToken cacheToken(token);
127     hidl_vec<hidl_handle> modelCache, dataCache;
128     NN_RET_CHECK(getCacheHandles(cacheDir, token, device->getNumberOfCacheFilesNeeded(),
129                                  /*createIfNotExist=*/false, &modelCache, &dataCache));
130     int ret = device->prepareModelFromCache(modelCache, dataCache, cacheToken, preparedModel);
131     return ret == ANEURALNETWORKS_NO_ERROR;
132 }
133 
compileModelAndCache(const std::shared_ptr<Device> & device,const ModelBuilder * model,int32_t executionPreference,const std::string & cacheDir,const uint8_t * token,std::shared_ptr<VersionedIPreparedModel> * preparedModel)134 int compileModelAndCache(const std::shared_ptr<Device>& device, const ModelBuilder* model,
135                          int32_t executionPreference, const std::string& cacheDir,
136                          const uint8_t* token,
137                          std::shared_ptr<VersionedIPreparedModel>* preparedModel) {
138     CHECK(device != nullptr);
139     *preparedModel = nullptr;
140     uint8_t dummyToken[ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN] = {0};
141     HidlToken cacheToken(token == nullptr ? dummyToken : token);
142     hidl_vec<hidl_handle> modelCache, dataCache;
143     if (token == nullptr || !getCacheHandles(cacheDir, token, device->getNumberOfCacheFilesNeeded(),
144                                              /*createIfNotExist=*/true, &modelCache, &dataCache)) {
145         modelCache.resize(0);
146         dataCache.resize(0);
147     }
148     Model hidlModel;
149     model->setHidlModel(&hidlModel);
150     return device->prepareModel(hidlModel, static_cast<ExecutionPreference>(executionPreference),
151                                 modelCache, dataCache, cacheToken, preparedModel);
152 }
153 
154 // Compiles the model on device.
155 // If compilation caching is available, depending on ExecutionPlan::mState, the token may only have
156 // been initialized by the user provided token (SIMPLE body), or is already re-hashed by the
157 // operation indices to be executed (COMPOUND body). The token will be re-hashed further by the
158 // device name, device version string, and the execution preference in this function.
compile(std::shared_ptr<Device> device,const ModelBuilder * model,int32_t executionPreference,const std::string & cacheDir,TokenHasher * token,std::shared_ptr<VersionedIPreparedModel> * preparedModel)159 int compile(std::shared_ptr<Device> device, const ModelBuilder* model, int32_t executionPreference,
160             const std::string& cacheDir, TokenHasher* token,
161             std::shared_ptr<VersionedIPreparedModel>* preparedModel) {
162     CHECK(device != nullptr);
163     const uint8_t* tokenData = nullptr;
164     if (device->isCachingSupported() && token->ok() && token->updateFromString(device->getName()) &&
165         token->updateFromString(device->getVersionString()) &&
166         token->update(&executionPreference, sizeof(executionPreference)) && token->finish()) {
167         tokenData = token->getCacheToken();
168     }
169     if (tokenData != nullptr && compileFromCache(device, cacheDir, tokenData, preparedModel)) {
170         return ANEURALNETWORKS_NO_ERROR;
171     }
172     return compileModelAndCache(device, model, executionPreference, cacheDir, tokenData,
173                                 preparedModel);
174 }
175 
176 typedef std::function<void(uint32_t)> OperationReadyCallback;
177 
copyOperandExtraParams(ModelBuilder & model,uint32_t toOperandIndex,const Operand & fromOperand)178 int copyOperandExtraParams(ModelBuilder& model, uint32_t toOperandIndex,
179                            const Operand& fromOperand) {
180     if (fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL &&
181         fromOperand.extraParams.getDiscriminator() ==
182                 Operand::ExtraParams::hidl_discriminator::channelQuant) {
183         auto& fromChannelQuant = fromOperand.extraParams.channelQuant();
184         ANeuralNetworksSymmPerChannelQuantParams toChannelQuant = {
185                 .channelDim = fromChannelQuant.channelDim,
186                 .scaleCount = static_cast<uint32_t>(fromChannelQuant.scales.size()),
187                 .scales = fromChannelQuant.scales.data(),
188         };
189         return model.setOperandSymmPerChannelQuantParams(toOperandIndex, toChannelQuant);
190     } else if (isExtensionOperandType(fromOperand.type) &&
191                fromOperand.extraParams.getDiscriminator() ==
192                        Operand::ExtraParams::hidl_discriminator::extension) {
193         hidl_vec<uint8_t> extensionData = fromOperand.extraParams.extension();
194         return model.setOperandExtensionData(toOperandIndex, extensionData.data(),
195                                              extensionData.size());
196     } else if (fromOperand.extraParams.getDiscriminator() !=
197                        Operand::ExtraParams::hidl_discriminator::none ||
198                fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
199         LOG(ERROR) << "Type " << toString(fromOperand.type)
200                    << " has an unexpected extraParams discriminator: "
201                    << static_cast<int>(fromOperand.extraParams.getDiscriminator());
202         return ANEURALNETWORKS_BAD_DATA;
203     } else {
204         return ANEURALNETWORKS_NO_ERROR;
205     }
206 }
207 
208 // This class tracks whether we know the value of an operand as operations
209 // are processed.
210 class OperandTracker {
211 public:
212     // Creates the tracker for this model. Figure out which operations can be
213     // executed right away and cb for each one of them.
214     OperandTracker(const ModelBuilder* model, OperationReadyCallback cb);
215     // Mark the specified operation as having been processed. The output
216     // of the operation now being known, this may make new operations to be
217     // able to run.  Call cb for each one of them.
218     void markProcessed(uint32_t operationIndex, OperationReadyCallback cb);
219 
220 private:
221     const ModelBuilder* mModel;
222     std::multimap<uint32_t, uint32_t> mOperandToOperations;
223     std::vector<uint32_t> mUnknownInputCount;  // For each operation
224 };
225 
OperandTracker(const ModelBuilder * model,OperationReadyCallback cb)226 OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb) :
227         mModel(model) {
228     const auto& operations = mModel->getOperations();
229     mUnknownInputCount.resize(operations.size());
230     for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) {
231         const Operation& operation = operations[operationIndex];
232         uint32_t count = 0;
233         for (uint32_t operandIndex : operation.inputs) {
234             auto lifetime = mModel->getOperand(operandIndex).lifetime;
235             if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
236                 lifetime == OperandLifeTime::MODEL_OUTPUT) {
237                 count++;
238                 mOperandToOperations.insert(
239                         std::pair<uint32_t, uint32_t>(operandIndex, operationIndex));
240             }
241         }
242         if (count == 0) {
243             cb(operationIndex);
244         }
245         mUnknownInputCount[operationIndex] = count;
246     }
247 }
248 
markProcessed(uint32_t operationIndex,OperationReadyCallback cb)249 void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) {
250     // Mark all its outputs as known.
251     const Operation& operation = mModel->getOperations()[operationIndex];
252     for (uint32_t operandIndex : operation.outputs) {
253         auto range = mOperandToOperations.equal_range(operandIndex);
254         for (auto i = range.first; i != range.second; i++) {
255             uint32_t& count = mUnknownInputCount[i->second];
256             if (--count == 0) {
257                 cb(i->second);
258             }
259         }
260     }
261 }
262 
263 }  // namespace
264 
ExecutionStep(ExecutionPlan * plan,uint32_t stepIndex,std::shared_ptr<Device> device)265 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex,
266                              std::shared_ptr<Device> device)
267     : mPlan(plan), mIndex(stepIndex), mSubModel(), mDevice(device), mToken(plan->getCacheToken()) {}
268 
269 // Adds an operand if it has not been added already.
270 // Sets the index in the submodel for the corresponding operand.
addOperand(uint32_t fromOperandIndex,uint32_t * toOperandIndex,const ModelBuilder & fromModel,OperandKind kind)271 int ExecutionStep::addOperand(uint32_t fromOperandIndex, uint32_t* toOperandIndex,
272                               const ModelBuilder& fromModel, OperandKind kind) {
273     // Have we added this operand already?
274     auto i = mOperandMap.find(fromOperandIndex);
275     if (i != mOperandMap.end()) {
276         nnAssert(kind == INPUT);
277         *toOperandIndex = i->second;
278         return ANEURALNETWORKS_NO_ERROR;
279     }
280 
281     // First time we add this operand.
282     *toOperandIndex = mSubModel.operandCount();
283     mOperandMap.insert(std::pair<uint32_t, uint32_t>(fromOperandIndex, *toOperandIndex));
284 
285     // Add the operand to the submodel.
286     const Operand& operand = fromModel.getOperand(fromOperandIndex);
287     ANeuralNetworksOperandType type = {
288             .type = static_cast<int32_t>(operand.type),
289             .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()),
290             .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr,
291             .scale = operand.scale,
292             .zeroPoint = operand.zeroPoint,
293     };
294 
295     int n = mSubModel.addOperand(type);
296     if (n != ANEURALNETWORKS_NO_ERROR) {
297         LOG(ERROR) << "Previous error occurred when partitioning the graph";
298         return n;
299     }
300 
301     n = copyOperandExtraParams(mSubModel, *toOperandIndex, operand);
302     if (n != ANEURALNETWORKS_NO_ERROR) {
303         LOG(ERROR) << "Error when copying extra parameters to the operand";
304         return n;
305     }
306 
307     // Sets its value.
308     switch (operand.lifetime) {
309         case OperandLifeTime::CONSTANT_COPY: {
310             const uint8_t* data = fromModel.getPointerToOperandValue(operand.location.offset);
311             n = mSubModel.setOperandValue(*toOperandIndex, data, operand.location.length);
312             if (n != ANEURALNETWORKS_NO_ERROR) {
313                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
314                 return n;
315             }
316         } break;
317         case OperandLifeTime::CONSTANT_REFERENCE: {
318             const Memory* memory = fromModel.getMemories()[operand.location.poolIndex];
319             n = mSubModel.setOperandValueFromMemory(*toOperandIndex, memory,
320                                                      operand.location.offset,
321                                                      operand.location.length);
322             if (n != ANEURALNETWORKS_NO_ERROR) {
323                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
324                 return n;
325             }
326         } break;
327         case OperandLifeTime::NO_VALUE: {
328             n = mSubModel.setOperandValue(*toOperandIndex, nullptr, 0);
329             if (n != ANEURALNETWORKS_NO_ERROR) {
330                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
331                 return n;
332             }
333         } break;
334         case OperandLifeTime::TEMPORARY_VARIABLE:  // handled similarly to MODEL_OUTPUT
335             if (kind == INPUT) {
336                 // The first time we've seen this operand is as an
337                 // input.  That means it must be defined by a
338                 // different partition, and is an input to this one.
339                 mTempsAsSubModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
340             } else {
341                 // The first time we've seen this operand is as an
342                 // output.  It may be an input to a different
343                 // partition, so keep track of it.
344                 mPlan->recordTemporaryDef(fromOperandIndex, mIndex);
345             }
346             break;
347         case OperandLifeTime::MODEL_INPUT:
348             mModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
349             break;
350         case OperandLifeTime::MODEL_OUTPUT:  // handled similarly to TEMPORARY_VARIABLE
351             if (kind == INPUT) {
352                 // The first time we've seen this operand is as an
353                 // input.  That means it must be defined by a
354                 // different partition, and is an input to this one.
355                 mOutputsAsSubModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
356             } else {
357                 // The first time we've seen this operand is as an
358                 // output.
359                 mModelOutputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
360             }
361             break;
362         default:
363             nnAssert(false);
364             break;
365     }
366 
367     return ANEURALNETWORKS_NO_ERROR;
368 }
369 
addOperation(int operationIndex,const ModelBuilder & fromModel)370 int ExecutionStep::addOperation(int operationIndex, const ModelBuilder& fromModel) {
371     const Operation& operation = fromModel.getOperation(operationIndex);
372     if (mToken.ok()) {
373         mToken.update(&operationIndex, sizeof(operationIndex));
374     }
375 
376     // Convert the input and output operand indexes.
377     //
378     // We expect operations to be added in topological order.  Therefore:
379     //
380     // - We may not have seen an input if it is a model input, a
381     //   constant, or an operand written by a different partition.
382     //
383     // - We should not have seen any outputs.
384     const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size());
385     const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size());
386     std::vector<uint32_t> inputs(inputCount);
387     std::vector<uint32_t> outputs(outputCount);
388 
389     auto addOperands = [this, &fromModel](const hidl_vec<uint32_t>& globalOperands,
390                                           std::vector<uint32_t>& localOperands,
391                                           OperandKind kind) -> int {
392         const uint32_t operandCount = static_cast<uint32_t>(globalOperands.size());
393         for (uint32_t i = 0; i < operandCount; i++) {
394             uint32_t localOperand = ~0U;
395             int n = addOperand(globalOperands[i], &localOperand, fromModel, kind);
396             if (n != ANEURALNETWORKS_NO_ERROR)
397                 return n;
398             localOperands[i] = localOperand;
399         }
400         return ANEURALNETWORKS_NO_ERROR;
401     };
402 
403     int n;
404     if ((n = addOperands(operation.inputs, inputs, INPUT)) != ANEURALNETWORKS_NO_ERROR ||
405         (n = addOperands(operation.outputs, outputs, OUTPUT)) != ANEURALNETWORKS_NO_ERROR) {
406         return n;
407     }
408 
409     return mSubModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
410                                    outputCount, outputs.data());
411 }
412 
mapInputsAndOutputs(std::shared_ptr<StepExecutor> stepExecutor) const413 void ExecutionStep::mapInputsAndOutputs(std::shared_ptr<StepExecutor> stepExecutor) const {
414     for (uint32_t i = 0, e = mInputIndexSubModelToFromModel.size(); i < e; i++) {
415         stepExecutor->mapInput(mInputIndexSubModelToFromModel[i], i);
416     }
417     for (uint32_t i = 0, e = mOutputIndexSubModelToFromModel.size(); i < e; i++) {
418         stepExecutor->mapOutput(mOutputIndexSubModelToFromModel[i], i);
419     }
420 }
421 
findTempsAsSubModelOutputs()422 void ExecutionPlan::CompoundBody::findTempsAsSubModelOutputs() {
423     for (const auto& step : mSteps) {
424         for (const auto& input : step->getTempsAsSubModelInputs()) {
425             const uint32_t fromModelIndex = input.first;
426             const auto it = mTemporaryToDefiningStep.find(fromModelIndex);
427             nnAssert(it != mTemporaryToDefiningStep.end());
428             const uint32_t stepIndex = it->second;
429             nnAssert(stepIndex < mSteps.size());
430             mSteps[stepIndex]->recordTempAsSubModelOutput(fromModelIndex);
431         }
432     }
433 }
434 
logSubModel() const435 void ExecutionStep::logSubModel() const {
436     VLOG(COMPILATION) << "ExecutionStep::finishSubModel, step " << mIndex;
437 
438     auto logRemapEntry = [](std::string &toLog, const std::pair<uint32_t, uint32_t>& e) {
439         if (!toLog.empty()) {
440             toLog += ", ";
441         }
442         toLog += "(";
443         toLog += std::to_string(e.first);
444         toLog += "->";
445         toLog += std::to_string(e.second);
446         toLog += ")";
447     };
448 
449     auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) {
450         std::string toLog;
451         for (const auto& e : map) {
452             logRemapEntry(toLog, e);
453         }
454         VLOG(COMPILATION) << name << ": " << toLog;
455     };
456     auto logRemapSet = [&logRemapEntry](const char* name, const SubModelOutputSetType& set) {
457         std::string toLog;
458         for (const auto& e : set) {
459             logRemapEntry(toLog, e);
460         }
461         VLOG(COMPILATION) << name << ": " << toLog;
462     };
463 
464     logRemapVector("model inputs", mModelInputs);
465     logRemapVector("model outputs", mModelOutputs);
466     logRemapVector("temps as submodel inputs", mTempsAsSubModelInputs);
467     logRemapSet("temps as submodel outputs", mTempsAsSubModelOutputs);
468     logRemapVector("outputs as submodel inputs", mOutputsAsSubModelInputs);
469 }
470 
convertModelInputsOrOutputs(const ExecutionStep::RemapVectorType & myModelInputsOrOutputs,uint32_t fromModelInputOrOutputCount,std::function<uint32_t (uint32_t)> fromModelGetInputOrOutputOperandIndex,std::vector<uint32_t> * inputsOrOutputs,std::vector<uint32_t> * inputOrOutputIndexSubModelToFromModel)471 static void convertModelInputsOrOutputs(
472         // IN: mModel{Inputs|Outputs}
473         const ExecutionStep::RemapVectorType& myModelInputsOrOutputs,
474         // IN: fromModel->{input|output}Count()
475         uint32_t                              fromModelInputOrOutputCount,
476         // IN: fromModel->get{Input|Output}OperandIndex
477         std::function<uint32_t(uint32_t)>     fromModelGetInputOrOutputOperandIndex,
478         // OUT: for v : mModel{Inputs|Outputs} : v.second
479         std::vector<uint32_t>*                inputsOrOutputs,
480         // OUT: submodel input-or-output index to original model input-or-output index
481         std::vector<uint32_t>*                inputOrOutputIndexSubModelToFromModel) {
482     std::map<uint32_t, uint32_t> fromModelIndexMap;  // operand index to input-or-output index
483     for (uint32_t i = 0; i < fromModelInputOrOutputCount; i++) {
484         fromModelIndexMap[fromModelGetInputOrOutputOperandIndex(i)] = i;
485     }
486     for (const auto& myInputOrOutput : myModelInputsOrOutputs) {
487         inputsOrOutputs->push_back(myInputOrOutput.second);
488         const uint32_t fromModelInputOrOutputIndex = fromModelIndexMap[myInputOrOutput.first];
489         inputOrOutputIndexSubModelToFromModel->push_back(fromModelInputOrOutputIndex);
490     }
491 }
492 
finishSubModel(const ModelBuilder * fromModel,bool * hasOutputOfUnknownSize,int32_t executionPreference)493 int ExecutionStep::finishSubModel(const ModelBuilder* fromModel, bool* hasOutputOfUnknownSize,
494                                   int32_t executionPreference) {
495     nnAssert(mDevice != nullptr);
496     if (VLOG_IS_ON(COMPILATION)) {
497         logSubModel();
498     }
499 
500     mSubModel.relaxComputationFloat32toFloat16(fromModel->isComputationFloat32RelaxedToFloat16());
501 
502     // Input order: mModelInputs, mTempsAsSubModelInputs, mOutputsAsSubModelInputs
503     // Output order: mModelOutputs, mTempsAsSubModelOutputs
504     //
505     // ExecutionPlan::next() depends on these orderings.
506 
507     std::vector<uint32_t> inputs;
508     convertModelInputsOrOutputs(mModelInputs,
509                                 fromModel->inputCount(),
510                                 [=](uint32_t i) { return fromModel->getInputOperandIndex(i); },
511                                 &inputs,
512                                 &mInputIndexSubModelToFromModel);
513     for (const auto& subModelInput : mTempsAsSubModelInputs) {
514         inputs.push_back(subModelInput.second);
515     }
516     for (const auto& subModelInput : mOutputsAsSubModelInputs) {
517         inputs.push_back(subModelInput.second);
518     }
519 
520     std::vector<uint32_t> outputs;
521     convertModelInputsOrOutputs(mModelOutputs,
522                                 fromModel->outputCount(),
523                                 [=](uint32_t i) { return fromModel->getOutputOperandIndex(i); },
524                                 &outputs,
525                                 &mOutputIndexSubModelToFromModel);
526     for (const auto& subModelOutput : mTempsAsSubModelOutputs) {
527         outputs.push_back(subModelOutput.second);
528         const Operand& operand = mSubModel.getOperand(subModelOutput.second);
529         if (operand.dimensions.size() == 0) {
530             *hasOutputOfUnknownSize = true;
531         } else {
532             for (uint32_t dimension : operand.dimensions) {
533                 if (dimension == 0) {
534                     *hasOutputOfUnknownSize = true;
535                     break;
536                 }
537             }
538         }
539         if (*hasOutputOfUnknownSize) {
540             VLOG(COMPILATION) << "SubModelOutput (operand#" << subModelOutput.first
541                               << " of original graph) has unknown size: " << toString(operand);
542         }
543     }
544 
545     {
546         int n = mSubModel.identifyInputsAndOutputs(inputs.size(), &inputs[0], outputs.size(), &outputs[0]);
547         if (n != ANEURALNETWORKS_NO_ERROR) {
548             return n;
549         }
550         n = mSubModel.finish();
551         if (n != ANEURALNETWORKS_NO_ERROR) {
552             return n;
553         }
554     }
555 
556     {
557         // Compute mOutputsAsSubModelInputsIndexToFromModel.
558 
559         std::map<uint32_t, uint32_t> fromModelOperandIndexToOutputIndex;
560         for (unsigned i = 0, e = fromModel->outputCount(); i < e; ++i) {
561             fromModelOperandIndexToOutputIndex[fromModel->getOutputOperandIndex(i)] = i;
562         }
563 
564         for (unsigned i = 0, e = mOutputsAsSubModelInputs.size(); i < e; i++) {
565             const uint32_t fromModelOperandIndex = mOutputsAsSubModelInputs[i].first;
566             const auto it = fromModelOperandIndexToOutputIndex.find(fromModelOperandIndex);
567             if (it == fromModelOperandIndexToOutputIndex.end()) {
568                 LOG(ERROR) << "Could not find main model output operand " << fromModelOperandIndex
569                            << " in main model output operand list";
570                 return ANEURALNETWORKS_BAD_STATE;
571             }
572             mOutputsAsSubModelInputsIndexToFromModel.push_back(it->second);
573         }
574     }
575 
576     // TODO: Move compilation elsewhere?
577     VLOG(COMPILATION) << "ExecutionStep::finishSubModel, compilation on " << mDevice->getName();
578     return compile(mDevice, &mSubModel, executionPreference, *mPlan->getCacheDir(), &mToken,
579                    &mPreparedSubModel);
580 }
581 
dump() const582 void ExecutionStep::dump() const {
583     Model model;
584     mSubModel.setHidlModel(&model);
585     if (VLOG_IS_ON(COMPILATION)) {
586         VLOG(COMPILATION) << "ExecutionStep#" << mIndex << " for " << mDevice->getName();
587         logModelToInfo(model);
588     }
589 }
590 
finish(const ModelBuilder * fromModel,int32_t executionPreference)591 int ExecutionPlan::CompoundBody::finish(const ModelBuilder* fromModel,
592                                         int32_t executionPreference) {
593     findTempsAsSubModelOutputs();
594     for (const auto& step : mSteps) {
595         int n = step->finishSubModel(fromModel, &mHasSubModelOutputOfUnknownSize,
596                                      executionPreference);
597         if (n != ANEURALNETWORKS_NO_ERROR) {
598             VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- finishSubModel failed";
599             return n;
600         }
601     }
602     if (mHasSubModelOutputOfUnknownSize) {
603         VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- mHasSubModelOutputOfUnknownSize";
604         return ANEURALNETWORKS_OP_FAILED;
605     }
606 
607     mSuccessfulFinish = true;
608     return ANEURALNETWORKS_NO_ERROR;
609 }
610 
finish(const ModelBuilder * fromModel,int32_t executionPreference)611 int ExecutionPlan::SimpleBody::finish([[maybe_unused]] const ModelBuilder* fromModel,
612                                       int32_t executionPreference) {
613     nnAssert(mDevice != nullptr);
614     VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
615     const int n =
616             compile(mDevice, mModel, executionPreference, *mCacheDir, &mToken, &mPreparedModel);
617     mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
618     return n;
619 }
620 
finish(const ModelBuilder * fromModel,int32_t executionPreference)621 int ExecutionPlan::finish(const ModelBuilder* fromModel, int32_t executionPreference) {
622     nnAssert(mBody != nullptr);
623     return mBody->finish(fromModel, executionPreference);
624 }
625 
Controller(const ExecutionPlan * plan,ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder,std::shared_ptr<const SubModelInputsAndOutputsType> subModelInputsAndOutputs,uint32_t totalSizeOfTemporaries)626 ExecutionPlan::Controller::Controller(
627         const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
628         const BurstBuilder* burstBuilder,
629         std::shared_ptr<const SubModelInputsAndOutputsType> subModelInputsAndOutputs,
630         uint32_t totalSizeOfTemporaries)
631     : mPlan(plan),
632       mExecutionBuilder(executionBuilder),
633       mBurstBuilder(burstBuilder),
634       mSubModelInputsAndOutputs(subModelInputsAndOutputs),
635       mNextStepIndex(0) {
636     if (totalSizeOfTemporaries) {
637         if (mTemporaries.create(totalSizeOfTemporaries) != ANEURALNETWORKS_NO_ERROR) {
638             LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries";
639             mNextStepIndex = kBadStepIndex;
640         }
641     }
642 }
643 
644 // Attempt to create a burst object for each PreparedModel/Partition. If the
645 // burst controller object cannot be made, return a nullptr in its place to
646 // indicate the regular execution path should be used. This can occur either
647 // because PreparedModel was nullptr (cpu was best choice), or because the
648 // IPreparedModel was of insufficient version or failed to configure the burst.
makeBursts() const649 std::vector<std::shared_ptr<ExecutionBurstController>> ExecutionPlan::makeBursts() const {
650     switch (mState) {
651         // burst object for each partition in the compound case
652         case COMPOUND: {
653             std::vector<std::shared_ptr<ExecutionBurstController>> bursts;
654             bursts.reserve(compound()->mSteps.size());
655             for (const auto& step : compound()->mSteps) {
656                 if (const auto preparedModel = step->getPreparedSubModel()) {
657                     bursts.push_back(preparedModel->configureExecutionBurst(/*blocking=*/true));
658                 } else {
659                     bursts.push_back(nullptr);
660                 }
661             }
662             return bursts;
663         }
664         // single burst object for the simple case
665         case SIMPLE: {
666             std::vector<std::shared_ptr<ExecutionBurstController>> burst;
667             auto simpleBody = static_cast<const SimpleBody*>(mBody);
668             if (const auto preparedModel = simpleBody->mPreparedModel) {
669                 burst.push_back(preparedModel->configureExecutionBurst(/*blocking=*/true));
670             } else {
671                 burst.push_back(nullptr);
672             }
673             return burst;
674         }
675         // no burst objects made
676         default:
677             return {};
678     }
679 }
680 
makeController(ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder) const681 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
682         ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const {
683     nnAssert(isValid());
684 
685     // Create the layout for a Memory object big enough for to hold
686     // every TEMPORARY in the original model that is live across
687     // partition boundaries.
688     //
689     // TODO: Rethink this approach for managing temporaries.  Some
690     // alternatives:
691     //
692     // 1) Adopt a memory layout scheme analogous to stack allocation,
693     // where objects of non-overlapping lifetime can occupy the same
694     // storage.  We would still have a single Memory object in this
695     // case.
696     //
697     // 2) Do something like what CpuExecutor does, and do allocations
698     // and deallocations on the fly (during execution) before first
699     // reference and after last reference, respectively.  This would
700     // mean having one Memory object per TEMPORARY; or, in a more
701     // complicated implementation, one Memory object per set of
702     // temporaries that have the same lifetime.  Note that the Android
703     // system limits the number of shared memory objects, which are
704     // what our Memory objects represent.
705     //
706     uint32_t totalSizeOfTemporaries = 0;
707     std::shared_ptr<Controller::SubModelInputsAndOutputsType> subModelInputsAndOutputs;
708     if (mState == COMPOUND) {
709         const ModelBuilder* fromModel = executionBuilder->getModel();
710         for (const auto& step : compound()->mSteps) {
711             for (const auto& output: step->getTempsAsSubModelOutputs()) {
712                 const uint32_t fromModelOperandIndex = output.first;
713                 const Operand& fromModelOperand = fromModel->getOperand(fromModelOperandIndex);
714                 if (subModelInputsAndOutputs == nullptr) {
715                     subModelInputsAndOutputs =
716                             std::make_shared<Controller::SubModelInputsAndOutputsType>();
717                 }
718                 const uint32_t size = TypeManager::get()->getSizeOfData(fromModelOperand);
719                 totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size);
720                 subModelInputsAndOutputs->insert(std::make_pair(fromModelOperandIndex, totalSizeOfTemporaries));
721                 totalSizeOfTemporaries += size;
722             }
723         }
724         if (VLOG_IS_ON(EXECUTION) && (subModelInputsAndOutputs != nullptr)) {
725             for (const auto& io : *subModelInputsAndOutputs) {
726                 VLOG(EXECUTION) << "temp: origOpndIdx = " << io.first
727                                 << ", offset = " << io.second;
728             }
729         }
730     }
731 
732     return std::shared_ptr<Controller>(new Controller(this, executionBuilder, burstBuilder,
733                                                       subModelInputsAndOutputs,
734                                                       totalSizeOfTemporaries));
735 }
736 
737 
738 // TODO: Find a better way to provide this functionality.
fallback(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor) const739 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
740                             std::shared_ptr<StepExecutor>* executor) const {
741     *executor = nullptr;
742 
743     VLOG(EXECUTION) << "ExecutionPlan::fallback(" << controller << ", " << executor
744                     << "): mNextStepIndex = " << controller->mNextStepIndex;
745 
746     if (controller->mNextStepIndex == 0) {
747         // We haven't called next().
748         return ANEURALNETWORKS_OP_FAILED;
749     }
750 
751     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
752         // The last call to next() did not produce an executor.
753         return ANEURALNETWORKS_OP_FAILED;
754     }
755 
756     --controller->mNextStepIndex;
757     return next(controller, executor);
758 }
759 
next(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,std::shared_ptr<ExecutionBurstController> * burstController) const760 int ExecutionPlan::next(std::shared_ptr<Controller> controller,
761                         std::shared_ptr<StepExecutor>* executor,
762                         std::shared_ptr<ExecutionBurstController>* burstController) const {
763     *executor = nullptr;
764     if (burstController != nullptr) {
765         *burstController = nullptr;
766     }
767 
768     VLOG(EXECUTION) << "ExecutionPlan::next("
769                     << SHOW_IF_DEBUG(controller << ", " << executor)
770                     << "): mNextStepIndex = " << controller->mNextStepIndex;
771 
772     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
773         return ANEURALNETWORKS_OP_FAILED;
774     }
775 
776     if (mState == EMPTY) {
777         nnAssert(controller->mNextStepIndex == 0);  // end
778         controller->mNextStepIndex = Controller::kBadStepIndex;
779         return ANEURALNETWORKS_NO_ERROR;
780     }
781 
782     if (mState == SIMPLE) {
783         if (controller->mNextStepIndex == 0) {
784             // First (and only) step.
785             auto simpleBody = static_cast<const SimpleBody*>(mBody);
786             *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder,
787                                                        simpleBody->mModel, simpleBody->mDevice,
788                                                        simpleBody->mPreparedModel);
789             (*executor)->mapInputsAndOutputsTrivially();
790             if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
791                 *burstController = controller->mBurstBuilder->getControllerAt(0);
792             }
793             controller->mNextStepIndex = 1;
794             return ANEURALNETWORKS_NO_ERROR;
795         }
796 
797         nnAssert(controller->mNextStepIndex == 1);  // end
798         controller->mNextStepIndex = Controller::kBadStepIndex;
799         return ANEURALNETWORKS_NO_ERROR;
800     }
801 
802     auto compoundBody = compound();
803 
804     if (controller->mNextStepIndex == compoundBody->mSteps.size()) {
805         // end
806         controller->mNextStepIndex = Controller::kBadStepIndex;
807         return ANEURALNETWORKS_NO_ERROR;
808     }
809 
810     // Input order: model inputs, temps as submodel inputs, outputs as submodel inputs
811     // Output order: model outputs, temps as submodel outputs
812     //
813     // ExecutionStep::finishSubModel() establishes these orderings.
814 
815     const auto step = compoundBody->mSteps[controller->mNextStepIndex];
816     *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getSubModel(),
817                                                step->getDevice(), step->getPreparedSubModel());
818     (*executor)->setExecutionStep(step);
819     step->mapInputsAndOutputs(*executor);
820     if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
821         *burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex);
822     }
823     if (controller->mSubModelInputsAndOutputs != nullptr) {
824         {
825             // Tell executor about temps as submodel outputs.
826 
827             const size_t firstSubModelOutputIndex = step->getModelOutputs().size();
828             const auto& subModelOutputs = step->getTempsAsSubModelOutputs();
829 
830             uint32_t idx = 0;
831             for (auto I = subModelOutputs.begin(), E = subModelOutputs.end(); I != E; I++, idx++) {
832                 const uint32_t fromModelOperandIndex = I->first;
833                 const uint32_t offsetOfTemporary =
834                     controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex);
835                 int n = (*executor)->setOutputFromTemporaryMemory(
836                     firstSubModelOutputIndex + idx,
837                     &controller->mTemporaries,
838                     offsetOfTemporary);
839                 if (n != ANEURALNETWORKS_NO_ERROR) {
840                     controller->mNextStepIndex = Controller::kBadStepIndex;
841                     return n;
842                 }
843             }
844         }
845         {
846             // Tell executor about temps as submodel inputs.
847 
848             const size_t firstSubModelInputIndex = step->getModelInputs().size();
849             const auto& subModelInputs = step->getTempsAsSubModelInputs();
850 
851             uint32_t idx = 0;
852             for (auto I = subModelInputs.begin(), E = subModelInputs.end(); I != E; I++, idx++) {
853                 const uint32_t fromModelOperandIndex = I->first;
854                 const uint32_t offsetOfTemporary =
855                     controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex);
856                 int n = (*executor)->setInputFromTemporaryMemory(
857                     firstSubModelInputIndex + idx,
858                     &controller->mTemporaries,
859                     offsetOfTemporary);
860                 if (n != ANEURALNETWORKS_NO_ERROR) {
861                     controller->mNextStepIndex = Controller::kBadStepIndex;
862                     return n;
863                 }
864             }
865         }
866     }
867     {
868         // Tell executor about outputs as submodel inputs.
869 
870         const size_t firstOutputsAsSubModelInputIndex =
871                 step->getModelInputs().size() + step->getTempsAsSubModelInputs().size();
872         const auto& outputsAsSubModelInputsIndexToFromModel =
873                 step->getOutputsAsSubModelInputsIndexToFromModel();
874         for (uint32_t i = 0, e = outputsAsSubModelInputsIndexToFromModel.size(); i < e; i++) {
875             uint32_t o = outputsAsSubModelInputsIndexToFromModel[i];
876             (*executor)->mapOutputToInput(o, firstOutputsAsSubModelInputIndex + i);
877         }
878     }
879 
880     controller->mNextStepIndex++;
881     return ANEURALNETWORKS_NO_ERROR;
882 }
883 
createNewStep(const std::shared_ptr<Device> device)884 std::shared_ptr<ExecutionStep> ExecutionPlan::createNewStep(const std::shared_ptr<Device> device) {
885     nnAssert(mState != SIMPLE);
886     if (mState == EMPTY) {
887         mBody = new CompoundBody();
888         mState = COMPOUND;
889     }
890     auto& steps = compound()->mSteps;
891     auto step = std::make_shared<ExecutionStep>(this, steps.size(), device);
892     steps.push_back(step);
893     return step;
894 }
895 
becomeSingleStep(const std::shared_ptr<Device> device,const ModelBuilder * model)896 void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device,
897                                      const ModelBuilder* model) {
898     nnAssert(mState == EMPTY);
899     mBody = new SimpleBody(device, model, mCacheDir, mToken);
900     mState = SIMPLE;
901 }
902 
dump() const903 void ExecutionPlan::dump() const {
904     if (mBody) {
905         mBody->dump();
906     } else {
907         VLOG(COMPILATION) << "EMPTY";
908     }
909 }
910 
reset()911 void ExecutionPlan::reset() {
912     if (mBody) {
913         delete mBody;
914         mBody = nullptr;
915     }
916     mState = EMPTY;
917 }
918 
forTest_getKind() const919 ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const {
920     switch (mState) {
921         case EMPTY:
922             return Kind::EMPTY;
923         case SIMPLE:
924             nnAssert(mBody);
925             return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR;
926         case COMPOUND:
927             nnAssert(mBody);
928             return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR;
929         default:
930             nnAssert(!"unexpected state");
931             return Kind::ERROR;
932     }
933 }
934 
forTest_simpleGetDevice() const935 std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const {
936     nnAssert(mState == SIMPLE);
937     return static_cast<const SimpleBody*>(mBody)->mDevice;
938 }
939 
forTest_compoundGetSteps() const940 const std::vector<std::shared_ptr<ExecutionStep>>& ExecutionPlan::forTest_compoundGetSteps() const {
941     return compound()->mSteps;
942 }
943 
forTest_hasSubModelOutputsOfUnknownSize() const944 bool ExecutionPlan::forTest_hasSubModelOutputsOfUnknownSize() const {
945     return mBody->hasSubModelOutputsOfUnknownSize();
946 }
947 
forTest_simpleGetCacheToken() const948 const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const {
949     CHECK(mState == SIMPLE)
950             << "Calling forTest_simpleGetCacheToken from execution plan with a non-SIMPLE body";
951     return static_cast<const SimpleBody*>(mBody)->mToken.getCacheToken();
952 }
953 
dump() const954 void ExecutionPlan::SimpleBody::dump() const {
955     VLOG(COMPILATION) << "SIMPLE for " << mDevice->getName();
956 }
957 
dump() const958 void ExecutionPlan::CompoundBody::dump() const {
959     for (const auto& step : mSteps) {
960         step->dump();
961     }
962 }
963 
partitionTheWork(const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,ExecutionPlan * plan) const964 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
965                                    uint32_t preference, ExecutionPlan* plan) const {
966     // This function uses a heuristic approach to partitioning the graph.
967     // It should be good enough for the first release.
968 
969     const size_t deviceCount = devices.size();
970     const size_t operationCount = mOperations.size();
971 
972     VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: deviceCount = " << deviceCount
973                       << ", operationCount = " << operationCount;
974 
975     // Figure out where each operation will best execute.
976     // The value of the vector is the index in the devices vector.
977     std::vector<int> bestDeviceForOperation(operationCount);
978     NN_RETURN_IF_ERROR(
979             findBestDeviceForEachOperation(preference, devices, &bestDeviceForOperation));
980 
981     // If one device will run all the operations, we don't need to split the work.
982     if (std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(),
983                            std::not_equal_to<int>()) == bestDeviceForOperation.end()) {
984         const int bestDeviceIndex = bestDeviceForOperation[0];
985         VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: "
986                           << bestDeviceIndex << " = " << devices[bestDeviceIndex]->getName();
987         plan->becomeSingleStep(devices[bestDeviceIndex], this);
988         return plan->finish(this, preference);
989     }
990 
991     // No easy solution, we need to split the work.
992 
993     // We keep track of the operations that are ready to run for each device.
994     std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount);
995 
996     // This helper function enqueues the operation on the appropriate queue.
997     auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
998         int deviceIndex = bestDeviceForOperation[operationIndex];
999         perDeviceQueue[deviceIndex].push(operationIndex);
1000         VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
1001                           << deviceIndex;
1002     };
1003 
1004     // This helper function finds a device that has operations ready to process.
1005     // We start by looking at the CPU. We do this to try to maximize the
1006     // size of the graph we'll send to non-CPU devices. If the CPU runs first,
1007     // it will have the chance to prepare more of the inputs required by the
1008     // other devices. This function returns -1 if all queues are empty.
1009     auto findNextDeviceToProcess = [&]() -> int {
1010         for (int i = deviceCount - 1; i >= 0; i--) {
1011             if (!perDeviceQueue[i].empty()) {
1012                 return i;
1013             }
1014         }
1015         return -1;
1016     };
1017 
1018     OperandTracker tracker(this, enqueueOnAppropriateDevice);
1019     // For each iteration of this loop, we'll create an execution step.
1020     while (true) {
1021         // Find the device we'll do this step for.
1022         int deviceIndex = findNextDeviceToProcess();
1023         VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex;
1024         if (deviceIndex < 0) {
1025             break;
1026         }
1027 
1028         // Assign as much as possible to this device.
1029         std::shared_ptr<ExecutionStep> step = plan->createNewStep(devices[deviceIndex]);
1030         auto& queue = perDeviceQueue[deviceIndex];
1031         while (!queue.empty()) {
1032             uint32_t operationIndex = queue.front();
1033             queue.pop();
1034             int n = step->addOperation(operationIndex, *this);
1035             if (n != ANEURALNETWORKS_NO_ERROR) {
1036                 LOG(ERROR) << "failed to add operation " << operationIndex << " to step";
1037                 return n;
1038             }
1039             tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
1040         }
1041     }
1042 
1043     int n = plan->finish(this, preference);
1044     if (VLOG_IS_ON(COMPILATION)) {
1045         Model model;
1046         setHidlModel(&model);
1047         VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: original model: ";
1048         logModelToInfo(model);
1049         plan->dump();
1050     }
1051     return n;
1052 }
1053 
getPerformanceInfo(const std::shared_ptr<Device> device,uint32_t operationIndex) const1054 PerformanceInfo ModelBuilder::getPerformanceInfo(const std::shared_ptr<Device> device,
1055                                                  uint32_t operationIndex) const {
1056     const Operation& operation = getOperation(operationIndex);
1057     // TODO This assumes that the type is dictated by the first operand. This is
1058     // currently the case but is not a safe assumption to make in the long term.
1059     const uint32_t operandIndex = operation.inputs[0];
1060     const OperandType operandType = mOperands[operandIndex].type;
1061     switch(operandType) {
1062         case OperandType::FLOAT32:
1063             if (mRelaxComputationFloat32toFloat16) {
1064                 return device->getRelaxedFloat32toFloat16PerformanceScalar();
1065             }
1066             break;
1067         case OperandType::TENSOR_FLOAT32:
1068             if (mRelaxComputationFloat32toFloat16) {
1069                 return device->getRelaxedFloat32toFloat16PerformanceTensor();
1070             }
1071             break;
1072         default:
1073             break;
1074     }
1075 
1076     return device->getPerformance(operandType);
1077 }
1078 
1079 namespace {
1080 
1081 // Add an element to the end of the vector and return a pair consisting of the
1082 // index of the new element and a pointer to the new element.
1083 template <class T>
extend(hidl_vec<T> * vec)1084 std::pair<uint32_t, T*> extend(hidl_vec<T>* vec) {
1085     size_t nextIndex = vec->size();
1086     vec->resize(nextIndex + 1);
1087     return {nextIndex, &(*vec)[nextIndex]};
1088 }
1089 
1090 // Add an element to the end of the vector, set it to the specified value, and
1091 // return a pair consisting of the index of the new element and a pointer to the
1092 // new element.
1093 template <class T>
extend(hidl_vec<T> * vec,const T & val)1094 std::pair<uint32_t, T*> extend(hidl_vec<T>* vec, const T& val) {
1095     auto extended = extend(vec);
1096     *extended.second = val;
1097     return extended;
1098 }
1099 
1100 template <typename T>
operator <(const hidl_vec<T> & a,const hidl_vec<T> & b)1101 bool operator<(const hidl_vec<T>& a, const hidl_vec<T>& b) {
1102     return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end());
1103 }
1104 
1105 // Compile-time mapping from a particular Model type to a name for that type.
1106 template <class T_Model>
1107 struct ModelVersion;
1108 template <>
1109 struct ModelVersion<V1_0::Model> {
1110     static constexpr char name[] = "V1_0";
1111 };
1112 template <>
1113 struct ModelVersion<V1_1::Model> {
1114     static constexpr char name[] = "V1_1";
1115 };
1116 template <>
1117 struct ModelVersion<V1_2::Model> {
1118     static constexpr char name[] = "V1_2";
1119 };
1120 
1121 // Dispatcher mechanism for calling an appropriate uncheckedConvertToV1_*
1122 // given the desired return type.
1123 template <typename T_ReturnType>
1124 T_ReturnType uncheckedConvertTo(OperationType type);
1125 template <>
uncheckedConvertTo(OperationType type)1126 V1_0::OperationType uncheckedConvertTo<V1_0::OperationType>(OperationType type) {
1127     return uncheckedConvertToV1_0(type);
1128 }
1129 template <>
uncheckedConvertTo(OperationType type)1130 V1_1::OperationType uncheckedConvertTo<V1_1::OperationType>(OperationType type) {
1131     return uncheckedConvertToV1_1(type);
1132 }
1133 
1134 // Dispatcher mechanism for calling an appropriate convertToV1_* given the
1135 // desired return type.  Note that there is no V1_1::Operand type.
1136 template <typename T_ReturnType>
1137 T_ReturnType convertTo(Operand operand);
1138 template <>
convertTo(Operand operand)1139 V1_0::Operand convertTo<V1_0::Operand>(Operand operand) {
1140     return convertToV1_0(operand);
1141 }
1142 
1143 // Dispatcher mechanism for calling an appropriate compliantWithV1_* given the
1144 // desired target model type.
1145 template <typename T_SlicedModel>
1146 void getNoncompliantOperations(const V1_2::Model& model,
1147                                std::set<uint32_t>* noncompliantOperations);
1148 template <>
getNoncompliantOperations(const V1_2::Model & model,std::set<uint32_t> * noncompliantOperations)1149 void getNoncompliantOperations<V1_0::Model>(const V1_2::Model& model,
1150                                             std::set<uint32_t>* noncompliantOperations) {
1151     compliantWithV1_0(model, noncompliantOperations);
1152 }
1153 template <>
getNoncompliantOperations(const V1_2::Model & model,std::set<uint32_t> * noncompliantOperations)1154 void getNoncompliantOperations<V1_1::Model>(const V1_2::Model& model,
1155                                             std::set<uint32_t>* noncompliantOperations) {
1156     compliantWithV1_1(model, noncompliantOperations);
1157 }
1158 
1159 class PlanModelSlicer : public IModelSlicer {
1160    public:
1161     PlanModelSlicer(const ModelBuilder* model);
1162 
getSliceV1_0()1163     std::optional<std::pair<V1_0::Model, std::function<uint32_t(uint32_t)>>> getSliceV1_0()
1164             override {
1165         return getSlice(&mSliceV1_0);
1166     }
getSliceV1_1()1167     std::optional<std::pair<V1_1::Model, std::function<uint32_t(uint32_t)>>> getSliceV1_1()
1168             override {
1169         return getSlice(&mSliceV1_1);
1170     }
1171 
getModel() const1172     const Model& getModel() const { return mHidlModel; }
1173 
1174    private:
1175     template <class T_SlicedModel>
1176     static bool invalid(const T_SlicedModel& model);
1177 
1178     enum class SliceState { UNINITIALIZED, INVALID, NORMAL };
1179     template <class T_SlicedModel>
1180     struct Slice {
1181         SliceState mState = SliceState::UNINITIALIZED;
1182         T_SlicedModel mHidlModel;
1183         std::vector<uint32_t> mSlicedOperationIndexToOrigIndex;
1184     };
1185     Slice<V1_0::Model> mSliceV1_0;
1186     Slice<V1_1::Model> mSliceV1_1;
1187 
1188     template <class T_SlicedModel>
1189     void initializeSlice(Slice<T_SlicedModel>* slice);
1190 
1191     template <class T_SlicedModel>
getSlice(Slice<T_SlicedModel> * slice)1192     std::optional<std::pair<T_SlicedModel, std::function<uint32_t(uint32_t)>>> getSlice(
1193             Slice<T_SlicedModel>* slice) {
1194         CHECK(slice != nullptr);
1195         if (slice->mState == SliceState::UNINITIALIZED) {
1196             initializeSlice(slice);
1197         }
1198         if (slice->mState == SliceState::INVALID) {
1199             return {};
1200         }
1201         return std::pair<T_SlicedModel, std::function<uint32_t(uint32_t)>>(
1202                 slice->mHidlModel, [slice](uint32_t slicedOperationIndex) {
1203                     return slice->mSlicedOperationIndexToOrigIndex.at(slicedOperationIndex);
1204                 });
1205     }
1206 
1207     Model mHidlModel;
1208 };
1209 
1210 template <class T_SlicedModel>
invalid(const T_SlicedModel & model)1211 bool PlanModelSlicer::invalid(const T_SlicedModel& model) {
1212     // A model must have at least one operation.  However, it's possible that a
1213     // slice has no operations (because no operations from the original model
1214     // are compliant with the sliced model type).  In this case, the sliced
1215     // model would be invalid.
1216     const bool looksEmpty = (model.operations.size() == 0);
1217     if (DeviceManager::get()->strictSlicing()) {
1218         CHECK_EQ(looksEmpty, (model.operands.size() == 0));
1219     }
1220     if (looksEmpty) return true;
1221 
1222     // A model must have at least one output.  However, it's possible for a
1223     // model to contain dead operations (i.e., outputs on which no model outputs
1224     // are data dependent).  A slice might contain only dead operations, and
1225     // hence have no model outputs.  In this case, the sliced model would be
1226     // invalid.
1227     if (model.outputIndexes.size() == 0) return true;
1228 
1229     // We shouldn't have to check whether the model is valid.
1230     // However, it could be invalid if:
1231     // - there is an error in the slicing algorithm; or
1232     // - there is an error in compliantWith (see http://b/131845106)
1233     if (!validateModel(model)) {
1234         LOG(WARNING) << "Sliced model fails validateModel()";
1235         CHECK(!DeviceManager::get()->strictSlicing());
1236         return true;
1237     }
1238 
1239     return false;
1240 }
1241 
PlanModelSlicer(const ModelBuilder * model)1242 PlanModelSlicer::PlanModelSlicer(const ModelBuilder* model) {
1243     model->setHidlModel(&mHidlModel);
1244 }
1245 
1246 template <class T_SlicedModel>
initializeSlice(Slice<T_SlicedModel> * slice)1247 void PlanModelSlicer::initializeSlice(Slice<T_SlicedModel>* slice) {
1248     using SlicedOperand = std::remove_pointer_t<decltype(slice->mHidlModel.operands.data())>;
1249     using SlicedOperation = std::remove_pointer_t<decltype(slice->mHidlModel.operations.data())>;
1250     using SlicedOperationType = decltype(SlicedOperation::type);
1251 
1252     CHECK(slice->mState == SliceState::UNINITIALIZED);
1253 
1254     const auto& origOperands = mHidlModel.operands;
1255     const auto& origOperations = mHidlModel.operations;
1256     auto& slicedOperands = slice->mHidlModel.operands;
1257     auto& slicedOperations = slice->mHidlModel.operations;
1258 
1259     // Indexes of elements of noncompliant origOperations
1260     std::set<uint32_t> noncompliantOperations;
1261     getNoncompliantOperations<T_SlicedModel>(mHidlModel, &noncompliantOperations);
1262 
1263     // Map from an operand index in origOperands to the corresponding operand index in
1264     // slicedOperands
1265     std::map<uint32_t, uint32_t> origOperandIndexToSlicedIndex;
1266 
1267     // Collect the operand indexes of every operand that is an input to a
1268     // compliant operation.  If the operand is a CONSTANT_* or a NO_VALUE, copy
1269     // it to the sliced model and update origOperandIndexToSlicedIndex
1270     // accordingly.  Otherwise, we'll deal with the operand in the subsequent
1271     // "Main loop", where we process operation outputs (intermediates and model
1272     // outputs).
1273     std::set<uint32_t> inputOperandIndexesOfCompliantOperations;
1274     for (uint32_t origOperationIndex = 0; origOperationIndex < origOperations.size();
1275          ++origOperationIndex) {
1276         if (noncompliantOperations.count(origOperationIndex)) {
1277             continue;
1278         }
1279         for (uint32_t input : origOperations[origOperationIndex].inputs) {
1280             if (inputOperandIndexesOfCompliantOperations.insert(input).second) {
1281                 const Operand& origOperand = origOperands[input];
1282                 switch (origOperand.lifetime) {
1283                     case OperandLifeTime::CONSTANT_COPY:
1284                     case OperandLifeTime::CONSTANT_REFERENCE:
1285                     case OperandLifeTime::NO_VALUE: {
1286                         const uint32_t slicedOperandIndex =
1287                                 extend(&slicedOperands, convertTo<SlicedOperand>(origOperand))
1288                                         .first;
1289                         slicedOperands[slicedOperandIndex].numberOfConsumers = 0;
1290                         origOperandIndexToSlicedIndex[input] = slicedOperandIndex;
1291                         VLOG(COMPILATION) << "origOperandIndexToSlicedIndex initialization created "
1292                                           << input << " -> " << slicedOperandIndex << ": "
1293                                           << toString(slicedOperands[slicedOperandIndex]);
1294                         break;
1295                     }
1296                     default:
1297                         break;
1298                 }
1299             }
1300         }
1301     }
1302 
1303     // For each output operand of a noncompliant operation that is the input
1304     // operand of at least one compliant operation, we will ensure that there is
1305     // a sliced model input whose "type" is that of the output operand.  This is
1306     // a map from output operand "type" (in the original model) to model input
1307     // operand index (in the sliced model).  Unfortunately, there is no
1308     // representation of operand "type" defined in the HAL that we can use
1309     // naively here -- we want (OperandType, dimensions, scale, zeroPoint,
1310     // extraParams), but these fields exist in Operand along with other fields
1311     // that need to be excluded from the map key (numberOfConsumers, lifetime,
1312     // location).  There are several choices:
1313     // - Don't have a map -- each output identified above gets its own sliced
1314     //   model input (no sharing of sliced model inputs).
1315     // - Create an operand "type" representation solely for use as a map key.
1316     // - Write a tailored comparison function that ignores the excluded fields.
1317     // We choose to write a tailored comparison function.  If Treble were to
1318     // generate a comparison function for us (http://b/130567619) then it might
1319     // be better to instead reset the excluded fields to canonical values --
1320     // then we could use the Treble provided comparison function, and the
1321     // solution would be robust (in a correctness sense, not a sharing sense) if
1322     // more fields are added and we neglect to canonicalize them.
1323     //
1324     // We also use this map for model input operands of the original model that
1325     // become input operands of the sliced model.  This means that an original
1326     // model input operand might be coalesced with other original model input
1327     // operands and/or with original model temporary operands.
1328     class OrigOperandToSlicedInputOperandIndex {
1329        public:
1330         OrigOperandToSlicedInputOperandIndex(hidl_vec<SlicedOperand>* slicedOperands,
1331                                              hidl_vec<uint32_t>* slicedInputIndexes)
1332             : mSlicedOperands(*slicedOperands), mSlicedInputIndexes(*slicedInputIndexes) {}
1333 
1334         // Given an operand from the original model, return the index of the
1335         // corresponding model input operand from the sliced model.  Creates a
1336         // new operand in the sliced model if necessary.
1337         uint32_t getIndex(Operand operand) {
1338             // Lookup
1339             auto it = mMap.find(operand);
1340             if (it != mMap.end()) {
1341                 VLOG(COMPILATION) << "OrigOperandToSlicedInputOperandIndex::getIndex looked for "
1342                                   << toString(operand) << " and found " << it->second << ": "
1343                                   << toString(it->first);
1344                 return it->second;
1345             }
1346 
1347             // Create
1348             operand.numberOfConsumers = 0;
1349             operand.lifetime = OperandLifeTime::MODEL_INPUT;
1350             operand.location = {};
1351             uint32_t slicedOperandIndex =
1352                     extend(&mSlicedOperands, convertTo<SlicedOperand>(operand)).first;
1353             mMap[operand] = slicedOperandIndex;
1354             extend(&mSlicedInputIndexes, slicedOperandIndex);
1355             VLOG(COMPILATION) << "OrigOperandToSlicedInputOperandIndex::getIndex created "
1356                               << slicedOperandIndex << ": " << toString(operand);
1357             return slicedOperandIndex;
1358         }
1359 
1360        private:
1361         class Compare {
1362            public:
1363             bool operator()(const Operand& a, const Operand& b) const {
1364                 if (a.type != b.type) {
1365                     return a.type < b.type;
1366                 }
1367                 if (a.dimensions != b.dimensions) {
1368                     return a.dimensions < b.dimensions;
1369                 }
1370                 if (a.scale != b.scale) {
1371                     return a.scale < b.scale;
1372                 }
1373                 if (a.zeroPoint != b.zeroPoint) {
1374                     return a.zeroPoint < b.zeroPoint;
1375                 }
1376                 return compare(a.extraParams, b.extraParams);
1377             }
1378 
1379            private:
1380             static bool compare(const SymmPerChannelQuantParams& a,
1381                                 const SymmPerChannelQuantParams& b) {
1382                 if (a.scales != b.scales) {
1383                     return a.scales < b.scales;
1384                 }
1385                 return a.channelDim < b.channelDim;
1386             }
1387 
1388             static bool compare(const Operand::ExtraParams& a, const Operand::ExtraParams& b) {
1389                 if (a.getDiscriminator() != b.getDiscriminator()) {
1390                     return a.getDiscriminator() < b.getDiscriminator();
1391                 }
1392 
1393                 switch (a.getDiscriminator()) {
1394                     default:
1395                         CHECK(false) << "Unexpected";
1396                         FALLTHROUGH_INTENDED;
1397                     case Operand::ExtraParams::hidl_discriminator::none:
1398                         return false;
1399 
1400                     case Operand::ExtraParams::hidl_discriminator::channelQuant:
1401                         return compare(a.channelQuant(), b.channelQuant());
1402 
1403                     case Operand::ExtraParams::hidl_discriminator::extension:
1404                         return a.extension() < b.extension();
1405                 }
1406             }
1407         };
1408         std::map<Operand, uint32_t, Compare> mMap;
1409         hidl_vec<SlicedOperand>& mSlicedOperands;
1410         hidl_vec<uint32_t>& mSlicedInputIndexes;
1411     } origOperandToSlicedInputOperandIndex(&slicedOperands, &slice->mHidlModel.inputIndexes);
1412 
1413     // An input of the original model is an input of the sliced model if and
1414     // only if it is consumed by at least one compliant operation.  Note that in
1415     // the sliced model we share all model inputs of the same "type"; and that
1416     // we may later add model inputs to the sliced model.
1417     for (uint32_t origInputIndex : mHidlModel.inputIndexes) {
1418         if (inputOperandIndexesOfCompliantOperations.count(origInputIndex)) {
1419             const uint32_t slicedIndex =
1420                     origOperandToSlicedInputOperandIndex.getIndex(origOperands[origInputIndex]);
1421             origOperandIndexToSlicedIndex[origInputIndex] = slicedIndex;
1422             VLOG(COMPILATION) << "origOperandIndexToSlicedIndex inputIndexes processing created "
1423                               << origInputIndex << " -> " << slicedIndex << ": "
1424                               << toString(slicedOperands[slicedIndex]);
1425         }
1426     }
1427 
1428     // Main loop: Process each operation of the original model.
1429     for (uint32_t origOperationIndex = 0; origOperationIndex < origOperations.size();
1430          ++origOperationIndex) {
1431         const Operation& origOperation = origOperations[origOperationIndex];
1432 
1433         if (noncompliantOperations.count(origOperationIndex)) {
1434             for (uint32_t output : origOperation.outputs) {
1435                 if (!inputOperandIndexesOfCompliantOperations.count(output)) {
1436                     continue;
1437                 }
1438                 const uint32_t slicedIndex =
1439                         origOperandToSlicedInputOperandIndex.getIndex(origOperands[output]);
1440                 origOperandIndexToSlicedIndex[output] = slicedIndex;
1441                 VLOG(COMPILATION)
1442                         << "origOperandIndexToSlicedIndex noncompliant output processing created "
1443                         << output << " -> " << slicedIndex << ": "
1444                         << toString(slicedOperands[slicedIndex]);
1445             }
1446         } else {
1447             slice->mSlicedOperationIndexToOrigIndex.push_back(origOperationIndex);
1448             SlicedOperation& slicedOperation = *extend(&slicedOperations).second;
1449             CHECK(slice->mSlicedOperationIndexToOrigIndex.size() == slicedOperations.size());
1450 
1451             slicedOperation.type = uncheckedConvertTo<SlicedOperationType>(origOperation.type);
1452 
1453             // Model is topologically sorted, so all inputs must be present in
1454             // origOperandIndexToSlicedIndex, and no outputs may be.
1455 
1456             // Operation inputs
1457             // - Fill in slicedOperation.inputs
1458             // - Update number of consumers for each input operand
1459             slicedOperation.inputs.resize(origOperation.inputs.size());
1460             std::transform(
1461                     origOperation.inputs.begin(), origOperation.inputs.end(),
1462                     slicedOperation.inputs.begin(),
1463                     [&origOperandIndexToSlicedIndex, &slicedOperands](uint32_t origOperandIndex) {
1464                         uint32_t slicedOperandIndex =
1465                                 origOperandIndexToSlicedIndex.at(origOperandIndex);
1466                         slicedOperands[slicedOperandIndex].numberOfConsumers++;
1467                         VLOG(COMPILATION) << "origOperandIndexToSlicedIndex compliant input "
1468                                              "processing created "
1469                                           << origOperandIndex << " -> " << slicedOperandIndex
1470                                           << ": " << toString(slicedOperands[slicedOperandIndex]);
1471                         return slicedOperandIndex;
1472                     });
1473 
1474             // Operation outputs
1475             // - Add new operands to slicedOperands
1476             // - Update origOperandIndexToSlicedIndex
1477             // - Fill in slicedOperation.outputs
1478             // - Record as a model output, if necessary
1479             const uint32_t firstOutputSlicedOperandIndex = slicedOperands.size();
1480             slicedOperands.resize(firstOutputSlicedOperandIndex + origOperation.outputs.size());
1481             slicedOperation.outputs.resize(origOperation.outputs.size());
1482             for (uint32_t outputNum = 0; outputNum < slicedOperation.outputs.size(); ++outputNum) {
1483                 uint32_t origOperandIndex = origOperation.outputs[outputNum];
1484                 uint32_t slicedOperandIndex = firstOutputSlicedOperandIndex + outputNum;
1485                 auto& slicedOperand = slicedOperands[slicedOperandIndex];
1486                 const auto& origOperand = origOperands[origOperandIndex];
1487                 slicedOperand = convertTo<SlicedOperand>(origOperand);
1488                 slicedOperand.numberOfConsumers = 0;
1489 
1490                 CHECK(origOperandIndexToSlicedIndex.count(origOperandIndex) == 0);
1491                 origOperandIndexToSlicedIndex[origOperandIndex] = slicedOperandIndex;
1492                 slicedOperation.outputs[outputNum] = slicedOperandIndex;
1493 
1494                 if (!inputOperandIndexesOfCompliantOperations.count(origOperandIndex) &&
1495                     origOperand.numberOfConsumers) {
1496                     // Was consumed only by noncompliant operations; convert to
1497                     // an output of the sliced model.
1498                     slicedOperand.lifetime = OperandLifeTime::MODEL_OUTPUT;
1499                 }
1500 
1501                 VLOG(COMPILATION) << "origOperandIndexToSlicedIndex compliant output created "
1502                                   << origOperandIndex << " -> " << slicedOperandIndex << ": "
1503                                   << toString(slicedOperand);
1504 
1505                 if (slicedOperand.lifetime == OperandLifeTime::MODEL_OUTPUT) {
1506                     extend(&slice->mHidlModel.outputIndexes, slicedOperandIndex);
1507                 }
1508             }
1509         }
1510     }
1511 
1512     // To keep things simple, we copy over these fields as-is.  We could instead
1513     // opt to regenerate them based on the operands present in the sliced model:
1514     // This would be more complex and probably take more computation time, but
1515     // it would reduce the size of the sliced model, and hence the time spent
1516     // copying it around and passing it across the HAL interface.
1517     slice->mHidlModel.operandValues = mHidlModel.operandValues;
1518     slice->mHidlModel.pools = mHidlModel.pools;
1519 
1520     if (VLOG_IS_ON(COMPILATION)) {
1521         {
1522             std::ostrstream fromName;
1523             fromName << "Slice: From " << ModelVersion<decltype(mHidlModel)>::name << std::ends;
1524             graphDump(fromName.str(), mHidlModel);
1525             fromName.freeze(false);
1526         }
1527         {
1528             std::ostrstream toName;
1529             toName << "Slice: To " << ModelVersion<decltype(slice->mHidlModel)>::name << std::ends;
1530             graphDump(toName.str(), convertToV1_2(slice->mHidlModel));
1531             toName.freeze(false);
1532         }
1533     }
1534 
1535     slice->mState = invalid(slice->mHidlModel) ? SliceState::INVALID : SliceState::NORMAL;
1536 }
1537 
1538 // This class determines whether a given device can execute a given operation
1539 class CanDo {
1540 public:
CanDo()1541     CanDo() {}
1542 
initialize(PlanModelSlicer * slicer,std::shared_ptr<Device> device)1543     void initialize(PlanModelSlicer* slicer, std::shared_ptr<Device> device) {
1544         device->getSupportedOperations(slicer->getModel(), slicer, &mSupportsOperationByIndex);
1545     }
1546 
check(size_t operationIndex) const1547     bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; }
1548 
1549 private:
1550     hidl_vec<bool> mSupportsOperationByIndex;
1551 };
1552 
1553 };  // anonymous namespace
1554 
findBestDeviceForEachOperation(uint32_t preference,const std::vector<std::shared_ptr<Device>> & devices,std::vector<int> * bestDeviceForOperation) const1555 int ModelBuilder::findBestDeviceForEachOperation(
1556         uint32_t preference, const std::vector<std::shared_ptr<Device>>& devices,
1557         std::vector<int>* bestDeviceForOperation) const {
1558     PlanModelSlicer slicer(this);
1559     const size_t deviceCount = devices.size();
1560     std::vector<CanDo> canDo(deviceCount);
1561     for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
1562         canDo[deviceIndex].initialize(&slicer, devices[deviceIndex]);
1563     }
1564 
1565     // Figure out the best driver for each operation.
1566     const size_t operationCount = mOperations.size();
1567     for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
1568         // Find which device, including CPU fallback, gives the best performance for this operation.
1569         int bestChoice = -1;
1570         float bestPerfVal = 0.0;  // Do not check bestPerfVal if bestChoice < 0.
1571         for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
1572             const auto& device = devices[deviceIndex];
1573             if (canDo[deviceIndex].check(operationIndex)) {
1574                 const PerformanceInfo perf = getPerformanceInfo(device, operationIndex);
1575                 const float perfVal =
1576                             (preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage
1577                                                                             : perf.execTime);
1578                 if (bestChoice < 0 || perfVal < bestPerfVal ||
1579                     (perfVal == bestPerfVal && device == DeviceManager::getCpuDevice())) {
1580                     bestChoice = deviceIndex;
1581                     bestPerfVal = perfVal;
1582                 }
1583             } else {
1584                 // Somewhat noisy logging, but only place where the user of
1585                 // NNAPI can get feedback on why an operation was not run on a
1586                 // specific device.
1587                 // Logs O(operationCount * deviceCount) times, but
1588                 // typically deviceCount is very small.
1589                 VLOG(COMPILATION) << "Device " << device->getName()
1590                                   << " can't do operation "
1591                                   << toString(getOperation(operationIndex).type);
1592             }
1593         }
1594         if (bestChoice < 0) {
1595             LOG(ERROR) << "No driver can do the op";
1596             return ANEURALNETWORKS_BAD_DATA;
1597         }
1598 
1599         (*bestDeviceForOperation)[operationIndex] = bestChoice;
1600         VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
1601                           << toString(getOperation(operationIndex).type) << ") = " << bestChoice
1602                           << " (" << devices[bestChoice]->getName() << ")";
1603     }
1604     return ANEURALNETWORKS_NO_ERROR;
1605 }
1606 
1607 } // namespace nn
1608 } // namespace android
1609