1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "CompilationBuilder.h"
18 #include "ExecutionPlan.h"
19 #include "HalInterfaces.h"
20 #include "Manager.h"
21 #include "ModelBuilder.h"
22 #include "NeuralNetworks.h"
23 #include "NeuralNetworksOEM.h"
24 #include "SampleDriver.h"
25 #include "TestNeuralNetworksWrapper.h"
26 #include "Utils.h"
27 #include "ValidateHal.h"
28
29 #include <gtest/gtest.h>
30
31 #include <filesystem>
32 #include <functional>
33 #include <map>
34 #include <queue>
35 #include <type_traits>
36
37 // Uncomment the following line to generate some debugging output that
38 // may be useful when analyzing failures:
39 //
40 // #define VERBOSE VERBOSE
41
42 // These tests do whitebox testing of the graph partitioning
43 // algorithm. It is "whitebox" in the sense that we're not evaluating
44 // whether a particular partitioning is legal, or "good enough"
45 // according to some metric, but whether it exactly matches the
46 // expected behavior of the current partitioning algorithm.
47 //
48 // A key part of the current partitioning algorithm is to determine
49 // which device among the available devices should be the one to
50 // execute a particular operation from the graph. This determination
51 // is made "locally" -- i.e., it does not depend on the graph
52 // topology, only on the properties of the operation in question.
53 // IDevice::getSupportedOperations() indicates which operations in a
54 // graph can be executed on a device, and IDevice::getCapabilities()
55 // indicates how "good" that device is for executing particular kinds
56 // of operations. For each operation, the partitioning algorithm
57 // picks the "best" device that is capable of executing that
58 // operation; if no device can do so, then the algorithm picks the
59 // cpu.
60 //
61 // As part of this testing approach, we want to make it easy to
62 // specify which operations in a test graph can be executed on which
63 // devices. We accomplish this in the following way:
64 // - A unary OEM operation is available.
65 // - There is a collection of operations (each of which has two inputs
66 // and one output):
67 // - Eight kinds of operations available at driver version V1_0 or
68 // later. They are represented in the graph as ADD or MUL with a
69 // particular activation function -- two opcodes times four
70 // activation functions means eight available operation kinds.
71 // This is a low-level representation detail -- when we specify the
72 // behavior of the device or build a graph, we do so in terms of
73 // operation encodings 0..7.
74 // - Eight kinds of operations available at driver version V1_1 or
75 // later. They are represented in the graph as DIV or SUB with
76 // a particular activation function, exactly analogous to ADD
77 // and MUL above. We use operation encodings 8..15 for them.
78 // - Four kinds of operations available at driver version V1_2 or
79 // later. They are represented in the graph as MAXIMUM,
80 // MINIMUM, POW, or PRELU. These operations take no activation
81 // function, so we only get 4 operation kinds, for which we
82 // use operation encodings 16..19.
83 // When we instantiate a device for testing purposes, we specify what subset of
84 // those operations the device is able to execute.
85 //
86 // In order to determine whether or not a partitioning matches the
87 // expected partitioning, we check the number of partitions, check
88 // which device each partition targets, and compare each partition's
89 // subgraph, model inputs, model outputs, submodel inputs, and
90 // submodel outputs against what is expected. In order to perform
91 // that comparison, we build a model to compare against a partition's
92 // submodel and run a graph comparison algorithm on it. The graph
93 // comparison and the inputs and outputs comparisons are syntactic
94 // rather than semantic comparisons -- they don't allow for
95 // reorderings of inputs and outputs. Because of this, we need to
96 // know exactly how the partitioning algorithm orders inputs and
97 // outputs in order to construct the models and operand lists to
98 // compare against. Here are some relevant behaviors of the
99 // partitioning algorithm:
100 //
101 // - It builds a subgraph by walking operations in forward topological
102 // order, and adding each operation's input operands and output
103 // operands in index order (input followed by output) when that
104 // operation is added. (It does not add an input that has already
105 // been added.)
106 // - It finds model inputs, model outputs, and submodel inputs in
107 // the order the corresponding operands were added to the subgraph
108 // (see ExecutionStep methods getModelInputs(), getModelOutputs(),
109 // getTempsAsSubModelInputs(), getOutputsAsSubModelInputs()).
110 // - It finds temps as submodel outputs in numerical order of corresponding
111 // operand number in the original model (see ExecutionStep method
112 // getTempsAsSubModelOutputs()).
113 // - When it calls identifyInputsAndOutputs() on the submodel, it
114 // passes inputs from getModelInputs() in order, followed by temps as
115 // submodel inputs from getTempsAsSubModelInputs() in order,
116 // followed by outputs as submodel inputs from
117 // getOutputsAsSubModelInputs() in order; and it passes outputs from
118 // getModelOutputs() in order followed by submodel outputs from
119 // getTempsAsSubModelOutputs() in order.
120 //
121 // TODO: Maybe the logic for comparing a partition to an expected
122 // model should be changed to tolerate reorderings of inputs and
123 // outputs, so that when we build models and lists to compare
124 // against, we don't need to worry about input and output
125 // orderings. But is there a way to do this that still lets us
126 // verify that we have the correct relationships between
127 // an (original) model's inputs and outputs and each submodel's
128 // inputs and outputs, as well as the correct relationship
129 // between submodel inputs and outputs across partitions?
130
131 namespace {
132
133 const Timing kBadTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
134
135 using CompilationBuilder = ::android::nn::CompilationBuilder;
136 using Device = ::android::nn::Device;
137 using DeviceManager = ::android::nn::DeviceManager;
138 using ExecutePreference = ::android::nn::test_wrapper::ExecutePreference;
139 using ExecutionPlan = ::android::nn::ExecutionPlan;
140 using ExecutionStep = ::android::nn::ExecutionStep;
141 using HalVersion = ::android::nn::HalVersion;
142 using HidlModel = ::android::hardware::neuralnetworks::V1_2::Model;
143 using HidlToken =
144 ::android::hardware::hidl_array<uint8_t, ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN>;
145 using ModelBuilder = ::android::nn::ModelBuilder;
146 using Result = ::android::nn::test_wrapper::Result;
147 using SampleDriver = ::android::nn::sample_driver::SampleDriver;
148 using WrapperSymmPerChannelQuantParams = ::android::nn::test_wrapper::SymmPerChannelQuantParams;
149 using WrapperCompilation = ::android::nn::test_wrapper::Compilation;
150 using WrapperModel = ::android::nn::test_wrapper::Model;
151 using WrapperOperandType = ::android::nn::test_wrapper::OperandType;
152 using WrapperType = ::android::nn::test_wrapper::Type;
153
154 template <typename T> using sp = ::android::sp<T>;
155 template <typename T>
156 using MQDescriptorSync = ::android::hardware::MQDescriptorSync<T>;
157
makeCapabilities(float perf)158 Capabilities makeCapabilities(float perf) {
159 PerformanceInfo perfInfo = {.execTime = perf, .powerUsage = perf};
160 return {.relaxedFloat32toFloat16PerformanceScalar = perfInfo,
161 .relaxedFloat32toFloat16PerformanceTensor = perfInfo,
162 .operandPerformance = ::android::nn::nonExtensionOperandPerformance(perfInfo)};
163 };
164
update(Capabilities * capabilities,OperandType type,float perf)165 void update(Capabilities* capabilities, OperandType type, float perf) {
166 PerformanceInfo perfInfo = {.execTime = perf, .powerUsage = perf};
167 ::android::nn::update(&capabilities->operandPerformance, type, perfInfo);
168 }
169
lookupExecTime(const Capabilities & capabilities,OperandType type)170 float lookupExecTime(const Capabilities& capabilities, OperandType type) {
171 return ::android::nn::lookup(capabilities.operandPerformance, type).execTime;
172 }
173
174 const uint32_t kNumFuseCodes = 4;
175 const uint32_t kBadOperation = ~0;
176
177 // V1_0 operations
178 const uint32_t kFirstEncodingADD = 0;
179 const uint32_t kFirstEncodingMUL = kFirstEncodingADD + kNumFuseCodes;
180 const uint32_t kFirstEncodingV1_0 = kFirstEncodingADD;
181 const uint32_t kLastEncodingV1_0 = kFirstEncodingMUL + kNumFuseCodes - 1;
182
183 // V1_1 operations
184 const uint32_t kFirstEncodingDIV = kLastEncodingV1_0 + 1;
185 const uint32_t kFirstEncodingSUB = kFirstEncodingDIV + kNumFuseCodes;
186 const uint32_t kFirstEncodingV1_1 = kFirstEncodingDIV;
187 const uint32_t kLastEncodingV1_1 = kFirstEncodingSUB + kNumFuseCodes - 1;
188
189 // V1_2 operations
190 const uint32_t kFirstEncodingMAXIMUM = kLastEncodingV1_1 + 1;
191 const uint32_t kFirstEncodingMINIMUM = kFirstEncodingMAXIMUM + 1;
192 const uint32_t kFirstEncodingPOW = kFirstEncodingMINIMUM + 1;
193 const uint32_t kFirstEncodingPRELU = kFirstEncodingPOW + 1;
194 const uint32_t kFirstEncodingV1_2 = kFirstEncodingMAXIMUM;
195 const uint32_t kLastEncodingV1_2 = kFirstEncodingPRELU;
196
197 const std::map<OperationType, uint32_t> operationToFirstEncoding = {
198 {OperationType::ADD, kFirstEncodingADD},
199 {OperationType::MUL, kFirstEncodingMUL},
200 {OperationType::DIV, kFirstEncodingDIV},
201 {OperationType::SUB, kFirstEncodingSUB},
202 {OperationType::MAXIMUM, kFirstEncodingMAXIMUM},
203 {OperationType::MINIMUM, kFirstEncodingMINIMUM},
204 {OperationType::POW, kFirstEncodingPOW},
205 {OperationType::PRELU, kFirstEncodingPRELU},
206 };
207
208 // Sorted in reverse order (std::greater) so that we can use map::lower_bound to
209 // find an entry whose key is numerically less than or equal to a search value.
210 // mapped_type is (OperandCode, hasFuseCode).
211 const std::map<uint32_t, std::pair<uint32_t, bool>, std::greater<>> firstEncodingToOperation = {
212 {kFirstEncodingADD, {ANEURALNETWORKS_ADD, true}},
213 {kFirstEncodingMUL, {ANEURALNETWORKS_MUL, true}},
214 {kFirstEncodingDIV, {ANEURALNETWORKS_DIV, true}},
215 {kFirstEncodingSUB, {ANEURALNETWORKS_SUB, true}},
216 {kFirstEncodingMAXIMUM, {ANEURALNETWORKS_MAXIMUM, false}},
217 {kFirstEncodingMINIMUM, {ANEURALNETWORKS_MINIMUM, false}},
218 {kFirstEncodingPOW, {ANEURALNETWORKS_POW, false}},
219 {kFirstEncodingPRELU, {ANEURALNETWORKS_PRELU, false}},
220 };
221
222 // Look up the operation with the specified index in a graph, and return the
223 // operation encoding; or, if for some reason this is not one of the encoded
224 // operations, then return kBadOperation.
lookupOperation(std::function<const Operation & (uint32_t)> getOperation,std::function<const Operand & (uint32_t)> getOperand,std::function<const uint8_t * (uint32_t)> getValue,uint32_t operationIndex)225 uint32_t lookupOperation(std::function<const Operation&(uint32_t)> getOperation,
226 std::function<const Operand&(uint32_t)> getOperand,
227 std::function<const uint8_t*(uint32_t)> getValue,
228 uint32_t operationIndex) {
229 const Operation& operation = getOperation(operationIndex);
230 switch (operation.type) {
231 case OperationType::ADD:
232 case OperationType::MUL:
233 case OperationType::DIV:
234 case OperationType::SUB: {
235 // input2 is the fused activation function
236 const Operand& input2 = getOperand(operation.inputs[2]);
237 if ((input2.type == OperandType::INT32) &&
238 (input2.lifetime == OperandLifeTime::CONSTANT_COPY)) {
239 int32_t value;
240 CHECK_EQ(sizeof(value), input2.location.length);
241 memcpy(&value,
242 getValue(input2.location.offset),
243 input2.location.length);
244 return value + operationToFirstEncoding.at(operation.type);
245 }
246 break;
247 }
248 default: {
249 auto it = operationToFirstEncoding.find(operation.type);
250 if (it != operationToFirstEncoding.end()) {
251 return it->second;
252 }
253 break;
254 }
255 }
256 return kBadOperation;
257 }
258
lookupOperation(const HidlModel & model,uint32_t operationIndex)259 uint32_t lookupOperation(const HidlModel& model, uint32_t operationIndex) {
260 return lookupOperation(
261 [&model](uint32_t index) -> const Operation& {
262 return model.operations[index];
263 },
264 [&model](uint32_t index) -> const Operand& {
265 return model.operands[index];
266 },
267 [&model](uint32_t offset) {return &model.operandValues[offset];},
268 operationIndex);
269 }
270
271 #ifdef VERBOSE
272 // This is a debugging utility function
dump(const char * name,const ModelBuilder * model)273 void dump(const char* name, const ModelBuilder* model) {
274 HidlModel hidlModel;
275 model->setHidlModel(&hidlModel);
276 std::cout << name << ": " << toString(hidlModel) << std::endl;
277 std::cout << "inputs: " << toString(hidlModel.inputIndexes) << std::endl;
278 std::cout << "outputs: " << toString(hidlModel.outputIndexes) << std::endl;
279 for (size_t i = 0, e = hidlModel.operations.size(); i < e; i++) {
280 std::cout << "operation[" << i << "]: " << toString(hidlModel.operations[i]) << std::endl;
281 }
282 }
283 #endif
284
285 // This is an IDevice for testing purposes. It only has a few
286 // interesting properties, all of which are specified as constructor
287 // arguments: device capabilities; which subset of operation kinds
288 // (0..19) does the device support; does the device support the OEM
289 // operation. The subset is represented with a bitmask, in which
290 // operation kind K corresponds to the bit (1 << K).
291 class PartitioningDriver : public SampleDriver {
292 private:
293 // Dummy class -- a prepared model must not be nullptr.
294 class PartitioningPreparedModel : public IPreparedModel {
295 public:
execute(const Request &,const sp<V1_0::IExecutionCallback> &)296 Return<ErrorStatus> execute(const Request&, const sp<V1_0::IExecutionCallback>&) override {
297 return ErrorStatus::DEVICE_UNAVAILABLE;
298 }
execute_1_2(const Request &,MeasureTiming,const sp<V1_2::IExecutionCallback> &)299 Return<ErrorStatus> execute_1_2(const Request&, MeasureTiming,
300 const sp<V1_2::IExecutionCallback>&) override {
301 return ErrorStatus::DEVICE_UNAVAILABLE;
302 }
executeSynchronously(const Request &,MeasureTiming,executeSynchronously_cb cb)303 Return<void> executeSynchronously(const Request&, MeasureTiming,
304 executeSynchronously_cb cb) override {
305 cb(ErrorStatus::DEVICE_UNAVAILABLE, {}, kBadTiming);
306 return Void();
307 }
configureExecutionBurst(const sp<V1_2::IBurstCallback> &,const MQDescriptorSync<V1_2::FmqRequestDatum> &,const MQDescriptorSync<V1_2::FmqResultDatum> &,configureExecutionBurst_cb cb)308 Return<void> configureExecutionBurst(
309 const sp<V1_2::IBurstCallback>& /*callback*/,
310 const MQDescriptorSync<V1_2::FmqRequestDatum>& /*requestChannel*/,
311 const MQDescriptorSync<V1_2::FmqResultDatum>& /*resultChannel*/,
312 configureExecutionBurst_cb cb) override {
313 cb(ErrorStatus::DEVICE_UNAVAILABLE, nullptr);
314 return Void();
315 }
316 };
317 public:
318 enum OEM {
319 OEMNo, // rejected by getSupportedOperations and prepareModel
320 OEMIndecisive, // accepted by getSupportedOperations but not prepareModel
321 OEMYes, // accepted by getSupportedOperations and prepareModel
322 };
323
PartitioningDriver(const char * name,const char * version,Capabilities capabilities,uint32_t operationMask,OEM oem=OEMNo)324 PartitioningDriver(const char* name, const char* version, Capabilities capabilities,
325 uint32_t operationMask, OEM oem = OEMNo)
326 : SampleDriver(name),
327 mVersionString(version),
328 mCapabilities(capabilities),
329 mOperationMask(operationMask),
330 mOEM(oem) {}
~PartitioningDriver()331 ~PartitioningDriver() override {}
332
getVersionString(getVersionString_cb cb)333 Return<void> getVersionString(getVersionString_cb cb) override {
334 cb(ErrorStatus::NONE, mVersionString);
335 return Void();
336 }
337
prepareModel_1_2(const Model & model,ExecutionPreference,const hidl_vec<hidl_handle> &,const hidl_vec<hidl_handle> &,const HidlToken &,const sp<IPreparedModelCallback> & cb)338 Return<ErrorStatus> prepareModel_1_2(const Model& model, ExecutionPreference,
339 const hidl_vec<hidl_handle>&, const hidl_vec<hidl_handle>&,
340 const HidlToken&,
341 const sp<IPreparedModelCallback>& cb) override {
342 ErrorStatus status = ErrorStatus::NONE;
343 if (mOEM != OEMYes) {
344 for (const auto& operation : model.operations) {
345 if (operation.type == OperationType::OEM_OPERATION) {
346 status = ErrorStatus::INVALID_ARGUMENT;
347 break;
348 }
349 }
350 }
351 cb->notify_1_2(status, new PartitioningPreparedModel);
352 return status;
353 }
354
getStatus()355 Return<DeviceStatus> getStatus() override {
356 return DeviceStatus::AVAILABLE;
357 }
358
getCapabilities_1_2(getCapabilities_1_2_cb cb)359 Return<void> getCapabilities_1_2(getCapabilities_1_2_cb cb) override {
360 cb(ErrorStatus::NONE, mCapabilities);
361 return Void();
362 }
363
getSupportedOperations_1_2(const Model & model,getSupportedOperations_cb cb)364 Return<void> getSupportedOperations_1_2(const Model& model,
365 getSupportedOperations_cb cb) override {
366 if (!android::nn::validateModel(model)) {
367 cb(ErrorStatus::INVALID_ARGUMENT, std::vector<bool>());
368 return Void();
369 }
370
371 const size_t count = model.operations.size();
372 std::vector<bool> supported(count);
373 for (size_t i = 0; i < count; i++) {
374 if (model.operations[i].type == OperationType::OEM_OPERATION) {
375 supported[i] = (mOEM != OEMNo);
376 continue;
377 }
378 supported[i] = false;
379 uint32_t operation = lookupOperation(model, i);
380 if ((operation != kBadOperation) && (mOperationMask & (1 << operation))) {
381 supported[i] = true;
382 }
383 }
384 cb(ErrorStatus::NONE, supported);
385 return Void();
386 }
387
getNumberOfCacheFilesNeeded(getNumberOfCacheFilesNeeded_cb cb)388 Return<void> getNumberOfCacheFilesNeeded(getNumberOfCacheFilesNeeded_cb cb) override {
389 cb(ErrorStatus::NONE, /*numModelCache=*/1, /*numDataCache=*/1);
390 return Void();
391 }
392
prepareModelFromCache(const hidl_vec<hidl_handle> &,const hidl_vec<hidl_handle> &,const HidlToken &,const sp<V1_2::IPreparedModelCallback> & callback)393 Return<ErrorStatus> prepareModelFromCache(
394 const hidl_vec<hidl_handle>&, const hidl_vec<hidl_handle>&, const HidlToken&,
395 const sp<V1_2::IPreparedModelCallback>& callback) override {
396 callback->notify_1_2(ErrorStatus::NONE, new PartitioningPreparedModel);
397 return ErrorStatus::NONE;
398 }
399
400 private:
401 std::string mVersionString;
402 Capabilities mCapabilities;
403 uint32_t mOperationMask;
404 OEM mOEM;
405 };
406
407 // Like PartitioningDriver, but implementing 1.1
408 class PartitioningDriverV1_1 : public V1_1::IDevice {
409 public:
PartitioningDriverV1_1(const char * name,const char * version,Capabilities capabilities,uint32_t operationMask,PartitioningDriver::OEM oem=PartitioningDriver::OEMNo)410 PartitioningDriverV1_1(const char* name, const char* version, Capabilities capabilities,
411 uint32_t operationMask,
412 PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
413 : mDriverV1_2(new PartitioningDriver(name, version, capabilities, operationMask, oem)) {}
getCapabilities_1_1(getCapabilities_1_1_cb _hidl_cb)414 Return<void> getCapabilities_1_1(getCapabilities_1_1_cb _hidl_cb) override {
415 return mDriverV1_2->getCapabilities_1_1(_hidl_cb);
416 }
getSupportedOperations_1_1(const V1_1::Model & model,getSupportedOperations_1_1_cb _hidl_cb)417 Return<void> getSupportedOperations_1_1(const V1_1::Model& model,
418 getSupportedOperations_1_1_cb _hidl_cb) override {
419 return mDriverV1_2->getSupportedOperations_1_1(model, _hidl_cb);
420 }
prepareModel_1_1(const V1_1::Model & model,ExecutionPreference preference,const sp<V1_0::IPreparedModelCallback> & actualCallback)421 Return<ErrorStatus> prepareModel_1_1(
422 const V1_1::Model& model, ExecutionPreference preference,
423 const sp<V1_0::IPreparedModelCallback>& actualCallback) override {
424 return mDriverV1_2->prepareModel_1_1(model, preference, actualCallback);
425 }
getStatus()426 Return<DeviceStatus> getStatus() override { return mDriverV1_2->getStatus(); }
getCapabilities(getCapabilities_cb _hidl_cb)427 Return<void> getCapabilities(getCapabilities_cb _hidl_cb) override {
428 return mDriverV1_2->getCapabilities(_hidl_cb);
429 }
getSupportedOperations(const V1_0::Model & model,getSupportedOperations_cb _hidl_cb)430 Return<void> getSupportedOperations(const V1_0::Model& model,
431 getSupportedOperations_cb _hidl_cb) override {
432 return mDriverV1_2->getSupportedOperations(model, _hidl_cb);
433 }
prepareModel(const V1_0::Model & model,const sp<V1_0::IPreparedModelCallback> & actualCallback)434 Return<ErrorStatus> prepareModel(
435 const V1_0::Model& model,
436 const sp<V1_0::IPreparedModelCallback>& actualCallback) override {
437 return mDriverV1_2->prepareModel(model, actualCallback);
438 }
439
440 private:
441 const sp<V1_2::IDevice> mDriverV1_2;
442 };
443
444 // Like PartitioningDriver, but implementing 1.0
445 class PartitioningDriverV1_0 : public V1_0::IDevice {
446 public:
PartitioningDriverV1_0(const char * name,const char * version,Capabilities capabilities,uint32_t operationMask,PartitioningDriver::OEM oem=PartitioningDriver::OEMNo)447 PartitioningDriverV1_0(const char* name, const char* version, Capabilities capabilities,
448 uint32_t operationMask,
449 PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
450 : mDriverV1_2(new PartitioningDriver(name, version, capabilities, operationMask, oem)) {}
getCapabilities(getCapabilities_cb _hidl_cb)451 Return<void> getCapabilities(getCapabilities_cb _hidl_cb) override {
452 return mDriverV1_2->getCapabilities(_hidl_cb);
453 }
getSupportedOperations(const V1_0::Model & model,getSupportedOperations_cb _hidl_cb)454 Return<void> getSupportedOperations(const V1_0::Model& model,
455 getSupportedOperations_cb _hidl_cb) override {
456 return mDriverV1_2->getSupportedOperations(model, _hidl_cb);
457 }
prepareModel(const V1_0::Model & model,const sp<V1_0::IPreparedModelCallback> & actualCallback)458 Return<ErrorStatus> prepareModel(
459 const V1_0::Model& model,
460 const sp<V1_0::IPreparedModelCallback>& actualCallback) override {
461 return mDriverV1_2->prepareModel(model, actualCallback);
462 }
getStatus()463 Return<DeviceStatus> getStatus() override { return mDriverV1_2->getStatus(); }
464
465 private:
466 const sp<V1_2::IDevice> mDriverV1_2;
467 };
468
469 // This class adds some simple abstractions and utilities on top of
470 // WrapperModel. For example, it provides methods that work in terms of
471 // operation kind (0..7); and because we care about graph topology rather than
472 // details of operand types and values, it greatly simplifies the process of
473 // creating operands.
474 class PartitioningModel : private WrapperModel {
475 public:
476 using WrapperModel::finish;
477 using WrapperModel::getHandle;
478 using WrapperModel::identifyInputsAndOutputs;
479 using WrapperModel::isValid;
480 using WrapperModel::relaxComputationFloat32toFloat16;
481
482 // Create a tensor operand of the specified type, and return the
483 // corresponding operand index.
addFloatOperand()484 uint32_t addFloatOperand() { return addOperand(WrapperType::TENSOR_FLOAT32); }
addQuantOperand()485 uint32_t addQuantOperand() { return addOperand(WrapperType::TENSOR_QUANT8_ASYMM); }
486
487 // Create an operand of the specified type, and return the corresponding
488 // operand index.
addOperand(WrapperType wrapperType)489 uint32_t addOperand(WrapperType wrapperType) {
490 switch (static_cast<int>(wrapperType)) {
491 case ANEURALNETWORKS_BOOL:
492 case ANEURALNETWORKS_FLOAT16:
493 case ANEURALNETWORKS_FLOAT32:
494 case ANEURALNETWORKS_INT32:
495 case ANEURALNETWORKS_UINT32:
496 case ANEURALNETWORKS_OEM_SCALAR: {
497 WrapperOperandType wrapperOperandType(wrapperType, {});
498 mWrapperOperandType.push_back(wrapperOperandType);
499 return WrapperModel::addOperand(&wrapperOperandType);
500 }
501
502 case ANEURALNETWORKS_TENSOR_BOOL8:
503 case ANEURALNETWORKS_TENSOR_FLOAT16:
504 case ANEURALNETWORKS_TENSOR_FLOAT32:
505 case ANEURALNETWORKS_TENSOR_OEM_BYTE: {
506 WrapperOperandType wrapperOperandType(wrapperType, {1});
507 mWrapperOperandType.push_back(wrapperOperandType);
508 return WrapperModel::addOperand(&wrapperOperandType);
509 }
510
511 case ANEURALNETWORKS_TENSOR_INT32:
512 case ANEURALNETWORKS_TENSOR_QUANT8_ASYMM:
513 case ANEURALNETWORKS_TENSOR_QUANT8_SYMM:
514 case ANEURALNETWORKS_TENSOR_QUANT16_ASYMM:
515 case ANEURALNETWORKS_TENSOR_QUANT16_SYMM: {
516 WrapperOperandType wrapperOperandType(wrapperType, {1}, 1.0f);
517 mWrapperOperandType.push_back(wrapperOperandType);
518 return WrapperModel::addOperand(&wrapperOperandType);
519 }
520
521 case ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL: {
522 WrapperOperandType wrapperOperandType(wrapperType, {1}, 0.0f, 0,
523 WrapperSymmPerChannelQuantParams({1.0f}, 0));
524 mWrapperOperandType.push_back(wrapperOperandType);
525 return WrapperModel::addOperand(&wrapperOperandType);
526 }
527
528 default:
529 ADD_FAILURE() << "Unexpected type " << static_cast<uint32_t>(wrapperType);
530 return ~uint32_t(0);
531 }
532 }
533
534 enum class Dimensioned { NO, YES };
535
536 // Create a V1_0 operation with two inputs and one output, specifying the
537 // operation kind (where 0 is the first V1_0 operation) and the input
538 // operand indexes.
539 // Returns the output operand index.
addOperation2To1V1_0(uint32_t operation,const uint32_t input0,const uint32_t input1,Dimensioned dimensionedOutput=Dimensioned::YES)540 uint32_t addOperation2To1V1_0(uint32_t operation, const uint32_t input0, const uint32_t input1,
541 Dimensioned dimensionedOutput = Dimensioned::YES) {
542 CHECK_LE(operation, kLastEncodingV1_0 - kFirstEncodingV1_0);
543 return addOperation2To1(operation + kFirstEncodingV1_0, input0, input1, dimensionedOutput);
544 }
545
546 // Create a V1_1 operation with two inputs and one output, specifying the
547 // operation kind (where 0 is the first V1_1 operation) and the input
548 // operand indexes.
549 // Returns the output operand index.
addOperation2To1V1_1(uint32_t operation,const uint32_t input0,const uint32_t input1,Dimensioned dimensionedOutput=Dimensioned::YES)550 uint32_t addOperation2To1V1_1(uint32_t operation, const uint32_t input0, const uint32_t input1,
551 Dimensioned dimensionedOutput = Dimensioned::YES) {
552 CHECK_LE(operation, kLastEncodingV1_1 - kFirstEncodingV1_1);
553 return addOperation2To1(operation + kFirstEncodingV1_1, input0, input1, dimensionedOutput);
554 }
555
556 // Create a V1_2 operation with two inputs and one output, specifying the
557 // operation kind (where 0 is the first V1_2 operation) and the input
558 // operand indexes.
559 // Returns the output operand index.
addOperation2To1V1_2(uint32_t operation,const uint32_t input0,const uint32_t input1,Dimensioned dimensionedOutput=Dimensioned::YES)560 uint32_t addOperation2To1V1_2(uint32_t operation, const uint32_t input0, const uint32_t input1,
561 Dimensioned dimensionedOutput = Dimensioned::YES) {
562 CHECK_LE(operation, kLastEncodingV1_2 - kFirstEncodingV1_2);
563 return addOperation2To1(operation + kFirstEncodingV1_2, input0, input1, dimensionedOutput);
564 }
565
566 // Create an OEM operation with one input and one output,
567 // specifying the input operand index. Returns the output operand
568 // index.
addOperationOEM1To1(const uint32_t input,Dimensioned dimensionedOutput=Dimensioned::YES)569 uint32_t addOperationOEM1To1(const uint32_t input,
570 Dimensioned dimensionedOutput = Dimensioned::YES) {
571 uint32_t output = addOperandOfSameType(input, dimensionedOutput);
572 addOperation(ANEURALNETWORKS_OEM_OPERATION, { input }, { output });
573 return output;
574 }
575
576 // Run the partitioning algorithm to create an ExecutionPlan.
partitionTheWork(const std::vector<std::shared_ptr<Device>> & devices,ExecutePreference preference,ExecutionPlan * plan)577 int partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
578 ExecutePreference preference, ExecutionPlan* plan) {
579 return reinterpret_cast<ModelBuilder*>(getHandle())->partitionTheWork(
580 devices, static_cast<uint32_t>(preference), plan);
581 }
582
583 #ifdef VERBOSE
584 // This is a debugging utility function.
dump(const char * name) const585 void dump(const char* name) const {
586 const ModelBuilder* mb = reinterpret_cast<const ModelBuilder*>(getHandle());
587 ::dump(name, mb);
588 }
589 #endif
590
591 private:
592 // Create an operation with two inputs and one output, specifying
593 // the operation kind and the input operand indexes.
594 // Returns the output operand index.
addOperation2To1(uint32_t operation,const uint32_t input0,const uint32_t input1,Dimensioned dimensionedOutput=Dimensioned::YES)595 uint32_t addOperation2To1(uint32_t operation, const uint32_t input0, const uint32_t input1,
596 Dimensioned dimensionedOutput = Dimensioned::YES) {
597 auto it = firstEncodingToOperation.lower_bound(operation);
598 CHECK(it != firstEncodingToOperation.end());
599 ANeuralNetworksOperationType type = it->second.first;
600 if (it->second.second) {
601 int32_t fuseCode = operation - it->first;
602 uint32_t input2 = addIntOperand(fuseCode);
603 uint32_t output = addOperandOfSameType(input0, dimensionedOutput);
604 addOperation(type, {input0, input1, input2}, {output});
605 return output;
606 } else {
607 uint32_t output = addOperandOfSameType(input0, dimensionedOutput);
608 addOperation(type, {input0, input1}, {output});
609 return output;
610 }
611 }
612
613 // Create a scalar integer operand of the specified value, and
614 // return the corresponding operand index.
addIntOperand(int32_t value)615 uint32_t addIntOperand(int32_t value) {
616 uint32_t operand = addOperand(WrapperType::INT32);
617 setOperandValue(operand, &value, sizeof(value));
618 return operand;
619 }
620
621 // Create an operand of the same type as the specified operand,
622 // and return the operand index of the new operand.
addOperandOfSameType(uint32_t operand,Dimensioned dimensioned=Dimensioned::YES)623 uint32_t addOperandOfSameType(uint32_t operand, Dimensioned dimensioned = Dimensioned::YES) {
624 WrapperOperandType type = mWrapperOperandType.at(operand);
625 for (auto& dimension : type.dimensions) {
626 dimension = (dimensioned == Dimensioned::YES);
627 }
628 mWrapperOperandType.push_back(type);
629 return WrapperModel::addOperand(&type);
630 }
631
632 // operand index to operand type
633 std::vector<WrapperOperandType> mWrapperOperandType;
634 };
635
636 // This class adds some utilities on top of WrapperCompilation.
637 class PartitioningCompilation : public WrapperCompilation {
638 public:
PartitioningCompilation(const PartitioningModel * model,const std::vector<std::shared_ptr<Device>> & devices)639 PartitioningCompilation(const PartitioningModel* model,
640 const std::vector<std::shared_ptr<Device>>& devices) {
641 ModelBuilder* m = reinterpret_cast<ModelBuilder*>(model->getHandle());
642 CompilationBuilder* c = nullptr;
643 int result = m->createCompilation(&c, devices);
644 EXPECT_EQ(result, 0);
645 mCompilation = reinterpret_cast<ANeuralNetworksCompilation*>(c);
646 }
647
setPartitioning(uint32_t partitioning)648 Result setPartitioning(uint32_t partitioning) {
649 return static_cast<Result>(builder()->setPartitioning(partitioning));
650 }
651
652 using WrapperCompilation::finish;
653
getExecutionPlan() const654 const ExecutionPlan& getExecutionPlan() const {
655 return builder()->forTest_getExecutionPlan();
656 }
657
658 private:
builder()659 CompilationBuilder* builder() {
660 return reinterpret_cast<CompilationBuilder*>(getHandle());
661 }
662
builder() const663 const CompilationBuilder* builder() const {
664 return reinterpret_cast<const CompilationBuilder*>(getHandle());
665 }
666 };
667
668 #ifdef VERBOSE
669 #define RETURN_TRUE() \
670 { \
671 std::cerr << "returning true from " << __LINE__ << std::endl; \
672 return true; \
673 }
674 #else
675 #define RETURN_TRUE() \
676 { \
677 return true; \
678 }
679 #endif
680 #ifdef VERBOSE
681 #define RETURN_FALSE(MESSAGE) \
682 { \
683 std::cerr << "returning false from " << __LINE__ MESSAGE << std::endl; \
684 return false; \
685 }
686 #else
687 #define RETURN_FALSE(MESSAGE) \
688 { \
689 return false; \
690 }
691 #endif
692
693 class PartitioningTest : public ::testing::Test {
694 protected:
695 using RemapVectorType = ExecutionStep::RemapVectorType;
696 using SubModelOutputSetType = ExecutionStep::SubModelOutputSetType;
697
SetUp()698 virtual void SetUp() {
699 }
700
701 // From a vector of DeviceSpecification, create a vector of
702 // Devices.
703 struct DeviceSpecification {
DeviceSpecification__anon83dd480d0111::PartitioningTest::DeviceSpecification704 DeviceSpecification(const std::string& name, const Capabilities& capabilities,
705 uint32_t operationMask,
706 PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
707 : mName(name),
708 mVersionString(kVersionString),
709 mCapabilities(capabilities),
710 mOperationMask(operationMask),
711 mOEM(oem) {}
DeviceSpecification__anon83dd480d0111::PartitioningTest::DeviceSpecification712 DeviceSpecification(const std::string& name, float perf, uint32_t operationMask,
713 PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
714 : DeviceSpecification(name, perf, perf, operationMask, oem) {}
DeviceSpecification__anon83dd480d0111::PartitioningTest::DeviceSpecification715 DeviceSpecification(const std::string& name, float perf, float perfRelaxed,
716 uint32_t operationMask,
717 PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
718 : DeviceSpecification(name, kVersionString, perf, perfRelaxed, operationMask, oem) {}
DeviceSpecification__anon83dd480d0111::PartitioningTest::DeviceSpecification719 DeviceSpecification(const std::string& name, const std::string& version, float perf,
720 uint32_t operationMask,
721 PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
722 : DeviceSpecification(name, version, perf, perf, operationMask, oem) {}
DeviceSpecification__anon83dd480d0111::PartitioningTest::DeviceSpecification723 DeviceSpecification(const std::string& name, const std::string& version, float perf,
724 float perfRelaxed, uint32_t operationMask,
725 PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
726 : mName(name), mVersionString(version), mOperationMask(operationMask), mOEM(oem) {
727 PerformanceInfo perfRelaxedInfo = {.execTime = perfRelaxed, .powerUsage = perfRelaxed};
728 mCapabilities = {.relaxedFloat32toFloat16PerformanceScalar = perfRelaxedInfo,
729 .relaxedFloat32toFloat16PerformanceTensor = perfRelaxedInfo,
730 .operandPerformance = ::android::nn::nonExtensionOperandPerformance(
731 {.execTime = perf, .powerUsage = perf})};
732 }
DeviceSpecification__anon83dd480d0111::PartitioningTest::DeviceSpecification733 DeviceSpecification(const std::string& name, float perf, HalVersion halVersion,
734 uint32_t operationMaskV1_0, uint32_t operationMaskV1_1 = 0,
735 uint32_t operationMaskV1_2 = 0)
736 : DeviceSpecification(name, perf, perf,
737 makeOperationMask(halVersion, operationMaskV1_0,
738 operationMaskV1_1, operationMaskV1_2)) {
739 mHalVersion = halVersion;
740 }
741
742 std::string mName;
743 std::string mVersionString;
744 Capabilities mCapabilities;
745 HalVersion mHalVersion = HalVersion::LATEST;
746 uint32_t mOperationMask;
747 PartitioningDriver::OEM mOEM = PartitioningDriver::OEMNo;
748
749 static constexpr char kVersionString[] = "JUST_AN_EXAMPLE";
750
751 private:
752 // This function takes three operation masks aligned at the low-order
753 // bit -- one mask each for V1_0, V1_1, and V1_2 -- and produces a single
754 // composite operation mask, formed by shifting each of the input
755 // operation masks appropriately and ORing the results together.
756 //
757 // For convenience, any bits of an input mask that are too high order
758 // for that mask are discarded -- this allows ~0 to be a legal input
759 // mask.
760 //
761 // For the sake of example, assume that each low order mask is 4 bits
762 // wide, and take some artistic license to write literals in binary.
763 // Then:
764 //
765 // assert(makeOperationMask(HalVersion::V1_2, 0b0110, 0b1001, 0b0101) ==
766 // 0b 0101 1001 0110);
767 //
768 // This is used by a DeviceSpecification constructor to build a mask of
769 // operations to be supported by the device.
makeOperationMask__anon83dd480d0111::PartitioningTest::DeviceSpecification770 static uint32_t makeOperationMask(HalVersion halVersion, uint32_t operationMaskV1_0,
771 uint32_t operationMaskV1_1, uint32_t operationMaskV1_2) {
772 if (halVersion < HalVersion::V1_2) {
773 CHECK(!operationMaskV1_2);
774 }
775 if (halVersion < HalVersion::V1_1) {
776 CHECK(!operationMaskV1_1);
777 }
778 auto maskOfWidth = [](uint32_t width) -> uint32_t { return (1U << width) - 1; };
779 static const uint32_t kOperationMaskV1_0 =
780 maskOfWidth(kLastEncodingV1_0 - kFirstEncodingV1_0 + 1);
781 static const uint32_t kOperationMaskV1_1 =
782 maskOfWidth(kLastEncodingV1_1 - kFirstEncodingV1_1 + 1);
783 static const uint32_t kOperationMaskV1_2 =
784 maskOfWidth(kLastEncodingV1_2 - kFirstEncodingV1_2 + 1);
785 return ((operationMaskV1_0 & kOperationMaskV1_0) << kFirstEncodingV1_0) |
786 ((operationMaskV1_1 & kOperationMaskV1_1) << kFirstEncodingV1_1) |
787 ((operationMaskV1_2 & kOperationMaskV1_2) << kFirstEncodingV1_2);
788 }
789 };
makeDevices(std::vector<DeviceSpecification> specifications)790 static std::vector<std::shared_ptr<Device>> makeDevices(
791 std::vector<DeviceSpecification> specifications) {
792 std::vector<std::shared_ptr<Device>> devices;
793 for (const auto& specification : specifications) {
794 V1_0::IDevice* halDriver = nullptr;
795 switch (specification.mHalVersion) {
796 case HalVersion::V1_2:
797 halDriver = new PartitioningDriver(
798 specification.mName.c_str(), specification.mVersionString.c_str(),
799 specification.mCapabilities, specification.mOperationMask,
800 specification.mOEM);
801 break;
802 case HalVersion::V1_1:
803 halDriver = new PartitioningDriverV1_1(
804 specification.mName.c_str(), specification.mVersionString.c_str(),
805 specification.mCapabilities, specification.mOperationMask,
806 specification.mOEM);
807 break;
808 case HalVersion::V1_0:
809 halDriver = new PartitioningDriverV1_0(
810 specification.mName.c_str(), specification.mVersionString.c_str(),
811 specification.mCapabilities, specification.mOperationMask,
812 specification.mOEM);
813 break;
814 default:
815 ADD_FAILURE() << "Unexpected";
816 }
817 auto device = DeviceManager::forTest_makeDriverDevice(specification.mName, halDriver);
818 devices.push_back(device);
819 }
820 devices.push_back(DeviceManager::getCpuDevice());
821 return devices;
822 }
823
824 /*-- Graph comparision ----------------------------------------------------------------*/
825
826 // An operand with certain values for its lifetime does not have a
827 // defining operation in the graph. For the purposes of the graph
828 // comparison algorithm, we encode the "defining operation" index of
829 // such an operand as follows:
830 // - NO_VALUE kPseudoDefiningOperationNoValue
831 // - MODEL_INPUT kPseudoDefiningOperationModelInput0 + (position in list of inputs)
832 // - CONSTANT_COPY kPseudoDefiningOperationConstantCopy0 + (constant value)
833 // Note: For the graphs we build in this test, we
834 // only expect to see 4-byte constants within
835 // a very restricted range, so we only make
836 // room for such constants in our encoding
837 // space.
838 // We do not expect to see CONSTANT_REFERENCE, and so we do not handle
839 // it.
840 //
841 // The encoding is intended to be relatively human readable; it is not
842 // designed to represent some optimal balance of ranges for the items
843 // within its scope (actual operations, inputs, constants).
844
845 enum PseudoDefiningOperationEncodings : uint32_t {
846 kPseudoDefiningOperationModelInput0 = 0x80000000U,
847 kPseudoDefiningOperationConstantCopy0 = 0x90000000U,
848 kPseudoDefiningOperationNoValue = 0xeeeeeeeeU,
849
850 // lowest value for special encoding
851 kPseudoDefiningOperationBase = 0x80000000U,
852
853 // range of encoded input or constant
854 kPseudoDefiningOperationRange = 0x10000000U,
855 };
856
857 // Build a map from operand to defining operation.
858 // TODO: Replace map with vector?
buildDefinitionMap(const ModelBuilder * model,std::map<uint32_t,uint32_t> * defMap)859 void buildDefinitionMap(const ModelBuilder* model,
860 std::map<uint32_t, uint32_t>* defMap) {
861 // actual definitions
862 ASSERT_LT(model->operationCount(), kPseudoDefiningOperationBase);
863 for (uint32_t i = 0, e = model->operationCount(); i < e; i++) {
864 const Operation& operation = model->getOperation(i);
865 for (uint32_t output : operation.outputs) {
866 (*defMap)[output] = i;
867 }
868 }
869 // inputs
870 ASSERT_LT(model->inputCount(), kPseudoDefiningOperationRange);
871 for (uint32_t i = 0, e = model->inputCount(); i < e; i++) {
872 (*defMap)[model->getInputOperandIndex(i)] = kPseudoDefiningOperationModelInput0 + i;
873 }
874 // look for NO_VALUE and CONSTANT_COPY
875 for (uint32_t i = 0, e = model->operandCount(); i < e; i++) {
876 const Operand& operand = model->getOperand(i);
877 switch (operand.lifetime) {
878 case OperandLifeTime::NO_VALUE:
879 (*defMap)[i] = kPseudoDefiningOperationNoValue;
880 break;
881 case OperandLifeTime::CONSTANT_COPY: {
882 ASSERT_EQ(operand.location.length, sizeof(uint32_t));
883 uint32_t value;
884 memcpy(&value, model->getPointerToOperandValue(operand.location.offset), sizeof(uint32_t));
885 ASSERT_LT(value, kPseudoDefiningOperationNoValue);
886 (*defMap)[i] = kPseudoDefiningOperationConstantCopy0 + value;
887 break;
888 }
889 case OperandLifeTime::TEMPORARY_VARIABLE:
890 case OperandLifeTime::MODEL_INPUT:
891 case OperandLifeTime::MODEL_OUTPUT:
892 // already handled
893 break;
894 default:
895 FAIL();
896 break;
897 }
898 }
899 // sanity check
900 ASSERT_EQ(model->operandCount(), defMap->size());
901 }
902
903 #ifdef VERBOSE
dump(const char * name,const std::map<uint32_t,uint32_t> * aMap)904 void dump(const char* name, const std::map<uint32_t, uint32_t>* aMap) {
905 auto writeNum = [](uint32_t num) {
906 if (num >= kPseudoDefiningOperationBase) {
907 std::cout << "0x" << std::hex << num << std::dec;
908 } else {
909 std::cout << num;
910 }
911 };
912
913 std::cout << name << ": { ";
914 bool gotOne = false;
915 for (const auto& entry : *aMap) {
916 if (gotOne) {
917 std::cout << ", ";
918 } else {
919 gotOne = true;
920 }
921 std::cout << "(";
922 writeNum(entry.first);
923 std::cout << ", ";
924 writeNum(entry.second);
925 std::cout << ")";
926 }
927 std::cout << " }" << std::endl;
928 }
929 #endif
930
compare(const Operand & operandA,const Operand & operandB)931 bool compare(const Operand& operandA, const Operand& operandB) {
932 if (operandA.type != operandB.type ||
933 operandA.dimensions != operandB.dimensions ||
934 operandA.numberOfConsumers != operandB.numberOfConsumers ||
935 operandA.scale != operandB.scale ||
936 operandA.zeroPoint != operandB.zeroPoint) {
937 return false;
938 }
939 return true;
940 }
941
942 // Compare two graphs. We ignore operand and operation indexes (i.e.,
943 // two nodes can be the same even if they are numbered differently)
944 // but we also ignore semantics (e.g., even if an operation kind is
945 // such that the operand is commutative, we still pay attention to the
946 // order of its input operands).
947 //
948 // The comparison algorithm works by walking modelA from outputs
949 // towards inputs, along the edge from each operand to its
950 // defining operation, and then along the edges to the operation's
951 // input operands. At each step along the way, we try to match up
952 // operands and operations from modelA with equivalent operands
953 // and operations from modelB.
954 //
955 // We start by assuming that modelA's outputs and modelB's outputs
956 // match positionally (e.g., modelA's first output operand is
957 // equivalent to modelB's first output operand). Once we've
958 // discovered two equivalent operands (such as those outputs), we
959 // place them in a work queue. We repeatedly pull operands off
960 // the queue and compare their defining operations and those
961 // operations' input operands, to discover more pairs of
962 // equivalent operands. If we ever find operations that do not
963 // match (e.g., because operation kind differs), or operands that
964 // do not match (e.g., because operand type differs); or if we
965 // ever find a conflict (we've already decided that operand A's
966 // equivalent operand is B0, but it looks like we need its
967 // equivalent operand to be B1); then the graphs compare unequal.
968 // Otherwise, we'll eventually exhaust the work queue, and
969 // conclude that the graphs compare equal.
970 //
971 // As a side effect of the comparison, we produce a map
972 // *inputsAndOutputsBToA that maps from each of the model input and output
973 // operand numbers of modelB to the corresponding operand numbers of modelA.
974 // If the comparison returns false, the contents of the map are undefined.
compare(const ModelBuilder * modelA,const ModelBuilder * modelB,std::map<uint32_t,uint32_t> * inputsAndOutputsBToA)975 bool compare(const ModelBuilder* modelA, const ModelBuilder* modelB,
976 std::map<uint32_t, uint32_t>* inputsAndOutputsBToA) {
977 CHECK(inputsAndOutputsBToA != nullptr);
978 EXPECT_TRUE(inputsAndOutputsBToA->empty());
979
980 #ifdef VERBOSE
981 ::dump("compare(A)", modelA);
982 ::dump("compare(B)", modelB);
983 #endif
984
985 if (modelA->operandCount() != modelB->operandCount() ||
986 modelA->operationCount() != modelB->operationCount() ||
987 modelA->inputCount() != modelB->inputCount() ||
988 modelA->outputCount() != modelB->outputCount()) {
989 RETURN_FALSE();
990 }
991
992 // Maps from operand index to index of defining operation.
993 std::map<uint32_t, uint32_t> defsA, defsB;
994 buildDefinitionMap(modelA, &defsA);
995 buildDefinitionMap(modelB, &defsB);
996 if (HasFatalFailure()) return false;
997
998 // Maps from operand index in modelA to equivalent operand index
999 // in modelB; and from operation index in modelA to equivalent
1000 // operation index in modelB.
1001 std::map<uint32_t, uint32_t> equivalentOperandsAToB;
1002 std::map<uint32_t, uint32_t> equivalentOperationsAToB;
1003
1004 // Queue of operand indexes from modelA, each of whose defining
1005 // operations are to be checked for equivalence with modelB.
1006 std::queue<uint32_t> workQueueOperandsA;
1007
1008 // Seed operand equivalence map and work queue from model outputs.
1009 for (uint32_t i = 0, e = modelA->outputCount(); i < e; i++) {
1010 uint32_t outputA = modelA->getOutputOperandIndex(i);
1011 uint32_t outputB = modelB->getOutputOperandIndex(i);
1012 if (!compare(modelA->getOperand(outputA), modelB->getOperand(outputB))) {
1013 RETURN_FALSE();
1014 }
1015 equivalentOperandsAToB[outputA] = outputB;
1016 workQueueOperandsA.push(outputA);
1017 }
1018
1019 #ifdef VERBOSE
1020 dump("defsA", &defsA);
1021 dump("defsB", &defsB);
1022 #endif
1023
1024 // Process the queue.
1025 uint32_t pseudoDefinitionCount = 0;
1026 while (!workQueueOperandsA.empty()) {
1027 #ifdef VERBOSE
1028 dump("equivalentOperandsAToB", &equivalentOperandsAToB);
1029 dump("equivalentOperationsAToB", &equivalentOperationsAToB);
1030 #endif
1031 uint32_t operandIndexA = workQueueOperandsA.front();
1032 #ifdef VERBOSE
1033 std::cout << "operandIndexA: " << operandIndexA << std::endl;
1034 #endif
1035 workQueueOperandsA.pop();
1036 uint32_t operandIndexB = equivalentOperandsAToB.at(operandIndexA);
1037
1038 uint32_t operationIndexA = defsA.at(operandIndexA);
1039 uint32_t operationIndexB = defsB.at(operandIndexB);
1040 auto it = equivalentOperationsAToB.find(operationIndexA);
1041 if (it != equivalentOperationsAToB.end()) {
1042 if (it->second != operationIndexB) {
1043 RETURN_FALSE();
1044 }
1045 continue;
1046 }
1047
1048 // We haven't identified an equivalent operation for
1049 // operationIndexA.
1050
1051 if ((operationIndexA >= kPseudoDefiningOperationBase) !=
1052 (operationIndexB >= kPseudoDefiningOperationBase)) {
1053 RETURN_FALSE();
1054 }
1055 // Either both operands have pseudo-definitions, or neither
1056 // does.
1057 if (operationIndexA >= kPseudoDefiningOperationBase) {
1058 // Both operands have pseudo-definitions.
1059 if (operationIndexA != operationIndexB) {
1060 RETURN_FALSE();
1061 }
1062 equivalentOperationsAToB[operationIndexA] = operationIndexB;
1063 ++pseudoDefinitionCount;
1064 continue;
1065 }
1066
1067 // If we get here, neither operation A nor operation B is a
1068 // pseudo-definition.
1069
1070 const Operation& operationA = modelA->getOperation(operationIndexA);
1071 const Operation& operationB = modelB->getOperation(operationIndexB);
1072 if (operationA.type != operationB.type ||
1073 operationA.inputs.size() != operationB.inputs.size() ||
1074 operationA.outputs.size() != operationB.outputs.size()) {
1075 RETURN_FALSE();
1076 }
1077 equivalentOperationsAToB[operationIndexA] = operationIndexB;
1078 for (uint32_t i = 0, e = operationA.inputs.size(); i < e; i++) {
1079 uint32_t inputA = operationA.inputs[i];
1080 uint32_t inputB = operationB.inputs[i];
1081 auto it = equivalentOperandsAToB.find(inputA);
1082 if (it != equivalentOperandsAToB.end()) {
1083 if (it->second != inputB) {
1084 RETURN_FALSE();
1085 }
1086 continue;
1087 }
1088 // We haven't identified an equivalent operand for inputA.
1089 if (!compare(modelA->getOperand(inputA), modelB->getOperand(inputB))) {
1090 RETURN_FALSE();
1091 }
1092 equivalentOperandsAToB[inputA] = inputB;
1093 workQueueOperandsA.push(inputA);
1094 }
1095 }
1096
1097 // Sanity check
1098 if (modelA->operandCount() != defsA.size() ||
1099 modelA->operandCount() != defsB.size() ||
1100 modelA->operandCount() != equivalentOperandsAToB.size() ||
1101 modelA->operationCount() + pseudoDefinitionCount != equivalentOperationsAToB.size()) {
1102 RETURN_FALSE();
1103 }
1104
1105 // Build *inputsAndOutputsBToA
1106 for (uint32_t aInputIndex : modelA->getInputOperandIndexes()) {
1107 (*inputsAndOutputsBToA)[equivalentOperandsAToB.at(aInputIndex)] = aInputIndex;
1108 }
1109 for (uint32_t aOutputIndex : modelA->getOutputOperandIndexes()) {
1110 (*inputsAndOutputsBToA)[equivalentOperandsAToB.at(aOutputIndex)] = aOutputIndex;
1111 }
1112
1113 RETURN_TRUE();
1114 }
1115
1116 /*-------------------------------------------------------------------------------------*/
1117
1118 // As a side effect of the comparison, we produce a map
1119 // *inputsAndOutputsModelToStep that maps from each of the model input and
1120 // output operand numbers of "model" to the corresponding operand numbers of
1121 // the submodel from "step". If the comparison returns false, the contents
1122 // of the map are undefined.
compare(std::shared_ptr<const ExecutionStep> step,const PartitioningModel * model,std::shared_ptr<Device> device,std::map<uint32_t,uint32_t> * inputsAndOutputsModelToStep)1123 bool compare(std::shared_ptr<const ExecutionStep> step, const PartitioningModel* model,
1124 std::shared_ptr<Device> device,
1125 std::map<uint32_t, uint32_t>* inputsAndOutputsModelToStep) {
1126 return (step->getDevice() == device) &&
1127 compare(step->getSubModel(),
1128 reinterpret_cast<const ModelBuilder*>(model->getHandle()),
1129 inputsAndOutputsModelToStep);
1130 }
1131
compare(std::shared_ptr<const ExecutionStep> step,const PartitioningModel * model,std::shared_ptr<Device> device,const RemapVectorType & modelInputs,const RemapVectorType & modelOutputs,const RemapVectorType & tempsAsSubModelInputs,const SubModelOutputSetType & tempsAsSubModelOutputs,const RemapVectorType & outputsAsSubModelInputs)1132 void compare(std::shared_ptr<const ExecutionStep> step, const PartitioningModel* model,
1133 std::shared_ptr<Device> device, const RemapVectorType& modelInputs,
1134 const RemapVectorType& modelOutputs, const RemapVectorType& tempsAsSubModelInputs,
1135 const SubModelOutputSetType& tempsAsSubModelOutputs,
1136 const RemapVectorType& outputsAsSubModelInputs) {
1137 std::map<uint32_t, uint32_t> inputsAndOutputsModelToStep;
1138 ASSERT_NO_FATAL_FAILURE(
1139 ASSERT_TRUE(compare(step, model, device, &inputsAndOutputsModelToStep)));
1140 ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep, step->getModelInputs(),
1141 modelInputs));
1142 ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep, step->getModelOutputs(),
1143 modelOutputs));
1144 ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep,
1145 step->getTempsAsSubModelInputs(), tempsAsSubModelInputs));
1146 ASSERT_TRUE(compareSubModelOutputSets(inputsAndOutputsModelToStep,
1147 step->getTempsAsSubModelOutputs(),
1148 tempsAsSubModelOutputs));
1149 ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep,
1150 step->getOutputsAsSubModelInputs(),
1151 outputsAsSubModelInputs));
1152 }
1153
1154 private:
compareRemapVectors(const std::map<uint32_t,uint32_t> & inputsAndOutputsModelToStep,const RemapVectorType & step,RemapVectorType model)1155 static bool compareRemapVectors(const std::map<uint32_t, uint32_t>& inputsAndOutputsModelToStep,
1156 const RemapVectorType& step, RemapVectorType model) {
1157 std::transform(model.begin(), model.end(), model.begin(),
1158 [&inputsAndOutputsModelToStep](const RemapVectorType::value_type& val) {
1159 return std::make_pair(val.first,
1160 inputsAndOutputsModelToStep.at(val.second));
1161 });
1162 return step == model;
1163 }
1164
compareSubModelOutputSets(const std::map<uint32_t,uint32_t> & inputsAndOutputsModelToStep,const SubModelOutputSetType & step,const SubModelOutputSetType & model)1165 static bool compareSubModelOutputSets(
1166 const std::map<uint32_t, uint32_t>& inputsAndOutputsModelToStep,
1167 const SubModelOutputSetType& step, const SubModelOutputSetType& model) {
1168 SubModelOutputSetType modelTransformed;
1169 std::transform(
1170 model.begin(), model.end(), std::inserter(modelTransformed, modelTransformed.end()),
1171 [&inputsAndOutputsModelToStep](const SubModelOutputSetType::value_type& val) {
1172 return std::make_pair(val.first, inputsAndOutputsModelToStep.at(val.second));
1173 });
1174 return step == modelTransformed;
1175 }
1176 };
1177
TEST_F(PartitioningTest,SimpleModel)1178 TEST_F(PartitioningTest, SimpleModel) {
1179 PartitioningModel model;
1180 uint32_t opnd0 = model.addFloatOperand();
1181 uint32_t opnd1 = model.addFloatOperand();
1182 uint32_t opnd2 = model.addOperation2To1V1_0(0, opnd0, opnd1);
1183 uint32_t opnd3 = model.addFloatOperand();
1184 uint32_t opnd4 = model.addOperation2To1V1_0(1, opnd2, opnd3);
1185 model.identifyInputsAndOutputs({ opnd0, opnd1, opnd3 }, { opnd4 });
1186 model.finish();
1187 ASSERT_TRUE(model.isValid());
1188
1189 // Simple partition (two devices are each capable of everything, one is the best).
1190 // No need to compare the original model to the model from the plan -- we
1191 // didn't actually do any partitioning.
1192 const auto devicesA = makeDevices({{"bad", 0.9, ~0U}, {"good", 0.5, ~0U}});
1193 ExecutionPlan planA;
1194 ASSERT_EQ(model.partitionTheWork(devicesA, ExecutePreference::PREFER_LOW_POWER, &planA),
1195 ANEURALNETWORKS_NO_ERROR);
1196 ASSERT_EQ(planA.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1197 ASSERT_NE(planA.forTest_simpleGetDevice().get(), nullptr);
1198 ASSERT_STREQ(planA.forTest_simpleGetDevice()->getName(), "good");
1199
1200 // Simple partition (two devices are each capable of everything, none better than CPU).
1201 // No need to compare the original model to the model from the plan -- we
1202 // didn't actually do any partitioning.
1203 const auto devicesC = makeDevices({{"bad", 1.1, ~0U}, {"bad2", 1.0, ~0U}});
1204 ExecutionPlan planC;
1205 ASSERT_EQ(model.partitionTheWork(devicesC, ExecutePreference::PREFER_LOW_POWER, &planC),
1206 ANEURALNETWORKS_NO_ERROR);
1207 ASSERT_EQ(planC.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1208 ASSERT_EQ(planC.forTest_simpleGetDevice(), DeviceManager::getCpuDevice());
1209
1210 // Compound partition (two devices, each is capable of one of the
1211 // two operations). We could do more extensive checking here --
1212 // for example, verify that each step within the plan has the
1213 // correct (model and submodel)x(inputs and outputs).
1214 const auto devicesB = makeDevices({{"0", 0.9, 1 << 0}, {"1", 0.5, 1 << 1}});
1215 ExecutionPlan planB;
1216 ASSERT_EQ(model.partitionTheWork(devicesB, ExecutePreference::PREFER_LOW_POWER, &planB),
1217 ANEURALNETWORKS_NO_ERROR);
1218 ASSERT_EQ(planB.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
1219 const auto& stepsB = planB.forTest_compoundGetSteps();
1220 ASSERT_EQ(stepsB.size(), size_t(2));
1221 {
1222 // Build a model to compare against the submodel from stepsB[0].
1223 PartitioningModel modelB0;
1224 uint32_t b0Opnd0 = modelB0.addFloatOperand();
1225 uint32_t b0Opnd1 = modelB0.addFloatOperand();
1226 uint32_t b0Opnd2 = modelB0.addOperation2To1V1_0(0, b0Opnd0, b0Opnd1);
1227 modelB0.identifyInputsAndOutputs({ b0Opnd0, b0Opnd1 }, { b0Opnd2 });
1228 modelB0.finish();
1229 ASSERT_TRUE(modelB0.isValid());
1230
1231 ASSERT_NO_FATAL_FAILURE(
1232 compare(stepsB[0], &modelB0, devicesB[0],
1233 RemapVectorType{{opnd0, b0Opnd0}, {opnd1, b0Opnd1}}, // modelInputs
1234 RemapVectorType{}, // modelOutputs
1235 RemapVectorType{}, // tempsAsSubModelInputs
1236 SubModelOutputSetType{{opnd2, b0Opnd2}}, // tempsAsSubModelOutputs
1237 RemapVectorType{})); // outputsAsSubModelInputs;
1238 }
1239 {
1240 // Build a model to compare against the submodel from stepsB[1].
1241 PartitioningModel modelB1;
1242 uint32_t b1Opnd2 = modelB1.addFloatOperand();
1243 uint32_t b1Opnd3 = modelB1.addFloatOperand();
1244 uint32_t b1Opnd4 = modelB1.addOperation2To1V1_0(1, b1Opnd2, b1Opnd3);
1245 // Note: In the partitioning algorithm, submodel inputs follow
1246 // model inputs. In the original model "model", opnd2 is not
1247 // an input; so in the submodel "modelB1", the corresponding
1248 // input b1Opnd2 is a submodel input, and must follow the
1249 // model input b1Opnd3.
1250 modelB1.identifyInputsAndOutputs({ b1Opnd3, b1Opnd2 }, { b1Opnd4 });
1251 modelB1.finish();
1252 ASSERT_TRUE(modelB1.isValid());
1253
1254 ASSERT_NO_FATAL_FAILURE(compare(stepsB[1], &modelB1, devicesB[1],
1255 RemapVectorType{{opnd3, b1Opnd3}}, // modelInputs
1256 RemapVectorType{{opnd4, b1Opnd4}}, // modelOutputs
1257 RemapVectorType{{opnd2, b1Opnd2}}, // tempsAsSubModelInputs
1258 SubModelOutputSetType{}, // tempsAsSubModelOutputs
1259 RemapVectorType{})); // outputsAsSubModelInputs
1260 }
1261 }
1262
TEST_F(PartitioningTest,SliceModel)1263 TEST_F(PartitioningTest, SliceModel) {
1264 PartitioningModel model;
1265 uint32_t opnd0 = model.addFloatOperand();
1266 uint32_t opnd1 = model.addFloatOperand();
1267 uint32_t opnd2 = model.addOperation2To1V1_0(0, opnd0, opnd1);
1268 uint32_t opnd3 = model.addOperation2To1V1_0(1, opnd0, opnd1);
1269 uint32_t opnd4 = model.addOperation2To1V1_1(0, opnd0, opnd1);
1270 uint32_t opnd5 = model.addOperation2To1V1_2(0, opnd2, opnd3);
1271 model.identifyInputsAndOutputs({opnd0, opnd1}, {opnd2, opnd4, opnd5});
1272 model.finish();
1273 ASSERT_TRUE(model.isValid());
1274
1275 // Simple partition (V1_0, V1_1, V1_2 devices are available; V1_2 has best perf).
1276 // No need to compare the original model to the model from the plan -- we
1277 // didn't actually do any partitioning.
1278 const auto devicesA = makeDevices({{"V1_0", 0.8, HalVersion::V1_0, ~0U},
1279 {"V1_1", 0.7, HalVersion::V1_1, ~0U, ~0U},
1280 {"V1_2", 0.6, HalVersion::V1_2, ~0U, ~0U, ~0U}});
1281 ExecutionPlan planA;
1282 ASSERT_EQ(model.partitionTheWork(devicesA, ExecutePreference::PREFER_LOW_POWER, &planA),
1283 ANEURALNETWORKS_NO_ERROR);
1284 ASSERT_EQ(planA.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1285 ASSERT_NE(planA.forTest_simpleGetDevice().get(), nullptr);
1286 ASSERT_STREQ(planA.forTest_simpleGetDevice()->getName(), "V1_2");
1287
1288 // Compound partition (V1_0, V1_1, V1_2 devices are available, in decreasing
1289 // order of performance; model is distributed across all three devices).
1290 const auto devicesB = makeDevices({{"V1_0", 0.6, HalVersion::V1_0, ~0U},
1291 {"V1_1", 0.7, HalVersion::V1_1, ~0U, ~0U},
1292 {"V1_2", 0.8, HalVersion::V1_2, ~0U, ~0U, ~0U}});
1293 ExecutionPlan planB;
1294 ASSERT_EQ(model.partitionTheWork(devicesB, ExecutePreference::PREFER_LOW_POWER, &planB),
1295 ANEURALNETWORKS_NO_ERROR);
1296 ASSERT_EQ(planB.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
1297 const auto& stepsB = planB.forTest_compoundGetSteps();
1298 ASSERT_EQ(stepsB.size(), size_t(3));
1299 {
1300 // Build a model to compare against the submodel from stepsB[0].
1301 PartitioningModel modelB0;
1302 uint32_t b0Opnd0 = modelB0.addFloatOperand();
1303 uint32_t b0Opnd1 = modelB0.addFloatOperand();
1304 uint32_t b0Opnd2 = modelB0.addOperation2To1V1_1(0, b0Opnd0, b0Opnd1);
1305 modelB0.identifyInputsAndOutputs({b0Opnd0, b0Opnd1}, {b0Opnd2});
1306 modelB0.finish();
1307 ASSERT_TRUE(modelB0.isValid());
1308
1309 ASSERT_NO_FATAL_FAILURE(
1310 compare(stepsB[0], &modelB0, devicesB[1],
1311 RemapVectorType{{opnd0, b0Opnd0}, {opnd1, b0Opnd1}}, // modelInputs
1312 RemapVectorType{{opnd4, b0Opnd2}}, // modelOutputs
1313 RemapVectorType{}, // tempsAsSubModelInputs
1314 SubModelOutputSetType{}, // tempsAsSubModelOutputs
1315 RemapVectorType{})); // outputsAsSubModelInputs
1316 }
1317 {
1318 // Build a model to compare against the submodel from stepsB[1].
1319 PartitioningModel modelB1;
1320 uint32_t b1Opnd0 = modelB1.addFloatOperand();
1321 uint32_t b1Opnd1 = modelB1.addFloatOperand();
1322 uint32_t b1Opnd2 = modelB1.addOperation2To1V1_0(0, b1Opnd0, b1Opnd1);
1323 uint32_t b1Opnd3 = modelB1.addOperation2To1V1_0(1, b1Opnd0, b1Opnd1);
1324 modelB1.identifyInputsAndOutputs({b1Opnd0, b1Opnd1}, {b1Opnd2, b1Opnd3});
1325 modelB1.finish();
1326 ASSERT_TRUE(modelB1.isValid());
1327
1328 ASSERT_NO_FATAL_FAILURE(
1329 compare(stepsB[1], &modelB1, devicesB[0],
1330 RemapVectorType{{opnd0, b1Opnd0}, {opnd1, b1Opnd1}}, // modelInputs
1331 RemapVectorType{{opnd2, b1Opnd2}}, // modelOutputs
1332 RemapVectorType{}, // tempsAsSubModelInputs
1333 SubModelOutputSetType{{opnd3, b1Opnd3}}, // tempsAsSubModelOutputs
1334 RemapVectorType{})); // outputsAsSubModelInputs
1335 }
1336 {
1337 // Build a model to compare against the submodel from stepsB[2].
1338 PartitioningModel modelB2;
1339 uint32_t b2Opnd0 = modelB2.addFloatOperand();
1340 uint32_t b2Opnd1 = modelB2.addFloatOperand();
1341 uint32_t b2Opnd2 = modelB2.addOperation2To1V1_2(0, b2Opnd0, b2Opnd1);
1342 // Note: In the partitioning algorithm, temps that are
1343 // submodel inputs precede model outputs that are submodel
1344 // inputs. In the original model "model", opnd3 is a temp and
1345 // opnd2 is a model output; so in the submodel "modelB2", the
1346 // corresponding inputs b2Opnd1 and b2Opnd0 must appear in
1347 // that order.
1348 modelB2.identifyInputsAndOutputs({b2Opnd1, b2Opnd0}, {b2Opnd2});
1349 modelB2.finish();
1350 ASSERT_TRUE(modelB2.isValid());
1351
1352 ASSERT_NO_FATAL_FAILURE(
1353 compare(stepsB[2], &modelB2, devicesB[2], RemapVectorType{}, // modelInputs
1354 RemapVectorType{{opnd5, b2Opnd2}}, // modelOutputs
1355 RemapVectorType{{opnd3, b2Opnd1}}, // tempsAsSubModelInputs
1356 SubModelOutputSetType{}, // tempsAsSubModelOutputs
1357 RemapVectorType{{opnd2, b2Opnd0}})); // outputsAsSubModelInputs
1358 }
1359
1360 // TODO: Make sure this still works when we have multiple devices
1361 // of same version available for slicing. An easy (?) choice would
1362 // be to route the two different V1_0 operations to different
1363 // devices.
1364 }
1365
TEST_F(PartitioningTest,SliceModelToEmpty)1366 TEST_F(PartitioningTest, SliceModelToEmpty) {
1367 PartitioningModel model;
1368 uint32_t opnd0 = model.addFloatOperand();
1369 uint32_t opnd1 = model.addFloatOperand();
1370 uint32_t opnd2 = model.addOperation2To1V1_2(0, opnd0, opnd1);
1371 model.identifyInputsAndOutputs({opnd0, opnd1}, {opnd2});
1372 model.finish();
1373 ASSERT_TRUE(model.isValid());
1374
1375 // Only the V1_2 device can handle any operations in the model.
1376 // No need to compare the original model to the model from the plan -- we
1377 // didn't actually do any partitioning.
1378 const auto devices = makeDevices({{"V1_0", 0.6, HalVersion::V1_0, ~0U},
1379 {"V1_1", 0.7, HalVersion::V1_1, ~0U, ~0U},
1380 {"V1_2", 0.8, HalVersion::V1_2, ~0U, ~0U, ~0U}});
1381 ExecutionPlan plan;
1382 ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
1383 ANEURALNETWORKS_NO_ERROR);
1384 ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1385 ASSERT_NE(plan.forTest_simpleGetDevice().get(), nullptr);
1386 ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), "V1_2");
1387 }
1388
TEST_F(PartitioningTest,Cpu)1389 TEST_F(PartitioningTest, Cpu) {
1390 // Here's a model where some operations execute only on the Cpu.
1391 // To make things interesting, we produce three partitions --
1392 // device, cpu, same-device.
1393
1394 static const uint32_t kCpuOp = 1;
1395 static const uint32_t kDevOp = 2;
1396
1397 const auto devices = makeDevices({{"1", 0.5, 1 << kDevOp}});
1398
1399 PartitioningModel model;
1400
1401 uint32_t opnd0 = model.addFloatOperand();
1402 uint32_t opnd1 = model.addFloatOperand();
1403
1404 uint32_t opnd2 = model.addOperation2To1V1_0(kDevOp, opnd0, opnd1);
1405 uint32_t opnd3 = model.addOperation2To1V1_0(kDevOp, opnd0, opnd2);
1406
1407 uint32_t opnd4 = model.addOperation2To1V1_0(kCpuOp, opnd0, opnd3);
1408 uint32_t opnd5 = model.addOperation2To1V1_0(kCpuOp, opnd2, opnd4);
1409
1410 uint32_t opnd6 = model.addFloatOperand();
1411
1412 uint32_t opnd7 = model.addOperation2To1V1_0(kDevOp, opnd3, opnd5);
1413 uint32_t opnd8 = model.addOperation2To1V1_0(kDevOp, opnd6, opnd7);
1414
1415 model.identifyInputsAndOutputs({ opnd0, opnd1, opnd6 }, { opnd4, opnd8 });
1416 model.finish();
1417 ASSERT_TRUE(model.isValid());
1418
1419 ExecutionPlan plan;
1420 ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
1421 ANEURALNETWORKS_NO_ERROR);
1422 ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
1423 const auto& steps = plan.forTest_compoundGetSteps();
1424 ASSERT_EQ(steps.size(), size_t(3));
1425 {
1426 const auto& step0 = steps[0];
1427
1428 // Build a model to compare against the submodel from steps[0].
1429 PartitioningModel model0;
1430 uint32_t m0Opnd0 = model0.addFloatOperand();
1431 uint32_t m0Opnd1 = model0.addFloatOperand();
1432 uint32_t m0Opnd2 = model0.addOperation2To1V1_0(kDevOp, m0Opnd0, m0Opnd1);
1433 uint32_t m0Opnd3 = model0.addOperation2To1V1_0(kDevOp, m0Opnd0, m0Opnd2);
1434 model0.identifyInputsAndOutputs({ m0Opnd0, m0Opnd1 }, { m0Opnd2, m0Opnd3 });
1435 model0.finish();
1436 ASSERT_TRUE(model0.isValid());
1437
1438 ASSERT_NO_FATAL_FAILURE(
1439 compare(step0, &model0, devices[0],
1440 RemapVectorType{{opnd0, m0Opnd0}, {opnd1, m0Opnd1}}, // modelInputs
1441 RemapVectorType{}, // modelOutputs
1442 RemapVectorType{}, // tempsAsSubModelInputs
1443 SubModelOutputSetType{{opnd2, m0Opnd2},
1444 {opnd3, m0Opnd3}}, // tempsAsSubModelOutputs
1445 RemapVectorType{})); // outputsAsSubModelInputs
1446 }
1447 {
1448 const auto& step1 = steps[1];
1449
1450 // Build a model to compare against the submodel from steps[1].
1451 PartitioningModel model1;
1452 uint32_t m1Opnd0 = model1.addFloatOperand();
1453 uint32_t m1Opnd3 = model1.addFloatOperand();
1454 uint32_t m1Opnd4 = model1.addOperation2To1V1_0(kCpuOp, m1Opnd0, m1Opnd3);
1455 uint32_t m1Opnd2 = model1.addFloatOperand();
1456 uint32_t m1Opnd5 = model1.addOperation2To1V1_0(kCpuOp, m1Opnd2, m1Opnd4);
1457 model1.identifyInputsAndOutputs({ m1Opnd0, m1Opnd3, m1Opnd2 }, { m1Opnd4, m1Opnd5 });
1458 model1.finish();
1459 ASSERT_TRUE(model1.isValid());
1460
1461 ASSERT_NO_FATAL_FAILURE(compare(
1462 step1, &model1, DeviceManager::getCpuDevice(),
1463 RemapVectorType{{opnd0, m1Opnd0}}, // modelInputs
1464 RemapVectorType{{opnd4, m1Opnd4}}, // modelOutputs
1465 RemapVectorType{{opnd3, m1Opnd3}, {opnd2, m1Opnd2}}, // tempsAsSubModelInputs
1466 SubModelOutputSetType{{opnd5, m1Opnd5}}, // tempsAsSubModelOutputs
1467 RemapVectorType{})); // outputsAsSubModelInputs
1468 }
1469 {
1470 const auto& step2 = steps[2];
1471
1472 // Build a model to compare against the submodel from steps[2].
1473 PartitioningModel model2;
1474 uint32_t m2Opnd3 = model2.addFloatOperand();
1475 uint32_t m2Opnd5 = model2.addFloatOperand();
1476 uint32_t m2Opnd7 = model2.addOperation2To1V1_0(kDevOp, m2Opnd3, m2Opnd5);
1477 uint32_t m2Opnd6 = model2.addFloatOperand();
1478 uint32_t m2Opnd8 = model2.addOperation2To1V1_0(kDevOp, m2Opnd6, m2Opnd7);
1479 model2.identifyInputsAndOutputs({ m2Opnd6, m2Opnd3, m2Opnd5 }, { m2Opnd8 });
1480 model2.finish();
1481 ASSERT_TRUE(model2.isValid());
1482
1483 ASSERT_NO_FATAL_FAILURE(compare(
1484 step2, &model2, devices[0], RemapVectorType{{opnd6, m2Opnd6}}, // modelInputs
1485 RemapVectorType{{opnd8, m2Opnd8}}, // modelOutputs
1486 RemapVectorType{{opnd3, m2Opnd3}, {opnd5, m2Opnd5}}, // tempsAsSubModelInputs
1487 SubModelOutputSetType{}, // tempsAsSubModelOutputs
1488 RemapVectorType{})); // outputsAsSubModelInputs
1489 }
1490 }
1491
TEST_F(PartitioningTest,SetPartitioning)1492 TEST_F(PartitioningTest, SetPartitioning) {
1493 PartitioningModel model;
1494 uint32_t opnd0 = model.addFloatOperand();
1495 uint32_t opnd1 = model.addFloatOperand();
1496 uint32_t opnd2 =
1497 model.addOperation2To1V1_0(0, opnd0, opnd1, PartitioningModel::Dimensioned::NO);
1498 uint32_t opnd3 = model.addFloatOperand();
1499 uint32_t opnd4 = model.addOperation2To1V1_0(1, opnd2, opnd3);
1500 model.identifyInputsAndOutputs({ opnd0, opnd1, opnd3 }, { opnd4 });
1501 model.finish();
1502 ASSERT_TRUE(model.isValid());
1503
1504 // We expect that we cannot successfully partition, because we
1505 // have an intermediate operand (opnd2) without dimensions, and
1506 // this is not currently handled.
1507
1508 // One device that can and should execute operation 0.
1509 const auto devices = makeDevices({{"hw", 0.5, (1 << 0)}});
1510
1511 // Test kPartitioningNo. We should not even attempt partitioning,
1512 // so there should be a SIMPLE plan on CPU.
1513 // No need to compare the original model to the model from the plan -- we
1514 // didn't actually do any partitioning.
1515 PartitioningCompilation cPNo(&model, devices);
1516 ASSERT_EQ(cPNo.setPartitioning(DeviceManager::kPartitioningNo), Result::NO_ERROR);
1517 ASSERT_EQ(cPNo.finish(), Result::NO_ERROR);
1518 ASSERT_EQ(cPNo.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1519 ASSERT_EQ(cPNo.getExecutionPlan().forTest_simpleGetDevice(), DeviceManager::getCpuDevice());
1520
1521 // Test kPartitioningWithFallback. We should attempt
1522 // partitioning, reach the end of the partitioning process (so we
1523 // have an unsuccessful execution plan), discover the dimensionless
1524 // intermediate operand, then fallback to CPU with a SIMPLE plan, and
1525 // finally return success.
1526 // No need to compare the original model to the model from the plan -- we
1527 // didn't actually do any partitioning.
1528 PartitioningCompilation cPWithFallback(&model, devices);
1529 ASSERT_EQ(cPWithFallback.setPartitioning(DeviceManager::kPartitioningWithFallback), Result::NO_ERROR);
1530 ASSERT_EQ(cPWithFallback.finish(), Result::NO_ERROR);
1531 ASSERT_EQ(cPWithFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1532 ASSERT_EQ(cPWithFallback.getExecutionPlan().forTest_simpleGetDevice(),
1533 DeviceManager::getCpuDevice());
1534
1535 // Test kPartitioningWithoutFallback. We should attempt
1536 // partitioning, and fail.
1537 PartitioningCompilation cPWithoutFallback(&model, devices);
1538 ASSERT_EQ(cPWithoutFallback.setPartitioning(DeviceManager::kPartitioningWithoutFallback), Result::NO_ERROR);
1539 ASSERT_EQ(cPWithoutFallback.finish(), Result::OP_FAILED);
1540 ASSERT_TRUE(cPWithoutFallback.getExecutionPlan().forTest_hasSubModelOutputsOfUnknownSize());
1541 ASSERT_EQ(cPWithoutFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::ERROR);
1542 }
1543
1544 // Regression test for http://b/69166603:
1545 // "partitioned compilation and execution yields wrong results when model output is submodel input"
TEST_F(PartitioningTest,ModelOutputAsSubmodelInput)1546 TEST_F(PartitioningTest, ModelOutputAsSubmodelInput) {
1547 PartitioningModel model;
1548 uint32_t opnd0 = model.addFloatOperand();
1549 uint32_t opnd1 = model.addFloatOperand();
1550 uint32_t opnd2 = model.addOperation2To1V1_0(0, opnd0, opnd1);
1551 uint32_t opnd3 = model.addOperation2To1V1_0(1, opnd2, opnd2);
1552 model.identifyInputsAndOutputs({ opnd0, opnd1 }, { opnd2, opnd3 });
1553 model.finish();
1554 ASSERT_TRUE(model.isValid());
1555
1556 // Compound partition (two devices, each is capable of one of the
1557 // two operations). We could do more extensive checking here --
1558 // for example, verify that each step within the plan has the
1559 // correct (model and submodel)x(inputs and outputs).
1560 const auto devices = makeDevices({{"0", 0.5, 1 << 0}, {"1", 0.5, 1 << 1}});
1561 ExecutionPlan plan;
1562 ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
1563 ANEURALNETWORKS_NO_ERROR);
1564 ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
1565 const auto& steps = plan.forTest_compoundGetSteps();
1566 ASSERT_EQ(steps.size(), size_t(2));
1567 {
1568 // Build a model to compare against the submodel from steps[0].
1569 PartitioningModel model0;
1570 uint32_t m0Opnd0 = model0.addFloatOperand();
1571 uint32_t m0Opnd1 = model0.addFloatOperand();
1572 uint32_t m0Opnd2 = model0.addOperation2To1V1_0(0, m0Opnd0, m0Opnd1);
1573 model0.identifyInputsAndOutputs({ m0Opnd0, m0Opnd1 }, { m0Opnd2 });
1574 model0.finish();
1575 ASSERT_TRUE(model0.isValid());
1576 ASSERT_NO_FATAL_FAILURE(
1577 compare(steps[0], &model0, devices[0],
1578 RemapVectorType{{opnd0, m0Opnd0}, {opnd1, m0Opnd1}}, // modelInputs
1579 RemapVectorType{{opnd2, m0Opnd2}}, // modelOutputs
1580 RemapVectorType{}, // tempsAsSubModelInputs
1581 SubModelOutputSetType{}, // tempsAsSubModelOutputs
1582 RemapVectorType{})); // outputsAsSubModelInputs
1583 }
1584 {
1585 // Build a model to compare against the submodel from steps[1].
1586 PartitioningModel model1;
1587 uint32_t m1Opnd2 = model1.addFloatOperand();
1588 uint32_t m1Opnd3 = model1.addOperation2To1V1_0(1, m1Opnd2, m1Opnd2);
1589 model1.identifyInputsAndOutputs({ m1Opnd2 }, { m1Opnd3 });
1590 model1.finish();
1591 ASSERT_TRUE(model1.isValid());
1592
1593 ASSERT_NO_FATAL_FAILURE(
1594 compare(steps[1], &model1, devices[1], RemapVectorType{}, // modelInputs
1595 RemapVectorType{{opnd3, m1Opnd3}}, // modelOutputs
1596 RemapVectorType{}, // tempsAsSubModelInputs
1597 SubModelOutputSetType{}, // tempsAsSubModelOutputs
1598 RemapVectorType{{opnd2, m1Opnd2}})); // outputsAsSubModelInputs
1599 }
1600 }
1601
TEST_F(PartitioningTest,OemOperations)1602 TEST_F(PartitioningTest, OemOperations) {
1603 // Trivial model consisting solely of OEM operation.
1604 PartitioningModel model;
1605 uint32_t opndIn = model.addFloatOperand();
1606 uint32_t opndOut = model.addOperationOEM1To1(opndIn);
1607 model.identifyInputsAndOutputs({ opndIn }, { opndOut });
1608 model.finish();
1609 ASSERT_TRUE(model.isValid());
1610
1611 // Verify that the best driver than can run an OEM operation is
1612 // used, even if it is not better than the CPU.
1613 // No need to compare the original model to the model from the plan -- we
1614 // didn't actually do any partitioning.
1615 const auto devicesBestOEM = makeDevices({{"badOEM", 1.5, ~0U, PartitioningDriver::OEMYes},
1616 {"noOEM", 0.5, ~0U, PartitioningDriver::OEMNo},
1617 {"goodOEM", 1.2, ~0U, PartitioningDriver::OEMYes}});
1618 PartitioningCompilation compilationBestOEM(&model, devicesBestOEM);
1619 ASSERT_EQ(compilationBestOEM.finish(), Result::NO_ERROR);
1620 const auto& planBestOEM = compilationBestOEM.getExecutionPlan();
1621 ASSERT_EQ(planBestOEM.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1622 ASSERT_NE(planBestOEM.forTest_simpleGetDevice().get(), nullptr);
1623 ASSERT_STREQ(planBestOEM.forTest_simpleGetDevice()->getName(), "goodOEM");
1624
1625 // Verify that we get an error if no driver can run an OEM operation.
1626 const auto devicesNoOEM = makeDevices({{"noOEM", 0.5, ~0U, PartitioningDriver::OEMNo}});
1627 PartitioningCompilation compilationNoOEM(&model, devicesNoOEM);
1628 ASSERT_EQ(compilationNoOEM.finish(), Result::BAD_DATA);
1629
1630 // Verify that we get an error if a driver can SUPPORT but not PREPARE an OEM operation.
1631 const auto devicesIndecisiveOEM =
1632 makeDevices({{"indecisiveOEM", 0.5, ~0U, PartitioningDriver::OEMIndecisive}});
1633 PartitioningCompilation compilationIndecisiveOEM(&model, devicesIndecisiveOEM);
1634 ASSERT_NE(compilationIndecisiveOEM.finish(), Result::NO_ERROR);
1635
1636 // Verify that we get an error if there are no drivers (only CPU fallback).
1637 PartitioningCompilation compilationNoDrivers(&model, makeDevices({}) /* no drivers */);
1638 ASSERT_EQ(compilationNoDrivers.finish(), Result::BAD_DATA);
1639 }
1640
TEST_F(PartitioningTest,RelaxedFP)1641 TEST_F(PartitioningTest, RelaxedFP) {
1642 const auto devices = makeDevices({// Best choice for non-relaxed model.
1643 {"f32", 0.8, 0.9 /* relaxed */, ~0U},
1644 // Best choice for relaxed model.
1645 {"f16", 0.9, 0.8 /* relaxed */, ~0U}});
1646
1647 auto TrivialTest = [&devices](bool doRelax, const char* expectDevice) {
1648 // Trivial model consisting solely of one operation.
1649 SCOPED_TRACE(expectDevice);
1650 PartitioningModel model;
1651 uint32_t opnd0 = model.addFloatOperand();
1652 uint32_t opnd1 = model.addFloatOperand();
1653 uint32_t opnd2 = model.addOperation2To1V1_0(0, opnd0, opnd1);
1654 model.identifyInputsAndOutputs({ opnd0, opnd1 }, { opnd2 });
1655 model.relaxComputationFloat32toFloat16(doRelax);
1656 model.finish();
1657 ASSERT_TRUE(model.isValid());
1658 // Verify that the model will be executed on the appropriate device.
1659 // No need to compare the original model to the model from the plan -- we
1660 // didn't actually do any partitioning.
1661 ExecutionPlan plan;
1662 ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
1663 ANEURALNETWORKS_NO_ERROR);
1664 ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1665 ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), expectDevice);
1666 };
1667
1668 ASSERT_NO_FATAL_FAILURE(TrivialTest(false, "f32"));
1669 ASSERT_NO_FATAL_FAILURE(TrivialTest(true, "f16"));
1670 }
1671
TEST_F(PartitioningTest,Perf)1672 TEST_F(PartitioningTest, Perf) {
1673 // The various type names used here are confusing.
1674 //
1675 // OperandType (from HAL file), WrapperType (from NeuralNetworksWrapper.h),
1676 // and OperandCode (from NeuralNetworks.h) are different enums representing
1677 // the same type kind -- e.g., OperandType::FLOAT32, WrapperType::FLOAT32,
1678 // ANEURALNETWORKS_FLOAT32. Corresponding enumerators have the same value.
1679 //
1680 // WrapperOperandType is the NeuralNetworksWrapper.h representation of a
1681 // full operand type (WrapperType plus dimensions plus other attributes).
1682
1683 auto TestType = [](OperandType operandType) {
1684 SCOPED_TRACE(toString(operandType));
1685 // Trivial model consisting solely of OEM operation. We
1686 // pick OEM operation because this allows us to use
1687 // inputs and outputs of any number and type.
1688 PartitioningModel model;
1689 uint32_t opndIn = model.addOperand(static_cast<WrapperType>(operandType));
1690 uint32_t opndOut = model.addOperationOEM1To1(opndIn);
1691 model.identifyInputsAndOutputs({opndIn}, {opndOut});
1692 model.finish();
1693 ASSERT_TRUE(model.isValid());
1694
1695 const Capabilities baseCapabilities = makeCapabilities(0.5);
1696
1697 {
1698 // better than base
1699 Capabilities goodCapabilities = baseCapabilities;
1700 update(&goodCapabilities, operandType, 0.25);
1701
1702 const auto devices =
1703 makeDevices({{"base", baseCapabilities, ~0U, PartitioningDriver::OEMYes},
1704 {"good", goodCapabilities, ~0U, PartitioningDriver::OEMYes}});
1705
1706 // Verify that model will be executed on "good".
1707 // No need to compare the original model to the model from the plan -- we
1708 // didn't actually do any partitioning.
1709 ExecutionPlan plan;
1710 ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
1711 ANEURALNETWORKS_NO_ERROR);
1712 ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1713 ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), "good");
1714 }
1715
1716 {
1717 // worse than base
1718 Capabilities badCapabilities = baseCapabilities;
1719 update(&badCapabilities, operandType, 0.75);
1720 const auto devices =
1721 makeDevices({{"base", baseCapabilities, ~0U, PartitioningDriver::OEMYes},
1722 {"bad", badCapabilities, ~0U, PartitioningDriver::OEMYes}});
1723
1724 // Verify that model will be executed on "base".
1725 // No need to compare the original model to the model from the plan -- we
1726 // didn't actually do any partitioning.
1727 ExecutionPlan plan;
1728 ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
1729 ANEURALNETWORKS_NO_ERROR);
1730 ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1731 ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), "base");
1732 }
1733 };
1734
1735 for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MIN);
1736 type <= static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MAX); ++type) {
1737 TestType(static_cast<OperandType>(type));
1738 }
1739 for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::OEM_MIN);
1740 type <= static_cast<uint32_t>(OperandTypeRange::OEM_MAX); ++type) {
1741 TestType(static_cast<OperandType>(type));
1742 }
1743 }
1744
1745 // Test token rehashing during the compilation step.
1746 class CacheTest : public PartitioningTest {
1747 protected:
SetUp()1748 virtual void SetUp() override {
1749 PartitioningTest::SetUp();
1750 char cacheDirTemp[] = "/data/local/tmp/TestCompilationCachingXXXXXX";
1751 char* cacheDir = mkdtemp(cacheDirTemp);
1752 ASSERT_NE(cacheDir, nullptr);
1753 mCacheDir = cacheDir;
1754 }
1755
TearDown()1756 virtual void TearDown() override {
1757 if (!::testing::Test::HasFailure()) {
1758 std::filesystem::remove_all(mCacheDir);
1759 }
1760 PartitioningTest::TearDown();
1761 }
1762
expectUniqueTokens(const std::vector<std::vector<uint8_t>> & tokens)1763 void expectUniqueTokens(const std::vector<std::vector<uint8_t>>& tokens) {
1764 for (uint32_t i = 0; i < tokens.size(); i++) {
1765 SCOPED_TRACE(i);
1766 for (uint32_t j = i + 1; j < tokens.size(); j++) {
1767 SCOPED_TRACE(j);
1768 EXPECT_NE(tokens[i], tokens[j]);
1769 }
1770 }
1771 }
1772
1773 // Launch a single run of the partitioner against the provided model and device list with
1774 // cache token privided as tokenIn. Find the partition for the device with deviceName.
1775 // Record the tranformed token into tokenOut.
1776 // If tokenIn is empty, no caching information will be provided to the partitioner.
getTransformedCacheTokenSingle(const PartitioningModel & model,const std::vector<std::shared_ptr<Device>> & devices,const char * deviceName,const std::vector<uint8_t> & tokenIn,ExecutePreference preference,std::vector<uint8_t> * tokenOut)1777 void getTransformedCacheTokenSingle(const PartitioningModel& model,
1778 const std::vector<std::shared_ptr<Device>>& devices,
1779 const char* deviceName, const std::vector<uint8_t>& tokenIn,
1780 ExecutePreference preference,
1781 std::vector<uint8_t>* tokenOut) {
1782 // Compile the model and get the execution plan.
1783 PartitioningCompilation compilation(&model, devices);
1784 if (!tokenIn.empty()) {
1785 compilation.setCaching(mCacheDir.c_str(), tokenIn);
1786 }
1787 compilation.setPreference(preference);
1788 ASSERT_EQ(compilation.finish(), Result::NO_ERROR);
1789 const ExecutionPlan& plan = compilation.getExecutionPlan();
1790
1791 // Find the cache info for the device.
1792 const uint8_t* token = nullptr;
1793 if (plan.forTest_getKind() == ExecutionPlan::Kind::SIMPLE) {
1794 ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), deviceName);
1795 token = plan.forTest_simpleGetCacheToken();
1796 } else if (plan.forTest_getKind() == ExecutionPlan::Kind::COMPOUND) {
1797 const auto& steps = plan.forTest_compoundGetSteps();
1798 bool found = false;
1799 for (const auto& step : steps) {
1800 // In general, two or more partitions can be on the same device. However, this will
1801 // not happen on the test models with only 2 operations.
1802 if (strcmp(step->getDevice()->getName(), deviceName) == 0) {
1803 ASSERT_FALSE(found);
1804 token = step->forTest_getCacheToken();
1805 found = true;
1806 }
1807 }
1808 ASSERT_TRUE(found);
1809 } else {
1810 FAIL();
1811 }
1812
1813 // Retrieve the transformed token from the cache info.
1814 if (token == nullptr) {
1815 tokenOut->clear();
1816 } else {
1817 tokenOut->resize(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN);
1818 std::copy(token, token + ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, tokenOut->begin());
1819 }
1820 }
1821
1822 // A wrapper of getTransformedCacheTokenSingle, which runs getTransformedCacheTokenSingle
1823 // multiple times and checks if the transformation provides consistent result.
getTransformedCacheToken(const PartitioningModel & model,const std::vector<std::shared_ptr<Device>> & devices,const char * deviceName,const std::vector<uint8_t> & tokenIn,ExecutePreference preference,std::vector<uint8_t> * tokenOut)1824 void getTransformedCacheToken(const PartitioningModel& model,
1825 const std::vector<std::shared_ptr<Device>>& devices,
1826 const char* deviceName, const std::vector<uint8_t>& tokenIn,
1827 ExecutePreference preference, std::vector<uint8_t>* tokenOut) {
1828 getTransformedCacheTokenSingle(model, devices, deviceName, tokenIn, preference, tokenOut);
1829
1830 // Test if the runtime maps to the same cache token every time for the same compilation
1831 // setup.
1832 for (uint32_t i = 0; i < 10; i++) {
1833 std::vector<uint8_t> token;
1834 SCOPED_TRACE(i);
1835 getTransformedCacheTokenSingle(model, devices, deviceName, tokenIn, preference, &token);
1836 EXPECT_EQ(*tokenOut, token);
1837 }
1838 }
1839
CreateModelForCachingTests(PartitioningModel * model)1840 void CreateModelForCachingTests(PartitioningModel* model) {
1841 uint32_t opnd0 = model->addFloatOperand();
1842 uint32_t opnd1 = model->addFloatOperand();
1843 uint32_t opnd2 = model->addOperation2To1V1_0(0, opnd0, opnd1);
1844 uint32_t opnd3 = model->addFloatOperand();
1845 uint32_t opnd4 = model->addOperation2To1V1_0(1, opnd2, opnd3);
1846 model->identifyInputsAndOutputs({opnd0, opnd1, opnd3}, {opnd4});
1847 model->finish();
1848 ASSERT_TRUE(model->isValid());
1849 }
1850
1851 std::string mCacheDir;
1852 };
1853
1854 // Test the case when no token is provided by the application and the execution plan has a
1855 // simple body.
TEST_F(CacheTest,CacheTokenNoneSimpleBody)1856 TEST_F(CacheTest, CacheTokenNoneSimpleBody) {
1857 PartitioningModel model;
1858 CreateModelForCachingTests(&model);
1859
1860 // deviceA can execute the whole model.
1861 const auto deviceA = makeDevices({
1862 {"deviceA", 0.5, ~0U},
1863 });
1864
1865 std::vector<uint8_t> tokenIn, tokenOut;
1866 getTransformedCacheToken(model, deviceA, "deviceA", tokenIn,
1867 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut);
1868 EXPECT_TRUE(tokenOut.empty());
1869 }
1870
1871 // Test if the runtime maps to different cache tokens for devices with different names in
1872 // execution plan with a simple body.
TEST_F(CacheTest,CacheTokenDifferentDeviceNamesSimpleBody)1873 TEST_F(CacheTest, CacheTokenDifferentDeviceNamesSimpleBody) {
1874 PartitioningModel model;
1875 CreateModelForCachingTests(&model);
1876
1877 // Two devices that can both execute the whole model.
1878 const auto deviceA = makeDevices({{"deviceA", 0.5, ~0U}});
1879 const auto deviceB = makeDevices({{"deviceB", 0.5, ~0U}});
1880
1881 std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
1882 std::vector<uint8_t> deviceAToken, deviceBToken;
1883 getTransformedCacheToken(model, deviceA, "deviceA", tokenIn,
1884 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceAToken);
1885 getTransformedCacheToken(model, deviceB, "deviceB", tokenIn,
1886 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceBToken);
1887 expectUniqueTokens({deviceAToken, deviceBToken});
1888 }
1889
1890 // Test if the runtime maps to different cache tokens for devices with different version strings in
1891 // execution plan with a simple body.
TEST_F(CacheTest,CacheTokenDifferentDeviceVersionStringsSimpleBody)1892 TEST_F(CacheTest, CacheTokenDifferentDeviceVersionStringsSimpleBody) {
1893 PartitioningModel model;
1894 CreateModelForCachingTests(&model);
1895
1896 // Two devices that can both execute the whole model.
1897 const auto deviceA_1_0 = makeDevices({{"deviceA", "1.0", 0.5, ~0U}});
1898 const auto deviceA_1_1 = makeDevices({{"deviceA", "1.1", 0.5, ~0U}});
1899
1900 std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
1901 std::vector<uint8_t> deviceA_1_0_Token, deviceA_1_1_Token;
1902 getTransformedCacheToken(model, deviceA_1_0, "deviceA", tokenIn,
1903 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceA_1_0_Token);
1904 getTransformedCacheToken(model, deviceA_1_1, "deviceA", tokenIn,
1905 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceA_1_1_Token);
1906 expectUniqueTokens({deviceA_1_0_Token, deviceA_1_1_Token});
1907 }
1908
1909 // Test if the runtime maps to different cache tokens for compilations with different preferences
1910 // in execution plan with a simple body.
TEST_F(CacheTest,CacheTokenDifferentPreferencesSimpleBody)1911 TEST_F(CacheTest, CacheTokenDifferentPreferencesSimpleBody) {
1912 PartitioningModel model;
1913 CreateModelForCachingTests(&model);
1914
1915 // One device that can execute the whole model.
1916 const auto deviceA = makeDevices({{"deviceA", 0.5, ~0U}});
1917
1918 std::vector<uint8_t> fastToken, powerToken, sustainedToken;
1919 std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
1920 getTransformedCacheToken(model, deviceA, "deviceA", tokenIn,
1921 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &fastToken);
1922 getTransformedCacheToken(model, deviceA, "deviceA", tokenIn,
1923 ExecutePreference::PREFER_LOW_POWER, &powerToken);
1924 getTransformedCacheToken(model, deviceA, "deviceA", tokenIn,
1925 ExecutePreference::PREFER_SUSTAINED_SPEED, &sustainedToken);
1926 expectUniqueTokens({fastToken, powerToken, sustainedToken});
1927 }
1928
1929 // Test if the runtime maps to different cache tokens for compilations with different tokens
1930 // provided by application in execution plan with a simple body.
TEST_F(CacheTest,CacheTokenDifferentTokensSimpleBody)1931 TEST_F(CacheTest, CacheTokenDifferentTokensSimpleBody) {
1932 PartitioningModel model;
1933 CreateModelForCachingTests(&model);
1934
1935 // One device that can execute the whole model.
1936 const auto deviceA = makeDevices({{"deviceA", 0.5, ~0U}});
1937
1938 std::vector<uint8_t> tokenOut1, tokenOut2;
1939 std::vector<uint8_t> tokenIn1(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
1940 std::vector<uint8_t> tokenIn2(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 1);
1941 getTransformedCacheToken(model, deviceA, "deviceA", tokenIn1,
1942 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut1);
1943 getTransformedCacheToken(model, deviceA, "deviceA", tokenIn2,
1944 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut2);
1945 expectUniqueTokens({tokenOut1, tokenOut2});
1946 }
1947
1948 // Test the case when no token is provided by the application and the execution plan has a
1949 // compound body.
TEST_F(CacheTest,CacheTokenNoneCompoundBody)1950 TEST_F(CacheTest, CacheTokenNoneCompoundBody) {
1951 PartitioningModel model;
1952 CreateModelForCachingTests(&model);
1953
1954 // DeviceA executes the first operation only.
1955 const auto devices = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}});
1956
1957 std::vector<uint8_t> tokenIn, tokenOut;
1958 getTransformedCacheToken(model, devices, "deviceA", tokenIn,
1959 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut);
1960 EXPECT_TRUE(tokenOut.empty());
1961 getTransformedCacheToken(model, devices, "deviceB", tokenIn,
1962 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut);
1963 EXPECT_TRUE(tokenOut.empty());
1964 }
1965
1966 // Test if the runtime maps to different cache tokens for devices with different names in
1967 // execution plan with a compound body.
TEST_F(CacheTest,CacheTokenDifferentDeviceNamesCompoundBody)1968 TEST_F(CacheTest, CacheTokenDifferentDeviceNamesCompoundBody) {
1969 PartitioningModel model;
1970 CreateModelForCachingTests(&model);
1971
1972 // DeviceA executes the first operation only.
1973 const auto devices1 = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceC", 0.5, 1 << 1}});
1974 // DeviceB executes the first operation only.
1975 const auto devices2 = makeDevices({{"deviceB", 0.8, ~0U}, {"deviceC", 0.5, 1 << 1}});
1976
1977 std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
1978 std::vector<uint8_t> deviceAToken, deviceBToken;
1979 getTransformedCacheToken(model, devices1, "deviceA", tokenIn,
1980 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceAToken);
1981 getTransformedCacheToken(model, devices2, "deviceB", tokenIn,
1982 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceBToken);
1983 expectUniqueTokens({deviceAToken, deviceBToken});
1984 }
1985
1986 // Test if the runtime maps to different cache tokens for devices with different names in
1987 // execution plan with a compound body.
TEST_F(CacheTest,CacheTokenDifferentDeviceVersionStringsCompoundBody)1988 TEST_F(CacheTest, CacheTokenDifferentDeviceVersionStringsCompoundBody) {
1989 PartitioningModel model;
1990 CreateModelForCachingTests(&model);
1991
1992 // DeviceA executes the first operation only.
1993 const auto devices1 = makeDevices({{"deviceA", "1.0", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}});
1994 // DeviceB executes the first operation only.
1995 const auto devices2 = makeDevices({{"deviceA", "1.1", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}});
1996
1997 std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
1998 std::vector<uint8_t> deviceA_1_0_Token, deviceA_1_1_Token;
1999 getTransformedCacheToken(model, devices1, "deviceA", tokenIn,
2000 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceA_1_0_Token);
2001 getTransformedCacheToken(model, devices2, "deviceA", tokenIn,
2002 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceA_1_1_Token);
2003 expectUniqueTokens({deviceA_1_0_Token, deviceA_1_1_Token});
2004 }
2005
2006 // Test if the runtime maps to different cache tokens for compilations with different preferences
2007 // in execution plan with a compound body.
TEST_F(CacheTest,CacheTokenDifferentPreferencesCompoundBody)2008 TEST_F(CacheTest, CacheTokenDifferentPreferencesCompoundBody) {
2009 PartitioningModel model;
2010 CreateModelForCachingTests(&model);
2011
2012 // DeviceA executes the first operation only.
2013 const auto devices = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}});
2014
2015 std::vector<uint8_t> fastToken, powerToken, sustainedToken;
2016 std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
2017 getTransformedCacheToken(model, devices, "deviceA", tokenIn,
2018 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &fastToken);
2019 getTransformedCacheToken(model, devices, "deviceA", tokenIn,
2020 ExecutePreference::PREFER_LOW_POWER, &powerToken);
2021 getTransformedCacheToken(model, devices, "deviceA", tokenIn,
2022 ExecutePreference::PREFER_SUSTAINED_SPEED, &sustainedToken);
2023 expectUniqueTokens({fastToken, powerToken, sustainedToken});
2024 }
2025
2026 // Test if the runtime maps to different cache tokens for compilations with different tokens
2027 // provided by application in execution plan with a compound body.
TEST_F(CacheTest,CacheTokenDifferentTokensCompoundBody)2028 TEST_F(CacheTest, CacheTokenDifferentTokensCompoundBody) {
2029 PartitioningModel model;
2030 CreateModelForCachingTests(&model);
2031
2032 // DeviceA executes the first operation only.
2033 const auto devices = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}});
2034
2035 std::vector<uint8_t> tokenOut1, tokenOut2;
2036 std::vector<uint8_t> tokenIn1(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
2037 std::vector<uint8_t> tokenIn2(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 1);
2038 getTransformedCacheToken(model, devices, "deviceA", tokenIn1,
2039 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut1);
2040 getTransformedCacheToken(model, devices, "deviceA", tokenIn2,
2041 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut2);
2042 expectUniqueTokens({tokenOut1, tokenOut2});
2043 }
2044
2045 // Test if the runtime maps to different cache tokens for compilations with different partitioning
2046 // outcome in execution plan with a compound body.
TEST_F(CacheTest,CacheTokenDifferentPartitionsCompoundBody)2047 TEST_F(CacheTest, CacheTokenDifferentPartitionsCompoundBody) {
2048 PartitioningModel model;
2049 CreateModelForCachingTests(&model);
2050
2051 // DeviceA executes the whole model.
2052 const auto devices1 = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 0U}});
2053 // DeviceA executes the first operation only.
2054 const auto devices2 = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}});
2055 // DeviceA executes the second operation only.
2056 const auto devices3 = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 0}});
2057
2058 std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
2059 std::vector<uint8_t> tokenOut1, tokenOut2, tokenOut3;
2060 getTransformedCacheToken(model, devices1, "deviceA", tokenIn,
2061 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut1);
2062 getTransformedCacheToken(model, devices2, "deviceA", tokenIn,
2063 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut2);
2064 getTransformedCacheToken(model, devices3, "deviceA", tokenIn,
2065 ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut3);
2066 expectUniqueTokens({tokenOut1, tokenOut2, tokenOut3});
2067 }
2068
2069 // Very basic tests of some of the PerformanceInfo functionality.
2070 // Placed in this file because partitioning is the consumer of this functionality.
2071 class PerfTest : public ::testing::Test {};
2072
TEST_F(PerfTest,Lookup)2073 TEST_F(PerfTest, Lookup) {
2074 // Derive an arbitrary (but reproducible) performance value from an OperandType.
2075 // We'll use this to ensure that we can save and then recover a type's performance.
2076 auto typePerf = [](OperandType type) { return float(static_cast<uint32_t>(type)); };
2077
2078 Capabilities capabilities = makeCapabilities(-1.0f);
2079
2080 for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MIN);
2081 type <= static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MAX); ++type) {
2082 OperandType operandType = static_cast<OperandType>(type);
2083 update(&capabilities, operandType, typePerf(operandType));
2084 }
2085 for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::OEM_MIN);
2086 type <= static_cast<uint32_t>(OperandTypeRange::OEM_MAX); ++type) {
2087 OperandType operandType = static_cast<OperandType>(type);
2088 update(&capabilities, operandType, typePerf(operandType));
2089 }
2090
2091 // Make sure lookup retrieves the values stored by update
2092
2093 for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MIN);
2094 type <= static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MAX); ++type) {
2095 OperandType operandType = static_cast<OperandType>(type);
2096 SCOPED_TRACE(toString(operandType));
2097 EXPECT_EQ(lookupExecTime(capabilities, operandType), typePerf(operandType));
2098 }
2099 for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::OEM_MIN);
2100 type <= static_cast<uint32_t>(OperandTypeRange::OEM_MAX); ++type) {
2101 OperandType operandType = static_cast<OperandType>(type);
2102 SCOPED_TRACE(toString(operandType));
2103 EXPECT_EQ(lookupExecTime(capabilities, operandType), typePerf(operandType));
2104 }
2105
2106 // Check the behavior of a missing type
2107
2108 OperandType operandType =
2109 static_cast<OperandType>(static_cast<uint32_t>(OperandTypeRange::BASE_MAX) + 1);
2110 EXPECT_EQ(lookupExecTime(capabilities, operandType), FLT_MAX);
2111 }
2112
2113 } // namespace
2114