1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CompilationBuilder.h"
18 #include "ExecutionPlan.h"
19 #include "GraphDump.h"
20 #include "HalInterfaces.h"
21 #include "Manager.h"
22 #include "ModelBuilder.h"
23 #include "NeuralNetworks.h"
24 #include "NeuralNetworksOEM.h"
25 #include "NeuralNetworksWrapper.h"
26 #include "SampleDriver.h"
27 #include "Utils.h"
28 #include "ValidateHal.h"
29 
30 #include <gtest/gtest.h>
31 
32 #include <map>
33 #include <queue>
34 
35 // Uncomment the following line to generate some debugging output that
36 // may be useful when analyzing failures:
37 //
38 // #define VERBOSE VERBOSE
39 
40 // Uncomment the following line to generate DOT graphs.
41 //
42 // #define GRAPH GRAPH
43 
44 // These tests do whitebox testing of the graph partitioning
45 // algorithm.  It is "whitebox" in the sense that we're not evaluating
46 // whether a particular partitioning is legal, or "good enough"
47 // according to some metric, but whether it exactly matches the
48 // expected behavior of the current partitioning algorithm.
49 //
50 // A key part of the current partitioning algorithm is to determine
51 // which device among the available devices should be the one to
52 // execute a particular operation from the graph.  This determination
53 // is made "locally" -- i.e., it does not depend on the graph
54 // topology, only on the properties of the operation in question.
55 // IDevice::getSupportedOperations() indicates which operations in a
56 // graph can be executed on a device, and IDevice::getCapabilities()
57 // indicates how "good" that device is for executing particular kinds
58 // of operations.  For each operation, the partitioning algorithm
59 // picks the "best" device that is capable of executing that
60 // operation; if no device can do so, then the algorithm picks the
61 // cpu.
62 //
63 // As part of this testing approach, we want to make it easy to
64 // specify which operations in a test graph can be executed on which
65 // devices.  We accomplish this with an abstraction: There are eight
66 // different kinds of operations (each of which has two inputs and one
67 // output), and when we instantiate a device for testing purposes, we
68 // specify what subset of those eight kinds of operations the device
69 // is able to execute.
70 //
71 // The eight kinds of operations are represented in the graph as ADD
72 // or MUL with a particular activation function -- two opcodes times
73 // four activation functions means eight available operation kinds.
74 // This is a low-level representation detail -- when we specify the
75 // behavior of the device or build a graph, we do so in terms of
76 // operation encodings 0..7.
77 //
78 // In order to determine whether or not a partitioning matches the
79 // expected partitioning, we check the number of partitions, check
80 // which device each partition targets, and compare each partition's
81 // subgraph, model inputs, model outputs, submodel inputs, and
82 // submodel outputs against what is expected.  In order to perform
83 // that comparison, we build a model to compare against a partition's
84 // submodel and run a graph comparison algorithm on it.  The graph
85 // comparison and the inputs and outputs comparisons are syntactic
86 // rather than semantic comparisons -- they don't allow for
87 // reorderings of inputs and outputs.  Because of this, we need to
88 // know exactly how the partitioning algorithm orders inputs and
89 // outputs in order to construct the models and operand lists to
90 // compare against.  Here are some relevant behaviors of the
91 // partitioning algorithm:
92 //
93 // - It builds a subgraph by walking operations in forward topological
94 //   order, and adding each operation's input operands and output
95 //   operands in index order (input followed by output) when that
96 //   operation is added.  (It does not add an input that has already
97 //   been added.)
98 // - It finds model inputs, model outputs, and submodel inputs in
99 //   the order the corresponding operands were added to the subgraph
100 //   (see ExecutionStep methods getModelInputs(), getModelOutputs(),
101 //   getTempsAsSubModelInputs(), getOutputsAsSubModelInputs()).
102 // - It finds temps as submodel outputs in numerical order of corresponding
103 //   operand number in the original model (see ExecutionStep method
104 //   getTempsAsSubModelOutputs()).
105 // - When it calls identifyInputsAndOutputs() on the submodel, it
106 //   passes inputs from getModelInputs() in order, followed by temps as
107 //   submodel inputs from getTempsAsSubModelInputs() in order,
108 //   followed by outputs as submodel inputs from
109 //   getOutputsAsSubModelInputs() in order; and it passes outputs from
110 //   getModelOutputs() in order followed by submodel outputs from
111 //   getTempsAsSubModelOutputs() in order.
112 //
113 // TODO: Maybe the logic for comparing a partition to an expected
114 //       model should be changed to tolerate reorderings of inputs and
115 //       outputs, so that when we build models and lists to compare
116 //       against, we don't need to worry about input and output
117 //       orderings.  But is there a way to do this that still lets us
118 //       verify that we have the correct relationships between
119 //       an (original) model's inputs and outputs and each submodel's
120 //       inputs and outputs, as well as the correct relationship
121 //       between submodel inputs and outputs across partitions?
122 
123 namespace {
124 
125 using CompilationBuilder = ::android::nn::CompilationBuilder;
126 using Device = ::android::nn::Device;
127 using DeviceManager = ::android::nn::DeviceManager;
128 using ExecutePreference = ::android::nn::wrapper::ExecutePreference;
129 using ExecutionPlan = ::android::nn::ExecutionPlan;
130 using ExecutionStep = ::android::nn::ExecutionStep;
131 using HidlModel = ::android::hardware::neuralnetworks::V1_1::Model;
132 using ModelBuilder = ::android::nn::ModelBuilder;
133 using Result = ::android::nn::wrapper::Result;
134 using SampleDriver = ::android::nn::sample_driver::SampleDriver;
135 using WrapperCompilation = ::android::nn::wrapper::Compilation;
136 using WrapperModel = ::android::nn::wrapper::Model;
137 using WrapperOperandType = ::android::nn::wrapper::OperandType;
138 using WrapperType = ::android::nn::wrapper::Type;
139 
140 template <typename T> using sp = ::android::sp<T>;
141 
142 // We employ an operation numbering scheme:
143 // - 0..FuseCode-1 = ADD with the appropriate activation function
144 // - FuseCode..2*FuseCode-1 = MUL with the appropriate activation function
145 const uint32_t kNumFuseCodes = 4;
146 const uint32_t kBadOperation = ~0;
147 
148 // Look up the operation with the specified index in a graph, and
149 // return the operation encoding -- 0..7; or, if for some reason this
150 // is not one of the encoded operations, then return kBadOperation.
lookupOperation(std::function<const Operation & (uint32_t)> getOperation,std::function<const Operand & (uint32_t)> getOperand,std::function<const uint8_t * (uint32_t)> getValue,uint32_t operationIndex)151 uint32_t lookupOperation(std::function<const Operation&(uint32_t)> getOperation,
152                          std::function<const Operand&(uint32_t)> getOperand,
153                          std::function<const uint8_t*(uint32_t)> getValue,
154                          uint32_t operationIndex) {
155     const Operation& operation = getOperation(operationIndex);
156     switch (operation.type) {
157         case OperationType::ADD:
158         case OperationType::MUL: {
159             // input2 is the fused activation function
160             const Operand& input2 = getOperand(operation.inputs[2]);
161             if ((input2.type == OperandType::INT32) &&
162                 (input2.lifetime == OperandLifeTime::CONSTANT_COPY)) {
163                 int32_t value;
164                 memcpy(&value,
165                        getValue(input2.location.offset),
166                        input2.location.length);
167                 if (operation.type == OperationType::MUL) {
168                     value += kNumFuseCodes;
169                 }
170                 return value;
171             }
172             break;
173         }
174         default:
175             break;
176     }
177     return kBadOperation;
178 }
179 
lookupOperation(const HidlModel & model,uint32_t operationIndex)180 uint32_t lookupOperation(const HidlModel& model, uint32_t operationIndex) {
181     return lookupOperation(
182         [&model](uint32_t index) -> const Operation& {
183             return model.operations[index];
184         },
185         [&model](uint32_t index) -> const Operand& {
186             return model.operands[index];
187         },
188         [&model](uint32_t offset) {return &model.operandValues[offset];},
189         operationIndex);
190 }
191 
192 #ifdef VERBOSE
193 // This is a debugging utility function
dump(const char * name,const ModelBuilder * model)194 void dump(const char* name, const ModelBuilder* model) {
195     HidlModel hidlModel;
196     model->setHidlModel(&hidlModel);
197     std::cout << name << ": " << toString(hidlModel) << std::endl;
198     std::cout << "inputs: " << toString(hidlModel.inputIndexes) << std::endl;
199     std::cout << "outputs: " << toString(hidlModel.outputIndexes) << std::endl;
200     for (size_t i = 0, e = hidlModel.operations.size(); i < e; i++) {
201         std::cout << "operation[" << i << "]: " << toString(hidlModel.operations[i]) << std::endl;
202     }
203 }
204 #endif
205 
206 #ifdef GRAPH
hidlGraphDump(const char * name,const HidlModel & model)207 inline void hidlGraphDump(const char* name, const HidlModel& model) {
208     ::android::nn::graphDump(name, model);
209 }
210 #endif
211 
graphDump(const char * name,const WrapperModel & model)212 void graphDump([[maybe_unused]] const char* name, [[maybe_unused]] const WrapperModel& model) {
213 #ifdef GRAPH
214     HidlModel hidlModel;
215     reinterpret_cast<const ModelBuilder*>(model.getHandle())->setHidlModel(&hidlModel);
216     hidlGraphDump(name, hidlModel);
217 #endif
218 }
219 
220 // This is an IDevice for testing purposes.  It only has a few
221 // interesting properties, all of which are specified as constructor
222 // arguments: device capabilities; which subset of operation kinds
223 // (0..7) does the device support; does the device support the OEM
224 // operation.  The subset is represented with a bitmask, in which
225 // operation kind K corresponds to the bit (1 << K).
226 class PartitioningDriver : public SampleDriver {
227 private:
228     // Dummy class -- a prepared model must not be nullptr.
229     class PartitioningPreparedModel : public IPreparedModel {
230     public:
execute(const Request &,const sp<IExecutionCallback> &)231         Return<ErrorStatus> execute(const Request&,
232                                     const sp<IExecutionCallback>&) override {
233             return ErrorStatus::DEVICE_UNAVAILABLE;
234         }
235     };
236 public:
237     enum OEM { OEMNo, OEMYes };
238 
PartitioningDriver(const char * name,Capabilities capabilities,uint32_t operationMask,OEM oem=OEMNo)239     PartitioningDriver(const char *name, Capabilities capabilities,
240                        uint32_t operationMask, OEM oem = OEMNo) :
241             SampleDriver(name), mCapabilities(capabilities),
242             mOperationMask(operationMask), mOEM(oem) {}
~PartitioningDriver()243     ~PartitioningDriver() override {}
244 
prepareModel_1_1(const Model &,ExecutionPreference,const sp<IPreparedModelCallback> & cb)245     Return<ErrorStatus> prepareModel_1_1(const Model&, ExecutionPreference,
246                                          const sp<IPreparedModelCallback>& cb) override {
247         cb->notify(ErrorStatus::NONE, new PartitioningPreparedModel);
248         return ErrorStatus::NONE;
249     }
250 
getStatus()251     Return<DeviceStatus> getStatus() override {
252         return DeviceStatus::AVAILABLE;
253     }
254 
getCapabilities_1_1(getCapabilities_1_1_cb cb)255     Return<void> getCapabilities_1_1(getCapabilities_1_1_cb cb) override {
256         cb(ErrorStatus::NONE, mCapabilities);
257         return Void();
258     }
259 
getSupportedOperations_1_1(const Model & model,getSupportedOperations_cb cb)260     Return<void> getSupportedOperations_1_1(const Model& model,
261                                             getSupportedOperations_cb cb) override {
262         if (!android::nn::validateModel(model)) {
263             cb(ErrorStatus::INVALID_ARGUMENT, std::vector<bool>());
264             return Void();
265         }
266 
267         const size_t count = model.operations.size();
268         std::vector<bool> supported(count);
269         for (size_t i = 0; i < count; i++) {
270             if (model.operations[i].type == OperationType::OEM_OPERATION) {
271                 supported[i] = (mOEM == OEMYes);
272                 continue;
273             }
274             supported[i] = false;
275             uint32_t operation = lookupOperation(model, i);
276             if ((operation != kBadOperation) && (mOperationMask & (1 << operation))) {
277                 supported[i] = true;
278             }
279         }
280         cb(ErrorStatus::NONE, supported);
281         return Void();
282     }
283 
284 private:
285     Capabilities mCapabilities;
286     uint32_t mOperationMask;
287     OEM mOEM;
288 };
289 
290 // This class adds some simple abstractions and utilities on top of
291 // ::android::nn::wrapper::Model.  For example, it provides methods
292 // that work in terms of operation kind (0..7); and because we care
293 // about graph topology rather than details of operand types and
294 // values, it greatly simplifies the process of creating operands.
295 class PartitioningModel : public WrapperModel {
296 public:
297     // Create a tensor operand of the specified type, and return the
298     // corresponding operand index.
addFloatOperand()299     uint32_t addFloatOperand() {
300         static const WrapperOperandType type(WrapperType::TENSOR_FLOAT32, { 1 });
301         return addOperand(&type);
302     }
addQuantOperand()303     uint32_t addQuantOperand() {
304         static const WrapperOperandType type(WrapperType::TENSOR_QUANT8_ASYMM, { 1 });
305         return addOperand(&type);
306     }
307 
308     // Create an operation with two inputs and one output, specifying
309     // the operation kind (0..7) and the input operand indexes.
310     // Returns the output operand index.
311     enum class Dimensioned { NO, YES };
addOperation2To1(uint32_t operation,const uint32_t input0,const uint32_t input1,Dimensioned dimensionedOutput=Dimensioned::YES)312     uint32_t addOperation2To1(uint32_t operation, const uint32_t input0, const uint32_t input1,
313                               Dimensioned dimensionedOutput = Dimensioned::YES) {
314         ANeuralNetworksOperationType type =
315                 (operation < kNumFuseCodes ? ANEURALNETWORKS_ADD : ANEURALNETWORKS_MUL);
316         int32_t fuseCode = (operation < kNumFuseCodes ? operation : operation - kNumFuseCodes);
317         uint32_t input2 = addIntOperand(fuseCode);
318         uint32_t output = addOperandOfSameType(input0, dimensionedOutput);
319         addOperation(type, { input0, input1, input2 }, { output });
320         return output;
321     }
322 
323     // Create an OEM operation with one input and one output,
324     // specifying the input operand index.  Returns the output operand
325     // index.
addOperationOEM1To1(const uint32_t input,Dimensioned dimensionedOutput=Dimensioned::YES)326     uint32_t addOperationOEM1To1(const uint32_t input,
327                                  Dimensioned dimensionedOutput = Dimensioned::YES) {
328         uint32_t output = addOperandOfSameType(input, dimensionedOutput);
329         addOperation(ANEURALNETWORKS_OEM_OPERATION, { input }, { output });
330         return output;
331     }
332 
333     // Run the partitioning algorithm to create an ExecutionPlan.
partitionTheWork(const std::vector<std::shared_ptr<Device>> & devices,ExecutePreference preference,ExecutionPlan * plan)334     int partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
335                          ExecutePreference preference, ExecutionPlan* plan) {
336         return reinterpret_cast<ModelBuilder*>(getHandle())->partitionTheWork(
337             devices, static_cast<uint32_t>(preference), plan);
338     }
339 
340 #ifdef VERBOSE
341     // This is a debugging utility function.
dump(const char * name) const342     void dump(const char* name) const {
343         const ModelBuilder* mb = reinterpret_cast<const ModelBuilder*>(getHandle());
344         ::dump(name, mb);
345     }
346 #endif
347 
348 private:
349 
350     // Create a scalar integer operand of the specified value, and
351     // return the corresponding operand index.
addIntOperand(int32_t value)352     uint32_t addIntOperand(int32_t value) {
353         static const WrapperOperandType type(WrapperType::INT32, { });
354         uint32_t operand = addOperand(&type);
355         setOperandValue(operand, &value, sizeof(value));
356         return operand;
357     }
358 
359     // Create an operand of the same type as the specified operand,
360     // and return the operand index of the new operand.
addOperandOfSameType(uint32_t operand,Dimensioned dimensioned=Dimensioned::YES)361     uint32_t addOperandOfSameType(uint32_t operand, Dimensioned dimensioned = Dimensioned::YES) {
362         const Operand& operandStruct =
363                 reinterpret_cast<const ModelBuilder*>(getHandle())->getOperand(operand);
364         WrapperOperandType type(static_cast<WrapperType>(operandStruct.type), { 1 });
365         if (dimensioned == Dimensioned::NO) {
366             for (auto& dimension : type.dimensions) {
367                 dimension = 0;
368             }
369         }
370         return addOperand(&type);
371     }
372 };
373 
374 // This class adds some utilities on top of ::android::nn::wrapper::Compilation.
375 class PartitioningCompilation : public WrapperCompilation {
376 public:
PartitioningCompilation(const WrapperModel * model)377     PartitioningCompilation(const WrapperModel* model) : WrapperCompilation(model) { }
378 
setPartitioning(uint32_t partitioning)379     Result setPartitioning(uint32_t partitioning) {
380         return static_cast<Result>(builder()->setPartitioning(partitioning));
381     }
382 
383     using WrapperCompilation::finish;
finish(const std::vector<std::shared_ptr<Device>> & devices)384     Result finish(const std::vector<std::shared_ptr<Device>>& devices) {
385         return static_cast<Result>(builder()->finish(devices));
386     }
387 
getExecutionPlan() const388     const ExecutionPlan& getExecutionPlan() const {
389         return builder()->forTest_getExecutionPlan();
390     }
391 
392 private:
builder()393     CompilationBuilder* builder() {
394         return reinterpret_cast<CompilationBuilder*>(getHandle());
395     }
396 
builder() const397     const CompilationBuilder* builder() const {
398         return reinterpret_cast<const CompilationBuilder*>(getHandle());
399     }
400 };
401 
402 #ifdef VERBOSE
403 #define RETURN_TRUE()                                                          \
404     {                                                                          \
405         std::cerr << "returning true from " << __LINE__ << std::endl;          \
406         return true;                                                           \
407     }
408 #else
409 #define RETURN_TRUE()                                                          \
410     {                                                                          \
411         return true;                                                           \
412     }
413 #endif
414 #ifdef VERBOSE
415 #define RETURN_FALSE(MESSAGE)                                                  \
416     {                                                                          \
417         std::cerr << "returning false from " << __LINE__ MESSAGE << std::endl; \
418         return false;                                                          \
419     }
420 #else
421 #define RETURN_FALSE(MESSAGE)                                                  \
422     {                                                                          \
423         return false;                                                          \
424     }
425 #endif
426 
427 class PartitioningTest : public ::testing::Test {
428 protected:
429     using RemapVectorType = ExecutionStep::RemapVectorType;
430     using SubModelOutputSetType = ExecutionStep::SubModelOutputSetType;
431 
SetUp()432     virtual void SetUp() {
433     }
434 
435     // From a vector of DeviceSpecification, create a vector of
436     // Devices.
437     struct DeviceSpecification {
DeviceSpecification__anon83dd480d0111::PartitioningTest::DeviceSpecification438         DeviceSpecification(const std::string &name, Capabilities capabilities,
439                             uint32_t operationMask,
440                             PartitioningDriver::OEM oem = PartitioningDriver::OEMNo) :
441                 mName(name), mCapabilities(capabilities),
442                 mOperationMask(operationMask), mOEM(oem) { }
443         std::string mName;
444         Capabilities mCapabilities;
445         uint32_t mOperationMask;
446         PartitioningDriver::OEM mOEM;
447     };
448     static std::vector<std::shared_ptr<Device>>
makeDevices(std::vector<DeviceSpecification> specifications)449     makeDevices(std::vector<DeviceSpecification> specifications) {
450         std::vector<std::shared_ptr<Device>> devices;
451         for (const auto& specification : specifications) {
452             devices.push_back(std::make_shared<Device>(
453                 specification.mName,
454                 new PartitioningDriver(specification.mName.c_str(),
455                                        specification.mCapabilities,
456                                        specification.mOperationMask,
457                                        specification.mOEM)));
458             if (!devices.back()->initialize()) {
459                 EXPECT_NE("failed to initialize device", nullptr);
460                 return {};
461             }
462         }
463         return devices;
464     }
465 
466     /*-- Graph comparision ----------------------------------------------------------------*/
467 
468     // An operand with certain values for its lifetime does not have a
469     // defining operation in the graph.  For the purposes of the graph
470     // comparison algorithm, we encode the "defining operation" index of
471     // such an operand as follows:
472     // - NO_VALUE       kPseudoDefiningOperationNoValue
473     // - MODEL_INPUT    kPseudoDefiningOperationModelInput0 + (position in list of inputs)
474     // - CONSTANT_COPY  kPseudoDefiningOperationConstantCopy0 + (constant value)
475     //                    Note: For the graphs we build in this test, we
476     //                          only expect to see 4-byte constants within
477     //                          a very restricted range, so we only make
478     //                          room for such constants in our encoding
479     //                          space.
480     // We do not expect to see CONSTANT_REFERENCE, and so we do not handle
481     // it.
482     //
483     // The encoding is intended to be relatively human readable; it is not
484     // designed to represent some optimal balance of ranges for the items
485     // within its scope (actual operations, inputs, constants).
486 
487     enum PseudoDefiningOperationEncodings : uint32_t {
488         kPseudoDefiningOperationModelInput0   = 0x80000000U,
489         kPseudoDefiningOperationConstantCopy0 = 0x90000000U,
490         kPseudoDefiningOperationNoValue       = 0xeeeeeeeeU,
491 
492         // lowest value for special encoding
493         kPseudoDefiningOperationBase          = 0x80000000U,
494 
495         // range of encoded input or constant
496         kPseudoDefiningOperationRange         = 0x10000000U,
497     };
498 
499     // Build a map from operand to defining operation.
500     // TODO: Replace map with vector?
buildDefinitionMap(const ModelBuilder * model,std::map<uint32_t,uint32_t> * defMap)501     void buildDefinitionMap(const ModelBuilder* model,
502                             std::map<uint32_t, uint32_t>* defMap) {
503         // actual definitions
504         ASSERT_LT(model->operationCount(), kPseudoDefiningOperationBase);
505         for (uint32_t i = 0, e = model->operationCount(); i < e; i++) {
506             const Operation& operation = model->getOperation(i);
507             for (uint32_t output : operation.outputs) {
508                 (*defMap)[output] = i;
509             }
510         }
511         // inputs
512         ASSERT_LT(model->inputCount(), kPseudoDefiningOperationRange);
513         for (uint32_t i = 0, e = model->inputCount(); i < e; i++) {
514             (*defMap)[model->getInputOperandIndex(i)] = kPseudoDefiningOperationModelInput0 + i;
515         }
516         // look for NO_VALUE and CONSTANT_COPY
517         for (uint32_t i = 0, e = model->operandCount(); i < e; i++) {
518             const Operand& operand = model->getOperand(i);
519             switch (operand.lifetime) {
520                 case OperandLifeTime::NO_VALUE:
521                     (*defMap)[i] = kPseudoDefiningOperationNoValue;
522                     break;
523                 case OperandLifeTime::CONSTANT_COPY: {
524                     ASSERT_EQ(operand.location.length, sizeof(uint32_t));
525                     uint32_t value;
526                     memcpy(&value, model->getPointerToOperandValue(operand.location.offset), sizeof(uint32_t));
527                     ASSERT_LT(value, kPseudoDefiningOperationNoValue);
528                     (*defMap)[i] = kPseudoDefiningOperationConstantCopy0 + value;
529                     break;
530                 }
531                 case OperandLifeTime::TEMPORARY_VARIABLE:
532                 case OperandLifeTime::MODEL_INPUT:
533                 case OperandLifeTime::MODEL_OUTPUT:
534                     // already handled
535                     break;
536                 default:
537                     FAIL();
538                     break;
539             }
540         }
541         // sanity check
542         ASSERT_EQ(model->operandCount(), defMap->size());
543     }
544 
545 #ifdef VERBOSE
dump(const char * name,const std::map<uint32_t,uint32_t> * aMap)546     void dump(const char* name, const std::map<uint32_t, uint32_t>* aMap) {
547         auto writeNum = [](uint32_t num) {
548             if (num >= kPseudoDefiningOperationBase) {
549                 std::cout << "0x" << std::hex << num << std::dec;
550             } else {
551                 std::cout << num;
552             }
553         };
554 
555         std::cout << name << ": { ";
556         bool gotOne = false;
557         for (const auto& entry : *aMap) {
558             if (gotOne) {
559                 std::cout << ", ";
560             } else {
561                 gotOne = true;
562             }
563             std::cout << "(";
564             writeNum(entry.first);
565             std::cout << ", ";
566             writeNum(entry.second);
567             std::cout << ")";
568         }
569         std::cout << " }" << std::endl;
570     }
571 #endif
572 
compare(const Operand & operandA,const Operand & operandB)573     bool compare(const Operand& operandA, const Operand& operandB) {
574         if (operandA.type != operandB.type ||
575             operandA.dimensions != operandB.dimensions ||
576             operandA.numberOfConsumers != operandB.numberOfConsumers ||
577             operandA.scale != operandB.scale ||
578             operandA.zeroPoint != operandB.zeroPoint) {
579             return false;
580         }
581         return true;
582     }
583 
584     // Compare two graphs.  We ignore operand and operation indexes (i.e.,
585     // two nodes can be the same even if they are numbered differently)
586     // but we also ignore semantics (e.g., even if an operation kind is
587     // such that the operand is commutative, we still pay attention to the
588     // order of its input operands).
589     //
590     // The comparison algorithm works by walking modelA from outputs
591     // towards inputs, along the edge from each operand to its
592     // defining operation, and then along the edges to the operation's
593     // input operands.  At each step along the way, we try to match up
594     // operands and operations from modelA with equivalent operands
595     // and operations from modelB.
596     //
597     // We start by assuming that modelA's outputs and modelB's outputs
598     // match positionally (e.g., modelA's first output operand is
599     // equivalent to modelB's first output operand).  Once we've
600     // discovered two equivalent operands (such as those outputs), we
601     // place them in a work queue.  We repeatedly pull operands off
602     // the queue and compare their defining operations and those
603     // operations' input operands, to discover more pairs of
604     // equivalent operands.  If we ever find operations that do not
605     // match (e.g., because operation kind differs), or operands that
606     // do not match (e.g., because operand type differs); or if we
607     // ever find a conflict (we've already decided that operand A's
608     // equivalent operand is B0, but it looks like we need its
609     // equivalent operand to be B1); then the graphs compare unequal.
610     // Otherwise, we'll eventually exhaust the work queue, and
611     // conclude that the graphs compare equal.
compare(const ModelBuilder * modelA,const ModelBuilder * modelB)612     bool compare(const ModelBuilder* modelA, const ModelBuilder* modelB) {
613 #ifdef VERBOSE
614         ::dump("compare(A)", modelA);
615         ::dump("compare(B)", modelB);
616 #endif
617 
618         if (modelA->operandCount()   != modelB->operandCount()   ||
619             modelA->operationCount() != modelB->operationCount() ||
620             modelA->inputCount()     != modelB->inputCount()     ||
621             modelA->outputCount()    != modelB->outputCount()) {
622             RETURN_FALSE();
623         }
624 
625         // Maps from operand index to index of defining operation.
626         std::map<uint32_t, uint32_t> defsA, defsB;
627         buildDefinitionMap(modelA, &defsA);
628         buildDefinitionMap(modelB, &defsB);
629         if (HasFatalFailure()) return false;
630 
631         // Maps from operand index in modelA to equivalent operand index
632         // in modelB; and from operation index in modelA to equivalent
633         // operation index in modelB.
634         std::map<uint32_t, uint32_t> equivalentOperandsAToB;
635         std::map<uint32_t, uint32_t> equivalentOperationsAToB;
636 
637         // Queue of operand indexes from modelA, each of whose defining
638         // operations are to be checked for equivalence with modelB.
639         std::queue<uint32_t> workQueueOperandsA;
640 
641         // Seed operand equivalence map and work queue from model outputs.
642         for (uint32_t i = 0, e = modelA->outputCount(); i < e; i++) {
643             uint32_t outputA = modelA->getOutputOperandIndex(i);
644             uint32_t outputB = modelB->getOutputOperandIndex(i);
645             if (!compare(modelA->getOperand(outputA), modelB->getOperand(outputB))) {
646                 RETURN_FALSE();
647             }
648             equivalentOperandsAToB[outputA] = outputB;
649             workQueueOperandsA.push(outputA);
650         }
651 
652 #ifdef VERBOSE
653         dump("defsA", &defsA);
654         dump("defsB", &defsB);
655 #endif
656 
657         // Process the queue.
658         uint32_t pseudoDefinitionCount = 0;
659         while (!workQueueOperandsA.empty()) {
660 #ifdef VERBOSE
661             dump("equivalentOperandsAToB", &equivalentOperandsAToB);
662             dump("equivalentOperationsAToB", &equivalentOperationsAToB);
663 #endif
664             uint32_t operandIndexA = workQueueOperandsA.front();
665 #ifdef VERBOSE
666             std::cout << "operandIndexA: " << operandIndexA << std::endl;
667 #endif
668             workQueueOperandsA.pop();
669             uint32_t operandIndexB = equivalentOperandsAToB.at(operandIndexA);
670 
671             uint32_t operationIndexA = defsA.at(operandIndexA);
672             uint32_t operationIndexB = defsB.at(operandIndexB);
673             auto it = equivalentOperationsAToB.find(operationIndexA);
674             if (it != equivalentOperationsAToB.end()) {
675                 if (it->second != operationIndexB) {
676                     RETURN_FALSE();
677                 }
678                 continue;
679             }
680 
681             // We haven't identified an equivalent operation for
682             // operationIndexA.
683 
684             if ((operationIndexA >= kPseudoDefiningOperationBase) !=
685                 (operationIndexB >= kPseudoDefiningOperationBase)) {
686                 RETURN_FALSE();
687             }
688             // Either both operands have pseudo-definitions, or neither
689             // does.
690             if (operationIndexA >= kPseudoDefiningOperationBase) {
691                 // Both operands have pseudo-definitions.
692                 if (operationIndexA != operationIndexB) {
693                     RETURN_FALSE();
694                 }
695                 equivalentOperationsAToB[operationIndexA] = operationIndexB;
696                 ++pseudoDefinitionCount;
697                 continue;
698             }
699 
700             // If we get here, neither operation A nor operation B is a
701             // pseudo-definition.
702 
703             const Operation& operationA = modelA->getOperation(operationIndexA);
704             const Operation& operationB = modelB->getOperation(operationIndexB);
705             if (operationA.type != operationB.type ||
706                 operationA.inputs.size() != operationB.inputs.size() ||
707                 operationA.outputs.size() != operationB.outputs.size()) {
708                 RETURN_FALSE();
709             }
710             equivalentOperationsAToB[operationIndexA] = operationIndexB;
711             for (uint32_t i = 0, e = operationA.inputs.size(); i < e; i++) {
712                 uint32_t inputA = operationA.inputs[i];
713                 uint32_t inputB = operationB.inputs[i];
714                 auto it = equivalentOperandsAToB.find(inputA);
715                 if (it != equivalentOperandsAToB.end()) {
716                     if (it->second != inputB) {
717                         RETURN_FALSE();
718                     }
719                     continue;
720                 }
721                 // We haven't identified an equivalent operand for inputA.
722                 if (!compare(modelA->getOperand(inputA), modelB->getOperand(inputB))) {
723                     RETURN_FALSE();
724                 }
725                 equivalentOperandsAToB[inputA] = inputB;
726                 workQueueOperandsA.push(inputA);
727             }
728         }
729 
730         // Sanity check
731         if (modelA->operandCount() != defsA.size() ||
732             modelA->operandCount() != defsB.size() ||
733             modelA->operandCount() != equivalentOperandsAToB.size() ||
734             modelA->operationCount() + pseudoDefinitionCount != equivalentOperationsAToB.size()) {
735             RETURN_FALSE();
736         }
737 
738         RETURN_TRUE();
739     }
740 
741     /*-------------------------------------------------------------------------------------*/
742 
compare(std::shared_ptr<const ExecutionStep> step,const WrapperModel * model,std::shared_ptr<Device> device)743     bool compare(std::shared_ptr<const ExecutionStep> step,
744                  const WrapperModel* model, std::shared_ptr<Device> device) {
745         return (step->getDevice() == device) &&
746                 compare(step->getSubModel(),
747                         reinterpret_cast<const ModelBuilder*>(model->getHandle()));
748     }
749 };
750 
TEST_F(PartitioningTest,SimpleModel)751 TEST_F(PartitioningTest, SimpleModel) {
752     PartitioningModel model;
753     uint32_t opnd0 = model.addFloatOperand();
754     uint32_t opnd1 = model.addFloatOperand();
755     uint32_t opnd2 = model.addOperation2To1(0, opnd0, opnd1);
756     uint32_t opnd3 = model.addFloatOperand();
757     uint32_t opnd4 = model.addOperation2To1(1, opnd2, opnd3);
758     model.identifyInputsAndOutputs({ opnd0, opnd1, opnd3 }, { opnd4 });
759     model.finish();
760     ASSERT_TRUE(model.isValid());
761     graphDump("SimpleModel", model);
762 
763     // Simple partition (two devices are each capable of everything, one is the best).
764     const auto devicesA = makeDevices(
765         {
766             {"bad", { .float32Performance = { .execTime = 0.9, .powerUsage = 0.9 },
767                             .quantized8Performance = { .execTime = 0.9, .powerUsage = 0.9 } }, ~0U},
768             {"good", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 },
769                             .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, ~0U}
770         });
771     ExecutionPlan planA;
772     ASSERT_EQ(model.partitionTheWork(devicesA, ExecutePreference::PREFER_LOW_POWER, &planA),
773               ANEURALNETWORKS_NO_ERROR);
774     ASSERT_EQ(planA.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
775     ASSERT_NE(planA.forTest_simpleGetDevice().get(), nullptr);
776     ASSERT_EQ(planA.forTest_simpleGetDevice()->getName(), "good");
777 
778     // Simple partition (two devices are each capable of everything, none better than CPU).
779     const auto devicesC = makeDevices(
780         {
781             {"bad", { .float32Performance = { .execTime = 1.1, .powerUsage = 1.1 },
782                             .quantized8Performance = { .execTime = 1.1, .powerUsage = 1.1 } }, ~0U},
783             {"bad2", { .float32Performance = { .execTime = 1.0, .powerUsage = 1.0 },
784                             .quantized8Performance = { .execTime = 1.0, .powerUsage = 1.0 } }, ~0U}
785         });
786     ExecutionPlan planC;
787     ASSERT_EQ(model.partitionTheWork(devicesC, ExecutePreference::PREFER_LOW_POWER, &planC),
788               ANEURALNETWORKS_NO_ERROR);
789     ASSERT_EQ(planC.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
790     ASSERT_EQ(planC.forTest_simpleGetDevice(), nullptr);
791 
792     // Compound partition (two devices, each is capable of one of the
793     // two operations).  We could do more extensive checking here --
794     // for example, verify that each step within the plan has the
795     // correct (model and submodel)x(inputs and outputs).
796     const auto devicesB = makeDevices(
797         {
798             {"0", { .float32Performance = { .execTime = 0.9, .powerUsage = 0.9 },
799                             .quantized8Performance = { .execTime = 0.9, .powerUsage = 0.9 } }, 1<<0},
800             {"1", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 },
801                             .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, 1<<1}
802         });
803     ExecutionPlan planB;
804     ASSERT_EQ(model.partitionTheWork(devicesB, ExecutePreference::PREFER_LOW_POWER, &planB),
805               ANEURALNETWORKS_NO_ERROR);
806     ASSERT_EQ(planB.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
807     const auto& stepsB = planB.forTest_compoundGetSteps();
808     ASSERT_EQ(stepsB.size(), size_t(2));
809     {
810         // Build a model to compare against the submodel from stepsB[0].
811         PartitioningModel modelB0;
812         uint32_t b0Opnd0 = modelB0.addFloatOperand();
813         uint32_t b0Opnd1 = modelB0.addFloatOperand();
814         uint32_t b0Opnd2 = modelB0.addOperation2To1(0, b0Opnd0, b0Opnd1);
815         modelB0.identifyInputsAndOutputs({ b0Opnd0, b0Opnd1 }, { b0Opnd2 });
816         modelB0.finish();
817         ASSERT_TRUE(modelB0.isValid());
818         ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(stepsB[0], &modelB0, devicesB[0])));
819         ASSERT_EQ(stepsB[0]->getModelInputs(),
820                   (RemapVectorType{ { opnd0, b0Opnd0 }, { opnd1, b0Opnd1 } }));
821         ASSERT_EQ(stepsB[0]->getModelOutputs(),
822                   (RemapVectorType{}));
823         ASSERT_EQ(stepsB[0]->getTempsAsSubModelInputs(),
824                   (RemapVectorType{}));
825         ASSERT_EQ(stepsB[0]->getTempsAsSubModelOutputs(),
826                   (SubModelOutputSetType{ { opnd2, b0Opnd2 } }));
827         ASSERT_EQ(stepsB[0]->getOutputsAsSubModelInputs(),
828                   (RemapVectorType{}));
829     }
830     {
831         // Build a model to compare against the submodel from stepsB[1].
832         PartitioningModel modelB1;
833         uint32_t b1Opnd2 = modelB1.addFloatOperand();
834         uint32_t b1Opnd3 = modelB1.addFloatOperand();
835         uint32_t b1Opnd4 = modelB1.addOperation2To1(1, b1Opnd2, b1Opnd3);
836         // Note: In the partitioning algorithm, submodel inputs follow
837         // model inputs.  In the original model "model", opnd2 is not
838         // an input; so in the submodel "modelB1", the corresponding
839         // input b1Opnd2 is a submodel input, and must follow the
840         // model input b1Opnd3.
841         modelB1.identifyInputsAndOutputs({ b1Opnd3, b1Opnd2 }, { b1Opnd4 });
842         modelB1.finish();
843         ASSERT_TRUE(modelB1.isValid());
844         ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(stepsB[1], &modelB1, devicesB[1])));
845         ASSERT_EQ(stepsB[1]->getModelInputs(),
846                   (RemapVectorType{ { opnd3, b1Opnd3 } }));
847         ASSERT_EQ(stepsB[1]->getModelOutputs(),
848                   (RemapVectorType{ { opnd4, b1Opnd4 } }));
849         ASSERT_EQ(stepsB[1]->getTempsAsSubModelInputs(),
850                   (RemapVectorType{ { opnd2, b1Opnd2 } }));
851         ASSERT_EQ(stepsB[1]->getTempsAsSubModelOutputs(),
852                   (SubModelOutputSetType{}));
853         ASSERT_EQ(stepsB[1]->getOutputsAsSubModelInputs(),
854                   (RemapVectorType{}));
855     }
856 }
857 
TEST_F(PartitioningTest,Cpu)858 TEST_F(PartitioningTest, Cpu) {
859     // Here's a model where some operations execute only on the Cpu.
860     // To make things interesting, we produce three partitions --
861     // device, cpu, same-device.
862 
863     static const uint32_t kCpuOp = 1;
864     static const uint32_t kDevOp = 2;
865 
866     const auto devices = makeDevices(
867         {
868             {"1", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 },
869                     .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, 1<<kDevOp}
870         });
871 
872     PartitioningModel model;
873 
874     uint32_t opnd0 = model.addFloatOperand();
875     uint32_t opnd1 = model.addFloatOperand();
876 
877     uint32_t opnd2 = model.addOperation2To1(kDevOp, opnd0, opnd1);
878     uint32_t opnd3 = model.addOperation2To1(kDevOp, opnd0, opnd2);
879 
880     uint32_t opnd4 = model.addOperation2To1(kCpuOp, opnd0, opnd3);
881     uint32_t opnd5 = model.addOperation2To1(kCpuOp, opnd2, opnd4);
882 
883     uint32_t opnd6 = model.addFloatOperand();
884 
885     uint32_t opnd7 = model.addOperation2To1(kDevOp, opnd3, opnd5);
886     uint32_t opnd8 = model.addOperation2To1(kDevOp, opnd6, opnd7);
887 
888     model.identifyInputsAndOutputs({ opnd0, opnd1, opnd6 }, { opnd4, opnd8 });
889     model.finish();
890     ASSERT_TRUE(model.isValid());
891 
892     ExecutionPlan plan;
893     ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
894               ANEURALNETWORKS_NO_ERROR);
895     ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
896     const auto& steps = plan.forTest_compoundGetSteps();
897     ASSERT_EQ(steps.size(), size_t(3));
898     {
899         const auto& step0 = steps[0];
900 
901         // Build a model to compare against the submodel from steps[0].
902         PartitioningModel model0;
903         uint32_t m0Opnd0 = model0.addFloatOperand();
904         uint32_t m0Opnd1 = model0.addFloatOperand();
905         uint32_t m0Opnd2 = model0.addOperation2To1(kDevOp, m0Opnd0, m0Opnd1);
906         uint32_t m0Opnd3 = model0.addOperation2To1(kDevOp, m0Opnd0, m0Opnd2);
907         model0.identifyInputsAndOutputs({ m0Opnd0, m0Opnd1 }, { m0Opnd2, m0Opnd3 });
908         model0.finish();
909         ASSERT_TRUE(model0.isValid());
910         ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(step0, &model0, devices[0])));
911         ASSERT_EQ(step0->getModelInputs(),
912                   (RemapVectorType{ { opnd0, m0Opnd0 }, { opnd1, m0Opnd1 } }));
913         ASSERT_EQ(step0->getModelOutputs(),
914                   (RemapVectorType{}));
915         ASSERT_EQ(step0->getTempsAsSubModelInputs(),
916                   (RemapVectorType{}));
917         ASSERT_EQ(step0->getTempsAsSubModelOutputs(),
918                   (SubModelOutputSetType{ { opnd2, m0Opnd2 }, { opnd3, m0Opnd3 } }));
919         ASSERT_EQ(step0->getOutputsAsSubModelInputs(),
920                   (RemapVectorType{}));
921     }
922     {
923         const auto& step1 = steps[1];
924 
925         // Build a model to compare against the submodel from steps[1].
926         PartitioningModel model1;
927         uint32_t m1Opnd0 = model1.addFloatOperand();
928         uint32_t m1Opnd3 = model1.addFloatOperand();
929         uint32_t m1Opnd4 = model1.addOperation2To1(kCpuOp, m1Opnd0, m1Opnd3);
930         uint32_t m1Opnd2 = model1.addFloatOperand();
931         uint32_t m1Opnd5 = model1.addOperation2To1(kCpuOp, m1Opnd2, m1Opnd4);
932         model1.identifyInputsAndOutputs({ m1Opnd0, m1Opnd3, m1Opnd2 }, { m1Opnd4, m1Opnd5 });
933         model1.finish();
934         ASSERT_TRUE(model1.isValid());
935         ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(step1, &model1, nullptr)));
936         ASSERT_EQ(step1->getModelInputs(),
937                   (RemapVectorType{ { opnd0, m1Opnd0 } }));
938         ASSERT_EQ(step1->getModelOutputs(),
939                   (RemapVectorType{ { opnd4, m1Opnd4 } }));
940         ASSERT_EQ(step1->getTempsAsSubModelInputs(),
941                   (RemapVectorType{ { opnd3, m1Opnd3 }, { opnd2, m1Opnd2 } }));
942         ASSERT_EQ(step1->getTempsAsSubModelOutputs(),
943                   (SubModelOutputSetType{ { opnd5, m1Opnd5 } }));
944         ASSERT_EQ(step1->getOutputsAsSubModelInputs(),
945                   (RemapVectorType{}));
946     }
947     {
948         const auto& step2 = steps[2];
949 
950         // Build a model to compare against the submodel from steps[2].
951         PartitioningModel model2;
952         uint32_t m2Opnd3 = model2.addFloatOperand();
953         uint32_t m2Opnd5 = model2.addFloatOperand();
954         uint32_t m2Opnd7 = model2.addOperation2To1(kDevOp, m2Opnd3, m2Opnd5);
955         uint32_t m2Opnd6 = model2.addFloatOperand();
956         uint32_t m2Opnd8 = model2.addOperation2To1(kDevOp, m2Opnd6, m2Opnd7);
957         model2.identifyInputsAndOutputs({ m2Opnd6, m2Opnd3, m2Opnd5 }, { m2Opnd8 });
958         model2.finish();
959         ASSERT_TRUE(model2.isValid());
960         ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(step2, &model2, devices[0])));
961         ASSERT_EQ(step2->getModelInputs(),
962                   (RemapVectorType{ { opnd6, m2Opnd6 } }));
963         ASSERT_EQ(step2->getModelOutputs(),
964                   (RemapVectorType{ { opnd8, m2Opnd8 } }));
965         ASSERT_EQ(step2->getTempsAsSubModelInputs(),
966                   (RemapVectorType{ { opnd3, m2Opnd3 }, { opnd5, m2Opnd5 } }));
967         ASSERT_EQ(step2->getTempsAsSubModelOutputs(),
968                   (SubModelOutputSetType{}));
969         ASSERT_EQ(step2->getOutputsAsSubModelInputs(),
970                   (RemapVectorType{}));
971     }
972 }
973 
TEST_F(PartitioningTest,SetPartitioning)974 TEST_F(PartitioningTest, SetPartitioning) {
975     PartitioningModel model;
976     uint32_t opnd0 = model.addFloatOperand();
977     uint32_t opnd1 = model.addFloatOperand();
978     uint32_t opnd2 = model.addOperation2To1(0, opnd0, opnd1, PartitioningModel::Dimensioned::NO);
979     uint32_t opnd3 = model.addFloatOperand();
980     uint32_t opnd4 = model.addOperation2To1(1, opnd2, opnd3);
981     model.identifyInputsAndOutputs({ opnd0, opnd1, opnd3 }, { opnd4 });
982     model.finish();
983     ASSERT_TRUE(model.isValid());
984 
985     // We expect that we cannot successfully partition, because we
986     // have an intermediate operand (opnd2) without dimensions, and
987     // this is not currently handled.
988 
989     // One device that can and should execute operation 0.
990     const auto devices = makeDevices({
991             {"hw", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 },
992                             .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, (1<<0)},
993         });
994 
995     // Test kPartitioningNo.  We should not even attempt partitioning,
996     // so there should be no execution plan.
997     PartitioningCompilation cPNo(&model);
998     ASSERT_EQ(cPNo.setPartitioning(DeviceManager::kPartitioningNo), Result::NO_ERROR);
999     ASSERT_EQ(cPNo.finish(devices), Result::NO_ERROR);
1000     ASSERT_EQ(cPNo.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::EMPTY);
1001 
1002     // Test kPartitioningWithFallback.  We should attempt
1003     // partitioning, reach the end of the partitioning process (so we
1004     // have an execution plan), discover the dimensionless
1005     // intermediate operand, and still return success (because of
1006     // fallback).
1007     PartitioningCompilation cPWithFallback(&model);
1008     ASSERT_EQ(cPWithFallback.setPartitioning(DeviceManager::kPartitioningWithFallback), Result::NO_ERROR);
1009     ASSERT_EQ(cPWithFallback.finish(devices), Result::NO_ERROR);
1010     ASSERT_EQ(cPWithFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::ERROR);
1011 
1012     // Test kPartitioningWithoutFallback.  We should attempt
1013     // partitioning, and fail.
1014     PartitioningCompilation cPWithoutFallback(&model);
1015     ASSERT_EQ(cPWithoutFallback.setPartitioning(DeviceManager::kPartitioningWithoutFallback), Result::NO_ERROR);
1016     ASSERT_EQ(cPWithoutFallback.finish(devices), Result::OP_FAILED);
1017     ASSERT_TRUE(cPWithoutFallback.getExecutionPlan().forTest_hasSubModelOutputsOfUnknownSize());
1018     ASSERT_EQ(cPWithoutFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::ERROR);
1019 }
1020 
1021 // Regression test for http://b/69166603:
1022 //     "partitioned compilation and execution yields wrong results when model output is submodel input"
TEST_F(PartitioningTest,ModelOutputAsSubmodelInput)1023 TEST_F(PartitioningTest, ModelOutputAsSubmodelInput) {
1024     PartitioningModel model;
1025     uint32_t opnd0 = model.addFloatOperand();
1026     uint32_t opnd1 = model.addFloatOperand();
1027     uint32_t opnd2 = model.addOperation2To1(0, opnd0, opnd1);
1028     uint32_t opnd3 = model.addOperation2To1(1, opnd2, opnd2);
1029     model.identifyInputsAndOutputs({ opnd0, opnd1 }, { opnd2, opnd3 });
1030     model.finish();
1031     ASSERT_TRUE(model.isValid());
1032 
1033     // Compound partition (two devices, each is capable of one of the
1034     // two operations).  We could do more extensive checking here --
1035     // for example, verify that each step within the plan has the
1036     // correct (model and submodel)x(inputs and outputs).
1037     const auto devices = makeDevices(
1038         {
1039             {"0", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 },
1040                             .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, 1<<0},
1041             {"1", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 },
1042                             .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, 1<<1}
1043         });
1044     ExecutionPlan plan;
1045     ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
1046               ANEURALNETWORKS_NO_ERROR);
1047     ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
1048     const auto& steps = plan.forTest_compoundGetSteps();
1049     ASSERT_EQ(steps.size(), size_t(2));
1050     {
1051         // Build a model to compare against the submodel from steps[0].
1052         PartitioningModel model0;
1053         uint32_t m0Opnd0 = model0.addFloatOperand();
1054         uint32_t m0Opnd1 = model0.addFloatOperand();
1055         uint32_t m0Opnd2 = model0.addOperation2To1(0, m0Opnd0, m0Opnd1);
1056         model0.identifyInputsAndOutputs({ m0Opnd0, m0Opnd1 }, { m0Opnd2 });
1057         model0.finish();
1058         ASSERT_TRUE(model0.isValid());
1059         ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(steps[0], &model0, devices[0])));
1060         ASSERT_EQ(steps[0]->getModelInputs(),
1061                   (RemapVectorType{ { opnd0, m0Opnd0 }, { opnd1, m0Opnd1 } }));
1062         ASSERT_EQ(steps[0]->getModelOutputs(),
1063                   (RemapVectorType{ { opnd2, m0Opnd2 } }));
1064         ASSERT_EQ(steps[0]->getTempsAsSubModelInputs(),
1065                   (RemapVectorType{}));
1066         ASSERT_EQ(steps[0]->getTempsAsSubModelOutputs(),
1067                   (SubModelOutputSetType{}));
1068         ASSERT_EQ(steps[0]->getOutputsAsSubModelInputs(),
1069                   (RemapVectorType{}));
1070     }
1071     {
1072         // Build a model to compare against the submodel from steps[1].
1073         PartitioningModel model1;
1074         uint32_t m1Opnd2 = model1.addFloatOperand();
1075         uint32_t m1Opnd3 = model1.addOperation2To1(1, m1Opnd2, m1Opnd2);
1076         model1.identifyInputsAndOutputs({ m1Opnd2 }, { m1Opnd3 });
1077         model1.finish();
1078         ASSERT_TRUE(model1.isValid());
1079         ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(steps[1], &model1, devices[1])));
1080         ASSERT_EQ(steps[1]->getModelInputs(),
1081                   (RemapVectorType{}));
1082         ASSERT_EQ(steps[1]->getModelOutputs(),
1083                   (RemapVectorType{ { opnd3, m1Opnd3 } }));
1084         ASSERT_EQ(steps[1]->getTempsAsSubModelInputs(),
1085                   (RemapVectorType{}));
1086         ASSERT_EQ(steps[1]->getTempsAsSubModelOutputs(),
1087                   (SubModelOutputSetType{}));
1088         ASSERT_EQ(steps[1]->getOutputsAsSubModelInputs(),
1089                   (RemapVectorType{ { opnd2, m1Opnd2 } }));
1090     }
1091 }
1092 
TEST_F(PartitioningTest,OemOperations)1093 TEST_F(PartitioningTest, OemOperations) {
1094     // Trivial model consisting solely of OEM operation.
1095     PartitioningModel model;
1096     uint32_t opndIn = model.addFloatOperand();
1097     uint32_t opndOut = model.addOperationOEM1To1(opndIn);
1098     model.identifyInputsAndOutputs({ opndIn }, { opndOut });
1099     model.finish();
1100     ASSERT_TRUE(model.isValid());
1101 
1102     // Verify that the best driver than can run an OEM operation is
1103     // used, even if it is not better than the CPU.
1104     const auto devicesBestOEM = makeDevices(
1105         {
1106             {"badOEM", { .float32Performance = { .execTime = 1.5, .powerUsage = 1.5 },
1107                          .quantized8Performance = { .execTime = 1.5, .powerUsage = 1.5 } },
1108                         ~0U, PartitioningDriver::OEMYes},
1109             {"noOEM", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 },
1110                         .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } },
1111                         ~0U, PartitioningDriver::OEMNo},
1112             {"goodOEM", { .float32Performance = { .execTime = 1.2, .powerUsage = 1.2 },
1113                           .quantized8Performance = { .execTime = 1.2, .powerUsage = 1.2 } },
1114                         ~0U, PartitioningDriver::OEMYes}
1115         });
1116     PartitioningCompilation compilationBestOEM(&model);
1117     ASSERT_EQ(compilationBestOEM.finish(devicesBestOEM), Result::NO_ERROR);
1118     const auto& planBestOEM = compilationBestOEM.getExecutionPlan();
1119     ASSERT_EQ(planBestOEM.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1120     ASSERT_NE(planBestOEM.forTest_simpleGetDevice().get(), nullptr);
1121     ASSERT_EQ(planBestOEM.forTest_simpleGetDevice()->getName(), "goodOEM");
1122 
1123     // Verify that we get an error if no driver can run an OEM operation.
1124     const auto devicesNoOEM = makeDevices(
1125         {
1126             {"noOEM", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 },
1127                         .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } },
1128                         ~0U, PartitioningDriver::OEMNo}
1129         });
1130     PartitioningCompilation compilationNoOEM(&model);
1131     ASSERT_EQ(compilationNoOEM.finish(devicesNoOEM), Result::BAD_DATA);
1132 
1133     // Verify that we get an error if there are no drivers (only CPU fallback).
1134     PartitioningCompilation compilationNoDrivers(&model);
1135     ASSERT_EQ(compilationNoDrivers.finish({} /* no drivers */), Result::BAD_DATA);
1136 }
1137 
TEST_F(PartitioningTest,RelaxedFP)1138 TEST_F(PartitioningTest, RelaxedFP) {
1139     const auto devices = makeDevices(
1140         {
1141             // Best choice for non-relaxed model.
1142             {"f32", { .float32Performance = { .execTime = 0.8, .powerUsage = 0.8 },
1143                       .relaxedFloat32toFloat16Performance = { .execTime = 0.9, .powerUsage = 0.9 }},
1144                     ~0U},
1145             // Best choice for relaxed model.
1146             {"f16", { .float32Performance = { .execTime = 0.9, .powerUsage = 0.9 },
1147                       .relaxedFloat32toFloat16Performance = { .execTime = 0.8, .powerUsage = 0.8 }},
1148                     ~0U}
1149         });
1150 
1151     auto TrivialTest = [&devices](bool doRelax, const char* expectDevice) {
1152         // Trivial model consisting solely of one operation.
1153         SCOPED_TRACE(expectDevice);
1154         PartitioningModel model;
1155         uint32_t opnd0 = model.addFloatOperand();
1156         uint32_t opnd1 = model.addFloatOperand();
1157         uint32_t opnd2 = model.addOperation2To1(0, opnd0, opnd1);
1158         model.identifyInputsAndOutputs({ opnd0, opnd1 }, { opnd2 });
1159         model.relaxComputationFloat32toFloat16(doRelax);
1160         model.finish();
1161         ASSERT_TRUE(model.isValid());
1162         // Verify that the model will be executed on the appropriate device.
1163         ExecutionPlan plan;
1164         ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
1165                   ANEURALNETWORKS_NO_ERROR);
1166         ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1167         ASSERT_EQ(plan.forTest_simpleGetDevice()->getName(), expectDevice);
1168     };
1169 
1170     ASSERT_NO_FATAL_FAILURE(TrivialTest(false, "f32"));
1171     ASSERT_NO_FATAL_FAILURE(TrivialTest(true, "f16"));
1172 }
1173 
1174 }  // namespace
1175