1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CompilationBuilder.h"
18 #include "ExecutionPlan.h"
19 #include "HalInterfaces.h"
20 #include "Manager.h"
21 #include "ModelBuilder.h"
22 #include "NeuralNetworks.h"
23 #include "NeuralNetworksOEM.h"
24 #include "SampleDriver.h"
25 #include "TestNeuralNetworksWrapper.h"
26 #include "Utils.h"
27 #include "ValidateHal.h"
28 
29 #include <gtest/gtest.h>
30 
31 #include <filesystem>
32 #include <functional>
33 #include <map>
34 #include <queue>
35 #include <type_traits>
36 
37 // Uncomment the following line to generate some debugging output that
38 // may be useful when analyzing failures:
39 //
40 // #define VERBOSE VERBOSE
41 
42 // These tests do whitebox testing of the graph partitioning
43 // algorithm.  It is "whitebox" in the sense that we're not evaluating
44 // whether a particular partitioning is legal, or "good enough"
45 // according to some metric, but whether it exactly matches the
46 // expected behavior of the current partitioning algorithm.
47 //
48 // A key part of the current partitioning algorithm is to determine
49 // which device among the available devices should be the one to
50 // execute a particular operation from the graph.  This determination
51 // is made "locally" -- i.e., it does not depend on the graph
52 // topology, only on the properties of the operation in question.
53 // IDevice::getSupportedOperations() indicates which operations in a
54 // graph can be executed on a device, and IDevice::getCapabilities()
55 // indicates how "good" that device is for executing particular kinds
56 // of operations.  For each operation, the partitioning algorithm
57 // picks the "best" device that is capable of executing that
58 // operation; if no device can do so, then the algorithm picks the
59 // cpu.
60 //
61 // As part of this testing approach, we want to make it easy to
62 // specify which operations in a test graph can be executed on which
63 // devices.  We accomplish this in the following way:
64 // - A unary OEM operation is available.
65 // - There is a collection of operations (each of which has two inputs
66 //   and one output):
67 //   - Eight kinds of operations available at driver version V1_0 or
68 //     later.  They are represented in the graph as ADD or MUL with a
69 //     particular activation function -- two opcodes times four
70 //     activation functions means eight available operation kinds.
71 //     This is a low-level representation detail -- when we specify the
72 //     behavior of the device or build a graph, we do so in terms of
73 //     operation encodings 0..7.
74 //   - Eight kinds of operations available at driver version V1_1 or
75 //     later.  They are represented in the graph as DIV or SUB with
76 //     a particular activation function, exactly analogous to ADD
77 //     and MUL above.  We use operation encodings 8..15 for them.
78 //   - Four kinds of operations available at driver version V1_2 or
79 //     later.  They are represented in the graph as MAXIMUM,
80 //     MINIMUM, POW, or PRELU.  These operations take no activation
81 //     function, so we only get 4 operation kinds, for which we
82 //     use operation encodings 16..19.
83 // When we instantiate a device for testing purposes, we specify what subset of
84 // those operations the device is able to execute.
85 //
86 // In order to determine whether or not a partitioning matches the
87 // expected partitioning, we check the number of partitions, check
88 // which device each partition targets, and compare each partition's
89 // subgraph, model inputs, model outputs, submodel inputs, and
90 // submodel outputs against what is expected.  In order to perform
91 // that comparison, we build a model to compare against a partition's
92 // submodel and run a graph comparison algorithm on it.  The graph
93 // comparison and the inputs and outputs comparisons are syntactic
94 // rather than semantic comparisons -- they don't allow for
95 // reorderings of inputs and outputs.  Because of this, we need to
96 // know exactly how the partitioning algorithm orders inputs and
97 // outputs in order to construct the models and operand lists to
98 // compare against.  Here are some relevant behaviors of the
99 // partitioning algorithm:
100 //
101 // - It builds a subgraph by walking operations in forward topological
102 //   order, and adding each operation's input operands and output
103 //   operands in index order (input followed by output) when that
104 //   operation is added.  (It does not add an input that has already
105 //   been added.)
106 // - It finds model inputs, model outputs, and submodel inputs in
107 //   the order the corresponding operands were added to the subgraph
108 //   (see ExecutionStep methods getModelInputs(), getModelOutputs(),
109 //   getTempsAsSubModelInputs(), getOutputsAsSubModelInputs()).
110 // - It finds temps as submodel outputs in numerical order of corresponding
111 //   operand number in the original model (see ExecutionStep method
112 //   getTempsAsSubModelOutputs()).
113 // - When it calls identifyInputsAndOutputs() on the submodel, it
114 //   passes inputs from getModelInputs() in order, followed by temps as
115 //   submodel inputs from getTempsAsSubModelInputs() in order,
116 //   followed by outputs as submodel inputs from
117 //   getOutputsAsSubModelInputs() in order; and it passes outputs from
118 //   getModelOutputs() in order followed by submodel outputs from
119 //   getTempsAsSubModelOutputs() in order.
120 //
121 // TODO: Maybe the logic for comparing a partition to an expected
122 //       model should be changed to tolerate reorderings of inputs and
123 //       outputs, so that when we build models and lists to compare
124 //       against, we don't need to worry about input and output
125 //       orderings.  But is there a way to do this that still lets us
126 //       verify that we have the correct relationships between
127 //       an (original) model's inputs and outputs and each submodel's
128 //       inputs and outputs, as well as the correct relationship
129 //       between submodel inputs and outputs across partitions?
130 
131 namespace {
132 
133 const Timing kBadTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
134 
135 using CompilationBuilder = ::android::nn::CompilationBuilder;
136 using Device = ::android::nn::Device;
137 using DeviceManager = ::android::nn::DeviceManager;
138 using ExecutePreference = ::android::nn::test_wrapper::ExecutePreference;
139 using ExecutionPlan = ::android::nn::ExecutionPlan;
140 using ExecutionStep = ::android::nn::ExecutionStep;
141 using HalVersion = ::android::nn::HalVersion;
142 using HidlModel = ::android::hardware::neuralnetworks::V1_2::Model;
143 using HidlToken =
144         ::android::hardware::hidl_array<uint8_t, ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN>;
145 using ModelBuilder = ::android::nn::ModelBuilder;
146 using Result = ::android::nn::test_wrapper::Result;
147 using SampleDriver = ::android::nn::sample_driver::SampleDriver;
148 using WrapperSymmPerChannelQuantParams = ::android::nn::test_wrapper::SymmPerChannelQuantParams;
149 using WrapperCompilation = ::android::nn::test_wrapper::Compilation;
150 using WrapperModel = ::android::nn::test_wrapper::Model;
151 using WrapperOperandType = ::android::nn::test_wrapper::OperandType;
152 using WrapperType = ::android::nn::test_wrapper::Type;
153 
154 template <typename T> using sp = ::android::sp<T>;
155 template <typename T>
156 using MQDescriptorSync = ::android::hardware::MQDescriptorSync<T>;
157 
makeCapabilities(float perf)158 Capabilities makeCapabilities(float perf) {
159     PerformanceInfo perfInfo = {.execTime = perf, .powerUsage = perf};
160     return {.relaxedFloat32toFloat16PerformanceScalar = perfInfo,
161             .relaxedFloat32toFloat16PerformanceTensor = perfInfo,
162             .operandPerformance = ::android::nn::nonExtensionOperandPerformance(perfInfo)};
163 };
164 
update(Capabilities * capabilities,OperandType type,float perf)165 void update(Capabilities* capabilities, OperandType type, float perf) {
166     PerformanceInfo perfInfo = {.execTime = perf, .powerUsage = perf};
167     ::android::nn::update(&capabilities->operandPerformance, type, perfInfo);
168 }
169 
lookupExecTime(const Capabilities & capabilities,OperandType type)170 float lookupExecTime(const Capabilities& capabilities, OperandType type) {
171     return ::android::nn::lookup(capabilities.operandPerformance, type).execTime;
172 }
173 
174 const uint32_t kNumFuseCodes = 4;
175 const uint32_t kBadOperation = ~0;
176 
177 // V1_0 operations
178 const uint32_t kFirstEncodingADD = 0;
179 const uint32_t kFirstEncodingMUL = kFirstEncodingADD + kNumFuseCodes;
180 const uint32_t kFirstEncodingV1_0 = kFirstEncodingADD;
181 const uint32_t kLastEncodingV1_0 = kFirstEncodingMUL + kNumFuseCodes - 1;
182 
183 // V1_1 operations
184 const uint32_t kFirstEncodingDIV = kLastEncodingV1_0 + 1;
185 const uint32_t kFirstEncodingSUB = kFirstEncodingDIV + kNumFuseCodes;
186 const uint32_t kFirstEncodingV1_1 = kFirstEncodingDIV;
187 const uint32_t kLastEncodingV1_1 = kFirstEncodingSUB + kNumFuseCodes - 1;
188 
189 // V1_2 operations
190 const uint32_t kFirstEncodingMAXIMUM = kLastEncodingV1_1 + 1;
191 const uint32_t kFirstEncodingMINIMUM = kFirstEncodingMAXIMUM + 1;
192 const uint32_t kFirstEncodingPOW = kFirstEncodingMINIMUM + 1;
193 const uint32_t kFirstEncodingPRELU = kFirstEncodingPOW + 1;
194 const uint32_t kFirstEncodingV1_2 = kFirstEncodingMAXIMUM;
195 const uint32_t kLastEncodingV1_2 = kFirstEncodingPRELU;
196 
197 const std::map<OperationType, uint32_t> operationToFirstEncoding = {
198         {OperationType::ADD, kFirstEncodingADD},
199         {OperationType::MUL, kFirstEncodingMUL},
200         {OperationType::DIV, kFirstEncodingDIV},
201         {OperationType::SUB, kFirstEncodingSUB},
202         {OperationType::MAXIMUM, kFirstEncodingMAXIMUM},
203         {OperationType::MINIMUM, kFirstEncodingMINIMUM},
204         {OperationType::POW, kFirstEncodingPOW},
205         {OperationType::PRELU, kFirstEncodingPRELU},
206 };
207 
208 // Sorted in reverse order (std::greater) so that we can use map::lower_bound to
209 // find an entry whose key is numerically less than or equal to a search value.
210 // mapped_type is (OperandCode, hasFuseCode).
211 const std::map<uint32_t, std::pair<uint32_t, bool>, std::greater<>> firstEncodingToOperation = {
212         {kFirstEncodingADD, {ANEURALNETWORKS_ADD, true}},
213         {kFirstEncodingMUL, {ANEURALNETWORKS_MUL, true}},
214         {kFirstEncodingDIV, {ANEURALNETWORKS_DIV, true}},
215         {kFirstEncodingSUB, {ANEURALNETWORKS_SUB, true}},
216         {kFirstEncodingMAXIMUM, {ANEURALNETWORKS_MAXIMUM, false}},
217         {kFirstEncodingMINIMUM, {ANEURALNETWORKS_MINIMUM, false}},
218         {kFirstEncodingPOW, {ANEURALNETWORKS_POW, false}},
219         {kFirstEncodingPRELU, {ANEURALNETWORKS_PRELU, false}},
220 };
221 
222 // Look up the operation with the specified index in a graph, and return the
223 // operation encoding; or, if for some reason this is not one of the encoded
224 // operations, then return kBadOperation.
lookupOperation(std::function<const Operation & (uint32_t)> getOperation,std::function<const Operand & (uint32_t)> getOperand,std::function<const uint8_t * (uint32_t)> getValue,uint32_t operationIndex)225 uint32_t lookupOperation(std::function<const Operation&(uint32_t)> getOperation,
226                          std::function<const Operand&(uint32_t)> getOperand,
227                          std::function<const uint8_t*(uint32_t)> getValue,
228                          uint32_t operationIndex) {
229     const Operation& operation = getOperation(operationIndex);
230     switch (operation.type) {
231         case OperationType::ADD:
232         case OperationType::MUL:
233         case OperationType::DIV:
234         case OperationType::SUB: {
235             // input2 is the fused activation function
236             const Operand& input2 = getOperand(operation.inputs[2]);
237             if ((input2.type == OperandType::INT32) &&
238                 (input2.lifetime == OperandLifeTime::CONSTANT_COPY)) {
239                 int32_t value;
240                 CHECK_EQ(sizeof(value), input2.location.length);
241                 memcpy(&value,
242                        getValue(input2.location.offset),
243                        input2.location.length);
244                 return value + operationToFirstEncoding.at(operation.type);
245             }
246             break;
247         }
248         default: {
249             auto it = operationToFirstEncoding.find(operation.type);
250             if (it != operationToFirstEncoding.end()) {
251                 return it->second;
252             }
253             break;
254         }
255     }
256     return kBadOperation;
257 }
258 
lookupOperation(const HidlModel & model,uint32_t operationIndex)259 uint32_t lookupOperation(const HidlModel& model, uint32_t operationIndex) {
260     return lookupOperation(
261         [&model](uint32_t index) -> const Operation& {
262             return model.operations[index];
263         },
264         [&model](uint32_t index) -> const Operand& {
265             return model.operands[index];
266         },
267         [&model](uint32_t offset) {return &model.operandValues[offset];},
268         operationIndex);
269 }
270 
271 #ifdef VERBOSE
272 // This is a debugging utility function
dump(const char * name,const ModelBuilder * model)273 void dump(const char* name, const ModelBuilder* model) {
274     HidlModel hidlModel;
275     model->setHidlModel(&hidlModel);
276     std::cout << name << ": " << toString(hidlModel) << std::endl;
277     std::cout << "inputs: " << toString(hidlModel.inputIndexes) << std::endl;
278     std::cout << "outputs: " << toString(hidlModel.outputIndexes) << std::endl;
279     for (size_t i = 0, e = hidlModel.operations.size(); i < e; i++) {
280         std::cout << "operation[" << i << "]: " << toString(hidlModel.operations[i]) << std::endl;
281     }
282 }
283 #endif
284 
285 // This is an IDevice for testing purposes.  It only has a few
286 // interesting properties, all of which are specified as constructor
287 // arguments: device capabilities; which subset of operation kinds
288 // (0..19) does the device support; does the device support the OEM
289 // operation.  The subset is represented with a bitmask, in which
290 // operation kind K corresponds to the bit (1 << K).
291 class PartitioningDriver : public SampleDriver {
292 private:
293     // Dummy class -- a prepared model must not be nullptr.
294     class PartitioningPreparedModel : public IPreparedModel {
295     public:
execute(const Request &,const sp<V1_0::IExecutionCallback> &)296      Return<ErrorStatus> execute(const Request&, const sp<V1_0::IExecutionCallback>&) override {
297          return ErrorStatus::DEVICE_UNAVAILABLE;
298      }
execute_1_2(const Request &,MeasureTiming,const sp<V1_2::IExecutionCallback> &)299      Return<ErrorStatus> execute_1_2(const Request&, MeasureTiming,
300                                      const sp<V1_2::IExecutionCallback>&) override {
301          return ErrorStatus::DEVICE_UNAVAILABLE;
302      }
executeSynchronously(const Request &,MeasureTiming,executeSynchronously_cb cb)303      Return<void> executeSynchronously(const Request&, MeasureTiming,
304                                        executeSynchronously_cb cb) override {
305          cb(ErrorStatus::DEVICE_UNAVAILABLE, {}, kBadTiming);
306          return Void();
307      }
configureExecutionBurst(const sp<V1_2::IBurstCallback> &,const MQDescriptorSync<V1_2::FmqRequestDatum> &,const MQDescriptorSync<V1_2::FmqResultDatum> &,configureExecutionBurst_cb cb)308      Return<void> configureExecutionBurst(
309              const sp<V1_2::IBurstCallback>& /*callback*/,
310              const MQDescriptorSync<V1_2::FmqRequestDatum>& /*requestChannel*/,
311              const MQDescriptorSync<V1_2::FmqResultDatum>& /*resultChannel*/,
312              configureExecutionBurst_cb cb) override {
313          cb(ErrorStatus::DEVICE_UNAVAILABLE, nullptr);
314          return Void();
315      }
316     };
317 public:
318     enum OEM {
319         OEMNo,          // rejected by getSupportedOperations and prepareModel
320         OEMIndecisive,  // accepted by getSupportedOperations but not prepareModel
321         OEMYes,         // accepted by getSupportedOperations and prepareModel
322     };
323 
PartitioningDriver(const char * name,const char * version,Capabilities capabilities,uint32_t operationMask,OEM oem=OEMNo)324     PartitioningDriver(const char* name, const char* version, Capabilities capabilities,
325                        uint32_t operationMask, OEM oem = OEMNo)
326         : SampleDriver(name),
327           mVersionString(version),
328           mCapabilities(capabilities),
329           mOperationMask(operationMask),
330           mOEM(oem) {}
~PartitioningDriver()331     ~PartitioningDriver() override {}
332 
getVersionString(getVersionString_cb cb)333     Return<void> getVersionString(getVersionString_cb cb) override {
334         cb(ErrorStatus::NONE, mVersionString);
335         return Void();
336     }
337 
prepareModel_1_2(const Model & model,ExecutionPreference,const hidl_vec<hidl_handle> &,const hidl_vec<hidl_handle> &,const HidlToken &,const sp<IPreparedModelCallback> & cb)338     Return<ErrorStatus> prepareModel_1_2(const Model& model, ExecutionPreference,
339                                          const hidl_vec<hidl_handle>&, const hidl_vec<hidl_handle>&,
340                                          const HidlToken&,
341                                          const sp<IPreparedModelCallback>& cb) override {
342         ErrorStatus status = ErrorStatus::NONE;
343         if (mOEM != OEMYes) {
344             for (const auto& operation : model.operations) {
345                 if (operation.type == OperationType::OEM_OPERATION) {
346                     status = ErrorStatus::INVALID_ARGUMENT;
347                     break;
348                 }
349             }
350         }
351         cb->notify_1_2(status, new PartitioningPreparedModel);
352         return status;
353     }
354 
getStatus()355     Return<DeviceStatus> getStatus() override {
356         return DeviceStatus::AVAILABLE;
357     }
358 
getCapabilities_1_2(getCapabilities_1_2_cb cb)359     Return<void> getCapabilities_1_2(getCapabilities_1_2_cb cb) override {
360         cb(ErrorStatus::NONE, mCapabilities);
361         return Void();
362     }
363 
getSupportedOperations_1_2(const Model & model,getSupportedOperations_cb cb)364     Return<void> getSupportedOperations_1_2(const Model& model,
365                                             getSupportedOperations_cb cb) override {
366         if (!android::nn::validateModel(model)) {
367             cb(ErrorStatus::INVALID_ARGUMENT, std::vector<bool>());
368             return Void();
369         }
370 
371         const size_t count = model.operations.size();
372         std::vector<bool> supported(count);
373         for (size_t i = 0; i < count; i++) {
374             if (model.operations[i].type == OperationType::OEM_OPERATION) {
375                 supported[i] = (mOEM != OEMNo);
376                 continue;
377             }
378             supported[i] = false;
379             uint32_t operation = lookupOperation(model, i);
380             if ((operation != kBadOperation) && (mOperationMask & (1 << operation))) {
381                 supported[i] = true;
382             }
383         }
384         cb(ErrorStatus::NONE, supported);
385         return Void();
386     }
387 
getNumberOfCacheFilesNeeded(getNumberOfCacheFilesNeeded_cb cb)388     Return<void> getNumberOfCacheFilesNeeded(getNumberOfCacheFilesNeeded_cb cb) override {
389         cb(ErrorStatus::NONE, /*numModelCache=*/1, /*numDataCache=*/1);
390         return Void();
391     }
392 
prepareModelFromCache(const hidl_vec<hidl_handle> &,const hidl_vec<hidl_handle> &,const HidlToken &,const sp<V1_2::IPreparedModelCallback> & callback)393     Return<ErrorStatus> prepareModelFromCache(
394             const hidl_vec<hidl_handle>&, const hidl_vec<hidl_handle>&, const HidlToken&,
395             const sp<V1_2::IPreparedModelCallback>& callback) override {
396         callback->notify_1_2(ErrorStatus::NONE, new PartitioningPreparedModel);
397         return ErrorStatus::NONE;
398     }
399 
400    private:
401     std::string mVersionString;
402     Capabilities mCapabilities;
403     uint32_t mOperationMask;
404     OEM mOEM;
405 };
406 
407 // Like PartitioningDriver, but implementing 1.1
408 class PartitioningDriverV1_1 : public V1_1::IDevice {
409    public:
PartitioningDriverV1_1(const char * name,const char * version,Capabilities capabilities,uint32_t operationMask,PartitioningDriver::OEM oem=PartitioningDriver::OEMNo)410     PartitioningDriverV1_1(const char* name, const char* version, Capabilities capabilities,
411                            uint32_t operationMask,
412                            PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
413         : mDriverV1_2(new PartitioningDriver(name, version, capabilities, operationMask, oem)) {}
getCapabilities_1_1(getCapabilities_1_1_cb _hidl_cb)414     Return<void> getCapabilities_1_1(getCapabilities_1_1_cb _hidl_cb) override {
415         return mDriverV1_2->getCapabilities_1_1(_hidl_cb);
416     }
getSupportedOperations_1_1(const V1_1::Model & model,getSupportedOperations_1_1_cb _hidl_cb)417     Return<void> getSupportedOperations_1_1(const V1_1::Model& model,
418                                             getSupportedOperations_1_1_cb _hidl_cb) override {
419         return mDriverV1_2->getSupportedOperations_1_1(model, _hidl_cb);
420     }
prepareModel_1_1(const V1_1::Model & model,ExecutionPreference preference,const sp<V1_0::IPreparedModelCallback> & actualCallback)421     Return<ErrorStatus> prepareModel_1_1(
422             const V1_1::Model& model, ExecutionPreference preference,
423             const sp<V1_0::IPreparedModelCallback>& actualCallback) override {
424         return mDriverV1_2->prepareModel_1_1(model, preference, actualCallback);
425     }
getStatus()426     Return<DeviceStatus> getStatus() override { return mDriverV1_2->getStatus(); }
getCapabilities(getCapabilities_cb _hidl_cb)427     Return<void> getCapabilities(getCapabilities_cb _hidl_cb) override {
428         return mDriverV1_2->getCapabilities(_hidl_cb);
429     }
getSupportedOperations(const V1_0::Model & model,getSupportedOperations_cb _hidl_cb)430     Return<void> getSupportedOperations(const V1_0::Model& model,
431                                         getSupportedOperations_cb _hidl_cb) override {
432         return mDriverV1_2->getSupportedOperations(model, _hidl_cb);
433     }
prepareModel(const V1_0::Model & model,const sp<V1_0::IPreparedModelCallback> & actualCallback)434     Return<ErrorStatus> prepareModel(
435             const V1_0::Model& model,
436             const sp<V1_0::IPreparedModelCallback>& actualCallback) override {
437         return mDriverV1_2->prepareModel(model, actualCallback);
438     }
439 
440    private:
441     const sp<V1_2::IDevice> mDriverV1_2;
442 };
443 
444 // Like PartitioningDriver, but implementing 1.0
445 class PartitioningDriverV1_0 : public V1_0::IDevice {
446    public:
PartitioningDriverV1_0(const char * name,const char * version,Capabilities capabilities,uint32_t operationMask,PartitioningDriver::OEM oem=PartitioningDriver::OEMNo)447     PartitioningDriverV1_0(const char* name, const char* version, Capabilities capabilities,
448                            uint32_t operationMask,
449                            PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
450         : mDriverV1_2(new PartitioningDriver(name, version, capabilities, operationMask, oem)) {}
getCapabilities(getCapabilities_cb _hidl_cb)451     Return<void> getCapabilities(getCapabilities_cb _hidl_cb) override {
452         return mDriverV1_2->getCapabilities(_hidl_cb);
453     }
getSupportedOperations(const V1_0::Model & model,getSupportedOperations_cb _hidl_cb)454     Return<void> getSupportedOperations(const V1_0::Model& model,
455                                         getSupportedOperations_cb _hidl_cb) override {
456         return mDriverV1_2->getSupportedOperations(model, _hidl_cb);
457     }
prepareModel(const V1_0::Model & model,const sp<V1_0::IPreparedModelCallback> & actualCallback)458     Return<ErrorStatus> prepareModel(
459             const V1_0::Model& model,
460             const sp<V1_0::IPreparedModelCallback>& actualCallback) override {
461         return mDriverV1_2->prepareModel(model, actualCallback);
462     }
getStatus()463     Return<DeviceStatus> getStatus() override { return mDriverV1_2->getStatus(); }
464 
465    private:
466     const sp<V1_2::IDevice> mDriverV1_2;
467 };
468 
469 // This class adds some simple abstractions and utilities on top of
470 // WrapperModel.  For example, it provides methods that work in terms of
471 // operation kind (0..7); and because we care about graph topology rather than
472 // details of operand types and values, it greatly simplifies the process of
473 // creating operands.
474 class PartitioningModel : private WrapperModel {
475    public:
476     using WrapperModel::finish;
477     using WrapperModel::getHandle;
478     using WrapperModel::identifyInputsAndOutputs;
479     using WrapperModel::isValid;
480     using WrapperModel::relaxComputationFloat32toFloat16;
481 
482     // Create a tensor operand of the specified type, and return the
483     // corresponding operand index.
addFloatOperand()484     uint32_t addFloatOperand() { return addOperand(WrapperType::TENSOR_FLOAT32); }
addQuantOperand()485     uint32_t addQuantOperand() { return addOperand(WrapperType::TENSOR_QUANT8_ASYMM); }
486 
487     // Create an operand of the specified type, and return the corresponding
488     // operand index.
addOperand(WrapperType wrapperType)489     uint32_t addOperand(WrapperType wrapperType) {
490         switch (static_cast<int>(wrapperType)) {
491             case ANEURALNETWORKS_BOOL:
492             case ANEURALNETWORKS_FLOAT16:
493             case ANEURALNETWORKS_FLOAT32:
494             case ANEURALNETWORKS_INT32:
495             case ANEURALNETWORKS_UINT32:
496             case ANEURALNETWORKS_OEM_SCALAR: {
497                 WrapperOperandType wrapperOperandType(wrapperType, {});
498                 mWrapperOperandType.push_back(wrapperOperandType);
499                 return WrapperModel::addOperand(&wrapperOperandType);
500             }
501 
502             case ANEURALNETWORKS_TENSOR_BOOL8:
503             case ANEURALNETWORKS_TENSOR_FLOAT16:
504             case ANEURALNETWORKS_TENSOR_FLOAT32:
505             case ANEURALNETWORKS_TENSOR_OEM_BYTE: {
506                 WrapperOperandType wrapperOperandType(wrapperType, {1});
507                 mWrapperOperandType.push_back(wrapperOperandType);
508                 return WrapperModel::addOperand(&wrapperOperandType);
509             }
510 
511             case ANEURALNETWORKS_TENSOR_INT32:
512             case ANEURALNETWORKS_TENSOR_QUANT8_ASYMM:
513             case ANEURALNETWORKS_TENSOR_QUANT8_SYMM:
514             case ANEURALNETWORKS_TENSOR_QUANT16_ASYMM:
515             case ANEURALNETWORKS_TENSOR_QUANT16_SYMM: {
516                 WrapperOperandType wrapperOperandType(wrapperType, {1}, 1.0f);
517                 mWrapperOperandType.push_back(wrapperOperandType);
518                 return WrapperModel::addOperand(&wrapperOperandType);
519             }
520 
521             case ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL: {
522                 WrapperOperandType wrapperOperandType(wrapperType, {1}, 0.0f, 0,
523                                                       WrapperSymmPerChannelQuantParams({1.0f}, 0));
524                 mWrapperOperandType.push_back(wrapperOperandType);
525                 return WrapperModel::addOperand(&wrapperOperandType);
526             }
527 
528             default:
529                 ADD_FAILURE() << "Unexpected type " << static_cast<uint32_t>(wrapperType);
530                 return ~uint32_t(0);
531         }
532     }
533 
534     enum class Dimensioned { NO, YES };
535 
536     // Create a V1_0 operation with two inputs and one output, specifying the
537     // operation kind (where 0 is the first V1_0 operation) and the input
538     // operand indexes.
539     // Returns the output operand index.
addOperation2To1V1_0(uint32_t operation,const uint32_t input0,const uint32_t input1,Dimensioned dimensionedOutput=Dimensioned::YES)540     uint32_t addOperation2To1V1_0(uint32_t operation, const uint32_t input0, const uint32_t input1,
541                                   Dimensioned dimensionedOutput = Dimensioned::YES) {
542         CHECK_LE(operation, kLastEncodingV1_0 - kFirstEncodingV1_0);
543         return addOperation2To1(operation + kFirstEncodingV1_0, input0, input1, dimensionedOutput);
544     }
545 
546     // Create a V1_1 operation with two inputs and one output, specifying the
547     // operation kind (where 0 is the first V1_1 operation) and the input
548     // operand indexes.
549     // Returns the output operand index.
addOperation2To1V1_1(uint32_t operation,const uint32_t input0,const uint32_t input1,Dimensioned dimensionedOutput=Dimensioned::YES)550     uint32_t addOperation2To1V1_1(uint32_t operation, const uint32_t input0, const uint32_t input1,
551                                   Dimensioned dimensionedOutput = Dimensioned::YES) {
552         CHECK_LE(operation, kLastEncodingV1_1 - kFirstEncodingV1_1);
553         return addOperation2To1(operation + kFirstEncodingV1_1, input0, input1, dimensionedOutput);
554     }
555 
556     // Create a V1_2 operation with two inputs and one output, specifying the
557     // operation kind (where 0 is the first V1_2 operation) and the input
558     // operand indexes.
559     // Returns the output operand index.
addOperation2To1V1_2(uint32_t operation,const uint32_t input0,const uint32_t input1,Dimensioned dimensionedOutput=Dimensioned::YES)560     uint32_t addOperation2To1V1_2(uint32_t operation, const uint32_t input0, const uint32_t input1,
561                                   Dimensioned dimensionedOutput = Dimensioned::YES) {
562         CHECK_LE(operation, kLastEncodingV1_2 - kFirstEncodingV1_2);
563         return addOperation2To1(operation + kFirstEncodingV1_2, input0, input1, dimensionedOutput);
564     }
565 
566     // Create an OEM operation with one input and one output,
567     // specifying the input operand index.  Returns the output operand
568     // index.
addOperationOEM1To1(const uint32_t input,Dimensioned dimensionedOutput=Dimensioned::YES)569     uint32_t addOperationOEM1To1(const uint32_t input,
570                                  Dimensioned dimensionedOutput = Dimensioned::YES) {
571         uint32_t output = addOperandOfSameType(input, dimensionedOutput);
572         addOperation(ANEURALNETWORKS_OEM_OPERATION, { input }, { output });
573         return output;
574     }
575 
576     // Run the partitioning algorithm to create an ExecutionPlan.
partitionTheWork(const std::vector<std::shared_ptr<Device>> & devices,ExecutePreference preference,ExecutionPlan * plan)577     int partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
578                          ExecutePreference preference, ExecutionPlan* plan) {
579         return reinterpret_cast<ModelBuilder*>(getHandle())->partitionTheWork(
580             devices, static_cast<uint32_t>(preference), plan);
581     }
582 
583 #ifdef VERBOSE
584     // This is a debugging utility function.
dump(const char * name) const585     void dump(const char* name) const {
586         const ModelBuilder* mb = reinterpret_cast<const ModelBuilder*>(getHandle());
587         ::dump(name, mb);
588     }
589 #endif
590 
591 private:
592  // Create an operation with two inputs and one output, specifying
593  // the operation kind and the input operand indexes.
594  // Returns the output operand index.
addOperation2To1(uint32_t operation,const uint32_t input0,const uint32_t input1,Dimensioned dimensionedOutput=Dimensioned::YES)595  uint32_t addOperation2To1(uint32_t operation, const uint32_t input0, const uint32_t input1,
596                            Dimensioned dimensionedOutput = Dimensioned::YES) {
597      auto it = firstEncodingToOperation.lower_bound(operation);
598      CHECK(it != firstEncodingToOperation.end());
599      ANeuralNetworksOperationType type = it->second.first;
600      if (it->second.second) {
601          int32_t fuseCode = operation - it->first;
602          uint32_t input2 = addIntOperand(fuseCode);
603          uint32_t output = addOperandOfSameType(input0, dimensionedOutput);
604          addOperation(type, {input0, input1, input2}, {output});
605          return output;
606      } else {
607          uint32_t output = addOperandOfSameType(input0, dimensionedOutput);
608          addOperation(type, {input0, input1}, {output});
609          return output;
610      }
611  }
612 
613  // Create a scalar integer operand of the specified value, and
614  // return the corresponding operand index.
addIntOperand(int32_t value)615  uint32_t addIntOperand(int32_t value) {
616      uint32_t operand = addOperand(WrapperType::INT32);
617      setOperandValue(operand, &value, sizeof(value));
618      return operand;
619     }
620 
621     // Create an operand of the same type as the specified operand,
622     // and return the operand index of the new operand.
addOperandOfSameType(uint32_t operand,Dimensioned dimensioned=Dimensioned::YES)623     uint32_t addOperandOfSameType(uint32_t operand, Dimensioned dimensioned = Dimensioned::YES) {
624         WrapperOperandType type = mWrapperOperandType.at(operand);
625         for (auto& dimension : type.dimensions) {
626             dimension = (dimensioned == Dimensioned::YES);
627         }
628         mWrapperOperandType.push_back(type);
629         return WrapperModel::addOperand(&type);
630     }
631 
632     // operand index to operand type
633     std::vector<WrapperOperandType> mWrapperOperandType;
634 };
635 
636 // This class adds some utilities on top of WrapperCompilation.
637 class PartitioningCompilation : public WrapperCompilation {
638 public:
PartitioningCompilation(const PartitioningModel * model,const std::vector<std::shared_ptr<Device>> & devices)639  PartitioningCompilation(const PartitioningModel* model,
640                          const std::vector<std::shared_ptr<Device>>& devices) {
641      ModelBuilder* m = reinterpret_cast<ModelBuilder*>(model->getHandle());
642      CompilationBuilder* c = nullptr;
643      int result = m->createCompilation(&c, devices);
644      EXPECT_EQ(result, 0);
645      mCompilation = reinterpret_cast<ANeuralNetworksCompilation*>(c);
646  }
647 
setPartitioning(uint32_t partitioning)648  Result setPartitioning(uint32_t partitioning) {
649      return static_cast<Result>(builder()->setPartitioning(partitioning));
650     }
651 
652     using WrapperCompilation::finish;
653 
getExecutionPlan() const654     const ExecutionPlan& getExecutionPlan() const {
655         return builder()->forTest_getExecutionPlan();
656     }
657 
658 private:
builder()659     CompilationBuilder* builder() {
660         return reinterpret_cast<CompilationBuilder*>(getHandle());
661     }
662 
builder() const663     const CompilationBuilder* builder() const {
664         return reinterpret_cast<const CompilationBuilder*>(getHandle());
665     }
666 };
667 
668 #ifdef VERBOSE
669 #define RETURN_TRUE()                                                          \
670     {                                                                          \
671         std::cerr << "returning true from " << __LINE__ << std::endl;          \
672         return true;                                                           \
673     }
674 #else
675 #define RETURN_TRUE()                                                          \
676     {                                                                          \
677         return true;                                                           \
678     }
679 #endif
680 #ifdef VERBOSE
681 #define RETURN_FALSE(MESSAGE)                                                  \
682     {                                                                          \
683         std::cerr << "returning false from " << __LINE__ MESSAGE << std::endl; \
684         return false;                                                          \
685     }
686 #else
687 #define RETURN_FALSE(MESSAGE)                                                  \
688     {                                                                          \
689         return false;                                                          \
690     }
691 #endif
692 
693 class PartitioningTest : public ::testing::Test {
694 protected:
695     using RemapVectorType = ExecutionStep::RemapVectorType;
696     using SubModelOutputSetType = ExecutionStep::SubModelOutputSetType;
697 
SetUp()698     virtual void SetUp() {
699     }
700 
701     // From a vector of DeviceSpecification, create a vector of
702     // Devices.
703     struct DeviceSpecification {
DeviceSpecification__anon83dd480d0111::PartitioningTest::DeviceSpecification704         DeviceSpecification(const std::string& name, const Capabilities& capabilities,
705                             uint32_t operationMask,
706                             PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
707             : mName(name),
708               mVersionString(kVersionString),
709               mCapabilities(capabilities),
710               mOperationMask(operationMask),
711               mOEM(oem) {}
DeviceSpecification__anon83dd480d0111::PartitioningTest::DeviceSpecification712         DeviceSpecification(const std::string& name, float perf, uint32_t operationMask,
713                             PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
714             : DeviceSpecification(name, perf, perf, operationMask, oem) {}
DeviceSpecification__anon83dd480d0111::PartitioningTest::DeviceSpecification715         DeviceSpecification(const std::string& name, float perf, float perfRelaxed,
716                             uint32_t operationMask,
717                             PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
718             : DeviceSpecification(name, kVersionString, perf, perfRelaxed, operationMask, oem) {}
DeviceSpecification__anon83dd480d0111::PartitioningTest::DeviceSpecification719         DeviceSpecification(const std::string& name, const std::string& version, float perf,
720                             uint32_t operationMask,
721                             PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
722             : DeviceSpecification(name, version, perf, perf, operationMask, oem) {}
DeviceSpecification__anon83dd480d0111::PartitioningTest::DeviceSpecification723         DeviceSpecification(const std::string& name, const std::string& version, float perf,
724                             float perfRelaxed, uint32_t operationMask,
725                             PartitioningDriver::OEM oem = PartitioningDriver::OEMNo)
726             : mName(name), mVersionString(version), mOperationMask(operationMask), mOEM(oem) {
727             PerformanceInfo perfRelaxedInfo = {.execTime = perfRelaxed, .powerUsage = perfRelaxed};
728             mCapabilities = {.relaxedFloat32toFloat16PerformanceScalar = perfRelaxedInfo,
729                              .relaxedFloat32toFloat16PerformanceTensor = perfRelaxedInfo,
730                              .operandPerformance = ::android::nn::nonExtensionOperandPerformance(
731                                      {.execTime = perf, .powerUsage = perf})};
732         }
DeviceSpecification__anon83dd480d0111::PartitioningTest::DeviceSpecification733         DeviceSpecification(const std::string& name, float perf, HalVersion halVersion,
734                             uint32_t operationMaskV1_0, uint32_t operationMaskV1_1 = 0,
735                             uint32_t operationMaskV1_2 = 0)
736             : DeviceSpecification(name, perf, perf,
737                                   makeOperationMask(halVersion, operationMaskV1_0,
738                                                     operationMaskV1_1, operationMaskV1_2)) {
739             mHalVersion = halVersion;
740         }
741 
742         std::string mName;
743         std::string mVersionString;
744         Capabilities mCapabilities;
745         HalVersion mHalVersion = HalVersion::LATEST;
746         uint32_t mOperationMask;
747         PartitioningDriver::OEM mOEM = PartitioningDriver::OEMNo;
748 
749         static constexpr char kVersionString[] = "JUST_AN_EXAMPLE";
750 
751        private:
752         // This function takes three operation masks aligned at the low-order
753         // bit -- one mask each for V1_0, V1_1, and V1_2 -- and produces a single
754         // composite operation mask, formed by shifting each of the input
755         // operation masks appropriately and ORing the results together.
756         //
757         // For convenience, any bits of an input mask that are too high order
758         // for that mask are discarded -- this allows ~0 to be a legal input
759         // mask.
760         //
761         // For the sake of example, assume that each low order mask is 4 bits
762         // wide, and take some artistic license to write literals in binary.
763         // Then:
764         //
765         //     assert(makeOperationMask(HalVersion::V1_2, 0b0110, 0b1001, 0b0101) ==
766         //            0b 0101 1001 0110);
767         //
768         // This is used by a DeviceSpecification constructor to build a mask of
769         // operations to be supported by the device.
makeOperationMask__anon83dd480d0111::PartitioningTest::DeviceSpecification770         static uint32_t makeOperationMask(HalVersion halVersion, uint32_t operationMaskV1_0,
771                                           uint32_t operationMaskV1_1, uint32_t operationMaskV1_2) {
772             if (halVersion < HalVersion::V1_2) {
773                 CHECK(!operationMaskV1_2);
774             }
775             if (halVersion < HalVersion::V1_1) {
776                 CHECK(!operationMaskV1_1);
777             }
778             auto maskOfWidth = [](uint32_t width) -> uint32_t { return (1U << width) - 1; };
779             static const uint32_t kOperationMaskV1_0 =
780                     maskOfWidth(kLastEncodingV1_0 - kFirstEncodingV1_0 + 1);
781             static const uint32_t kOperationMaskV1_1 =
782                     maskOfWidth(kLastEncodingV1_1 - kFirstEncodingV1_1 + 1);
783             static const uint32_t kOperationMaskV1_2 =
784                     maskOfWidth(kLastEncodingV1_2 - kFirstEncodingV1_2 + 1);
785             return ((operationMaskV1_0 & kOperationMaskV1_0) << kFirstEncodingV1_0) |
786                    ((operationMaskV1_1 & kOperationMaskV1_1) << kFirstEncodingV1_1) |
787                    ((operationMaskV1_2 & kOperationMaskV1_2) << kFirstEncodingV1_2);
788         }
789     };
makeDevices(std::vector<DeviceSpecification> specifications)790     static std::vector<std::shared_ptr<Device>> makeDevices(
791             std::vector<DeviceSpecification> specifications) {
792         std::vector<std::shared_ptr<Device>> devices;
793         for (const auto& specification : specifications) {
794             V1_0::IDevice* halDriver = nullptr;
795             switch (specification.mHalVersion) {
796                 case HalVersion::V1_2:
797                     halDriver = new PartitioningDriver(
798                             specification.mName.c_str(), specification.mVersionString.c_str(),
799                             specification.mCapabilities, specification.mOperationMask,
800                             specification.mOEM);
801                     break;
802                 case HalVersion::V1_1:
803                     halDriver = new PartitioningDriverV1_1(
804                             specification.mName.c_str(), specification.mVersionString.c_str(),
805                             specification.mCapabilities, specification.mOperationMask,
806                             specification.mOEM);
807                     break;
808                 case HalVersion::V1_0:
809                     halDriver = new PartitioningDriverV1_0(
810                             specification.mName.c_str(), specification.mVersionString.c_str(),
811                             specification.mCapabilities, specification.mOperationMask,
812                             specification.mOEM);
813                     break;
814                 default:
815                     ADD_FAILURE() << "Unexpected";
816             }
817             auto device = DeviceManager::forTest_makeDriverDevice(specification.mName, halDriver);
818             devices.push_back(device);
819         }
820         devices.push_back(DeviceManager::getCpuDevice());
821         return devices;
822     }
823 
824     /*-- Graph comparision ----------------------------------------------------------------*/
825 
826     // An operand with certain values for its lifetime does not have a
827     // defining operation in the graph.  For the purposes of the graph
828     // comparison algorithm, we encode the "defining operation" index of
829     // such an operand as follows:
830     // - NO_VALUE       kPseudoDefiningOperationNoValue
831     // - MODEL_INPUT    kPseudoDefiningOperationModelInput0 + (position in list of inputs)
832     // - CONSTANT_COPY  kPseudoDefiningOperationConstantCopy0 + (constant value)
833     //                    Note: For the graphs we build in this test, we
834     //                          only expect to see 4-byte constants within
835     //                          a very restricted range, so we only make
836     //                          room for such constants in our encoding
837     //                          space.
838     // We do not expect to see CONSTANT_REFERENCE, and so we do not handle
839     // it.
840     //
841     // The encoding is intended to be relatively human readable; it is not
842     // designed to represent some optimal balance of ranges for the items
843     // within its scope (actual operations, inputs, constants).
844 
845     enum PseudoDefiningOperationEncodings : uint32_t {
846         kPseudoDefiningOperationModelInput0   = 0x80000000U,
847         kPseudoDefiningOperationConstantCopy0 = 0x90000000U,
848         kPseudoDefiningOperationNoValue       = 0xeeeeeeeeU,
849 
850         // lowest value for special encoding
851         kPseudoDefiningOperationBase          = 0x80000000U,
852 
853         // range of encoded input or constant
854         kPseudoDefiningOperationRange         = 0x10000000U,
855     };
856 
857     // Build a map from operand to defining operation.
858     // TODO: Replace map with vector?
buildDefinitionMap(const ModelBuilder * model,std::map<uint32_t,uint32_t> * defMap)859     void buildDefinitionMap(const ModelBuilder* model,
860                             std::map<uint32_t, uint32_t>* defMap) {
861         // actual definitions
862         ASSERT_LT(model->operationCount(), kPseudoDefiningOperationBase);
863         for (uint32_t i = 0, e = model->operationCount(); i < e; i++) {
864             const Operation& operation = model->getOperation(i);
865             for (uint32_t output : operation.outputs) {
866                 (*defMap)[output] = i;
867             }
868         }
869         // inputs
870         ASSERT_LT(model->inputCount(), kPseudoDefiningOperationRange);
871         for (uint32_t i = 0, e = model->inputCount(); i < e; i++) {
872             (*defMap)[model->getInputOperandIndex(i)] = kPseudoDefiningOperationModelInput0 + i;
873         }
874         // look for NO_VALUE and CONSTANT_COPY
875         for (uint32_t i = 0, e = model->operandCount(); i < e; i++) {
876             const Operand& operand = model->getOperand(i);
877             switch (operand.lifetime) {
878                 case OperandLifeTime::NO_VALUE:
879                     (*defMap)[i] = kPseudoDefiningOperationNoValue;
880                     break;
881                 case OperandLifeTime::CONSTANT_COPY: {
882                     ASSERT_EQ(operand.location.length, sizeof(uint32_t));
883                     uint32_t value;
884                     memcpy(&value, model->getPointerToOperandValue(operand.location.offset), sizeof(uint32_t));
885                     ASSERT_LT(value, kPseudoDefiningOperationNoValue);
886                     (*defMap)[i] = kPseudoDefiningOperationConstantCopy0 + value;
887                     break;
888                 }
889                 case OperandLifeTime::TEMPORARY_VARIABLE:
890                 case OperandLifeTime::MODEL_INPUT:
891                 case OperandLifeTime::MODEL_OUTPUT:
892                     // already handled
893                     break;
894                 default:
895                     FAIL();
896                     break;
897             }
898         }
899         // sanity check
900         ASSERT_EQ(model->operandCount(), defMap->size());
901     }
902 
903 #ifdef VERBOSE
dump(const char * name,const std::map<uint32_t,uint32_t> * aMap)904     void dump(const char* name, const std::map<uint32_t, uint32_t>* aMap) {
905         auto writeNum = [](uint32_t num) {
906             if (num >= kPseudoDefiningOperationBase) {
907                 std::cout << "0x" << std::hex << num << std::dec;
908             } else {
909                 std::cout << num;
910             }
911         };
912 
913         std::cout << name << ": { ";
914         bool gotOne = false;
915         for (const auto& entry : *aMap) {
916             if (gotOne) {
917                 std::cout << ", ";
918             } else {
919                 gotOne = true;
920             }
921             std::cout << "(";
922             writeNum(entry.first);
923             std::cout << ", ";
924             writeNum(entry.second);
925             std::cout << ")";
926         }
927         std::cout << " }" << std::endl;
928     }
929 #endif
930 
compare(const Operand & operandA,const Operand & operandB)931     bool compare(const Operand& operandA, const Operand& operandB) {
932         if (operandA.type != operandB.type ||
933             operandA.dimensions != operandB.dimensions ||
934             operandA.numberOfConsumers != operandB.numberOfConsumers ||
935             operandA.scale != operandB.scale ||
936             operandA.zeroPoint != operandB.zeroPoint) {
937             return false;
938         }
939         return true;
940     }
941 
942     // Compare two graphs.  We ignore operand and operation indexes (i.e.,
943     // two nodes can be the same even if they are numbered differently)
944     // but we also ignore semantics (e.g., even if an operation kind is
945     // such that the operand is commutative, we still pay attention to the
946     // order of its input operands).
947     //
948     // The comparison algorithm works by walking modelA from outputs
949     // towards inputs, along the edge from each operand to its
950     // defining operation, and then along the edges to the operation's
951     // input operands.  At each step along the way, we try to match up
952     // operands and operations from modelA with equivalent operands
953     // and operations from modelB.
954     //
955     // We start by assuming that modelA's outputs and modelB's outputs
956     // match positionally (e.g., modelA's first output operand is
957     // equivalent to modelB's first output operand).  Once we've
958     // discovered two equivalent operands (such as those outputs), we
959     // place them in a work queue.  We repeatedly pull operands off
960     // the queue and compare their defining operations and those
961     // operations' input operands, to discover more pairs of
962     // equivalent operands.  If we ever find operations that do not
963     // match (e.g., because operation kind differs), or operands that
964     // do not match (e.g., because operand type differs); or if we
965     // ever find a conflict (we've already decided that operand A's
966     // equivalent operand is B0, but it looks like we need its
967     // equivalent operand to be B1); then the graphs compare unequal.
968     // Otherwise, we'll eventually exhaust the work queue, and
969     // conclude that the graphs compare equal.
970     //
971     // As a side effect of the comparison, we produce a map
972     // *inputsAndOutputsBToA that maps from each of the model input and output
973     // operand numbers of modelB to the corresponding operand numbers of modelA.
974     // If the comparison returns false, the contents of the map are undefined.
compare(const ModelBuilder * modelA,const ModelBuilder * modelB,std::map<uint32_t,uint32_t> * inputsAndOutputsBToA)975     bool compare(const ModelBuilder* modelA, const ModelBuilder* modelB,
976                  std::map<uint32_t, uint32_t>* inputsAndOutputsBToA) {
977         CHECK(inputsAndOutputsBToA != nullptr);
978         EXPECT_TRUE(inputsAndOutputsBToA->empty());
979 
980 #ifdef VERBOSE
981         ::dump("compare(A)", modelA);
982         ::dump("compare(B)", modelB);
983 #endif
984 
985         if (modelA->operandCount()   != modelB->operandCount()   ||
986             modelA->operationCount() != modelB->operationCount() ||
987             modelA->inputCount()     != modelB->inputCount()     ||
988             modelA->outputCount()    != modelB->outputCount()) {
989             RETURN_FALSE();
990         }
991 
992         // Maps from operand index to index of defining operation.
993         std::map<uint32_t, uint32_t> defsA, defsB;
994         buildDefinitionMap(modelA, &defsA);
995         buildDefinitionMap(modelB, &defsB);
996         if (HasFatalFailure()) return false;
997 
998         // Maps from operand index in modelA to equivalent operand index
999         // in modelB; and from operation index in modelA to equivalent
1000         // operation index in modelB.
1001         std::map<uint32_t, uint32_t> equivalentOperandsAToB;
1002         std::map<uint32_t, uint32_t> equivalentOperationsAToB;
1003 
1004         // Queue of operand indexes from modelA, each of whose defining
1005         // operations are to be checked for equivalence with modelB.
1006         std::queue<uint32_t> workQueueOperandsA;
1007 
1008         // Seed operand equivalence map and work queue from model outputs.
1009         for (uint32_t i = 0, e = modelA->outputCount(); i < e; i++) {
1010             uint32_t outputA = modelA->getOutputOperandIndex(i);
1011             uint32_t outputB = modelB->getOutputOperandIndex(i);
1012             if (!compare(modelA->getOperand(outputA), modelB->getOperand(outputB))) {
1013                 RETURN_FALSE();
1014             }
1015             equivalentOperandsAToB[outputA] = outputB;
1016             workQueueOperandsA.push(outputA);
1017         }
1018 
1019 #ifdef VERBOSE
1020         dump("defsA", &defsA);
1021         dump("defsB", &defsB);
1022 #endif
1023 
1024         // Process the queue.
1025         uint32_t pseudoDefinitionCount = 0;
1026         while (!workQueueOperandsA.empty()) {
1027 #ifdef VERBOSE
1028             dump("equivalentOperandsAToB", &equivalentOperandsAToB);
1029             dump("equivalentOperationsAToB", &equivalentOperationsAToB);
1030 #endif
1031             uint32_t operandIndexA = workQueueOperandsA.front();
1032 #ifdef VERBOSE
1033             std::cout << "operandIndexA: " << operandIndexA << std::endl;
1034 #endif
1035             workQueueOperandsA.pop();
1036             uint32_t operandIndexB = equivalentOperandsAToB.at(operandIndexA);
1037 
1038             uint32_t operationIndexA = defsA.at(operandIndexA);
1039             uint32_t operationIndexB = defsB.at(operandIndexB);
1040             auto it = equivalentOperationsAToB.find(operationIndexA);
1041             if (it != equivalentOperationsAToB.end()) {
1042                 if (it->second != operationIndexB) {
1043                     RETURN_FALSE();
1044                 }
1045                 continue;
1046             }
1047 
1048             // We haven't identified an equivalent operation for
1049             // operationIndexA.
1050 
1051             if ((operationIndexA >= kPseudoDefiningOperationBase) !=
1052                 (operationIndexB >= kPseudoDefiningOperationBase)) {
1053                 RETURN_FALSE();
1054             }
1055             // Either both operands have pseudo-definitions, or neither
1056             // does.
1057             if (operationIndexA >= kPseudoDefiningOperationBase) {
1058                 // Both operands have pseudo-definitions.
1059                 if (operationIndexA != operationIndexB) {
1060                     RETURN_FALSE();
1061                 }
1062                 equivalentOperationsAToB[operationIndexA] = operationIndexB;
1063                 ++pseudoDefinitionCount;
1064                 continue;
1065             }
1066 
1067             // If we get here, neither operation A nor operation B is a
1068             // pseudo-definition.
1069 
1070             const Operation& operationA = modelA->getOperation(operationIndexA);
1071             const Operation& operationB = modelB->getOperation(operationIndexB);
1072             if (operationA.type != operationB.type ||
1073                 operationA.inputs.size() != operationB.inputs.size() ||
1074                 operationA.outputs.size() != operationB.outputs.size()) {
1075                 RETURN_FALSE();
1076             }
1077             equivalentOperationsAToB[operationIndexA] = operationIndexB;
1078             for (uint32_t i = 0, e = operationA.inputs.size(); i < e; i++) {
1079                 uint32_t inputA = operationA.inputs[i];
1080                 uint32_t inputB = operationB.inputs[i];
1081                 auto it = equivalentOperandsAToB.find(inputA);
1082                 if (it != equivalentOperandsAToB.end()) {
1083                     if (it->second != inputB) {
1084                         RETURN_FALSE();
1085                     }
1086                     continue;
1087                 }
1088                 // We haven't identified an equivalent operand for inputA.
1089                 if (!compare(modelA->getOperand(inputA), modelB->getOperand(inputB))) {
1090                     RETURN_FALSE();
1091                 }
1092                 equivalentOperandsAToB[inputA] = inputB;
1093                 workQueueOperandsA.push(inputA);
1094             }
1095         }
1096 
1097         // Sanity check
1098         if (modelA->operandCount() != defsA.size() ||
1099             modelA->operandCount() != defsB.size() ||
1100             modelA->operandCount() != equivalentOperandsAToB.size() ||
1101             modelA->operationCount() + pseudoDefinitionCount != equivalentOperationsAToB.size()) {
1102             RETURN_FALSE();
1103         }
1104 
1105         // Build *inputsAndOutputsBToA
1106         for (uint32_t aInputIndex : modelA->getInputOperandIndexes()) {
1107             (*inputsAndOutputsBToA)[equivalentOperandsAToB.at(aInputIndex)] = aInputIndex;
1108         }
1109         for (uint32_t aOutputIndex : modelA->getOutputOperandIndexes()) {
1110             (*inputsAndOutputsBToA)[equivalentOperandsAToB.at(aOutputIndex)] = aOutputIndex;
1111         }
1112 
1113         RETURN_TRUE();
1114     }
1115 
1116     /*-------------------------------------------------------------------------------------*/
1117 
1118     // As a side effect of the comparison, we produce a map
1119     // *inputsAndOutputsModelToStep that maps from each of the model input and
1120     // output operand numbers of "model" to the corresponding operand numbers of
1121     // the submodel from "step".  If the comparison returns false, the contents
1122     // of the map are undefined.
compare(std::shared_ptr<const ExecutionStep> step,const PartitioningModel * model,std::shared_ptr<Device> device,std::map<uint32_t,uint32_t> * inputsAndOutputsModelToStep)1123     bool compare(std::shared_ptr<const ExecutionStep> step, const PartitioningModel* model,
1124                  std::shared_ptr<Device> device,
1125                  std::map<uint32_t, uint32_t>* inputsAndOutputsModelToStep) {
1126         return (step->getDevice() == device) &&
1127                compare(step->getSubModel(),
1128                        reinterpret_cast<const ModelBuilder*>(model->getHandle()),
1129                        inputsAndOutputsModelToStep);
1130     }
1131 
compare(std::shared_ptr<const ExecutionStep> step,const PartitioningModel * model,std::shared_ptr<Device> device,const RemapVectorType & modelInputs,const RemapVectorType & modelOutputs,const RemapVectorType & tempsAsSubModelInputs,const SubModelOutputSetType & tempsAsSubModelOutputs,const RemapVectorType & outputsAsSubModelInputs)1132     void compare(std::shared_ptr<const ExecutionStep> step, const PartitioningModel* model,
1133                  std::shared_ptr<Device> device, const RemapVectorType& modelInputs,
1134                  const RemapVectorType& modelOutputs, const RemapVectorType& tempsAsSubModelInputs,
1135                  const SubModelOutputSetType& tempsAsSubModelOutputs,
1136                  const RemapVectorType& outputsAsSubModelInputs) {
1137         std::map<uint32_t, uint32_t> inputsAndOutputsModelToStep;
1138         ASSERT_NO_FATAL_FAILURE(
1139                 ASSERT_TRUE(compare(step, model, device, &inputsAndOutputsModelToStep)));
1140         ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep, step->getModelInputs(),
1141                                         modelInputs));
1142         ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep, step->getModelOutputs(),
1143                                         modelOutputs));
1144         ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep,
1145                                         step->getTempsAsSubModelInputs(), tempsAsSubModelInputs));
1146         ASSERT_TRUE(compareSubModelOutputSets(inputsAndOutputsModelToStep,
1147                                               step->getTempsAsSubModelOutputs(),
1148                                               tempsAsSubModelOutputs));
1149         ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep,
1150                                         step->getOutputsAsSubModelInputs(),
1151                                         outputsAsSubModelInputs));
1152     }
1153 
1154    private:
compareRemapVectors(const std::map<uint32_t,uint32_t> & inputsAndOutputsModelToStep,const RemapVectorType & step,RemapVectorType model)1155     static bool compareRemapVectors(const std::map<uint32_t, uint32_t>& inputsAndOutputsModelToStep,
1156                                     const RemapVectorType& step, RemapVectorType model) {
1157         std::transform(model.begin(), model.end(), model.begin(),
1158                        [&inputsAndOutputsModelToStep](const RemapVectorType::value_type& val) {
1159                            return std::make_pair(val.first,
1160                                                  inputsAndOutputsModelToStep.at(val.second));
1161                        });
1162         return step == model;
1163     }
1164 
compareSubModelOutputSets(const std::map<uint32_t,uint32_t> & inputsAndOutputsModelToStep,const SubModelOutputSetType & step,const SubModelOutputSetType & model)1165     static bool compareSubModelOutputSets(
1166             const std::map<uint32_t, uint32_t>& inputsAndOutputsModelToStep,
1167             const SubModelOutputSetType& step, const SubModelOutputSetType& model) {
1168         SubModelOutputSetType modelTransformed;
1169         std::transform(
1170                 model.begin(), model.end(), std::inserter(modelTransformed, modelTransformed.end()),
1171                 [&inputsAndOutputsModelToStep](const SubModelOutputSetType::value_type& val) {
1172                     return std::make_pair(val.first, inputsAndOutputsModelToStep.at(val.second));
1173                 });
1174         return step == modelTransformed;
1175     }
1176 };
1177 
TEST_F(PartitioningTest,SimpleModel)1178 TEST_F(PartitioningTest, SimpleModel) {
1179     PartitioningModel model;
1180     uint32_t opnd0 = model.addFloatOperand();
1181     uint32_t opnd1 = model.addFloatOperand();
1182     uint32_t opnd2 = model.addOperation2To1V1_0(0, opnd0, opnd1);
1183     uint32_t opnd3 = model.addFloatOperand();
1184     uint32_t opnd4 = model.addOperation2To1V1_0(1, opnd2, opnd3);
1185     model.identifyInputsAndOutputs({ opnd0, opnd1, opnd3 }, { opnd4 });
1186     model.finish();
1187     ASSERT_TRUE(model.isValid());
1188 
1189     // Simple partition (two devices are each capable of everything, one is the best).
1190     // No need to compare the original model to the model from the plan -- we
1191     // didn't actually do any partitioning.
1192     const auto devicesA = makeDevices({{"bad", 0.9, ~0U}, {"good", 0.5, ~0U}});
1193     ExecutionPlan planA;
1194     ASSERT_EQ(model.partitionTheWork(devicesA, ExecutePreference::PREFER_LOW_POWER, &planA),
1195               ANEURALNETWORKS_NO_ERROR);
1196     ASSERT_EQ(planA.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1197     ASSERT_NE(planA.forTest_simpleGetDevice().get(), nullptr);
1198     ASSERT_STREQ(planA.forTest_simpleGetDevice()->getName(), "good");
1199 
1200     // Simple partition (two devices are each capable of everything, none better than CPU).
1201     // No need to compare the original model to the model from the plan -- we
1202     // didn't actually do any partitioning.
1203     const auto devicesC = makeDevices({{"bad", 1.1, ~0U}, {"bad2", 1.0, ~0U}});
1204     ExecutionPlan planC;
1205     ASSERT_EQ(model.partitionTheWork(devicesC, ExecutePreference::PREFER_LOW_POWER, &planC),
1206               ANEURALNETWORKS_NO_ERROR);
1207     ASSERT_EQ(planC.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1208     ASSERT_EQ(planC.forTest_simpleGetDevice(), DeviceManager::getCpuDevice());
1209 
1210     // Compound partition (two devices, each is capable of one of the
1211     // two operations).  We could do more extensive checking here --
1212     // for example, verify that each step within the plan has the
1213     // correct (model and submodel)x(inputs and outputs).
1214     const auto devicesB = makeDevices({{"0", 0.9, 1 << 0}, {"1", 0.5, 1 << 1}});
1215     ExecutionPlan planB;
1216     ASSERT_EQ(model.partitionTheWork(devicesB, ExecutePreference::PREFER_LOW_POWER, &planB),
1217               ANEURALNETWORKS_NO_ERROR);
1218     ASSERT_EQ(planB.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
1219     const auto& stepsB = planB.forTest_compoundGetSteps();
1220     ASSERT_EQ(stepsB.size(), size_t(2));
1221     {
1222         // Build a model to compare against the submodel from stepsB[0].
1223         PartitioningModel modelB0;
1224         uint32_t b0Opnd0 = modelB0.addFloatOperand();
1225         uint32_t b0Opnd1 = modelB0.addFloatOperand();
1226         uint32_t b0Opnd2 = modelB0.addOperation2To1V1_0(0, b0Opnd0, b0Opnd1);
1227         modelB0.identifyInputsAndOutputs({ b0Opnd0, b0Opnd1 }, { b0Opnd2 });
1228         modelB0.finish();
1229         ASSERT_TRUE(modelB0.isValid());
1230 
1231         ASSERT_NO_FATAL_FAILURE(
1232                 compare(stepsB[0], &modelB0, devicesB[0],
1233                         RemapVectorType{{opnd0, b0Opnd0}, {opnd1, b0Opnd1}},  // modelInputs
1234                         RemapVectorType{},                                    // modelOutputs
1235                         RemapVectorType{},                        // tempsAsSubModelInputs
1236                         SubModelOutputSetType{{opnd2, b0Opnd2}},  // tempsAsSubModelOutputs
1237                         RemapVectorType{}));                      // outputsAsSubModelInputs;
1238     }
1239     {
1240         // Build a model to compare against the submodel from stepsB[1].
1241         PartitioningModel modelB1;
1242         uint32_t b1Opnd2 = modelB1.addFloatOperand();
1243         uint32_t b1Opnd3 = modelB1.addFloatOperand();
1244         uint32_t b1Opnd4 = modelB1.addOperation2To1V1_0(1, b1Opnd2, b1Opnd3);
1245         // Note: In the partitioning algorithm, submodel inputs follow
1246         // model inputs.  In the original model "model", opnd2 is not
1247         // an input; so in the submodel "modelB1", the corresponding
1248         // input b1Opnd2 is a submodel input, and must follow the
1249         // model input b1Opnd3.
1250         modelB1.identifyInputsAndOutputs({ b1Opnd3, b1Opnd2 }, { b1Opnd4 });
1251         modelB1.finish();
1252         ASSERT_TRUE(modelB1.isValid());
1253 
1254         ASSERT_NO_FATAL_FAILURE(compare(stepsB[1], &modelB1, devicesB[1],
1255                                         RemapVectorType{{opnd3, b1Opnd3}},  // modelInputs
1256                                         RemapVectorType{{opnd4, b1Opnd4}},  // modelOutputs
1257                                         RemapVectorType{{opnd2, b1Opnd2}},  // tempsAsSubModelInputs
1258                                         SubModelOutputSetType{},  // tempsAsSubModelOutputs
1259                                         RemapVectorType{}));      // outputsAsSubModelInputs
1260     }
1261 }
1262 
TEST_F(PartitioningTest,SliceModel)1263 TEST_F(PartitioningTest, SliceModel) {
1264     PartitioningModel model;
1265     uint32_t opnd0 = model.addFloatOperand();
1266     uint32_t opnd1 = model.addFloatOperand();
1267     uint32_t opnd2 = model.addOperation2To1V1_0(0, opnd0, opnd1);
1268     uint32_t opnd3 = model.addOperation2To1V1_0(1, opnd0, opnd1);
1269     uint32_t opnd4 = model.addOperation2To1V1_1(0, opnd0, opnd1);
1270     uint32_t opnd5 = model.addOperation2To1V1_2(0, opnd2, opnd3);
1271     model.identifyInputsAndOutputs({opnd0, opnd1}, {opnd2, opnd4, opnd5});
1272     model.finish();
1273     ASSERT_TRUE(model.isValid());
1274 
1275     // Simple partition (V1_0, V1_1, V1_2 devices are available; V1_2 has best perf).
1276     // No need to compare the original model to the model from the plan -- we
1277     // didn't actually do any partitioning.
1278     const auto devicesA = makeDevices({{"V1_0", 0.8, HalVersion::V1_0, ~0U},
1279                                        {"V1_1", 0.7, HalVersion::V1_1, ~0U, ~0U},
1280                                        {"V1_2", 0.6, HalVersion::V1_2, ~0U, ~0U, ~0U}});
1281     ExecutionPlan planA;
1282     ASSERT_EQ(model.partitionTheWork(devicesA, ExecutePreference::PREFER_LOW_POWER, &planA),
1283               ANEURALNETWORKS_NO_ERROR);
1284     ASSERT_EQ(planA.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1285     ASSERT_NE(planA.forTest_simpleGetDevice().get(), nullptr);
1286     ASSERT_STREQ(planA.forTest_simpleGetDevice()->getName(), "V1_2");
1287 
1288     // Compound partition (V1_0, V1_1, V1_2 devices are available, in decreasing
1289     // order of performance; model is distributed across all three devices).
1290     const auto devicesB = makeDevices({{"V1_0", 0.6, HalVersion::V1_0, ~0U},
1291                                        {"V1_1", 0.7, HalVersion::V1_1, ~0U, ~0U},
1292                                        {"V1_2", 0.8, HalVersion::V1_2, ~0U, ~0U, ~0U}});
1293     ExecutionPlan planB;
1294     ASSERT_EQ(model.partitionTheWork(devicesB, ExecutePreference::PREFER_LOW_POWER, &planB),
1295               ANEURALNETWORKS_NO_ERROR);
1296     ASSERT_EQ(planB.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
1297     const auto& stepsB = planB.forTest_compoundGetSteps();
1298     ASSERT_EQ(stepsB.size(), size_t(3));
1299     {
1300         // Build a model to compare against the submodel from stepsB[0].
1301         PartitioningModel modelB0;
1302         uint32_t b0Opnd0 = modelB0.addFloatOperand();
1303         uint32_t b0Opnd1 = modelB0.addFloatOperand();
1304         uint32_t b0Opnd2 = modelB0.addOperation2To1V1_1(0, b0Opnd0, b0Opnd1);
1305         modelB0.identifyInputsAndOutputs({b0Opnd0, b0Opnd1}, {b0Opnd2});
1306         modelB0.finish();
1307         ASSERT_TRUE(modelB0.isValid());
1308 
1309         ASSERT_NO_FATAL_FAILURE(
1310                 compare(stepsB[0], &modelB0, devicesB[1],
1311                         RemapVectorType{{opnd0, b0Opnd0}, {opnd1, b0Opnd1}},  // modelInputs
1312                         RemapVectorType{{opnd4, b0Opnd2}},                    // modelOutputs
1313                         RemapVectorType{},        // tempsAsSubModelInputs
1314                         SubModelOutputSetType{},  // tempsAsSubModelOutputs
1315                         RemapVectorType{}));      // outputsAsSubModelInputs
1316     }
1317     {
1318         // Build a model to compare against the submodel from stepsB[1].
1319         PartitioningModel modelB1;
1320         uint32_t b1Opnd0 = modelB1.addFloatOperand();
1321         uint32_t b1Opnd1 = modelB1.addFloatOperand();
1322         uint32_t b1Opnd2 = modelB1.addOperation2To1V1_0(0, b1Opnd0, b1Opnd1);
1323         uint32_t b1Opnd3 = modelB1.addOperation2To1V1_0(1, b1Opnd0, b1Opnd1);
1324         modelB1.identifyInputsAndOutputs({b1Opnd0, b1Opnd1}, {b1Opnd2, b1Opnd3});
1325         modelB1.finish();
1326         ASSERT_TRUE(modelB1.isValid());
1327 
1328         ASSERT_NO_FATAL_FAILURE(
1329                 compare(stepsB[1], &modelB1, devicesB[0],
1330                         RemapVectorType{{opnd0, b1Opnd0}, {opnd1, b1Opnd1}},  // modelInputs
1331                         RemapVectorType{{opnd2, b1Opnd2}},                    // modelOutputs
1332                         RemapVectorType{},                        // tempsAsSubModelInputs
1333                         SubModelOutputSetType{{opnd3, b1Opnd3}},  // tempsAsSubModelOutputs
1334                         RemapVectorType{}));                      // outputsAsSubModelInputs
1335     }
1336     {
1337         // Build a model to compare against the submodel from stepsB[2].
1338         PartitioningModel modelB2;
1339         uint32_t b2Opnd0 = modelB2.addFloatOperand();
1340         uint32_t b2Opnd1 = modelB2.addFloatOperand();
1341         uint32_t b2Opnd2 = modelB2.addOperation2To1V1_2(0, b2Opnd0, b2Opnd1);
1342         // Note: In the partitioning algorithm, temps that are
1343         // submodel inputs precede model outputs that are submodel
1344         // inputs.  In the original model "model", opnd3 is a temp and
1345         // opnd2 is a model output; so in the submodel "modelB2", the
1346         // corresponding inputs b2Opnd1 and b2Opnd0 must appear in
1347         // that order.
1348         modelB2.identifyInputsAndOutputs({b2Opnd1, b2Opnd0}, {b2Opnd2});
1349         modelB2.finish();
1350         ASSERT_TRUE(modelB2.isValid());
1351 
1352         ASSERT_NO_FATAL_FAILURE(
1353                 compare(stepsB[2], &modelB2, devicesB[2], RemapVectorType{},  // modelInputs
1354                         RemapVectorType{{opnd5, b2Opnd2}},                    // modelOutputs
1355                         RemapVectorType{{opnd3, b2Opnd1}},    // tempsAsSubModelInputs
1356                         SubModelOutputSetType{},              // tempsAsSubModelOutputs
1357                         RemapVectorType{{opnd2, b2Opnd0}}));  // outputsAsSubModelInputs
1358     }
1359 
1360     // TODO: Make sure this still works when we have multiple devices
1361     // of same version available for slicing. An easy (?) choice would
1362     // be to route the two different V1_0 operations to different
1363     // devices.
1364 }
1365 
TEST_F(PartitioningTest,SliceModelToEmpty)1366 TEST_F(PartitioningTest, SliceModelToEmpty) {
1367     PartitioningModel model;
1368     uint32_t opnd0 = model.addFloatOperand();
1369     uint32_t opnd1 = model.addFloatOperand();
1370     uint32_t opnd2 = model.addOperation2To1V1_2(0, opnd0, opnd1);
1371     model.identifyInputsAndOutputs({opnd0, opnd1}, {opnd2});
1372     model.finish();
1373     ASSERT_TRUE(model.isValid());
1374 
1375     // Only the V1_2 device can handle any operations in the model.
1376     // No need to compare the original model to the model from the plan -- we
1377     // didn't actually do any partitioning.
1378     const auto devices = makeDevices({{"V1_0", 0.6, HalVersion::V1_0, ~0U},
1379                                       {"V1_1", 0.7, HalVersion::V1_1, ~0U, ~0U},
1380                                       {"V1_2", 0.8, HalVersion::V1_2, ~0U, ~0U, ~0U}});
1381     ExecutionPlan plan;
1382     ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
1383               ANEURALNETWORKS_NO_ERROR);
1384     ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1385     ASSERT_NE(plan.forTest_simpleGetDevice().get(), nullptr);
1386     ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), "V1_2");
1387 }
1388 
TEST_F(PartitioningTest,Cpu)1389 TEST_F(PartitioningTest, Cpu) {
1390     // Here's a model where some operations execute only on the Cpu.
1391     // To make things interesting, we produce three partitions --
1392     // device, cpu, same-device.
1393 
1394     static const uint32_t kCpuOp = 1;
1395     static const uint32_t kDevOp = 2;
1396 
1397     const auto devices = makeDevices({{"1", 0.5, 1 << kDevOp}});
1398 
1399     PartitioningModel model;
1400 
1401     uint32_t opnd0 = model.addFloatOperand();
1402     uint32_t opnd1 = model.addFloatOperand();
1403 
1404     uint32_t opnd2 = model.addOperation2To1V1_0(kDevOp, opnd0, opnd1);
1405     uint32_t opnd3 = model.addOperation2To1V1_0(kDevOp, opnd0, opnd2);
1406 
1407     uint32_t opnd4 = model.addOperation2To1V1_0(kCpuOp, opnd0, opnd3);
1408     uint32_t opnd5 = model.addOperation2To1V1_0(kCpuOp, opnd2, opnd4);
1409 
1410     uint32_t opnd6 = model.addFloatOperand();
1411 
1412     uint32_t opnd7 = model.addOperation2To1V1_0(kDevOp, opnd3, opnd5);
1413     uint32_t opnd8 = model.addOperation2To1V1_0(kDevOp, opnd6, opnd7);
1414 
1415     model.identifyInputsAndOutputs({ opnd0, opnd1, opnd6 }, { opnd4, opnd8 });
1416     model.finish();
1417     ASSERT_TRUE(model.isValid());
1418 
1419     ExecutionPlan plan;
1420     ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
1421               ANEURALNETWORKS_NO_ERROR);
1422     ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
1423     const auto& steps = plan.forTest_compoundGetSteps();
1424     ASSERT_EQ(steps.size(), size_t(3));
1425     {
1426         const auto& step0 = steps[0];
1427 
1428         // Build a model to compare against the submodel from steps[0].
1429         PartitioningModel model0;
1430         uint32_t m0Opnd0 = model0.addFloatOperand();
1431         uint32_t m0Opnd1 = model0.addFloatOperand();
1432         uint32_t m0Opnd2 = model0.addOperation2To1V1_0(kDevOp, m0Opnd0, m0Opnd1);
1433         uint32_t m0Opnd3 = model0.addOperation2To1V1_0(kDevOp, m0Opnd0, m0Opnd2);
1434         model0.identifyInputsAndOutputs({ m0Opnd0, m0Opnd1 }, { m0Opnd2, m0Opnd3 });
1435         model0.finish();
1436         ASSERT_TRUE(model0.isValid());
1437 
1438         ASSERT_NO_FATAL_FAILURE(
1439                 compare(step0, &model0, devices[0],
1440                         RemapVectorType{{opnd0, m0Opnd0}, {opnd1, m0Opnd1}},  // modelInputs
1441                         RemapVectorType{},                                    // modelOutputs
1442                         RemapVectorType{},  // tempsAsSubModelInputs
1443                         SubModelOutputSetType{{opnd2, m0Opnd2},
1444                                               {opnd3, m0Opnd3}},  // tempsAsSubModelOutputs
1445                         RemapVectorType{}));                      // outputsAsSubModelInputs
1446     }
1447     {
1448         const auto& step1 = steps[1];
1449 
1450         // Build a model to compare against the submodel from steps[1].
1451         PartitioningModel model1;
1452         uint32_t m1Opnd0 = model1.addFloatOperand();
1453         uint32_t m1Opnd3 = model1.addFloatOperand();
1454         uint32_t m1Opnd4 = model1.addOperation2To1V1_0(kCpuOp, m1Opnd0, m1Opnd3);
1455         uint32_t m1Opnd2 = model1.addFloatOperand();
1456         uint32_t m1Opnd5 = model1.addOperation2To1V1_0(kCpuOp, m1Opnd2, m1Opnd4);
1457         model1.identifyInputsAndOutputs({ m1Opnd0, m1Opnd3, m1Opnd2 }, { m1Opnd4, m1Opnd5 });
1458         model1.finish();
1459         ASSERT_TRUE(model1.isValid());
1460 
1461         ASSERT_NO_FATAL_FAILURE(compare(
1462                 step1, &model1, DeviceManager::getCpuDevice(),
1463                 RemapVectorType{{opnd0, m1Opnd0}},                    // modelInputs
1464                 RemapVectorType{{opnd4, m1Opnd4}},                    // modelOutputs
1465                 RemapVectorType{{opnd3, m1Opnd3}, {opnd2, m1Opnd2}},  // tempsAsSubModelInputs
1466                 SubModelOutputSetType{{opnd5, m1Opnd5}},              // tempsAsSubModelOutputs
1467                 RemapVectorType{}));                                  // outputsAsSubModelInputs
1468     }
1469     {
1470         const auto& step2 = steps[2];
1471 
1472         // Build a model to compare against the submodel from steps[2].
1473         PartitioningModel model2;
1474         uint32_t m2Opnd3 = model2.addFloatOperand();
1475         uint32_t m2Opnd5 = model2.addFloatOperand();
1476         uint32_t m2Opnd7 = model2.addOperation2To1V1_0(kDevOp, m2Opnd3, m2Opnd5);
1477         uint32_t m2Opnd6 = model2.addFloatOperand();
1478         uint32_t m2Opnd8 = model2.addOperation2To1V1_0(kDevOp, m2Opnd6, m2Opnd7);
1479         model2.identifyInputsAndOutputs({ m2Opnd6, m2Opnd3, m2Opnd5 }, { m2Opnd8 });
1480         model2.finish();
1481         ASSERT_TRUE(model2.isValid());
1482 
1483         ASSERT_NO_FATAL_FAILURE(compare(
1484                 step2, &model2, devices[0], RemapVectorType{{opnd6, m2Opnd6}},  // modelInputs
1485                 RemapVectorType{{opnd8, m2Opnd8}},                              // modelOutputs
1486                 RemapVectorType{{opnd3, m2Opnd3}, {opnd5, m2Opnd5}},  // tempsAsSubModelInputs
1487                 SubModelOutputSetType{},                              // tempsAsSubModelOutputs
1488                 RemapVectorType{}));                                  // outputsAsSubModelInputs
1489     }
1490 }
1491 
TEST_F(PartitioningTest,SetPartitioning)1492 TEST_F(PartitioningTest, SetPartitioning) {
1493     PartitioningModel model;
1494     uint32_t opnd0 = model.addFloatOperand();
1495     uint32_t opnd1 = model.addFloatOperand();
1496     uint32_t opnd2 =
1497             model.addOperation2To1V1_0(0, opnd0, opnd1, PartitioningModel::Dimensioned::NO);
1498     uint32_t opnd3 = model.addFloatOperand();
1499     uint32_t opnd4 = model.addOperation2To1V1_0(1, opnd2, opnd3);
1500     model.identifyInputsAndOutputs({ opnd0, opnd1, opnd3 }, { opnd4 });
1501     model.finish();
1502     ASSERT_TRUE(model.isValid());
1503 
1504     // We expect that we cannot successfully partition, because we
1505     // have an intermediate operand (opnd2) without dimensions, and
1506     // this is not currently handled.
1507 
1508     // One device that can and should execute operation 0.
1509     const auto devices = makeDevices({{"hw", 0.5, (1 << 0)}});
1510 
1511     // Test kPartitioningNo.  We should not even attempt partitioning,
1512     // so there should be a SIMPLE plan on CPU.
1513     // No need to compare the original model to the model from the plan -- we
1514     // didn't actually do any partitioning.
1515     PartitioningCompilation cPNo(&model, devices);
1516     ASSERT_EQ(cPNo.setPartitioning(DeviceManager::kPartitioningNo), Result::NO_ERROR);
1517     ASSERT_EQ(cPNo.finish(), Result::NO_ERROR);
1518     ASSERT_EQ(cPNo.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1519     ASSERT_EQ(cPNo.getExecutionPlan().forTest_simpleGetDevice(), DeviceManager::getCpuDevice());
1520 
1521     // Test kPartitioningWithFallback.  We should attempt
1522     // partitioning, reach the end of the partitioning process (so we
1523     // have an unsuccessful execution plan), discover the dimensionless
1524     // intermediate operand, then fallback to CPU with a SIMPLE plan, and
1525     // finally return success.
1526     // No need to compare the original model to the model from the plan -- we
1527     // didn't actually do any partitioning.
1528     PartitioningCompilation cPWithFallback(&model, devices);
1529     ASSERT_EQ(cPWithFallback.setPartitioning(DeviceManager::kPartitioningWithFallback), Result::NO_ERROR);
1530     ASSERT_EQ(cPWithFallback.finish(), Result::NO_ERROR);
1531     ASSERT_EQ(cPWithFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1532     ASSERT_EQ(cPWithFallback.getExecutionPlan().forTest_simpleGetDevice(),
1533               DeviceManager::getCpuDevice());
1534 
1535     // Test kPartitioningWithoutFallback.  We should attempt
1536     // partitioning, and fail.
1537     PartitioningCompilation cPWithoutFallback(&model, devices);
1538     ASSERT_EQ(cPWithoutFallback.setPartitioning(DeviceManager::kPartitioningWithoutFallback), Result::NO_ERROR);
1539     ASSERT_EQ(cPWithoutFallback.finish(), Result::OP_FAILED);
1540     ASSERT_TRUE(cPWithoutFallback.getExecutionPlan().forTest_hasSubModelOutputsOfUnknownSize());
1541     ASSERT_EQ(cPWithoutFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::ERROR);
1542 }
1543 
1544 // Regression test for http://b/69166603:
1545 //     "partitioned compilation and execution yields wrong results when model output is submodel input"
TEST_F(PartitioningTest,ModelOutputAsSubmodelInput)1546 TEST_F(PartitioningTest, ModelOutputAsSubmodelInput) {
1547     PartitioningModel model;
1548     uint32_t opnd0 = model.addFloatOperand();
1549     uint32_t opnd1 = model.addFloatOperand();
1550     uint32_t opnd2 = model.addOperation2To1V1_0(0, opnd0, opnd1);
1551     uint32_t opnd3 = model.addOperation2To1V1_0(1, opnd2, opnd2);
1552     model.identifyInputsAndOutputs({ opnd0, opnd1 }, { opnd2, opnd3 });
1553     model.finish();
1554     ASSERT_TRUE(model.isValid());
1555 
1556     // Compound partition (two devices, each is capable of one of the
1557     // two operations).  We could do more extensive checking here --
1558     // for example, verify that each step within the plan has the
1559     // correct (model and submodel)x(inputs and outputs).
1560     const auto devices = makeDevices({{"0", 0.5, 1 << 0}, {"1", 0.5, 1 << 1}});
1561     ExecutionPlan plan;
1562     ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
1563               ANEURALNETWORKS_NO_ERROR);
1564     ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
1565     const auto& steps = plan.forTest_compoundGetSteps();
1566     ASSERT_EQ(steps.size(), size_t(2));
1567     {
1568         // Build a model to compare against the submodel from steps[0].
1569         PartitioningModel model0;
1570         uint32_t m0Opnd0 = model0.addFloatOperand();
1571         uint32_t m0Opnd1 = model0.addFloatOperand();
1572         uint32_t m0Opnd2 = model0.addOperation2To1V1_0(0, m0Opnd0, m0Opnd1);
1573         model0.identifyInputsAndOutputs({ m0Opnd0, m0Opnd1 }, { m0Opnd2 });
1574         model0.finish();
1575         ASSERT_TRUE(model0.isValid());
1576         ASSERT_NO_FATAL_FAILURE(
1577                 compare(steps[0], &model0, devices[0],
1578                         RemapVectorType{{opnd0, m0Opnd0}, {opnd1, m0Opnd1}},  // modelInputs
1579                         RemapVectorType{{opnd2, m0Opnd2}},                    // modelOutputs
1580                         RemapVectorType{},        // tempsAsSubModelInputs
1581                         SubModelOutputSetType{},  // tempsAsSubModelOutputs
1582                         RemapVectorType{}));      // outputsAsSubModelInputs
1583     }
1584     {
1585         // Build a model to compare against the submodel from steps[1].
1586         PartitioningModel model1;
1587         uint32_t m1Opnd2 = model1.addFloatOperand();
1588         uint32_t m1Opnd3 = model1.addOperation2To1V1_0(1, m1Opnd2, m1Opnd2);
1589         model1.identifyInputsAndOutputs({ m1Opnd2 }, { m1Opnd3 });
1590         model1.finish();
1591         ASSERT_TRUE(model1.isValid());
1592 
1593         ASSERT_NO_FATAL_FAILURE(
1594                 compare(steps[1], &model1, devices[1], RemapVectorType{},  // modelInputs
1595                         RemapVectorType{{opnd3, m1Opnd3}},                 // modelOutputs
1596                         RemapVectorType{},                                 // tempsAsSubModelInputs
1597                         SubModelOutputSetType{},                           // tempsAsSubModelOutputs
1598                         RemapVectorType{{opnd2, m1Opnd2}}));  // outputsAsSubModelInputs
1599     }
1600 }
1601 
TEST_F(PartitioningTest,OemOperations)1602 TEST_F(PartitioningTest, OemOperations) {
1603     // Trivial model consisting solely of OEM operation.
1604     PartitioningModel model;
1605     uint32_t opndIn = model.addFloatOperand();
1606     uint32_t opndOut = model.addOperationOEM1To1(opndIn);
1607     model.identifyInputsAndOutputs({ opndIn }, { opndOut });
1608     model.finish();
1609     ASSERT_TRUE(model.isValid());
1610 
1611     // Verify that the best driver than can run an OEM operation is
1612     // used, even if it is not better than the CPU.
1613     // No need to compare the original model to the model from the plan -- we
1614     // didn't actually do any partitioning.
1615     const auto devicesBestOEM = makeDevices({{"badOEM", 1.5, ~0U, PartitioningDriver::OEMYes},
1616                                              {"noOEM", 0.5, ~0U, PartitioningDriver::OEMNo},
1617                                              {"goodOEM", 1.2, ~0U, PartitioningDriver::OEMYes}});
1618     PartitioningCompilation compilationBestOEM(&model, devicesBestOEM);
1619     ASSERT_EQ(compilationBestOEM.finish(), Result::NO_ERROR);
1620     const auto& planBestOEM = compilationBestOEM.getExecutionPlan();
1621     ASSERT_EQ(planBestOEM.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1622     ASSERT_NE(planBestOEM.forTest_simpleGetDevice().get(), nullptr);
1623     ASSERT_STREQ(planBestOEM.forTest_simpleGetDevice()->getName(), "goodOEM");
1624 
1625     // Verify that we get an error if no driver can run an OEM operation.
1626     const auto devicesNoOEM = makeDevices({{"noOEM", 0.5, ~0U, PartitioningDriver::OEMNo}});
1627     PartitioningCompilation compilationNoOEM(&model, devicesNoOEM);
1628     ASSERT_EQ(compilationNoOEM.finish(), Result::BAD_DATA);
1629 
1630     // Verify that we get an error if a driver can SUPPORT but not PREPARE an OEM operation.
1631     const auto devicesIndecisiveOEM =
1632             makeDevices({{"indecisiveOEM", 0.5, ~0U, PartitioningDriver::OEMIndecisive}});
1633     PartitioningCompilation compilationIndecisiveOEM(&model, devicesIndecisiveOEM);
1634     ASSERT_NE(compilationIndecisiveOEM.finish(), Result::NO_ERROR);
1635 
1636     // Verify that we get an error if there are no drivers (only CPU fallback).
1637     PartitioningCompilation compilationNoDrivers(&model, makeDevices({}) /* no drivers */);
1638     ASSERT_EQ(compilationNoDrivers.finish(), Result::BAD_DATA);
1639 }
1640 
TEST_F(PartitioningTest,RelaxedFP)1641 TEST_F(PartitioningTest, RelaxedFP) {
1642     const auto devices = makeDevices({// Best choice for non-relaxed model.
1643                                       {"f32", 0.8, 0.9 /* relaxed */, ~0U},
1644                                       // Best choice for relaxed model.
1645                                       {"f16", 0.9, 0.8 /* relaxed */, ~0U}});
1646 
1647     auto TrivialTest = [&devices](bool doRelax, const char* expectDevice) {
1648         // Trivial model consisting solely of one operation.
1649         SCOPED_TRACE(expectDevice);
1650         PartitioningModel model;
1651         uint32_t opnd0 = model.addFloatOperand();
1652         uint32_t opnd1 = model.addFloatOperand();
1653         uint32_t opnd2 = model.addOperation2To1V1_0(0, opnd0, opnd1);
1654         model.identifyInputsAndOutputs({ opnd0, opnd1 }, { opnd2 });
1655         model.relaxComputationFloat32toFloat16(doRelax);
1656         model.finish();
1657         ASSERT_TRUE(model.isValid());
1658         // Verify that the model will be executed on the appropriate device.
1659         // No need to compare the original model to the model from the plan -- we
1660         // didn't actually do any partitioning.
1661         ExecutionPlan plan;
1662         ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
1663                   ANEURALNETWORKS_NO_ERROR);
1664         ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1665         ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), expectDevice);
1666     };
1667 
1668     ASSERT_NO_FATAL_FAILURE(TrivialTest(false, "f32"));
1669     ASSERT_NO_FATAL_FAILURE(TrivialTest(true, "f16"));
1670 }
1671 
TEST_F(PartitioningTest,Perf)1672 TEST_F(PartitioningTest, Perf) {
1673     // The various type names used here are confusing.
1674     //
1675     // OperandType (from HAL file), WrapperType (from NeuralNetworksWrapper.h),
1676     // and OperandCode (from NeuralNetworks.h) are different enums representing
1677     // the same type kind -- e.g., OperandType::FLOAT32, WrapperType::FLOAT32,
1678     // ANEURALNETWORKS_FLOAT32.  Corresponding enumerators have the same value.
1679     //
1680     // WrapperOperandType is the NeuralNetworksWrapper.h representation of a
1681     // full operand type (WrapperType plus dimensions plus other attributes).
1682 
1683     auto TestType = [](OperandType operandType) {
1684         SCOPED_TRACE(toString(operandType));
1685         // Trivial model consisting solely of OEM operation.  We
1686         // pick OEM operation because this allows us to use
1687         // inputs and outputs of any number and type.
1688         PartitioningModel model;
1689         uint32_t opndIn = model.addOperand(static_cast<WrapperType>(operandType));
1690         uint32_t opndOut = model.addOperationOEM1To1(opndIn);
1691         model.identifyInputsAndOutputs({opndIn}, {opndOut});
1692         model.finish();
1693         ASSERT_TRUE(model.isValid());
1694 
1695         const Capabilities baseCapabilities = makeCapabilities(0.5);
1696 
1697         {
1698             // better than base
1699             Capabilities goodCapabilities = baseCapabilities;
1700             update(&goodCapabilities, operandType, 0.25);
1701 
1702             const auto devices =
1703                     makeDevices({{"base", baseCapabilities, ~0U, PartitioningDriver::OEMYes},
1704                                  {"good", goodCapabilities, ~0U, PartitioningDriver::OEMYes}});
1705 
1706             // Verify that model will be executed on "good".
1707             // No need to compare the original model to the model from the plan -- we
1708             // didn't actually do any partitioning.
1709             ExecutionPlan plan;
1710             ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
1711                       ANEURALNETWORKS_NO_ERROR);
1712             ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1713             ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), "good");
1714         }
1715 
1716         {
1717             // worse than base
1718             Capabilities badCapabilities = baseCapabilities;
1719             update(&badCapabilities, operandType, 0.75);
1720             const auto devices =
1721                     makeDevices({{"base", baseCapabilities, ~0U, PartitioningDriver::OEMYes},
1722                                  {"bad", badCapabilities, ~0U, PartitioningDriver::OEMYes}});
1723 
1724             // Verify that model will be executed on "base".
1725             // No need to compare the original model to the model from the plan -- we
1726             // didn't actually do any partitioning.
1727             ExecutionPlan plan;
1728             ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
1729                       ANEURALNETWORKS_NO_ERROR);
1730             ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1731             ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), "base");
1732         }
1733     };
1734 
1735     for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MIN);
1736          type <= static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MAX); ++type) {
1737         TestType(static_cast<OperandType>(type));
1738     }
1739     for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::OEM_MIN);
1740          type <= static_cast<uint32_t>(OperandTypeRange::OEM_MAX); ++type) {
1741         TestType(static_cast<OperandType>(type));
1742     }
1743 }
1744 
1745 // Test token rehashing during the compilation step.
1746 class CacheTest : public PartitioningTest {
1747    protected:
SetUp()1748     virtual void SetUp() override {
1749         PartitioningTest::SetUp();
1750         char cacheDirTemp[] = "/data/local/tmp/TestCompilationCachingXXXXXX";
1751         char* cacheDir = mkdtemp(cacheDirTemp);
1752         ASSERT_NE(cacheDir, nullptr);
1753         mCacheDir = cacheDir;
1754     }
1755 
TearDown()1756     virtual void TearDown() override {
1757         if (!::testing::Test::HasFailure()) {
1758             std::filesystem::remove_all(mCacheDir);
1759         }
1760         PartitioningTest::TearDown();
1761     }
1762 
expectUniqueTokens(const std::vector<std::vector<uint8_t>> & tokens)1763     void expectUniqueTokens(const std::vector<std::vector<uint8_t>>& tokens) {
1764         for (uint32_t i = 0; i < tokens.size(); i++) {
1765             SCOPED_TRACE(i);
1766             for (uint32_t j = i + 1; j < tokens.size(); j++) {
1767                 SCOPED_TRACE(j);
1768                 EXPECT_NE(tokens[i], tokens[j]);
1769             }
1770         }
1771     }
1772 
1773     // Launch a single run of the partitioner against the provided model and device list with
1774     // cache token privided as tokenIn. Find the partition for the device with deviceName.
1775     // Record the tranformed token into tokenOut.
1776     // If tokenIn is empty, no caching information will be provided to the partitioner.
getTransformedCacheTokenSingle(const PartitioningModel & model,const std::vector<std::shared_ptr<Device>> & devices,const char * deviceName,const std::vector<uint8_t> & tokenIn,ExecutePreference preference,std::vector<uint8_t> * tokenOut)1777     void getTransformedCacheTokenSingle(const PartitioningModel& model,
1778                                         const std::vector<std::shared_ptr<Device>>& devices,
1779                                         const char* deviceName, const std::vector<uint8_t>& tokenIn,
1780                                         ExecutePreference preference,
1781                                         std::vector<uint8_t>* tokenOut) {
1782         // Compile the model and get the execution plan.
1783         PartitioningCompilation compilation(&model, devices);
1784         if (!tokenIn.empty()) {
1785             compilation.setCaching(mCacheDir.c_str(), tokenIn);
1786         }
1787         compilation.setPreference(preference);
1788         ASSERT_EQ(compilation.finish(), Result::NO_ERROR);
1789         const ExecutionPlan& plan = compilation.getExecutionPlan();
1790 
1791         // Find the cache info for the device.
1792         const uint8_t* token = nullptr;
1793         if (plan.forTest_getKind() == ExecutionPlan::Kind::SIMPLE) {
1794             ASSERT_STREQ(plan.forTest_simpleGetDevice()->getName(), deviceName);
1795             token = plan.forTest_simpleGetCacheToken();
1796         } else if (plan.forTest_getKind() == ExecutionPlan::Kind::COMPOUND) {
1797             const auto& steps = plan.forTest_compoundGetSteps();
1798             bool found = false;
1799             for (const auto& step : steps) {
1800                 // In general, two or more partitions can be on the same device. However, this will
1801                 // not happen on the test models with only 2 operations.
1802                 if (strcmp(step->getDevice()->getName(), deviceName) == 0) {
1803                     ASSERT_FALSE(found);
1804                     token = step->forTest_getCacheToken();
1805                     found = true;
1806                 }
1807             }
1808             ASSERT_TRUE(found);
1809         } else {
1810             FAIL();
1811         }
1812 
1813         // Retrieve the transformed token from the cache info.
1814         if (token == nullptr) {
1815             tokenOut->clear();
1816         } else {
1817             tokenOut->resize(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN);
1818             std::copy(token, token + ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, tokenOut->begin());
1819         }
1820     }
1821 
1822     // A wrapper of getTransformedCacheTokenSingle, which runs getTransformedCacheTokenSingle
1823     // multiple times and checks if the transformation provides consistent result.
getTransformedCacheToken(const PartitioningModel & model,const std::vector<std::shared_ptr<Device>> & devices,const char * deviceName,const std::vector<uint8_t> & tokenIn,ExecutePreference preference,std::vector<uint8_t> * tokenOut)1824     void getTransformedCacheToken(const PartitioningModel& model,
1825                                   const std::vector<std::shared_ptr<Device>>& devices,
1826                                   const char* deviceName, const std::vector<uint8_t>& tokenIn,
1827                                   ExecutePreference preference, std::vector<uint8_t>* tokenOut) {
1828         getTransformedCacheTokenSingle(model, devices, deviceName, tokenIn, preference, tokenOut);
1829 
1830         // Test if the runtime maps to the same cache token every time for the same compilation
1831         // setup.
1832         for (uint32_t i = 0; i < 10; i++) {
1833             std::vector<uint8_t> token;
1834             SCOPED_TRACE(i);
1835             getTransformedCacheTokenSingle(model, devices, deviceName, tokenIn, preference, &token);
1836             EXPECT_EQ(*tokenOut, token);
1837         }
1838     }
1839 
CreateModelForCachingTests(PartitioningModel * model)1840     void CreateModelForCachingTests(PartitioningModel* model) {
1841         uint32_t opnd0 = model->addFloatOperand();
1842         uint32_t opnd1 = model->addFloatOperand();
1843         uint32_t opnd2 = model->addOperation2To1V1_0(0, opnd0, opnd1);
1844         uint32_t opnd3 = model->addFloatOperand();
1845         uint32_t opnd4 = model->addOperation2To1V1_0(1, opnd2, opnd3);
1846         model->identifyInputsAndOutputs({opnd0, opnd1, opnd3}, {opnd4});
1847         model->finish();
1848         ASSERT_TRUE(model->isValid());
1849     }
1850 
1851     std::string mCacheDir;
1852 };
1853 
1854 // Test the case when no token is provided by the application and the execution plan has a
1855 // simple body.
TEST_F(CacheTest,CacheTokenNoneSimpleBody)1856 TEST_F(CacheTest, CacheTokenNoneSimpleBody) {
1857     PartitioningModel model;
1858     CreateModelForCachingTests(&model);
1859 
1860     // deviceA can execute the whole model.
1861     const auto deviceA = makeDevices({
1862             {"deviceA", 0.5, ~0U},
1863     });
1864 
1865     std::vector<uint8_t> tokenIn, tokenOut;
1866     getTransformedCacheToken(model, deviceA, "deviceA", tokenIn,
1867                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut);
1868     EXPECT_TRUE(tokenOut.empty());
1869 }
1870 
1871 // Test if the runtime maps to different cache tokens for devices with different names in
1872 // execution plan with a simple body.
TEST_F(CacheTest,CacheTokenDifferentDeviceNamesSimpleBody)1873 TEST_F(CacheTest, CacheTokenDifferentDeviceNamesSimpleBody) {
1874     PartitioningModel model;
1875     CreateModelForCachingTests(&model);
1876 
1877     // Two devices that can both execute the whole model.
1878     const auto deviceA = makeDevices({{"deviceA", 0.5, ~0U}});
1879     const auto deviceB = makeDevices({{"deviceB", 0.5, ~0U}});
1880 
1881     std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
1882     std::vector<uint8_t> deviceAToken, deviceBToken;
1883     getTransformedCacheToken(model, deviceA, "deviceA", tokenIn,
1884                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceAToken);
1885     getTransformedCacheToken(model, deviceB, "deviceB", tokenIn,
1886                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceBToken);
1887     expectUniqueTokens({deviceAToken, deviceBToken});
1888 }
1889 
1890 // Test if the runtime maps to different cache tokens for devices with different version strings in
1891 // execution plan with a simple body.
TEST_F(CacheTest,CacheTokenDifferentDeviceVersionStringsSimpleBody)1892 TEST_F(CacheTest, CacheTokenDifferentDeviceVersionStringsSimpleBody) {
1893     PartitioningModel model;
1894     CreateModelForCachingTests(&model);
1895 
1896     // Two devices that can both execute the whole model.
1897     const auto deviceA_1_0 = makeDevices({{"deviceA", "1.0", 0.5, ~0U}});
1898     const auto deviceA_1_1 = makeDevices({{"deviceA", "1.1", 0.5, ~0U}});
1899 
1900     std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
1901     std::vector<uint8_t> deviceA_1_0_Token, deviceA_1_1_Token;
1902     getTransformedCacheToken(model, deviceA_1_0, "deviceA", tokenIn,
1903                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceA_1_0_Token);
1904     getTransformedCacheToken(model, deviceA_1_1, "deviceA", tokenIn,
1905                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceA_1_1_Token);
1906     expectUniqueTokens({deviceA_1_0_Token, deviceA_1_1_Token});
1907 }
1908 
1909 // Test if the runtime maps to different cache tokens for compilations with different preferences
1910 // in execution plan with a simple body.
TEST_F(CacheTest,CacheTokenDifferentPreferencesSimpleBody)1911 TEST_F(CacheTest, CacheTokenDifferentPreferencesSimpleBody) {
1912     PartitioningModel model;
1913     CreateModelForCachingTests(&model);
1914 
1915     // One device that can execute the whole model.
1916     const auto deviceA = makeDevices({{"deviceA", 0.5, ~0U}});
1917 
1918     std::vector<uint8_t> fastToken, powerToken, sustainedToken;
1919     std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
1920     getTransformedCacheToken(model, deviceA, "deviceA", tokenIn,
1921                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &fastToken);
1922     getTransformedCacheToken(model, deviceA, "deviceA", tokenIn,
1923                              ExecutePreference::PREFER_LOW_POWER, &powerToken);
1924     getTransformedCacheToken(model, deviceA, "deviceA", tokenIn,
1925                              ExecutePreference::PREFER_SUSTAINED_SPEED, &sustainedToken);
1926     expectUniqueTokens({fastToken, powerToken, sustainedToken});
1927 }
1928 
1929 // Test if the runtime maps to different cache tokens for compilations with different tokens
1930 // provided by application in execution plan with a simple body.
TEST_F(CacheTest,CacheTokenDifferentTokensSimpleBody)1931 TEST_F(CacheTest, CacheTokenDifferentTokensSimpleBody) {
1932     PartitioningModel model;
1933     CreateModelForCachingTests(&model);
1934 
1935     // One device that can execute the whole model.
1936     const auto deviceA = makeDevices({{"deviceA", 0.5, ~0U}});
1937 
1938     std::vector<uint8_t> tokenOut1, tokenOut2;
1939     std::vector<uint8_t> tokenIn1(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
1940     std::vector<uint8_t> tokenIn2(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 1);
1941     getTransformedCacheToken(model, deviceA, "deviceA", tokenIn1,
1942                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut1);
1943     getTransformedCacheToken(model, deviceA, "deviceA", tokenIn2,
1944                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut2);
1945     expectUniqueTokens({tokenOut1, tokenOut2});
1946 }
1947 
1948 // Test the case when no token is provided by the application and the execution plan has a
1949 // compound body.
TEST_F(CacheTest,CacheTokenNoneCompoundBody)1950 TEST_F(CacheTest, CacheTokenNoneCompoundBody) {
1951     PartitioningModel model;
1952     CreateModelForCachingTests(&model);
1953 
1954     // DeviceA executes the first operation only.
1955     const auto devices = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}});
1956 
1957     std::vector<uint8_t> tokenIn, tokenOut;
1958     getTransformedCacheToken(model, devices, "deviceA", tokenIn,
1959                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut);
1960     EXPECT_TRUE(tokenOut.empty());
1961     getTransformedCacheToken(model, devices, "deviceB", tokenIn,
1962                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut);
1963     EXPECT_TRUE(tokenOut.empty());
1964 }
1965 
1966 // Test if the runtime maps to different cache tokens for devices with different names in
1967 // execution plan with a compound body.
TEST_F(CacheTest,CacheTokenDifferentDeviceNamesCompoundBody)1968 TEST_F(CacheTest, CacheTokenDifferentDeviceNamesCompoundBody) {
1969     PartitioningModel model;
1970     CreateModelForCachingTests(&model);
1971 
1972     // DeviceA executes the first operation only.
1973     const auto devices1 = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceC", 0.5, 1 << 1}});
1974     // DeviceB executes the first operation only.
1975     const auto devices2 = makeDevices({{"deviceB", 0.8, ~0U}, {"deviceC", 0.5, 1 << 1}});
1976 
1977     std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
1978     std::vector<uint8_t> deviceAToken, deviceBToken;
1979     getTransformedCacheToken(model, devices1, "deviceA", tokenIn,
1980                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceAToken);
1981     getTransformedCacheToken(model, devices2, "deviceB", tokenIn,
1982                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceBToken);
1983     expectUniqueTokens({deviceAToken, deviceBToken});
1984 }
1985 
1986 // Test if the runtime maps to different cache tokens for devices with different names in
1987 // execution plan with a compound body.
TEST_F(CacheTest,CacheTokenDifferentDeviceVersionStringsCompoundBody)1988 TEST_F(CacheTest, CacheTokenDifferentDeviceVersionStringsCompoundBody) {
1989     PartitioningModel model;
1990     CreateModelForCachingTests(&model);
1991 
1992     // DeviceA executes the first operation only.
1993     const auto devices1 = makeDevices({{"deviceA", "1.0", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}});
1994     // DeviceB executes the first operation only.
1995     const auto devices2 = makeDevices({{"deviceA", "1.1", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}});
1996 
1997     std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
1998     std::vector<uint8_t> deviceA_1_0_Token, deviceA_1_1_Token;
1999     getTransformedCacheToken(model, devices1, "deviceA", tokenIn,
2000                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceA_1_0_Token);
2001     getTransformedCacheToken(model, devices2, "deviceA", tokenIn,
2002                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &deviceA_1_1_Token);
2003     expectUniqueTokens({deviceA_1_0_Token, deviceA_1_1_Token});
2004 }
2005 
2006 // Test if the runtime maps to different cache tokens for compilations with different preferences
2007 // in execution plan with a compound body.
TEST_F(CacheTest,CacheTokenDifferentPreferencesCompoundBody)2008 TEST_F(CacheTest, CacheTokenDifferentPreferencesCompoundBody) {
2009     PartitioningModel model;
2010     CreateModelForCachingTests(&model);
2011 
2012     // DeviceA executes the first operation only.
2013     const auto devices = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}});
2014 
2015     std::vector<uint8_t> fastToken, powerToken, sustainedToken;
2016     std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
2017     getTransformedCacheToken(model, devices, "deviceA", tokenIn,
2018                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &fastToken);
2019     getTransformedCacheToken(model, devices, "deviceA", tokenIn,
2020                              ExecutePreference::PREFER_LOW_POWER, &powerToken);
2021     getTransformedCacheToken(model, devices, "deviceA", tokenIn,
2022                              ExecutePreference::PREFER_SUSTAINED_SPEED, &sustainedToken);
2023     expectUniqueTokens({fastToken, powerToken, sustainedToken});
2024 }
2025 
2026 // Test if the runtime maps to different cache tokens for compilations with different tokens
2027 // provided by application in execution plan with a compound body.
TEST_F(CacheTest,CacheTokenDifferentTokensCompoundBody)2028 TEST_F(CacheTest, CacheTokenDifferentTokensCompoundBody) {
2029     PartitioningModel model;
2030     CreateModelForCachingTests(&model);
2031 
2032     // DeviceA executes the first operation only.
2033     const auto devices = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}});
2034 
2035     std::vector<uint8_t> tokenOut1, tokenOut2;
2036     std::vector<uint8_t> tokenIn1(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
2037     std::vector<uint8_t> tokenIn2(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 1);
2038     getTransformedCacheToken(model, devices, "deviceA", tokenIn1,
2039                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut1);
2040     getTransformedCacheToken(model, devices, "deviceA", tokenIn2,
2041                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut2);
2042     expectUniqueTokens({tokenOut1, tokenOut2});
2043 }
2044 
2045 // Test if the runtime maps to different cache tokens for compilations with different partitioning
2046 // outcome in execution plan with a compound body.
TEST_F(CacheTest,CacheTokenDifferentPartitionsCompoundBody)2047 TEST_F(CacheTest, CacheTokenDifferentPartitionsCompoundBody) {
2048     PartitioningModel model;
2049     CreateModelForCachingTests(&model);
2050 
2051     // DeviceA executes the whole model.
2052     const auto devices1 = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 0U}});
2053     // DeviceA executes the first operation only.
2054     const auto devices2 = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 1}});
2055     // DeviceA executes the second operation only.
2056     const auto devices3 = makeDevices({{"deviceA", 0.8, ~0U}, {"deviceB", 0.5, 1 << 0}});
2057 
2058     std::vector<uint8_t> tokenIn(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN, 0);
2059     std::vector<uint8_t> tokenOut1, tokenOut2, tokenOut3;
2060     getTransformedCacheToken(model, devices1, "deviceA", tokenIn,
2061                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut1);
2062     getTransformedCacheToken(model, devices2, "deviceA", tokenIn,
2063                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut2);
2064     getTransformedCacheToken(model, devices3, "deviceA", tokenIn,
2065                              ExecutePreference::PREFER_FAST_SINGLE_ANSWER, &tokenOut3);
2066     expectUniqueTokens({tokenOut1, tokenOut2, tokenOut3});
2067 }
2068 
2069 // Very basic tests of some of the PerformanceInfo functionality.
2070 // Placed in this file because partitioning is the consumer of this functionality.
2071 class PerfTest : public ::testing::Test {};
2072 
TEST_F(PerfTest,Lookup)2073 TEST_F(PerfTest, Lookup) {
2074     // Derive an arbitrary (but reproducible) performance value from an OperandType.
2075     // We'll use this to ensure that we can save and then recover a type's performance.
2076     auto typePerf = [](OperandType type) { return float(static_cast<uint32_t>(type)); };
2077 
2078     Capabilities capabilities = makeCapabilities(-1.0f);
2079 
2080     for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MIN);
2081          type <= static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MAX); ++type) {
2082         OperandType operandType = static_cast<OperandType>(type);
2083         update(&capabilities, operandType, typePerf(operandType));
2084     }
2085     for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::OEM_MIN);
2086          type <= static_cast<uint32_t>(OperandTypeRange::OEM_MAX); ++type) {
2087         OperandType operandType = static_cast<OperandType>(type);
2088         update(&capabilities, operandType, typePerf(operandType));
2089     }
2090 
2091     // Make sure lookup retrieves the values stored by update
2092 
2093     for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MIN);
2094          type <= static_cast<uint32_t>(OperandTypeRange::FUNDAMENTAL_MAX); ++type) {
2095         OperandType operandType = static_cast<OperandType>(type);
2096         SCOPED_TRACE(toString(operandType));
2097         EXPECT_EQ(lookupExecTime(capabilities, operandType), typePerf(operandType));
2098     }
2099     for (uint32_t type = static_cast<uint32_t>(OperandTypeRange::OEM_MIN);
2100          type <= static_cast<uint32_t>(OperandTypeRange::OEM_MAX); ++type) {
2101         OperandType operandType = static_cast<OperandType>(type);
2102         SCOPED_TRACE(toString(operandType));
2103         EXPECT_EQ(lookupExecTime(capabilities, operandType), typePerf(operandType));
2104     }
2105 
2106     // Check the behavior of a missing type
2107 
2108     OperandType operandType =
2109             static_cast<OperandType>(static_cast<uint32_t>(OperandTypeRange::BASE_MAX) + 1);
2110     EXPECT_EQ(lookupExecTime(capabilities, operandType), FLT_MAX);
2111 }
2112 
2113 }  // namespace
2114