1/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "CompilationBuilder.h"
18#include "ExecutionPlan.h"
19#include "GraphDump.h"
20#include "HalInterfaces.h"
21#include "Manager.h"
22#include "ModelBuilder.h"
23#include "NeuralNetworks.h"
24#include "NeuralNetworksOEM.h"
25#include "NeuralNetworksWrapper.h"
26#include "SampleDriver.h"
27#include "Utils.h"
28#include "ValidateHal.h"
29
30#include <gtest/gtest.h>
31
32#include <map>
33#include <queue>
34
35// Uncomment the following line to generate some debugging output that
36// may be useful when analyzing failures:
37//
38// #define VERBOSE VERBOSE
39
40// Uncomment the following line to generate DOT graphs.
41//
42// #define GRAPH GRAPH
43
44// These tests do whitebox testing of the graph partitioning
45// algorithm.  It is "whitebox" in the sense that we're not evaluating
46// whether a particular partitioning is legal, or "good enough"
47// according to some metric, but whether it exactly matches the
48// expected behavior of the current partitioning algorithm.
49//
50// A key part of the current partitioning algorithm is to determine
51// which device among the available devices should be the one to
52// execute a particular operation from the graph.  This determination
53// is made "locally" -- i.e., it does not depend on the graph
54// topology, only on the properties of the operation in question.
55// IDevice::getSupportedOperations() indicates which operations in a
56// graph can be executed on a device, and IDevice::getCapabilities()
57// indicates how "good" that device is for executing particular kinds
58// of operations.  For each operation, the partitioning algorithm
59// picks the "best" device that is capable of executing that
60// operation; if no device can do so, then the algorithm picks the
61// cpu.
62//
63// As part of this testing approach, we want to make it easy to
64// specify which operations in a test graph can be executed on which
65// devices.  We accomplish this with an abstraction: There are eight
66// different kinds of operations (each of which has two inputs and one
67// output), and when we instantiate a device for testing purposes, we
68// specify what subset of those eight kinds of operations the device
69// is able to execute.
70//
71// The eight kinds of operations are represented in the graph as ADD
72// or MUL with a particular activation function -- two opcodes times
73// four activation functions means eight available operation kinds.
74// This is a low-level representation detail -- when we specify the
75// behavior of the device or build a graph, we do so in terms of
76// operation encodings 0..7.
77//
78// In order to determine whether or not a partitioning matches the
79// expected partitioning, we check the number of partitions, check
80// which device each partition targets, and compare each partition's
81// subgraph, model inputs, model outputs, submodel inputs, and
82// submodel outputs against what is expected.  In order to perform
83// that comparison, we build a model to compare against a partition's
84// submodel and run a graph comparison algorithm on it.  The graph
85// comparison and the inputs and outputs comparisons are syntactic
86// rather than semantic comparisons -- they don't allow for
87// reorderings of inputs and outputs.  Because of this, we need to
88// know exactly how the partitioning algorithm orders inputs and
89// outputs in order to construct the models and operand lists to
90// compare against.  Here are some relevant behaviors of the
91// partitioning algorithm:
92//
93// - It builds a subgraph by walking operations in forward topological
94//   order, and adding each operation's input operands and output
95//   operands in index order (input followed by output) when that
96//   operation is added.  (It does not add an input that has already
97//   been added.)
98// - It finds model inputs, model outputs, and submodel inputs in
99//   the order the corresponding operands were added to the subgraph
100//   (see ExecutionStep methods getModelInputs(), getModelOutputs(),
101//   getTempsAsSubModelInputs(), getOutputsAsSubModelInputs()).
102// - It finds temps as submodel outputs in numerical order of corresponding
103//   operand number in the original model (see ExecutionStep method
104//   getTempsAsSubModelOutputs()).
105// - When it calls identifyInputsAndOutputs() on the submodel, it
106//   passes inputs from getModelInputs() in order, followed by temps as
107//   submodel inputs from getTempsAsSubModelInputs() in order,
108//   followed by outputs as submodel inputs from
109//   getOutputsAsSubModelInputs() in order; and it passes outputs from
110//   getModelOutputs() in order followed by submodel outputs from
111//   getTempsAsSubModelOutputs() in order.
112//
113// TODO: Maybe the logic for comparing a partition to an expected
114//       model should be changed to tolerate reorderings of inputs and
115//       outputs, so that when we build models and lists to compare
116//       against, we don't need to worry about input and output
117//       orderings.  But is there a way to do this that still lets us
118//       verify that we have the correct relationships between
119//       an (original) model's inputs and outputs and each submodel's
120//       inputs and outputs, as well as the correct relationship
121//       between submodel inputs and outputs across partitions?
122
123namespace {
124
125using CompilationBuilder = ::android::nn::CompilationBuilder;
126using Device = ::android::nn::Device;
127using DeviceManager = ::android::nn::DeviceManager;
128using ExecutePreference = ::android::nn::wrapper::ExecutePreference;
129using ExecutionPlan = ::android::nn::ExecutionPlan;
130using ExecutionStep = ::android::nn::ExecutionStep;
131using HidlModel = ::android::hardware::neuralnetworks::V1_1::Model;
132using ModelBuilder = ::android::nn::ModelBuilder;
133using Result = ::android::nn::wrapper::Result;
134using SampleDriver = ::android::nn::sample_driver::SampleDriver;
135using WrapperCompilation = ::android::nn::wrapper::Compilation;
136using WrapperModel = ::android::nn::wrapper::Model;
137using WrapperOperandType = ::android::nn::wrapper::OperandType;
138using WrapperType = ::android::nn::wrapper::Type;
139
140template <typename T> using sp = ::android::sp<T>;
141
142// We employ an operation numbering scheme:
143// - 0..FuseCode-1 = ADD with the appropriate activation function
144// - FuseCode..2*FuseCode-1 = MUL with the appropriate activation function
145const uint32_t kNumFuseCodes = 4;
146const uint32_t kBadOperation = ~0;
147
148// Look up the operation with the specified index in a graph, and
149// return the operation encoding -- 0..7; or, if for some reason this
150// is not one of the encoded operations, then return kBadOperation.
151uint32_t lookupOperation(std::function<const Operation&(uint32_t)> getOperation,
152                         std::function<const Operand&(uint32_t)> getOperand,
153                         std::function<const uint8_t*(uint32_t)> getValue,
154                         uint32_t operationIndex) {
155    const Operation& operation = getOperation(operationIndex);
156    switch (operation.type) {
157        case OperationType::ADD:
158        case OperationType::MUL: {
159            // input2 is the fused activation function
160            const Operand& input2 = getOperand(operation.inputs[2]);
161            if ((input2.type == OperandType::INT32) &&
162                (input2.lifetime == OperandLifeTime::CONSTANT_COPY)) {
163                int32_t value;
164                memcpy(&value,
165                       getValue(input2.location.offset),
166                       input2.location.length);
167                if (operation.type == OperationType::MUL) {
168                    value += kNumFuseCodes;
169                }
170                return value;
171            }
172            break;
173        }
174        default:
175            break;
176    }
177    return kBadOperation;
178}
179
180uint32_t lookupOperation(const HidlModel& model, uint32_t operationIndex) {
181    return lookupOperation(
182        [&model](uint32_t index) -> const Operation& {
183            return model.operations[index];
184        },
185        [&model](uint32_t index) -> const Operand& {
186            return model.operands[index];
187        },
188        [&model](uint32_t offset) {return &model.operandValues[offset];},
189        operationIndex);
190}
191
192#ifdef VERBOSE
193// This is a debugging utility function
194void dump(const char* name, const ModelBuilder* model) {
195    HidlModel hidlModel;
196    model->setHidlModel(&hidlModel);
197    std::cout << name << ": " << toString(hidlModel) << std::endl;
198    std::cout << "inputs: " << toString(hidlModel.inputIndexes) << std::endl;
199    std::cout << "outputs: " << toString(hidlModel.outputIndexes) << std::endl;
200    for (size_t i = 0, e = hidlModel.operations.size(); i < e; i++) {
201        std::cout << "operation[" << i << "]: " << toString(hidlModel.operations[i]) << std::endl;
202    }
203}
204#endif
205
206#ifdef GRAPH
207inline void hidlGraphDump(const char* name, const HidlModel& model) {
208    ::android::nn::graphDump(name, model);
209}
210#endif
211
212void graphDump([[maybe_unused]] const char* name, [[maybe_unused]] const WrapperModel& model) {
213#ifdef GRAPH
214    HidlModel hidlModel;
215    reinterpret_cast<const ModelBuilder*>(model.getHandle())->setHidlModel(&hidlModel);
216    hidlGraphDump(name, hidlModel);
217#endif
218}
219
220// This is an IDevice for testing purposes.  It only has a few
221// interesting properties, all of which are specified as constructor
222// arguments: device capabilities; which subset of operation kinds
223// (0..7) does the device support; does the device support the OEM
224// operation.  The subset is represented with a bitmask, in which
225// operation kind K corresponds to the bit (1 << K).
226class PartitioningDriver : public SampleDriver {
227private:
228    // Dummy class -- a prepared model must not be nullptr.
229    class PartitioningPreparedModel : public IPreparedModel {
230    public:
231        Return<ErrorStatus> execute(const Request&,
232                                    const sp<IExecutionCallback>&) override {
233            return ErrorStatus::DEVICE_UNAVAILABLE;
234        }
235    };
236public:
237    enum OEM { OEMNo, OEMYes };
238
239    PartitioningDriver(const char *name, Capabilities capabilities,
240                       uint32_t operationMask, OEM oem = OEMNo) :
241            SampleDriver(name), mCapabilities(capabilities),
242            mOperationMask(operationMask), mOEM(oem) {}
243    ~PartitioningDriver() override {}
244
245    Return<ErrorStatus> prepareModel_1_1(const Model&, ExecutionPreference,
246                                         const sp<IPreparedModelCallback>& cb) override {
247        cb->notify(ErrorStatus::NONE, new PartitioningPreparedModel);
248        return ErrorStatus::NONE;
249    }
250
251    Return<DeviceStatus> getStatus() override {
252        return DeviceStatus::AVAILABLE;
253    }
254
255    Return<void> getCapabilities_1_1(getCapabilities_1_1_cb cb) override {
256        cb(ErrorStatus::NONE, mCapabilities);
257        return Void();
258    }
259
260    Return<void> getSupportedOperations_1_1(const Model& model,
261                                            getSupportedOperations_cb cb) override {
262        if (!android::nn::validateModel(model)) {
263            cb(ErrorStatus::INVALID_ARGUMENT, std::vector<bool>());
264            return Void();
265        }
266
267        const size_t count = model.operations.size();
268        std::vector<bool> supported(count);
269        for (size_t i = 0; i < count; i++) {
270            if (model.operations[i].type == OperationType::OEM_OPERATION) {
271                supported[i] = (mOEM == OEMYes);
272                continue;
273            }
274            supported[i] = false;
275            uint32_t operation = lookupOperation(model, i);
276            if ((operation != kBadOperation) && (mOperationMask & (1 << operation))) {
277                supported[i] = true;
278            }
279        }
280        cb(ErrorStatus::NONE, supported);
281        return Void();
282    }
283
284private:
285    Capabilities mCapabilities;
286    uint32_t mOperationMask;
287    OEM mOEM;
288};
289
290// This class adds some simple abstractions and utilities on top of
291// ::android::nn::wrapper::Model.  For example, it provides methods
292// that work in terms of operation kind (0..7); and because we care
293// about graph topology rather than details of operand types and
294// values, it greatly simplifies the process of creating operands.
295class PartitioningModel : public WrapperModel {
296public:
297    // Create a tensor operand of the specified type, and return the
298    // corresponding operand index.
299    uint32_t addFloatOperand() {
300        static const WrapperOperandType type(WrapperType::TENSOR_FLOAT32, { 1 });
301        return addOperand(&type);
302    }
303    uint32_t addQuantOperand() {
304        static const WrapperOperandType type(WrapperType::TENSOR_QUANT8_ASYMM, { 1 });
305        return addOperand(&type);
306    }
307
308    // Create an operation with two inputs and one output, specifying
309    // the operation kind (0..7) and the input operand indexes.
310    // Returns the output operand index.
311    enum class Dimensioned { NO, YES };
312    uint32_t addOperation2To1(uint32_t operation, const uint32_t input0, const uint32_t input1,
313                              Dimensioned dimensionedOutput = Dimensioned::YES) {
314        ANeuralNetworksOperationType type =
315                (operation < kNumFuseCodes ? ANEURALNETWORKS_ADD : ANEURALNETWORKS_MUL);
316        int32_t fuseCode = (operation < kNumFuseCodes ? operation : operation - kNumFuseCodes);
317        uint32_t input2 = addIntOperand(fuseCode);
318        uint32_t output = addOperandOfSameType(input0, dimensionedOutput);
319        addOperation(type, { input0, input1, input2 }, { output });
320        return output;
321    }
322
323    // Create an OEM operation with one input and one output,
324    // specifying the input operand index.  Returns the output operand
325    // index.
326    uint32_t addOperationOEM1To1(const uint32_t input,
327                                 Dimensioned dimensionedOutput = Dimensioned::YES) {
328        uint32_t output = addOperandOfSameType(input, dimensionedOutput);
329        addOperation(ANEURALNETWORKS_OEM_OPERATION, { input }, { output });
330        return output;
331    }
332
333    // Run the partitioning algorithm to create an ExecutionPlan.
334    int partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
335                         ExecutePreference preference, ExecutionPlan* plan) {
336        return reinterpret_cast<ModelBuilder*>(getHandle())->partitionTheWork(
337            devices, static_cast<uint32_t>(preference), plan);
338    }
339
340#ifdef VERBOSE
341    // This is a debugging utility function.
342    void dump(const char* name) const {
343        const ModelBuilder* mb = reinterpret_cast<const ModelBuilder*>(getHandle());
344        ::dump(name, mb);
345    }
346#endif
347
348private:
349
350    // Create a scalar integer operand of the specified value, and
351    // return the corresponding operand index.
352    uint32_t addIntOperand(int32_t value) {
353        static const WrapperOperandType type(WrapperType::INT32, { });
354        uint32_t operand = addOperand(&type);
355        setOperandValue(operand, &value, sizeof(value));
356        return operand;
357    }
358
359    // Create an operand of the same type as the specified operand,
360    // and return the operand index of the new operand.
361    uint32_t addOperandOfSameType(uint32_t operand, Dimensioned dimensioned = Dimensioned::YES) {
362        const Operand& operandStruct =
363                reinterpret_cast<const ModelBuilder*>(getHandle())->getOperand(operand);
364        WrapperOperandType type(static_cast<WrapperType>(operandStruct.type), { 1 });
365        if (dimensioned == Dimensioned::NO) {
366            for (auto& dimension : type.dimensions) {
367                dimension = 0;
368            }
369        }
370        return addOperand(&type);
371    }
372};
373
374// This class adds some utilities on top of ::android::nn::wrapper::Compilation.
375class PartitioningCompilation : public WrapperCompilation {
376public:
377    PartitioningCompilation(const WrapperModel* model) : WrapperCompilation(model) { }
378
379    Result setPartitioning(uint32_t partitioning) {
380        return static_cast<Result>(builder()->setPartitioning(partitioning));
381    }
382
383    using WrapperCompilation::finish;
384    Result finish(const std::vector<std::shared_ptr<Device>>& devices) {
385        return static_cast<Result>(builder()->finish(devices));
386    }
387
388    const ExecutionPlan& getExecutionPlan() const {
389        return builder()->forTest_getExecutionPlan();
390    }
391
392private:
393    CompilationBuilder* builder() {
394        return reinterpret_cast<CompilationBuilder*>(getHandle());
395    }
396
397    const CompilationBuilder* builder() const {
398        return reinterpret_cast<const CompilationBuilder*>(getHandle());
399    }
400};
401
402#ifdef VERBOSE
403#define RETURN_TRUE()                                                          \
404    {                                                                          \
405        std::cerr << "returning true from " << __LINE__ << std::endl;          \
406        return true;                                                           \
407    }
408#else
409#define RETURN_TRUE()                                                          \
410    {                                                                          \
411        return true;                                                           \
412    }
413#endif
414#ifdef VERBOSE
415#define RETURN_FALSE(MESSAGE)                                                  \
416    {                                                                          \
417        std::cerr << "returning false from " << __LINE__ MESSAGE << std::endl; \
418        return false;                                                          \
419    }
420#else
421#define RETURN_FALSE(MESSAGE)                                                  \
422    {                                                                          \
423        return false;                                                          \
424    }
425#endif
426
427class PartitioningTest : public ::testing::Test {
428protected:
429    using RemapVectorType = ExecutionStep::RemapVectorType;
430    using SubModelOutputSetType = ExecutionStep::SubModelOutputSetType;
431
432    virtual void SetUp() {
433    }
434
435    // From a vector of DeviceSpecification, create a vector of
436    // Devices.
437    struct DeviceSpecification {
438        DeviceSpecification(const std::string &name, Capabilities capabilities,
439                            uint32_t operationMask,
440                            PartitioningDriver::OEM oem = PartitioningDriver::OEMNo) :
441                mName(name), mCapabilities(capabilities),
442                mOperationMask(operationMask), mOEM(oem) { }
443        std::string mName;
444        Capabilities mCapabilities;
445        uint32_t mOperationMask;
446        PartitioningDriver::OEM mOEM;
447    };
448    static std::vector<std::shared_ptr<Device>>
449    makeDevices(std::vector<DeviceSpecification> specifications) {
450        std::vector<std::shared_ptr<Device>> devices;
451        for (const auto& specification : specifications) {
452            devices.push_back(std::make_shared<Device>(
453                specification.mName,
454                new PartitioningDriver(specification.mName.c_str(),
455                                       specification.mCapabilities,
456                                       specification.mOperationMask,
457                                       specification.mOEM)));
458            if (!devices.back()->initialize()) {
459                EXPECT_NE("failed to initialize device", nullptr);
460                return {};
461            }
462        }
463        return devices;
464    }
465
466    /*-- Graph comparision ----------------------------------------------------------------*/
467
468    // An operand with certain values for its lifetime does not have a
469    // defining operation in the graph.  For the purposes of the graph
470    // comparison algorithm, we encode the "defining operation" index of
471    // such an operand as follows:
472    // - NO_VALUE       kPseudoDefiningOperationNoValue
473    // - MODEL_INPUT    kPseudoDefiningOperationModelInput0 + (position in list of inputs)
474    // - CONSTANT_COPY  kPseudoDefiningOperationConstantCopy0 + (constant value)
475    //                    Note: For the graphs we build in this test, we
476    //                          only expect to see 4-byte constants within
477    //                          a very restricted range, so we only make
478    //                          room for such constants in our encoding
479    //                          space.
480    // We do not expect to see CONSTANT_REFERENCE, and so we do not handle
481    // it.
482    //
483    // The encoding is intended to be relatively human readable; it is not
484    // designed to represent some optimal balance of ranges for the items
485    // within its scope (actual operations, inputs, constants).
486
487    enum PseudoDefiningOperationEncodings : uint32_t {
488        kPseudoDefiningOperationModelInput0   = 0x80000000U,
489        kPseudoDefiningOperationConstantCopy0 = 0x90000000U,
490        kPseudoDefiningOperationNoValue       = 0xeeeeeeeeU,
491
492        // lowest value for special encoding
493        kPseudoDefiningOperationBase          = 0x80000000U,
494
495        // range of encoded input or constant
496        kPseudoDefiningOperationRange         = 0x10000000U,
497    };
498
499    // Build a map from operand to defining operation.
500    // TODO: Replace map with vector?
501    void buildDefinitionMap(const ModelBuilder* model,
502                            std::map<uint32_t, uint32_t>* defMap) {
503        // actual definitions
504        ASSERT_LT(model->operationCount(), kPseudoDefiningOperationBase);
505        for (uint32_t i = 0, e = model->operationCount(); i < e; i++) {
506            const Operation& operation = model->getOperation(i);
507            for (uint32_t output : operation.outputs) {
508                (*defMap)[output] = i;
509            }
510        }
511        // inputs
512        ASSERT_LT(model->inputCount(), kPseudoDefiningOperationRange);
513        for (uint32_t i = 0, e = model->inputCount(); i < e; i++) {
514            (*defMap)[model->getInputOperandIndex(i)] = kPseudoDefiningOperationModelInput0 + i;
515        }
516        // look for NO_VALUE and CONSTANT_COPY
517        for (uint32_t i = 0, e = model->operandCount(); i < e; i++) {
518            const Operand& operand = model->getOperand(i);
519            switch (operand.lifetime) {
520                case OperandLifeTime::NO_VALUE:
521                    (*defMap)[i] = kPseudoDefiningOperationNoValue;
522                    break;
523                case OperandLifeTime::CONSTANT_COPY: {
524                    ASSERT_EQ(operand.location.length, sizeof(uint32_t));
525                    uint32_t value;
526                    memcpy(&value, model->getPointerToOperandValue(operand.location.offset), sizeof(uint32_t));
527                    ASSERT_LT(value, kPseudoDefiningOperationNoValue);
528                    (*defMap)[i] = kPseudoDefiningOperationConstantCopy0 + value;
529                    break;
530                }
531                case OperandLifeTime::TEMPORARY_VARIABLE:
532                case OperandLifeTime::MODEL_INPUT:
533                case OperandLifeTime::MODEL_OUTPUT:
534                    // already handled
535                    break;
536                default:
537                    FAIL();
538                    break;
539            }
540        }
541        // sanity check
542        ASSERT_EQ(model->operandCount(), defMap->size());
543    }
544
545#ifdef VERBOSE
546    void dump(const char* name, const std::map<uint32_t, uint32_t>* aMap) {
547        auto writeNum = [](uint32_t num) {
548            if (num >= kPseudoDefiningOperationBase) {
549                std::cout << "0x" << std::hex << num << std::dec;
550            } else {
551                std::cout << num;
552            }
553        };
554
555        std::cout << name << ": { ";
556        bool gotOne = false;
557        for (const auto& entry : *aMap) {
558            if (gotOne) {
559                std::cout << ", ";
560            } else {
561                gotOne = true;
562            }
563            std::cout << "(";
564            writeNum(entry.first);
565            std::cout << ", ";
566            writeNum(entry.second);
567            std::cout << ")";
568        }
569        std::cout << " }" << std::endl;
570    }
571#endif
572
573    bool compare(const Operand& operandA, const Operand& operandB) {
574        if (operandA.type != operandB.type ||
575            operandA.dimensions != operandB.dimensions ||
576            operandA.numberOfConsumers != operandB.numberOfConsumers ||
577            operandA.scale != operandB.scale ||
578            operandA.zeroPoint != operandB.zeroPoint) {
579            return false;
580        }
581        return true;
582    }
583
584    // Compare two graphs.  We ignore operand and operation indexes (i.e.,
585    // two nodes can be the same even if they are numbered differently)
586    // but we also ignore semantics (e.g., even if an operation kind is
587    // such that the operand is commutative, we still pay attention to the
588    // order of its input operands).
589    //
590    // The comparison algorithm works by walking modelA from outputs
591    // towards inputs, along the edge from each operand to its
592    // defining operation, and then along the edges to the operation's
593    // input operands.  At each step along the way, we try to match up
594    // operands and operations from modelA with equivalent operands
595    // and operations from modelB.
596    //
597    // We start by assuming that modelA's outputs and modelB's outputs
598    // match positionally (e.g., modelA's first output operand is
599    // equivalent to modelB's first output operand).  Once we've
600    // discovered two equivalent operands (such as those outputs), we
601    // place them in a work queue.  We repeatedly pull operands off
602    // the queue and compare their defining operations and those
603    // operations' input operands, to discover more pairs of
604    // equivalent operands.  If we ever find operations that do not
605    // match (e.g., because operation kind differs), or operands that
606    // do not match (e.g., because operand type differs); or if we
607    // ever find a conflict (we've already decided that operand A's
608    // equivalent operand is B0, but it looks like we need its
609    // equivalent operand to be B1); then the graphs compare unequal.
610    // Otherwise, we'll eventually exhaust the work queue, and
611    // conclude that the graphs compare equal.
612    bool compare(const ModelBuilder* modelA, const ModelBuilder* modelB) {
613#ifdef VERBOSE
614        ::dump("compare(A)", modelA);
615        ::dump("compare(B)", modelB);
616#endif
617
618        if (modelA->operandCount()   != modelB->operandCount()   ||
619            modelA->operationCount() != modelB->operationCount() ||
620            modelA->inputCount()     != modelB->inputCount()     ||
621            modelA->outputCount()    != modelB->outputCount()) {
622            RETURN_FALSE();
623        }
624
625        // Maps from operand index to index of defining operation.
626        std::map<uint32_t, uint32_t> defsA, defsB;
627        buildDefinitionMap(modelA, &defsA);
628        buildDefinitionMap(modelB, &defsB);
629        if (HasFatalFailure()) return false;
630
631        // Maps from operand index in modelA to equivalent operand index
632        // in modelB; and from operation index in modelA to equivalent
633        // operation index in modelB.
634        std::map<uint32_t, uint32_t> equivalentOperandsAToB;
635        std::map<uint32_t, uint32_t> equivalentOperationsAToB;
636
637        // Queue of operand indexes from modelA, each of whose defining
638        // operations are to be checked for equivalence with modelB.
639        std::queue<uint32_t> workQueueOperandsA;
640
641        // Seed operand equivalence map and work queue from model outputs.
642        for (uint32_t i = 0, e = modelA->outputCount(); i < e; i++) {
643            uint32_t outputA = modelA->getOutputOperandIndex(i);
644            uint32_t outputB = modelB->getOutputOperandIndex(i);
645            if (!compare(modelA->getOperand(outputA), modelB->getOperand(outputB))) {
646                RETURN_FALSE();
647            }
648            equivalentOperandsAToB[outputA] = outputB;
649            workQueueOperandsA.push(outputA);
650        }
651
652#ifdef VERBOSE
653        dump("defsA", &defsA);
654        dump("defsB", &defsB);
655#endif
656
657        // Process the queue.
658        uint32_t pseudoDefinitionCount = 0;
659        while (!workQueueOperandsA.empty()) {
660#ifdef VERBOSE
661            dump("equivalentOperandsAToB", &equivalentOperandsAToB);
662            dump("equivalentOperationsAToB", &equivalentOperationsAToB);
663#endif
664            uint32_t operandIndexA = workQueueOperandsA.front();
665#ifdef VERBOSE
666            std::cout << "operandIndexA: " << operandIndexA << std::endl;
667#endif
668            workQueueOperandsA.pop();
669            uint32_t operandIndexB = equivalentOperandsAToB.at(operandIndexA);
670
671            uint32_t operationIndexA = defsA.at(operandIndexA);
672            uint32_t operationIndexB = defsB.at(operandIndexB);
673            auto it = equivalentOperationsAToB.find(operationIndexA);
674            if (it != equivalentOperationsAToB.end()) {
675                if (it->second != operationIndexB) {
676                    RETURN_FALSE();
677                }
678                continue;
679            }
680
681            // We haven't identified an equivalent operation for
682            // operationIndexA.
683
684            if ((operationIndexA >= kPseudoDefiningOperationBase) !=
685                (operationIndexB >= kPseudoDefiningOperationBase)) {
686                RETURN_FALSE();
687            }
688            // Either both operands have pseudo-definitions, or neither
689            // does.
690            if (operationIndexA >= kPseudoDefiningOperationBase) {
691                // Both operands have pseudo-definitions.
692                if (operationIndexA != operationIndexB) {
693                    RETURN_FALSE();
694                }
695                equivalentOperationsAToB[operationIndexA] = operationIndexB;
696                ++pseudoDefinitionCount;
697                continue;
698            }
699
700            // If we get here, neither operation A nor operation B is a
701            // pseudo-definition.
702
703            const Operation& operationA = modelA->getOperation(operationIndexA);
704            const Operation& operationB = modelB->getOperation(operationIndexB);
705            if (operationA.type != operationB.type ||
706                operationA.inputs.size() != operationB.inputs.size() ||
707                operationA.outputs.size() != operationB.outputs.size()) {
708                RETURN_FALSE();
709            }
710            equivalentOperationsAToB[operationIndexA] = operationIndexB;
711            for (uint32_t i = 0, e = operationA.inputs.size(); i < e; i++) {
712                uint32_t inputA = operationA.inputs[i];
713                uint32_t inputB = operationB.inputs[i];
714                auto it = equivalentOperandsAToB.find(inputA);
715                if (it != equivalentOperandsAToB.end()) {
716                    if (it->second != inputB) {
717                        RETURN_FALSE();
718                    }
719                    continue;
720                }
721                // We haven't identified an equivalent operand for inputA.
722                if (!compare(modelA->getOperand(inputA), modelB->getOperand(inputB))) {
723                    RETURN_FALSE();
724                }
725                equivalentOperandsAToB[inputA] = inputB;
726                workQueueOperandsA.push(inputA);
727            }
728        }
729
730        // Sanity check
731        if (modelA->operandCount() != defsA.size() ||
732            modelA->operandCount() != defsB.size() ||
733            modelA->operandCount() != equivalentOperandsAToB.size() ||
734            modelA->operationCount() + pseudoDefinitionCount != equivalentOperationsAToB.size()) {
735            RETURN_FALSE();
736        }
737
738        RETURN_TRUE();
739    }
740
741    /*-------------------------------------------------------------------------------------*/
742
743    bool compare(std::shared_ptr<const ExecutionStep> step,
744                 const WrapperModel* model, std::shared_ptr<Device> device) {
745        return (step->getDevice() == device) &&
746                compare(step->getSubModel(),
747                        reinterpret_cast<const ModelBuilder*>(model->getHandle()));
748    }
749};
750
751TEST_F(PartitioningTest, SimpleModel) {
752    PartitioningModel model;
753    uint32_t opnd0 = model.addFloatOperand();
754    uint32_t opnd1 = model.addFloatOperand();
755    uint32_t opnd2 = model.addOperation2To1(0, opnd0, opnd1);
756    uint32_t opnd3 = model.addFloatOperand();
757    uint32_t opnd4 = model.addOperation2To1(1, opnd2, opnd3);
758    model.identifyInputsAndOutputs({ opnd0, opnd1, opnd3 }, { opnd4 });
759    model.finish();
760    ASSERT_TRUE(model.isValid());
761    graphDump("SimpleModel", model);
762
763    // Simple partition (two devices are each capable of everything, one is the best).
764    const auto devicesA = makeDevices(
765        {
766            {"bad", { .float32Performance = { .execTime = 0.9, .powerUsage = 0.9 },
767                            .quantized8Performance = { .execTime = 0.9, .powerUsage = 0.9 } }, ~0U},
768            {"good", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 },
769                            .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, ~0U}
770        });
771    ExecutionPlan planA;
772    ASSERT_EQ(model.partitionTheWork(devicesA, ExecutePreference::PREFER_LOW_POWER, &planA),
773              ANEURALNETWORKS_NO_ERROR);
774    ASSERT_EQ(planA.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
775    ASSERT_NE(planA.forTest_simpleGetDevice().get(), nullptr);
776    ASSERT_EQ(planA.forTest_simpleGetDevice()->getName(), "good");
777
778    // Simple partition (two devices are each capable of everything, none better than CPU).
779    const auto devicesC = makeDevices(
780        {
781            {"bad", { .float32Performance = { .execTime = 1.1, .powerUsage = 1.1 },
782                            .quantized8Performance = { .execTime = 1.1, .powerUsage = 1.1 } }, ~0U},
783            {"bad2", { .float32Performance = { .execTime = 1.0, .powerUsage = 1.0 },
784                            .quantized8Performance = { .execTime = 1.0, .powerUsage = 1.0 } }, ~0U}
785        });
786    ExecutionPlan planC;
787    ASSERT_EQ(model.partitionTheWork(devicesC, ExecutePreference::PREFER_LOW_POWER, &planC),
788              ANEURALNETWORKS_NO_ERROR);
789    ASSERT_EQ(planC.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
790    ASSERT_EQ(planC.forTest_simpleGetDevice(), nullptr);
791
792    // Compound partition (two devices, each is capable of one of the
793    // two operations).  We could do more extensive checking here --
794    // for example, verify that each step within the plan has the
795    // correct (model and submodel)x(inputs and outputs).
796    const auto devicesB = makeDevices(
797        {
798            {"0", { .float32Performance = { .execTime = 0.9, .powerUsage = 0.9 },
799                            .quantized8Performance = { .execTime = 0.9, .powerUsage = 0.9 } }, 1<<0},
800            {"1", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 },
801                            .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, 1<<1}
802        });
803    ExecutionPlan planB;
804    ASSERT_EQ(model.partitionTheWork(devicesB, ExecutePreference::PREFER_LOW_POWER, &planB),
805              ANEURALNETWORKS_NO_ERROR);
806    ASSERT_EQ(planB.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
807    const auto& stepsB = planB.forTest_compoundGetSteps();
808    ASSERT_EQ(stepsB.size(), size_t(2));
809    {
810        // Build a model to compare against the submodel from stepsB[0].
811        PartitioningModel modelB0;
812        uint32_t b0Opnd0 = modelB0.addFloatOperand();
813        uint32_t b0Opnd1 = modelB0.addFloatOperand();
814        uint32_t b0Opnd2 = modelB0.addOperation2To1(0, b0Opnd0, b0Opnd1);
815        modelB0.identifyInputsAndOutputs({ b0Opnd0, b0Opnd1 }, { b0Opnd2 });
816        modelB0.finish();
817        ASSERT_TRUE(modelB0.isValid());
818        ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(stepsB[0], &modelB0, devicesB[0])));
819        ASSERT_EQ(stepsB[0]->getModelInputs(),
820                  (RemapVectorType{ { opnd0, b0Opnd0 }, { opnd1, b0Opnd1 } }));
821        ASSERT_EQ(stepsB[0]->getModelOutputs(),
822                  (RemapVectorType{}));
823        ASSERT_EQ(stepsB[0]->getTempsAsSubModelInputs(),
824                  (RemapVectorType{}));
825        ASSERT_EQ(stepsB[0]->getTempsAsSubModelOutputs(),
826                  (SubModelOutputSetType{ { opnd2, b0Opnd2 } }));
827        ASSERT_EQ(stepsB[0]->getOutputsAsSubModelInputs(),
828                  (RemapVectorType{}));
829    }
830    {
831        // Build a model to compare against the submodel from stepsB[1].
832        PartitioningModel modelB1;
833        uint32_t b1Opnd2 = modelB1.addFloatOperand();
834        uint32_t b1Opnd3 = modelB1.addFloatOperand();
835        uint32_t b1Opnd4 = modelB1.addOperation2To1(1, b1Opnd2, b1Opnd3);
836        // Note: In the partitioning algorithm, submodel inputs follow
837        // model inputs.  In the original model "model", opnd2 is not
838        // an input; so in the submodel "modelB1", the corresponding
839        // input b1Opnd2 is a submodel input, and must follow the
840        // model input b1Opnd3.
841        modelB1.identifyInputsAndOutputs({ b1Opnd3, b1Opnd2 }, { b1Opnd4 });
842        modelB1.finish();
843        ASSERT_TRUE(modelB1.isValid());
844        ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(stepsB[1], &modelB1, devicesB[1])));
845        ASSERT_EQ(stepsB[1]->getModelInputs(),
846                  (RemapVectorType{ { opnd3, b1Opnd3 } }));
847        ASSERT_EQ(stepsB[1]->getModelOutputs(),
848                  (RemapVectorType{ { opnd4, b1Opnd4 } }));
849        ASSERT_EQ(stepsB[1]->getTempsAsSubModelInputs(),
850                  (RemapVectorType{ { opnd2, b1Opnd2 } }));
851        ASSERT_EQ(stepsB[1]->getTempsAsSubModelOutputs(),
852                  (SubModelOutputSetType{}));
853        ASSERT_EQ(stepsB[1]->getOutputsAsSubModelInputs(),
854                  (RemapVectorType{}));
855    }
856}
857
858TEST_F(PartitioningTest, Cpu) {
859    // Here's a model where some operations execute only on the Cpu.
860    // To make things interesting, we produce three partitions --
861    // device, cpu, same-device.
862
863    static const uint32_t kCpuOp = 1;
864    static const uint32_t kDevOp = 2;
865
866    const auto devices = makeDevices(
867        {
868            {"1", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 },
869                    .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, 1<<kDevOp}
870        });
871
872    PartitioningModel model;
873
874    uint32_t opnd0 = model.addFloatOperand();
875    uint32_t opnd1 = model.addFloatOperand();
876
877    uint32_t opnd2 = model.addOperation2To1(kDevOp, opnd0, opnd1);
878    uint32_t opnd3 = model.addOperation2To1(kDevOp, opnd0, opnd2);
879
880    uint32_t opnd4 = model.addOperation2To1(kCpuOp, opnd0, opnd3);
881    uint32_t opnd5 = model.addOperation2To1(kCpuOp, opnd2, opnd4);
882
883    uint32_t opnd6 = model.addFloatOperand();
884
885    uint32_t opnd7 = model.addOperation2To1(kDevOp, opnd3, opnd5);
886    uint32_t opnd8 = model.addOperation2To1(kDevOp, opnd6, opnd7);
887
888    model.identifyInputsAndOutputs({ opnd0, opnd1, opnd6 }, { opnd4, opnd8 });
889    model.finish();
890    ASSERT_TRUE(model.isValid());
891
892    ExecutionPlan plan;
893    ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
894              ANEURALNETWORKS_NO_ERROR);
895    ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
896    const auto& steps = plan.forTest_compoundGetSteps();
897    ASSERT_EQ(steps.size(), size_t(3));
898    {
899        const auto& step0 = steps[0];
900
901        // Build a model to compare against the submodel from steps[0].
902        PartitioningModel model0;
903        uint32_t m0Opnd0 = model0.addFloatOperand();
904        uint32_t m0Opnd1 = model0.addFloatOperand();
905        uint32_t m0Opnd2 = model0.addOperation2To1(kDevOp, m0Opnd0, m0Opnd1);
906        uint32_t m0Opnd3 = model0.addOperation2To1(kDevOp, m0Opnd0, m0Opnd2);
907        model0.identifyInputsAndOutputs({ m0Opnd0, m0Opnd1 }, { m0Opnd2, m0Opnd3 });
908        model0.finish();
909        ASSERT_TRUE(model0.isValid());
910        ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(step0, &model0, devices[0])));
911        ASSERT_EQ(step0->getModelInputs(),
912                  (RemapVectorType{ { opnd0, m0Opnd0 }, { opnd1, m0Opnd1 } }));
913        ASSERT_EQ(step0->getModelOutputs(),
914                  (RemapVectorType{}));
915        ASSERT_EQ(step0->getTempsAsSubModelInputs(),
916                  (RemapVectorType{}));
917        ASSERT_EQ(step0->getTempsAsSubModelOutputs(),
918                  (SubModelOutputSetType{ { opnd2, m0Opnd2 }, { opnd3, m0Opnd3 } }));
919        ASSERT_EQ(step0->getOutputsAsSubModelInputs(),
920                  (RemapVectorType{}));
921    }
922    {
923        const auto& step1 = steps[1];
924
925        // Build a model to compare against the submodel from steps[1].
926        PartitioningModel model1;
927        uint32_t m1Opnd0 = model1.addFloatOperand();
928        uint32_t m1Opnd3 = model1.addFloatOperand();
929        uint32_t m1Opnd4 = model1.addOperation2To1(kCpuOp, m1Opnd0, m1Opnd3);
930        uint32_t m1Opnd2 = model1.addFloatOperand();
931        uint32_t m1Opnd5 = model1.addOperation2To1(kCpuOp, m1Opnd2, m1Opnd4);
932        model1.identifyInputsAndOutputs({ m1Opnd0, m1Opnd3, m1Opnd2 }, { m1Opnd4, m1Opnd5 });
933        model1.finish();
934        ASSERT_TRUE(model1.isValid());
935        ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(step1, &model1, nullptr)));
936        ASSERT_EQ(step1->getModelInputs(),
937                  (RemapVectorType{ { opnd0, m1Opnd0 } }));
938        ASSERT_EQ(step1->getModelOutputs(),
939                  (RemapVectorType{ { opnd4, m1Opnd4 } }));
940        ASSERT_EQ(step1->getTempsAsSubModelInputs(),
941                  (RemapVectorType{ { opnd3, m1Opnd3 }, { opnd2, m1Opnd2 } }));
942        ASSERT_EQ(step1->getTempsAsSubModelOutputs(),
943                  (SubModelOutputSetType{ { opnd5, m1Opnd5 } }));
944        ASSERT_EQ(step1->getOutputsAsSubModelInputs(),
945                  (RemapVectorType{}));
946    }
947    {
948        const auto& step2 = steps[2];
949
950        // Build a model to compare against the submodel from steps[2].
951        PartitioningModel model2;
952        uint32_t m2Opnd3 = model2.addFloatOperand();
953        uint32_t m2Opnd5 = model2.addFloatOperand();
954        uint32_t m2Opnd7 = model2.addOperation2To1(kDevOp, m2Opnd3, m2Opnd5);
955        uint32_t m2Opnd6 = model2.addFloatOperand();
956        uint32_t m2Opnd8 = model2.addOperation2To1(kDevOp, m2Opnd6, m2Opnd7);
957        model2.identifyInputsAndOutputs({ m2Opnd6, m2Opnd3, m2Opnd5 }, { m2Opnd8 });
958        model2.finish();
959        ASSERT_TRUE(model2.isValid());
960        ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(step2, &model2, devices[0])));
961        ASSERT_EQ(step2->getModelInputs(),
962                  (RemapVectorType{ { opnd6, m2Opnd6 } }));
963        ASSERT_EQ(step2->getModelOutputs(),
964                  (RemapVectorType{ { opnd8, m2Opnd8 } }));
965        ASSERT_EQ(step2->getTempsAsSubModelInputs(),
966                  (RemapVectorType{ { opnd3, m2Opnd3 }, { opnd5, m2Opnd5 } }));
967        ASSERT_EQ(step2->getTempsAsSubModelOutputs(),
968                  (SubModelOutputSetType{}));
969        ASSERT_EQ(step2->getOutputsAsSubModelInputs(),
970                  (RemapVectorType{}));
971    }
972}
973
974TEST_F(PartitioningTest, SetPartitioning) {
975    PartitioningModel model;
976    uint32_t opnd0 = model.addFloatOperand();
977    uint32_t opnd1 = model.addFloatOperand();
978    uint32_t opnd2 = model.addOperation2To1(0, opnd0, opnd1, PartitioningModel::Dimensioned::NO);
979    uint32_t opnd3 = model.addFloatOperand();
980    uint32_t opnd4 = model.addOperation2To1(1, opnd2, opnd3);
981    model.identifyInputsAndOutputs({ opnd0, opnd1, opnd3 }, { opnd4 });
982    model.finish();
983    ASSERT_TRUE(model.isValid());
984
985    // We expect that we cannot successfully partition, because we
986    // have an intermediate operand (opnd2) without dimensions, and
987    // this is not currently handled.
988
989    // One device that can and should execute operation 0.
990    const auto devices = makeDevices({
991            {"hw", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 },
992                            .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, (1<<0)},
993        });
994
995    // Test kPartitioningNo.  We should not even attempt partitioning,
996    // so there should be no execution plan.
997    PartitioningCompilation cPNo(&model);
998    ASSERT_EQ(cPNo.setPartitioning(DeviceManager::kPartitioningNo), Result::NO_ERROR);
999    ASSERT_EQ(cPNo.finish(devices), Result::NO_ERROR);
1000    ASSERT_EQ(cPNo.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::EMPTY);
1001
1002    // Test kPartitioningWithFallback.  We should attempt
1003    // partitioning, reach the end of the partitioning process (so we
1004    // have an execution plan), discover the dimensionless
1005    // intermediate operand, and still return success (because of
1006    // fallback).
1007    PartitioningCompilation cPWithFallback(&model);
1008    ASSERT_EQ(cPWithFallback.setPartitioning(DeviceManager::kPartitioningWithFallback), Result::NO_ERROR);
1009    ASSERT_EQ(cPWithFallback.finish(devices), Result::NO_ERROR);
1010    ASSERT_EQ(cPWithFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::ERROR);
1011
1012    // Test kPartitioningWithoutFallback.  We should attempt
1013    // partitioning, and fail.
1014    PartitioningCompilation cPWithoutFallback(&model);
1015    ASSERT_EQ(cPWithoutFallback.setPartitioning(DeviceManager::kPartitioningWithoutFallback), Result::NO_ERROR);
1016    ASSERT_EQ(cPWithoutFallback.finish(devices), Result::OP_FAILED);
1017    ASSERT_TRUE(cPWithoutFallback.getExecutionPlan().forTest_hasSubModelOutputsOfUnknownSize());
1018    ASSERT_EQ(cPWithoutFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::ERROR);
1019}
1020
1021// Regression test for http://b/69166603:
1022//     "partitioned compilation and execution yields wrong results when model output is submodel input"
1023TEST_F(PartitioningTest, ModelOutputAsSubmodelInput) {
1024    PartitioningModel model;
1025    uint32_t opnd0 = model.addFloatOperand();
1026    uint32_t opnd1 = model.addFloatOperand();
1027    uint32_t opnd2 = model.addOperation2To1(0, opnd0, opnd1);
1028    uint32_t opnd3 = model.addOperation2To1(1, opnd2, opnd2);
1029    model.identifyInputsAndOutputs({ opnd0, opnd1 }, { opnd2, opnd3 });
1030    model.finish();
1031    ASSERT_TRUE(model.isValid());
1032
1033    // Compound partition (two devices, each is capable of one of the
1034    // two operations).  We could do more extensive checking here --
1035    // for example, verify that each step within the plan has the
1036    // correct (model and submodel)x(inputs and outputs).
1037    const auto devices = makeDevices(
1038        {
1039            {"0", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 },
1040                            .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, 1<<0},
1041            {"1", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 },
1042                            .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, 1<<1}
1043        });
1044    ExecutionPlan plan;
1045    ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
1046              ANEURALNETWORKS_NO_ERROR);
1047    ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
1048    const auto& steps = plan.forTest_compoundGetSteps();
1049    ASSERT_EQ(steps.size(), size_t(2));
1050    {
1051        // Build a model to compare against the submodel from steps[0].
1052        PartitioningModel model0;
1053        uint32_t m0Opnd0 = model0.addFloatOperand();
1054        uint32_t m0Opnd1 = model0.addFloatOperand();
1055        uint32_t m0Opnd2 = model0.addOperation2To1(0, m0Opnd0, m0Opnd1);
1056        model0.identifyInputsAndOutputs({ m0Opnd0, m0Opnd1 }, { m0Opnd2 });
1057        model0.finish();
1058        ASSERT_TRUE(model0.isValid());
1059        ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(steps[0], &model0, devices[0])));
1060        ASSERT_EQ(steps[0]->getModelInputs(),
1061                  (RemapVectorType{ { opnd0, m0Opnd0 }, { opnd1, m0Opnd1 } }));
1062        ASSERT_EQ(steps[0]->getModelOutputs(),
1063                  (RemapVectorType{ { opnd2, m0Opnd2 } }));
1064        ASSERT_EQ(steps[0]->getTempsAsSubModelInputs(),
1065                  (RemapVectorType{}));
1066        ASSERT_EQ(steps[0]->getTempsAsSubModelOutputs(),
1067                  (SubModelOutputSetType{}));
1068        ASSERT_EQ(steps[0]->getOutputsAsSubModelInputs(),
1069                  (RemapVectorType{}));
1070    }
1071    {
1072        // Build a model to compare against the submodel from steps[1].
1073        PartitioningModel model1;
1074        uint32_t m1Opnd2 = model1.addFloatOperand();
1075        uint32_t m1Opnd3 = model1.addOperation2To1(1, m1Opnd2, m1Opnd2);
1076        model1.identifyInputsAndOutputs({ m1Opnd2 }, { m1Opnd3 });
1077        model1.finish();
1078        ASSERT_TRUE(model1.isValid());
1079        ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(steps[1], &model1, devices[1])));
1080        ASSERT_EQ(steps[1]->getModelInputs(),
1081                  (RemapVectorType{}));
1082        ASSERT_EQ(steps[1]->getModelOutputs(),
1083                  (RemapVectorType{ { opnd3, m1Opnd3 } }));
1084        ASSERT_EQ(steps[1]->getTempsAsSubModelInputs(),
1085                  (RemapVectorType{}));
1086        ASSERT_EQ(steps[1]->getTempsAsSubModelOutputs(),
1087                  (SubModelOutputSetType{}));
1088        ASSERT_EQ(steps[1]->getOutputsAsSubModelInputs(),
1089                  (RemapVectorType{ { opnd2, m1Opnd2 } }));
1090    }
1091}
1092
1093TEST_F(PartitioningTest, OemOperations) {
1094    // Trivial model consisting solely of OEM operation.
1095    PartitioningModel model;
1096    uint32_t opndIn = model.addFloatOperand();
1097    uint32_t opndOut = model.addOperationOEM1To1(opndIn);
1098    model.identifyInputsAndOutputs({ opndIn }, { opndOut });
1099    model.finish();
1100    ASSERT_TRUE(model.isValid());
1101
1102    // Verify that the best driver than can run an OEM operation is
1103    // used, even if it is not better than the CPU.
1104    const auto devicesBestOEM = makeDevices(
1105        {
1106            {"badOEM", { .float32Performance = { .execTime = 1.5, .powerUsage = 1.5 },
1107                         .quantized8Performance = { .execTime = 1.5, .powerUsage = 1.5 } },
1108                        ~0U, PartitioningDriver::OEMYes},
1109            {"noOEM", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 },
1110                        .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } },
1111                        ~0U, PartitioningDriver::OEMNo},
1112            {"goodOEM", { .float32Performance = { .execTime = 1.2, .powerUsage = 1.2 },
1113                          .quantized8Performance = { .execTime = 1.2, .powerUsage = 1.2 } },
1114                        ~0U, PartitioningDriver::OEMYes}
1115        });
1116    PartitioningCompilation compilationBestOEM(&model);
1117    ASSERT_EQ(compilationBestOEM.finish(devicesBestOEM), Result::NO_ERROR);
1118    const auto& planBestOEM = compilationBestOEM.getExecutionPlan();
1119    ASSERT_EQ(planBestOEM.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1120    ASSERT_NE(planBestOEM.forTest_simpleGetDevice().get(), nullptr);
1121    ASSERT_EQ(planBestOEM.forTest_simpleGetDevice()->getName(), "goodOEM");
1122
1123    // Verify that we get an error if no driver can run an OEM operation.
1124    const auto devicesNoOEM = makeDevices(
1125        {
1126            {"noOEM", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 },
1127                        .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } },
1128                        ~0U, PartitioningDriver::OEMNo}
1129        });
1130    PartitioningCompilation compilationNoOEM(&model);
1131    ASSERT_EQ(compilationNoOEM.finish(devicesNoOEM), Result::BAD_DATA);
1132
1133    // Verify that we get an error if there are no drivers (only CPU fallback).
1134    PartitioningCompilation compilationNoDrivers(&model);
1135    ASSERT_EQ(compilationNoDrivers.finish({} /* no drivers */), Result::BAD_DATA);
1136}
1137
1138TEST_F(PartitioningTest, RelaxedFP) {
1139    const auto devices = makeDevices(
1140        {
1141            // Best choice for non-relaxed model.
1142            {"f32", { .float32Performance = { .execTime = 0.8, .powerUsage = 0.8 },
1143                      .relaxedFloat32toFloat16Performance = { .execTime = 0.9, .powerUsage = 0.9 }},
1144                    ~0U},
1145            // Best choice for relaxed model.
1146            {"f16", { .float32Performance = { .execTime = 0.9, .powerUsage = 0.9 },
1147                      .relaxedFloat32toFloat16Performance = { .execTime = 0.8, .powerUsage = 0.8 }},
1148                    ~0U}
1149        });
1150
1151    auto TrivialTest = [&devices](bool doRelax, const char* expectDevice) {
1152        // Trivial model consisting solely of one operation.
1153        SCOPED_TRACE(expectDevice);
1154        PartitioningModel model;
1155        uint32_t opnd0 = model.addFloatOperand();
1156        uint32_t opnd1 = model.addFloatOperand();
1157        uint32_t opnd2 = model.addOperation2To1(0, opnd0, opnd1);
1158        model.identifyInputsAndOutputs({ opnd0, opnd1 }, { opnd2 });
1159        model.relaxComputationFloat32toFloat16(doRelax);
1160        model.finish();
1161        ASSERT_TRUE(model.isValid());
1162        // Verify that the model will be executed on the appropriate device.
1163        ExecutionPlan plan;
1164        ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
1165                  ANEURALNETWORKS_NO_ERROR);
1166        ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
1167        ASSERT_EQ(plan.forTest_simpleGetDevice()->getName(), expectDevice);
1168    };
1169
1170    ASSERT_NO_FATAL_FAILURE(TrivialTest(false, "f32"));
1171    ASSERT_NO_FATAL_FAILURE(TrivialTest(true, "f16"));
1172}
1173
1174}  // namespace
1175