1/* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "CompilationBuilder.h" 18#include "ExecutionPlan.h" 19#include "GraphDump.h" 20#include "HalInterfaces.h" 21#include "Manager.h" 22#include "ModelBuilder.h" 23#include "NeuralNetworks.h" 24#include "NeuralNetworksOEM.h" 25#include "NeuralNetworksWrapper.h" 26#include "SampleDriver.h" 27#include "Utils.h" 28#include "ValidateHal.h" 29 30#include <gtest/gtest.h> 31 32#include <map> 33#include <queue> 34 35// Uncomment the following line to generate some debugging output that 36// may be useful when analyzing failures: 37// 38// #define VERBOSE VERBOSE 39 40// Uncomment the following line to generate DOT graphs. 41// 42// #define GRAPH GRAPH 43 44// These tests do whitebox testing of the graph partitioning 45// algorithm. It is "whitebox" in the sense that we're not evaluating 46// whether a particular partitioning is legal, or "good enough" 47// according to some metric, but whether it exactly matches the 48// expected behavior of the current partitioning algorithm. 49// 50// A key part of the current partitioning algorithm is to determine 51// which device among the available devices should be the one to 52// execute a particular operation from the graph. This determination 53// is made "locally" -- i.e., it does not depend on the graph 54// topology, only on the properties of the operation in question. 55// IDevice::getSupportedOperations() indicates which operations in a 56// graph can be executed on a device, and IDevice::getCapabilities() 57// indicates how "good" that device is for executing particular kinds 58// of operations. For each operation, the partitioning algorithm 59// picks the "best" device that is capable of executing that 60// operation; if no device can do so, then the algorithm picks the 61// cpu. 62// 63// As part of this testing approach, we want to make it easy to 64// specify which operations in a test graph can be executed on which 65// devices. We accomplish this with an abstraction: There are eight 66// different kinds of operations (each of which has two inputs and one 67// output), and when we instantiate a device for testing purposes, we 68// specify what subset of those eight kinds of operations the device 69// is able to execute. 70// 71// The eight kinds of operations are represented in the graph as ADD 72// or MUL with a particular activation function -- two opcodes times 73// four activation functions means eight available operation kinds. 74// This is a low-level representation detail -- when we specify the 75// behavior of the device or build a graph, we do so in terms of 76// operation encodings 0..7. 77// 78// In order to determine whether or not a partitioning matches the 79// expected partitioning, we check the number of partitions, check 80// which device each partition targets, and compare each partition's 81// subgraph, model inputs, model outputs, submodel inputs, and 82// submodel outputs against what is expected. In order to perform 83// that comparison, we build a model to compare against a partition's 84// submodel and run a graph comparison algorithm on it. The graph 85// comparison and the inputs and outputs comparisons are syntactic 86// rather than semantic comparisons -- they don't allow for 87// reorderings of inputs and outputs. Because of this, we need to 88// know exactly how the partitioning algorithm orders inputs and 89// outputs in order to construct the models and operand lists to 90// compare against. Here are some relevant behaviors of the 91// partitioning algorithm: 92// 93// - It builds a subgraph by walking operations in forward topological 94// order, and adding each operation's input operands and output 95// operands in index order (input followed by output) when that 96// operation is added. (It does not add an input that has already 97// been added.) 98// - It finds model inputs, model outputs, and submodel inputs in 99// the order the corresponding operands were added to the subgraph 100// (see ExecutionStep methods getModelInputs(), getModelOutputs(), 101// getTempsAsSubModelInputs(), getOutputsAsSubModelInputs()). 102// - It finds temps as submodel outputs in numerical order of corresponding 103// operand number in the original model (see ExecutionStep method 104// getTempsAsSubModelOutputs()). 105// - When it calls identifyInputsAndOutputs() on the submodel, it 106// passes inputs from getModelInputs() in order, followed by temps as 107// submodel inputs from getTempsAsSubModelInputs() in order, 108// followed by outputs as submodel inputs from 109// getOutputsAsSubModelInputs() in order; and it passes outputs from 110// getModelOutputs() in order followed by submodel outputs from 111// getTempsAsSubModelOutputs() in order. 112// 113// TODO: Maybe the logic for comparing a partition to an expected 114// model should be changed to tolerate reorderings of inputs and 115// outputs, so that when we build models and lists to compare 116// against, we don't need to worry about input and output 117// orderings. But is there a way to do this that still lets us 118// verify that we have the correct relationships between 119// an (original) model's inputs and outputs and each submodel's 120// inputs and outputs, as well as the correct relationship 121// between submodel inputs and outputs across partitions? 122 123namespace { 124 125using CompilationBuilder = ::android::nn::CompilationBuilder; 126using Device = ::android::nn::Device; 127using DeviceManager = ::android::nn::DeviceManager; 128using ExecutePreference = ::android::nn::wrapper::ExecutePreference; 129using ExecutionPlan = ::android::nn::ExecutionPlan; 130using ExecutionStep = ::android::nn::ExecutionStep; 131using HidlModel = ::android::hardware::neuralnetworks::V1_1::Model; 132using ModelBuilder = ::android::nn::ModelBuilder; 133using Result = ::android::nn::wrapper::Result; 134using SampleDriver = ::android::nn::sample_driver::SampleDriver; 135using WrapperCompilation = ::android::nn::wrapper::Compilation; 136using WrapperModel = ::android::nn::wrapper::Model; 137using WrapperOperandType = ::android::nn::wrapper::OperandType; 138using WrapperType = ::android::nn::wrapper::Type; 139 140template <typename T> using sp = ::android::sp<T>; 141 142// We employ an operation numbering scheme: 143// - 0..FuseCode-1 = ADD with the appropriate activation function 144// - FuseCode..2*FuseCode-1 = MUL with the appropriate activation function 145const uint32_t kNumFuseCodes = 4; 146const uint32_t kBadOperation = ~0; 147 148// Look up the operation with the specified index in a graph, and 149// return the operation encoding -- 0..7; or, if for some reason this 150// is not one of the encoded operations, then return kBadOperation. 151uint32_t lookupOperation(std::function<const Operation&(uint32_t)> getOperation, 152 std::function<const Operand&(uint32_t)> getOperand, 153 std::function<const uint8_t*(uint32_t)> getValue, 154 uint32_t operationIndex) { 155 const Operation& operation = getOperation(operationIndex); 156 switch (operation.type) { 157 case OperationType::ADD: 158 case OperationType::MUL: { 159 // input2 is the fused activation function 160 const Operand& input2 = getOperand(operation.inputs[2]); 161 if ((input2.type == OperandType::INT32) && 162 (input2.lifetime == OperandLifeTime::CONSTANT_COPY)) { 163 int32_t value; 164 memcpy(&value, 165 getValue(input2.location.offset), 166 input2.location.length); 167 if (operation.type == OperationType::MUL) { 168 value += kNumFuseCodes; 169 } 170 return value; 171 } 172 break; 173 } 174 default: 175 break; 176 } 177 return kBadOperation; 178} 179 180uint32_t lookupOperation(const HidlModel& model, uint32_t operationIndex) { 181 return lookupOperation( 182 [&model](uint32_t index) -> const Operation& { 183 return model.operations[index]; 184 }, 185 [&model](uint32_t index) -> const Operand& { 186 return model.operands[index]; 187 }, 188 [&model](uint32_t offset) {return &model.operandValues[offset];}, 189 operationIndex); 190} 191 192#ifdef VERBOSE 193// This is a debugging utility function 194void dump(const char* name, const ModelBuilder* model) { 195 HidlModel hidlModel; 196 model->setHidlModel(&hidlModel); 197 std::cout << name << ": " << toString(hidlModel) << std::endl; 198 std::cout << "inputs: " << toString(hidlModel.inputIndexes) << std::endl; 199 std::cout << "outputs: " << toString(hidlModel.outputIndexes) << std::endl; 200 for (size_t i = 0, e = hidlModel.operations.size(); i < e; i++) { 201 std::cout << "operation[" << i << "]: " << toString(hidlModel.operations[i]) << std::endl; 202 } 203} 204#endif 205 206#ifdef GRAPH 207inline void hidlGraphDump(const char* name, const HidlModel& model) { 208 ::android::nn::graphDump(name, model); 209} 210#endif 211 212void graphDump([[maybe_unused]] const char* name, [[maybe_unused]] const WrapperModel& model) { 213#ifdef GRAPH 214 HidlModel hidlModel; 215 reinterpret_cast<const ModelBuilder*>(model.getHandle())->setHidlModel(&hidlModel); 216 hidlGraphDump(name, hidlModel); 217#endif 218} 219 220// This is an IDevice for testing purposes. It only has a few 221// interesting properties, all of which are specified as constructor 222// arguments: device capabilities; which subset of operation kinds 223// (0..7) does the device support; does the device support the OEM 224// operation. The subset is represented with a bitmask, in which 225// operation kind K corresponds to the bit (1 << K). 226class PartitioningDriver : public SampleDriver { 227private: 228 // Dummy class -- a prepared model must not be nullptr. 229 class PartitioningPreparedModel : public IPreparedModel { 230 public: 231 Return<ErrorStatus> execute(const Request&, 232 const sp<IExecutionCallback>&) override { 233 return ErrorStatus::DEVICE_UNAVAILABLE; 234 } 235 }; 236public: 237 enum OEM { OEMNo, OEMYes }; 238 239 PartitioningDriver(const char *name, Capabilities capabilities, 240 uint32_t operationMask, OEM oem = OEMNo) : 241 SampleDriver(name), mCapabilities(capabilities), 242 mOperationMask(operationMask), mOEM(oem) {} 243 ~PartitioningDriver() override {} 244 245 Return<ErrorStatus> prepareModel_1_1(const Model&, ExecutionPreference, 246 const sp<IPreparedModelCallback>& cb) override { 247 cb->notify(ErrorStatus::NONE, new PartitioningPreparedModel); 248 return ErrorStatus::NONE; 249 } 250 251 Return<DeviceStatus> getStatus() override { 252 return DeviceStatus::AVAILABLE; 253 } 254 255 Return<void> getCapabilities_1_1(getCapabilities_1_1_cb cb) override { 256 cb(ErrorStatus::NONE, mCapabilities); 257 return Void(); 258 } 259 260 Return<void> getSupportedOperations_1_1(const Model& model, 261 getSupportedOperations_cb cb) override { 262 if (!android::nn::validateModel(model)) { 263 cb(ErrorStatus::INVALID_ARGUMENT, std::vector<bool>()); 264 return Void(); 265 } 266 267 const size_t count = model.operations.size(); 268 std::vector<bool> supported(count); 269 for (size_t i = 0; i < count; i++) { 270 if (model.operations[i].type == OperationType::OEM_OPERATION) { 271 supported[i] = (mOEM == OEMYes); 272 continue; 273 } 274 supported[i] = false; 275 uint32_t operation = lookupOperation(model, i); 276 if ((operation != kBadOperation) && (mOperationMask & (1 << operation))) { 277 supported[i] = true; 278 } 279 } 280 cb(ErrorStatus::NONE, supported); 281 return Void(); 282 } 283 284private: 285 Capabilities mCapabilities; 286 uint32_t mOperationMask; 287 OEM mOEM; 288}; 289 290// This class adds some simple abstractions and utilities on top of 291// ::android::nn::wrapper::Model. For example, it provides methods 292// that work in terms of operation kind (0..7); and because we care 293// about graph topology rather than details of operand types and 294// values, it greatly simplifies the process of creating operands. 295class PartitioningModel : public WrapperModel { 296public: 297 // Create a tensor operand of the specified type, and return the 298 // corresponding operand index. 299 uint32_t addFloatOperand() { 300 static const WrapperOperandType type(WrapperType::TENSOR_FLOAT32, { 1 }); 301 return addOperand(&type); 302 } 303 uint32_t addQuantOperand() { 304 static const WrapperOperandType type(WrapperType::TENSOR_QUANT8_ASYMM, { 1 }); 305 return addOperand(&type); 306 } 307 308 // Create an operation with two inputs and one output, specifying 309 // the operation kind (0..7) and the input operand indexes. 310 // Returns the output operand index. 311 enum class Dimensioned { NO, YES }; 312 uint32_t addOperation2To1(uint32_t operation, const uint32_t input0, const uint32_t input1, 313 Dimensioned dimensionedOutput = Dimensioned::YES) { 314 ANeuralNetworksOperationType type = 315 (operation < kNumFuseCodes ? ANEURALNETWORKS_ADD : ANEURALNETWORKS_MUL); 316 int32_t fuseCode = (operation < kNumFuseCodes ? operation : operation - kNumFuseCodes); 317 uint32_t input2 = addIntOperand(fuseCode); 318 uint32_t output = addOperandOfSameType(input0, dimensionedOutput); 319 addOperation(type, { input0, input1, input2 }, { output }); 320 return output; 321 } 322 323 // Create an OEM operation with one input and one output, 324 // specifying the input operand index. Returns the output operand 325 // index. 326 uint32_t addOperationOEM1To1(const uint32_t input, 327 Dimensioned dimensionedOutput = Dimensioned::YES) { 328 uint32_t output = addOperandOfSameType(input, dimensionedOutput); 329 addOperation(ANEURALNETWORKS_OEM_OPERATION, { input }, { output }); 330 return output; 331 } 332 333 // Run the partitioning algorithm to create an ExecutionPlan. 334 int partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices, 335 ExecutePreference preference, ExecutionPlan* plan) { 336 return reinterpret_cast<ModelBuilder*>(getHandle())->partitionTheWork( 337 devices, static_cast<uint32_t>(preference), plan); 338 } 339 340#ifdef VERBOSE 341 // This is a debugging utility function. 342 void dump(const char* name) const { 343 const ModelBuilder* mb = reinterpret_cast<const ModelBuilder*>(getHandle()); 344 ::dump(name, mb); 345 } 346#endif 347 348private: 349 350 // Create a scalar integer operand of the specified value, and 351 // return the corresponding operand index. 352 uint32_t addIntOperand(int32_t value) { 353 static const WrapperOperandType type(WrapperType::INT32, { }); 354 uint32_t operand = addOperand(&type); 355 setOperandValue(operand, &value, sizeof(value)); 356 return operand; 357 } 358 359 // Create an operand of the same type as the specified operand, 360 // and return the operand index of the new operand. 361 uint32_t addOperandOfSameType(uint32_t operand, Dimensioned dimensioned = Dimensioned::YES) { 362 const Operand& operandStruct = 363 reinterpret_cast<const ModelBuilder*>(getHandle())->getOperand(operand); 364 WrapperOperandType type(static_cast<WrapperType>(operandStruct.type), { 1 }); 365 if (dimensioned == Dimensioned::NO) { 366 for (auto& dimension : type.dimensions) { 367 dimension = 0; 368 } 369 } 370 return addOperand(&type); 371 } 372}; 373 374// This class adds some utilities on top of ::android::nn::wrapper::Compilation. 375class PartitioningCompilation : public WrapperCompilation { 376public: 377 PartitioningCompilation(const WrapperModel* model) : WrapperCompilation(model) { } 378 379 Result setPartitioning(uint32_t partitioning) { 380 return static_cast<Result>(builder()->setPartitioning(partitioning)); 381 } 382 383 using WrapperCompilation::finish; 384 Result finish(const std::vector<std::shared_ptr<Device>>& devices) { 385 return static_cast<Result>(builder()->finish(devices)); 386 } 387 388 const ExecutionPlan& getExecutionPlan() const { 389 return builder()->forTest_getExecutionPlan(); 390 } 391 392private: 393 CompilationBuilder* builder() { 394 return reinterpret_cast<CompilationBuilder*>(getHandle()); 395 } 396 397 const CompilationBuilder* builder() const { 398 return reinterpret_cast<const CompilationBuilder*>(getHandle()); 399 } 400}; 401 402#ifdef VERBOSE 403#define RETURN_TRUE() \ 404 { \ 405 std::cerr << "returning true from " << __LINE__ << std::endl; \ 406 return true; \ 407 } 408#else 409#define RETURN_TRUE() \ 410 { \ 411 return true; \ 412 } 413#endif 414#ifdef VERBOSE 415#define RETURN_FALSE(MESSAGE) \ 416 { \ 417 std::cerr << "returning false from " << __LINE__ MESSAGE << std::endl; \ 418 return false; \ 419 } 420#else 421#define RETURN_FALSE(MESSAGE) \ 422 { \ 423 return false; \ 424 } 425#endif 426 427class PartitioningTest : public ::testing::Test { 428protected: 429 using RemapVectorType = ExecutionStep::RemapVectorType; 430 using SubModelOutputSetType = ExecutionStep::SubModelOutputSetType; 431 432 virtual void SetUp() { 433 } 434 435 // From a vector of DeviceSpecification, create a vector of 436 // Devices. 437 struct DeviceSpecification { 438 DeviceSpecification(const std::string &name, Capabilities capabilities, 439 uint32_t operationMask, 440 PartitioningDriver::OEM oem = PartitioningDriver::OEMNo) : 441 mName(name), mCapabilities(capabilities), 442 mOperationMask(operationMask), mOEM(oem) { } 443 std::string mName; 444 Capabilities mCapabilities; 445 uint32_t mOperationMask; 446 PartitioningDriver::OEM mOEM; 447 }; 448 static std::vector<std::shared_ptr<Device>> 449 makeDevices(std::vector<DeviceSpecification> specifications) { 450 std::vector<std::shared_ptr<Device>> devices; 451 for (const auto& specification : specifications) { 452 devices.push_back(std::make_shared<Device>( 453 specification.mName, 454 new PartitioningDriver(specification.mName.c_str(), 455 specification.mCapabilities, 456 specification.mOperationMask, 457 specification.mOEM))); 458 if (!devices.back()->initialize()) { 459 EXPECT_NE("failed to initialize device", nullptr); 460 return {}; 461 } 462 } 463 return devices; 464 } 465 466 /*-- Graph comparision ----------------------------------------------------------------*/ 467 468 // An operand with certain values for its lifetime does not have a 469 // defining operation in the graph. For the purposes of the graph 470 // comparison algorithm, we encode the "defining operation" index of 471 // such an operand as follows: 472 // - NO_VALUE kPseudoDefiningOperationNoValue 473 // - MODEL_INPUT kPseudoDefiningOperationModelInput0 + (position in list of inputs) 474 // - CONSTANT_COPY kPseudoDefiningOperationConstantCopy0 + (constant value) 475 // Note: For the graphs we build in this test, we 476 // only expect to see 4-byte constants within 477 // a very restricted range, so we only make 478 // room for such constants in our encoding 479 // space. 480 // We do not expect to see CONSTANT_REFERENCE, and so we do not handle 481 // it. 482 // 483 // The encoding is intended to be relatively human readable; it is not 484 // designed to represent some optimal balance of ranges for the items 485 // within its scope (actual operations, inputs, constants). 486 487 enum PseudoDefiningOperationEncodings : uint32_t { 488 kPseudoDefiningOperationModelInput0 = 0x80000000U, 489 kPseudoDefiningOperationConstantCopy0 = 0x90000000U, 490 kPseudoDefiningOperationNoValue = 0xeeeeeeeeU, 491 492 // lowest value for special encoding 493 kPseudoDefiningOperationBase = 0x80000000U, 494 495 // range of encoded input or constant 496 kPseudoDefiningOperationRange = 0x10000000U, 497 }; 498 499 // Build a map from operand to defining operation. 500 // TODO: Replace map with vector? 501 void buildDefinitionMap(const ModelBuilder* model, 502 std::map<uint32_t, uint32_t>* defMap) { 503 // actual definitions 504 ASSERT_LT(model->operationCount(), kPseudoDefiningOperationBase); 505 for (uint32_t i = 0, e = model->operationCount(); i < e; i++) { 506 const Operation& operation = model->getOperation(i); 507 for (uint32_t output : operation.outputs) { 508 (*defMap)[output] = i; 509 } 510 } 511 // inputs 512 ASSERT_LT(model->inputCount(), kPseudoDefiningOperationRange); 513 for (uint32_t i = 0, e = model->inputCount(); i < e; i++) { 514 (*defMap)[model->getInputOperandIndex(i)] = kPseudoDefiningOperationModelInput0 + i; 515 } 516 // look for NO_VALUE and CONSTANT_COPY 517 for (uint32_t i = 0, e = model->operandCount(); i < e; i++) { 518 const Operand& operand = model->getOperand(i); 519 switch (operand.lifetime) { 520 case OperandLifeTime::NO_VALUE: 521 (*defMap)[i] = kPseudoDefiningOperationNoValue; 522 break; 523 case OperandLifeTime::CONSTANT_COPY: { 524 ASSERT_EQ(operand.location.length, sizeof(uint32_t)); 525 uint32_t value; 526 memcpy(&value, model->getPointerToOperandValue(operand.location.offset), sizeof(uint32_t)); 527 ASSERT_LT(value, kPseudoDefiningOperationNoValue); 528 (*defMap)[i] = kPseudoDefiningOperationConstantCopy0 + value; 529 break; 530 } 531 case OperandLifeTime::TEMPORARY_VARIABLE: 532 case OperandLifeTime::MODEL_INPUT: 533 case OperandLifeTime::MODEL_OUTPUT: 534 // already handled 535 break; 536 default: 537 FAIL(); 538 break; 539 } 540 } 541 // sanity check 542 ASSERT_EQ(model->operandCount(), defMap->size()); 543 } 544 545#ifdef VERBOSE 546 void dump(const char* name, const std::map<uint32_t, uint32_t>* aMap) { 547 auto writeNum = [](uint32_t num) { 548 if (num >= kPseudoDefiningOperationBase) { 549 std::cout << "0x" << std::hex << num << std::dec; 550 } else { 551 std::cout << num; 552 } 553 }; 554 555 std::cout << name << ": { "; 556 bool gotOne = false; 557 for (const auto& entry : *aMap) { 558 if (gotOne) { 559 std::cout << ", "; 560 } else { 561 gotOne = true; 562 } 563 std::cout << "("; 564 writeNum(entry.first); 565 std::cout << ", "; 566 writeNum(entry.second); 567 std::cout << ")"; 568 } 569 std::cout << " }" << std::endl; 570 } 571#endif 572 573 bool compare(const Operand& operandA, const Operand& operandB) { 574 if (operandA.type != operandB.type || 575 operandA.dimensions != operandB.dimensions || 576 operandA.numberOfConsumers != operandB.numberOfConsumers || 577 operandA.scale != operandB.scale || 578 operandA.zeroPoint != operandB.zeroPoint) { 579 return false; 580 } 581 return true; 582 } 583 584 // Compare two graphs. We ignore operand and operation indexes (i.e., 585 // two nodes can be the same even if they are numbered differently) 586 // but we also ignore semantics (e.g., even if an operation kind is 587 // such that the operand is commutative, we still pay attention to the 588 // order of its input operands). 589 // 590 // The comparison algorithm works by walking modelA from outputs 591 // towards inputs, along the edge from each operand to its 592 // defining operation, and then along the edges to the operation's 593 // input operands. At each step along the way, we try to match up 594 // operands and operations from modelA with equivalent operands 595 // and operations from modelB. 596 // 597 // We start by assuming that modelA's outputs and modelB's outputs 598 // match positionally (e.g., modelA's first output operand is 599 // equivalent to modelB's first output operand). Once we've 600 // discovered two equivalent operands (such as those outputs), we 601 // place them in a work queue. We repeatedly pull operands off 602 // the queue and compare their defining operations and those 603 // operations' input operands, to discover more pairs of 604 // equivalent operands. If we ever find operations that do not 605 // match (e.g., because operation kind differs), or operands that 606 // do not match (e.g., because operand type differs); or if we 607 // ever find a conflict (we've already decided that operand A's 608 // equivalent operand is B0, but it looks like we need its 609 // equivalent operand to be B1); then the graphs compare unequal. 610 // Otherwise, we'll eventually exhaust the work queue, and 611 // conclude that the graphs compare equal. 612 bool compare(const ModelBuilder* modelA, const ModelBuilder* modelB) { 613#ifdef VERBOSE 614 ::dump("compare(A)", modelA); 615 ::dump("compare(B)", modelB); 616#endif 617 618 if (modelA->operandCount() != modelB->operandCount() || 619 modelA->operationCount() != modelB->operationCount() || 620 modelA->inputCount() != modelB->inputCount() || 621 modelA->outputCount() != modelB->outputCount()) { 622 RETURN_FALSE(); 623 } 624 625 // Maps from operand index to index of defining operation. 626 std::map<uint32_t, uint32_t> defsA, defsB; 627 buildDefinitionMap(modelA, &defsA); 628 buildDefinitionMap(modelB, &defsB); 629 if (HasFatalFailure()) return false; 630 631 // Maps from operand index in modelA to equivalent operand index 632 // in modelB; and from operation index in modelA to equivalent 633 // operation index in modelB. 634 std::map<uint32_t, uint32_t> equivalentOperandsAToB; 635 std::map<uint32_t, uint32_t> equivalentOperationsAToB; 636 637 // Queue of operand indexes from modelA, each of whose defining 638 // operations are to be checked for equivalence with modelB. 639 std::queue<uint32_t> workQueueOperandsA; 640 641 // Seed operand equivalence map and work queue from model outputs. 642 for (uint32_t i = 0, e = modelA->outputCount(); i < e; i++) { 643 uint32_t outputA = modelA->getOutputOperandIndex(i); 644 uint32_t outputB = modelB->getOutputOperandIndex(i); 645 if (!compare(modelA->getOperand(outputA), modelB->getOperand(outputB))) { 646 RETURN_FALSE(); 647 } 648 equivalentOperandsAToB[outputA] = outputB; 649 workQueueOperandsA.push(outputA); 650 } 651 652#ifdef VERBOSE 653 dump("defsA", &defsA); 654 dump("defsB", &defsB); 655#endif 656 657 // Process the queue. 658 uint32_t pseudoDefinitionCount = 0; 659 while (!workQueueOperandsA.empty()) { 660#ifdef VERBOSE 661 dump("equivalentOperandsAToB", &equivalentOperandsAToB); 662 dump("equivalentOperationsAToB", &equivalentOperationsAToB); 663#endif 664 uint32_t operandIndexA = workQueueOperandsA.front(); 665#ifdef VERBOSE 666 std::cout << "operandIndexA: " << operandIndexA << std::endl; 667#endif 668 workQueueOperandsA.pop(); 669 uint32_t operandIndexB = equivalentOperandsAToB.at(operandIndexA); 670 671 uint32_t operationIndexA = defsA.at(operandIndexA); 672 uint32_t operationIndexB = defsB.at(operandIndexB); 673 auto it = equivalentOperationsAToB.find(operationIndexA); 674 if (it != equivalentOperationsAToB.end()) { 675 if (it->second != operationIndexB) { 676 RETURN_FALSE(); 677 } 678 continue; 679 } 680 681 // We haven't identified an equivalent operation for 682 // operationIndexA. 683 684 if ((operationIndexA >= kPseudoDefiningOperationBase) != 685 (operationIndexB >= kPseudoDefiningOperationBase)) { 686 RETURN_FALSE(); 687 } 688 // Either both operands have pseudo-definitions, or neither 689 // does. 690 if (operationIndexA >= kPseudoDefiningOperationBase) { 691 // Both operands have pseudo-definitions. 692 if (operationIndexA != operationIndexB) { 693 RETURN_FALSE(); 694 } 695 equivalentOperationsAToB[operationIndexA] = operationIndexB; 696 ++pseudoDefinitionCount; 697 continue; 698 } 699 700 // If we get here, neither operation A nor operation B is a 701 // pseudo-definition. 702 703 const Operation& operationA = modelA->getOperation(operationIndexA); 704 const Operation& operationB = modelB->getOperation(operationIndexB); 705 if (operationA.type != operationB.type || 706 operationA.inputs.size() != operationB.inputs.size() || 707 operationA.outputs.size() != operationB.outputs.size()) { 708 RETURN_FALSE(); 709 } 710 equivalentOperationsAToB[operationIndexA] = operationIndexB; 711 for (uint32_t i = 0, e = operationA.inputs.size(); i < e; i++) { 712 uint32_t inputA = operationA.inputs[i]; 713 uint32_t inputB = operationB.inputs[i]; 714 auto it = equivalentOperandsAToB.find(inputA); 715 if (it != equivalentOperandsAToB.end()) { 716 if (it->second != inputB) { 717 RETURN_FALSE(); 718 } 719 continue; 720 } 721 // We haven't identified an equivalent operand for inputA. 722 if (!compare(modelA->getOperand(inputA), modelB->getOperand(inputB))) { 723 RETURN_FALSE(); 724 } 725 equivalentOperandsAToB[inputA] = inputB; 726 workQueueOperandsA.push(inputA); 727 } 728 } 729 730 // Sanity check 731 if (modelA->operandCount() != defsA.size() || 732 modelA->operandCount() != defsB.size() || 733 modelA->operandCount() != equivalentOperandsAToB.size() || 734 modelA->operationCount() + pseudoDefinitionCount != equivalentOperationsAToB.size()) { 735 RETURN_FALSE(); 736 } 737 738 RETURN_TRUE(); 739 } 740 741 /*-------------------------------------------------------------------------------------*/ 742 743 bool compare(std::shared_ptr<const ExecutionStep> step, 744 const WrapperModel* model, std::shared_ptr<Device> device) { 745 return (step->getDevice() == device) && 746 compare(step->getSubModel(), 747 reinterpret_cast<const ModelBuilder*>(model->getHandle())); 748 } 749}; 750 751TEST_F(PartitioningTest, SimpleModel) { 752 PartitioningModel model; 753 uint32_t opnd0 = model.addFloatOperand(); 754 uint32_t opnd1 = model.addFloatOperand(); 755 uint32_t opnd2 = model.addOperation2To1(0, opnd0, opnd1); 756 uint32_t opnd3 = model.addFloatOperand(); 757 uint32_t opnd4 = model.addOperation2To1(1, opnd2, opnd3); 758 model.identifyInputsAndOutputs({ opnd0, opnd1, opnd3 }, { opnd4 }); 759 model.finish(); 760 ASSERT_TRUE(model.isValid()); 761 graphDump("SimpleModel", model); 762 763 // Simple partition (two devices are each capable of everything, one is the best). 764 const auto devicesA = makeDevices( 765 { 766 {"bad", { .float32Performance = { .execTime = 0.9, .powerUsage = 0.9 }, 767 .quantized8Performance = { .execTime = 0.9, .powerUsage = 0.9 } }, ~0U}, 768 {"good", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 }, 769 .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, ~0U} 770 }); 771 ExecutionPlan planA; 772 ASSERT_EQ(model.partitionTheWork(devicesA, ExecutePreference::PREFER_LOW_POWER, &planA), 773 ANEURALNETWORKS_NO_ERROR); 774 ASSERT_EQ(planA.forTest_getKind(), ExecutionPlan::Kind::SIMPLE); 775 ASSERT_NE(planA.forTest_simpleGetDevice().get(), nullptr); 776 ASSERT_EQ(planA.forTest_simpleGetDevice()->getName(), "good"); 777 778 // Simple partition (two devices are each capable of everything, none better than CPU). 779 const auto devicesC = makeDevices( 780 { 781 {"bad", { .float32Performance = { .execTime = 1.1, .powerUsage = 1.1 }, 782 .quantized8Performance = { .execTime = 1.1, .powerUsage = 1.1 } }, ~0U}, 783 {"bad2", { .float32Performance = { .execTime = 1.0, .powerUsage = 1.0 }, 784 .quantized8Performance = { .execTime = 1.0, .powerUsage = 1.0 } }, ~0U} 785 }); 786 ExecutionPlan planC; 787 ASSERT_EQ(model.partitionTheWork(devicesC, ExecutePreference::PREFER_LOW_POWER, &planC), 788 ANEURALNETWORKS_NO_ERROR); 789 ASSERT_EQ(planC.forTest_getKind(), ExecutionPlan::Kind::SIMPLE); 790 ASSERT_EQ(planC.forTest_simpleGetDevice(), nullptr); 791 792 // Compound partition (two devices, each is capable of one of the 793 // two operations). We could do more extensive checking here -- 794 // for example, verify that each step within the plan has the 795 // correct (model and submodel)x(inputs and outputs). 796 const auto devicesB = makeDevices( 797 { 798 {"0", { .float32Performance = { .execTime = 0.9, .powerUsage = 0.9 }, 799 .quantized8Performance = { .execTime = 0.9, .powerUsage = 0.9 } }, 1<<0}, 800 {"1", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 }, 801 .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, 1<<1} 802 }); 803 ExecutionPlan planB; 804 ASSERT_EQ(model.partitionTheWork(devicesB, ExecutePreference::PREFER_LOW_POWER, &planB), 805 ANEURALNETWORKS_NO_ERROR); 806 ASSERT_EQ(planB.forTest_getKind(), ExecutionPlan::Kind::COMPOUND); 807 const auto& stepsB = planB.forTest_compoundGetSteps(); 808 ASSERT_EQ(stepsB.size(), size_t(2)); 809 { 810 // Build a model to compare against the submodel from stepsB[0]. 811 PartitioningModel modelB0; 812 uint32_t b0Opnd0 = modelB0.addFloatOperand(); 813 uint32_t b0Opnd1 = modelB0.addFloatOperand(); 814 uint32_t b0Opnd2 = modelB0.addOperation2To1(0, b0Opnd0, b0Opnd1); 815 modelB0.identifyInputsAndOutputs({ b0Opnd0, b0Opnd1 }, { b0Opnd2 }); 816 modelB0.finish(); 817 ASSERT_TRUE(modelB0.isValid()); 818 ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(stepsB[0], &modelB0, devicesB[0]))); 819 ASSERT_EQ(stepsB[0]->getModelInputs(), 820 (RemapVectorType{ { opnd0, b0Opnd0 }, { opnd1, b0Opnd1 } })); 821 ASSERT_EQ(stepsB[0]->getModelOutputs(), 822 (RemapVectorType{})); 823 ASSERT_EQ(stepsB[0]->getTempsAsSubModelInputs(), 824 (RemapVectorType{})); 825 ASSERT_EQ(stepsB[0]->getTempsAsSubModelOutputs(), 826 (SubModelOutputSetType{ { opnd2, b0Opnd2 } })); 827 ASSERT_EQ(stepsB[0]->getOutputsAsSubModelInputs(), 828 (RemapVectorType{})); 829 } 830 { 831 // Build a model to compare against the submodel from stepsB[1]. 832 PartitioningModel modelB1; 833 uint32_t b1Opnd2 = modelB1.addFloatOperand(); 834 uint32_t b1Opnd3 = modelB1.addFloatOperand(); 835 uint32_t b1Opnd4 = modelB1.addOperation2To1(1, b1Opnd2, b1Opnd3); 836 // Note: In the partitioning algorithm, submodel inputs follow 837 // model inputs. In the original model "model", opnd2 is not 838 // an input; so in the submodel "modelB1", the corresponding 839 // input b1Opnd2 is a submodel input, and must follow the 840 // model input b1Opnd3. 841 modelB1.identifyInputsAndOutputs({ b1Opnd3, b1Opnd2 }, { b1Opnd4 }); 842 modelB1.finish(); 843 ASSERT_TRUE(modelB1.isValid()); 844 ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(stepsB[1], &modelB1, devicesB[1]))); 845 ASSERT_EQ(stepsB[1]->getModelInputs(), 846 (RemapVectorType{ { opnd3, b1Opnd3 } })); 847 ASSERT_EQ(stepsB[1]->getModelOutputs(), 848 (RemapVectorType{ { opnd4, b1Opnd4 } })); 849 ASSERT_EQ(stepsB[1]->getTempsAsSubModelInputs(), 850 (RemapVectorType{ { opnd2, b1Opnd2 } })); 851 ASSERT_EQ(stepsB[1]->getTempsAsSubModelOutputs(), 852 (SubModelOutputSetType{})); 853 ASSERT_EQ(stepsB[1]->getOutputsAsSubModelInputs(), 854 (RemapVectorType{})); 855 } 856} 857 858TEST_F(PartitioningTest, Cpu) { 859 // Here's a model where some operations execute only on the Cpu. 860 // To make things interesting, we produce three partitions -- 861 // device, cpu, same-device. 862 863 static const uint32_t kCpuOp = 1; 864 static const uint32_t kDevOp = 2; 865 866 const auto devices = makeDevices( 867 { 868 {"1", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 }, 869 .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, 1<<kDevOp} 870 }); 871 872 PartitioningModel model; 873 874 uint32_t opnd0 = model.addFloatOperand(); 875 uint32_t opnd1 = model.addFloatOperand(); 876 877 uint32_t opnd2 = model.addOperation2To1(kDevOp, opnd0, opnd1); 878 uint32_t opnd3 = model.addOperation2To1(kDevOp, opnd0, opnd2); 879 880 uint32_t opnd4 = model.addOperation2To1(kCpuOp, opnd0, opnd3); 881 uint32_t opnd5 = model.addOperation2To1(kCpuOp, opnd2, opnd4); 882 883 uint32_t opnd6 = model.addFloatOperand(); 884 885 uint32_t opnd7 = model.addOperation2To1(kDevOp, opnd3, opnd5); 886 uint32_t opnd8 = model.addOperation2To1(kDevOp, opnd6, opnd7); 887 888 model.identifyInputsAndOutputs({ opnd0, opnd1, opnd6 }, { opnd4, opnd8 }); 889 model.finish(); 890 ASSERT_TRUE(model.isValid()); 891 892 ExecutionPlan plan; 893 ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan), 894 ANEURALNETWORKS_NO_ERROR); 895 ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND); 896 const auto& steps = plan.forTest_compoundGetSteps(); 897 ASSERT_EQ(steps.size(), size_t(3)); 898 { 899 const auto& step0 = steps[0]; 900 901 // Build a model to compare against the submodel from steps[0]. 902 PartitioningModel model0; 903 uint32_t m0Opnd0 = model0.addFloatOperand(); 904 uint32_t m0Opnd1 = model0.addFloatOperand(); 905 uint32_t m0Opnd2 = model0.addOperation2To1(kDevOp, m0Opnd0, m0Opnd1); 906 uint32_t m0Opnd3 = model0.addOperation2To1(kDevOp, m0Opnd0, m0Opnd2); 907 model0.identifyInputsAndOutputs({ m0Opnd0, m0Opnd1 }, { m0Opnd2, m0Opnd3 }); 908 model0.finish(); 909 ASSERT_TRUE(model0.isValid()); 910 ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(step0, &model0, devices[0]))); 911 ASSERT_EQ(step0->getModelInputs(), 912 (RemapVectorType{ { opnd0, m0Opnd0 }, { opnd1, m0Opnd1 } })); 913 ASSERT_EQ(step0->getModelOutputs(), 914 (RemapVectorType{})); 915 ASSERT_EQ(step0->getTempsAsSubModelInputs(), 916 (RemapVectorType{})); 917 ASSERT_EQ(step0->getTempsAsSubModelOutputs(), 918 (SubModelOutputSetType{ { opnd2, m0Opnd2 }, { opnd3, m0Opnd3 } })); 919 ASSERT_EQ(step0->getOutputsAsSubModelInputs(), 920 (RemapVectorType{})); 921 } 922 { 923 const auto& step1 = steps[1]; 924 925 // Build a model to compare against the submodel from steps[1]. 926 PartitioningModel model1; 927 uint32_t m1Opnd0 = model1.addFloatOperand(); 928 uint32_t m1Opnd3 = model1.addFloatOperand(); 929 uint32_t m1Opnd4 = model1.addOperation2To1(kCpuOp, m1Opnd0, m1Opnd3); 930 uint32_t m1Opnd2 = model1.addFloatOperand(); 931 uint32_t m1Opnd5 = model1.addOperation2To1(kCpuOp, m1Opnd2, m1Opnd4); 932 model1.identifyInputsAndOutputs({ m1Opnd0, m1Opnd3, m1Opnd2 }, { m1Opnd4, m1Opnd5 }); 933 model1.finish(); 934 ASSERT_TRUE(model1.isValid()); 935 ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(step1, &model1, nullptr))); 936 ASSERT_EQ(step1->getModelInputs(), 937 (RemapVectorType{ { opnd0, m1Opnd0 } })); 938 ASSERT_EQ(step1->getModelOutputs(), 939 (RemapVectorType{ { opnd4, m1Opnd4 } })); 940 ASSERT_EQ(step1->getTempsAsSubModelInputs(), 941 (RemapVectorType{ { opnd3, m1Opnd3 }, { opnd2, m1Opnd2 } })); 942 ASSERT_EQ(step1->getTempsAsSubModelOutputs(), 943 (SubModelOutputSetType{ { opnd5, m1Opnd5 } })); 944 ASSERT_EQ(step1->getOutputsAsSubModelInputs(), 945 (RemapVectorType{})); 946 } 947 { 948 const auto& step2 = steps[2]; 949 950 // Build a model to compare against the submodel from steps[2]. 951 PartitioningModel model2; 952 uint32_t m2Opnd3 = model2.addFloatOperand(); 953 uint32_t m2Opnd5 = model2.addFloatOperand(); 954 uint32_t m2Opnd7 = model2.addOperation2To1(kDevOp, m2Opnd3, m2Opnd5); 955 uint32_t m2Opnd6 = model2.addFloatOperand(); 956 uint32_t m2Opnd8 = model2.addOperation2To1(kDevOp, m2Opnd6, m2Opnd7); 957 model2.identifyInputsAndOutputs({ m2Opnd6, m2Opnd3, m2Opnd5 }, { m2Opnd8 }); 958 model2.finish(); 959 ASSERT_TRUE(model2.isValid()); 960 ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(step2, &model2, devices[0]))); 961 ASSERT_EQ(step2->getModelInputs(), 962 (RemapVectorType{ { opnd6, m2Opnd6 } })); 963 ASSERT_EQ(step2->getModelOutputs(), 964 (RemapVectorType{ { opnd8, m2Opnd8 } })); 965 ASSERT_EQ(step2->getTempsAsSubModelInputs(), 966 (RemapVectorType{ { opnd3, m2Opnd3 }, { opnd5, m2Opnd5 } })); 967 ASSERT_EQ(step2->getTempsAsSubModelOutputs(), 968 (SubModelOutputSetType{})); 969 ASSERT_EQ(step2->getOutputsAsSubModelInputs(), 970 (RemapVectorType{})); 971 } 972} 973 974TEST_F(PartitioningTest, SetPartitioning) { 975 PartitioningModel model; 976 uint32_t opnd0 = model.addFloatOperand(); 977 uint32_t opnd1 = model.addFloatOperand(); 978 uint32_t opnd2 = model.addOperation2To1(0, opnd0, opnd1, PartitioningModel::Dimensioned::NO); 979 uint32_t opnd3 = model.addFloatOperand(); 980 uint32_t opnd4 = model.addOperation2To1(1, opnd2, opnd3); 981 model.identifyInputsAndOutputs({ opnd0, opnd1, opnd3 }, { opnd4 }); 982 model.finish(); 983 ASSERT_TRUE(model.isValid()); 984 985 // We expect that we cannot successfully partition, because we 986 // have an intermediate operand (opnd2) without dimensions, and 987 // this is not currently handled. 988 989 // One device that can and should execute operation 0. 990 const auto devices = makeDevices({ 991 {"hw", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 }, 992 .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, (1<<0)}, 993 }); 994 995 // Test kPartitioningNo. We should not even attempt partitioning, 996 // so there should be no execution plan. 997 PartitioningCompilation cPNo(&model); 998 ASSERT_EQ(cPNo.setPartitioning(DeviceManager::kPartitioningNo), Result::NO_ERROR); 999 ASSERT_EQ(cPNo.finish(devices), Result::NO_ERROR); 1000 ASSERT_EQ(cPNo.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::EMPTY); 1001 1002 // Test kPartitioningWithFallback. We should attempt 1003 // partitioning, reach the end of the partitioning process (so we 1004 // have an execution plan), discover the dimensionless 1005 // intermediate operand, and still return success (because of 1006 // fallback). 1007 PartitioningCompilation cPWithFallback(&model); 1008 ASSERT_EQ(cPWithFallback.setPartitioning(DeviceManager::kPartitioningWithFallback), Result::NO_ERROR); 1009 ASSERT_EQ(cPWithFallback.finish(devices), Result::NO_ERROR); 1010 ASSERT_EQ(cPWithFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::ERROR); 1011 1012 // Test kPartitioningWithoutFallback. We should attempt 1013 // partitioning, and fail. 1014 PartitioningCompilation cPWithoutFallback(&model); 1015 ASSERT_EQ(cPWithoutFallback.setPartitioning(DeviceManager::kPartitioningWithoutFallback), Result::NO_ERROR); 1016 ASSERT_EQ(cPWithoutFallback.finish(devices), Result::OP_FAILED); 1017 ASSERT_TRUE(cPWithoutFallback.getExecutionPlan().forTest_hasSubModelOutputsOfUnknownSize()); 1018 ASSERT_EQ(cPWithoutFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::ERROR); 1019} 1020 1021// Regression test for http://b/69166603: 1022// "partitioned compilation and execution yields wrong results when model output is submodel input" 1023TEST_F(PartitioningTest, ModelOutputAsSubmodelInput) { 1024 PartitioningModel model; 1025 uint32_t opnd0 = model.addFloatOperand(); 1026 uint32_t opnd1 = model.addFloatOperand(); 1027 uint32_t opnd2 = model.addOperation2To1(0, opnd0, opnd1); 1028 uint32_t opnd3 = model.addOperation2To1(1, opnd2, opnd2); 1029 model.identifyInputsAndOutputs({ opnd0, opnd1 }, { opnd2, opnd3 }); 1030 model.finish(); 1031 ASSERT_TRUE(model.isValid()); 1032 1033 // Compound partition (two devices, each is capable of one of the 1034 // two operations). We could do more extensive checking here -- 1035 // for example, verify that each step within the plan has the 1036 // correct (model and submodel)x(inputs and outputs). 1037 const auto devices = makeDevices( 1038 { 1039 {"0", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 }, 1040 .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, 1<<0}, 1041 {"1", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 }, 1042 .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, 1<<1} 1043 }); 1044 ExecutionPlan plan; 1045 ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan), 1046 ANEURALNETWORKS_NO_ERROR); 1047 ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND); 1048 const auto& steps = plan.forTest_compoundGetSteps(); 1049 ASSERT_EQ(steps.size(), size_t(2)); 1050 { 1051 // Build a model to compare against the submodel from steps[0]. 1052 PartitioningModel model0; 1053 uint32_t m0Opnd0 = model0.addFloatOperand(); 1054 uint32_t m0Opnd1 = model0.addFloatOperand(); 1055 uint32_t m0Opnd2 = model0.addOperation2To1(0, m0Opnd0, m0Opnd1); 1056 model0.identifyInputsAndOutputs({ m0Opnd0, m0Opnd1 }, { m0Opnd2 }); 1057 model0.finish(); 1058 ASSERT_TRUE(model0.isValid()); 1059 ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(steps[0], &model0, devices[0]))); 1060 ASSERT_EQ(steps[0]->getModelInputs(), 1061 (RemapVectorType{ { opnd0, m0Opnd0 }, { opnd1, m0Opnd1 } })); 1062 ASSERT_EQ(steps[0]->getModelOutputs(), 1063 (RemapVectorType{ { opnd2, m0Opnd2 } })); 1064 ASSERT_EQ(steps[0]->getTempsAsSubModelInputs(), 1065 (RemapVectorType{})); 1066 ASSERT_EQ(steps[0]->getTempsAsSubModelOutputs(), 1067 (SubModelOutputSetType{})); 1068 ASSERT_EQ(steps[0]->getOutputsAsSubModelInputs(), 1069 (RemapVectorType{})); 1070 } 1071 { 1072 // Build a model to compare against the submodel from steps[1]. 1073 PartitioningModel model1; 1074 uint32_t m1Opnd2 = model1.addFloatOperand(); 1075 uint32_t m1Opnd3 = model1.addOperation2To1(1, m1Opnd2, m1Opnd2); 1076 model1.identifyInputsAndOutputs({ m1Opnd2 }, { m1Opnd3 }); 1077 model1.finish(); 1078 ASSERT_TRUE(model1.isValid()); 1079 ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(steps[1], &model1, devices[1]))); 1080 ASSERT_EQ(steps[1]->getModelInputs(), 1081 (RemapVectorType{})); 1082 ASSERT_EQ(steps[1]->getModelOutputs(), 1083 (RemapVectorType{ { opnd3, m1Opnd3 } })); 1084 ASSERT_EQ(steps[1]->getTempsAsSubModelInputs(), 1085 (RemapVectorType{})); 1086 ASSERT_EQ(steps[1]->getTempsAsSubModelOutputs(), 1087 (SubModelOutputSetType{})); 1088 ASSERT_EQ(steps[1]->getOutputsAsSubModelInputs(), 1089 (RemapVectorType{ { opnd2, m1Opnd2 } })); 1090 } 1091} 1092 1093TEST_F(PartitioningTest, OemOperations) { 1094 // Trivial model consisting solely of OEM operation. 1095 PartitioningModel model; 1096 uint32_t opndIn = model.addFloatOperand(); 1097 uint32_t opndOut = model.addOperationOEM1To1(opndIn); 1098 model.identifyInputsAndOutputs({ opndIn }, { opndOut }); 1099 model.finish(); 1100 ASSERT_TRUE(model.isValid()); 1101 1102 // Verify that the best driver than can run an OEM operation is 1103 // used, even if it is not better than the CPU. 1104 const auto devicesBestOEM = makeDevices( 1105 { 1106 {"badOEM", { .float32Performance = { .execTime = 1.5, .powerUsage = 1.5 }, 1107 .quantized8Performance = { .execTime = 1.5, .powerUsage = 1.5 } }, 1108 ~0U, PartitioningDriver::OEMYes}, 1109 {"noOEM", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 }, 1110 .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, 1111 ~0U, PartitioningDriver::OEMNo}, 1112 {"goodOEM", { .float32Performance = { .execTime = 1.2, .powerUsage = 1.2 }, 1113 .quantized8Performance = { .execTime = 1.2, .powerUsage = 1.2 } }, 1114 ~0U, PartitioningDriver::OEMYes} 1115 }); 1116 PartitioningCompilation compilationBestOEM(&model); 1117 ASSERT_EQ(compilationBestOEM.finish(devicesBestOEM), Result::NO_ERROR); 1118 const auto& planBestOEM = compilationBestOEM.getExecutionPlan(); 1119 ASSERT_EQ(planBestOEM.forTest_getKind(), ExecutionPlan::Kind::SIMPLE); 1120 ASSERT_NE(planBestOEM.forTest_simpleGetDevice().get(), nullptr); 1121 ASSERT_EQ(planBestOEM.forTest_simpleGetDevice()->getName(), "goodOEM"); 1122 1123 // Verify that we get an error if no driver can run an OEM operation. 1124 const auto devicesNoOEM = makeDevices( 1125 { 1126 {"noOEM", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 }, 1127 .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, 1128 ~0U, PartitioningDriver::OEMNo} 1129 }); 1130 PartitioningCompilation compilationNoOEM(&model); 1131 ASSERT_EQ(compilationNoOEM.finish(devicesNoOEM), Result::BAD_DATA); 1132 1133 // Verify that we get an error if there are no drivers (only CPU fallback). 1134 PartitioningCompilation compilationNoDrivers(&model); 1135 ASSERT_EQ(compilationNoDrivers.finish({} /* no drivers */), Result::BAD_DATA); 1136} 1137 1138TEST_F(PartitioningTest, RelaxedFP) { 1139 const auto devices = makeDevices( 1140 { 1141 // Best choice for non-relaxed model. 1142 {"f32", { .float32Performance = { .execTime = 0.8, .powerUsage = 0.8 }, 1143 .relaxedFloat32toFloat16Performance = { .execTime = 0.9, .powerUsage = 0.9 }}, 1144 ~0U}, 1145 // Best choice for relaxed model. 1146 {"f16", { .float32Performance = { .execTime = 0.9, .powerUsage = 0.9 }, 1147 .relaxedFloat32toFloat16Performance = { .execTime = 0.8, .powerUsage = 0.8 }}, 1148 ~0U} 1149 }); 1150 1151 auto TrivialTest = [&devices](bool doRelax, const char* expectDevice) { 1152 // Trivial model consisting solely of one operation. 1153 SCOPED_TRACE(expectDevice); 1154 PartitioningModel model; 1155 uint32_t opnd0 = model.addFloatOperand(); 1156 uint32_t opnd1 = model.addFloatOperand(); 1157 uint32_t opnd2 = model.addOperation2To1(0, opnd0, opnd1); 1158 model.identifyInputsAndOutputs({ opnd0, opnd1 }, { opnd2 }); 1159 model.relaxComputationFloat32toFloat16(doRelax); 1160 model.finish(); 1161 ASSERT_TRUE(model.isValid()); 1162 // Verify that the model will be executed on the appropriate device. 1163 ExecutionPlan plan; 1164 ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan), 1165 ANEURALNETWORKS_NO_ERROR); 1166 ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE); 1167 ASSERT_EQ(plan.forTest_simpleGetDevice()->getName(), expectDevice); 1168 }; 1169 1170 ASSERT_NO_FATAL_FAILURE(TrivialTest(false, "f32")); 1171 ASSERT_NO_FATAL_FAILURE(TrivialTest(true, "f16")); 1172} 1173 1174} // namespace 1175