| /* |
| * Copyright (C) 2017 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include "ExecutionPlan.h" |
| #include "HalInterfaces.h" |
| #include "Manager.h" |
| #include "ModelBuilder.h" |
| #include "NeuralNetworks.h" |
| #include "NeuralNetworksWrapper.h" |
| #include "Utils.h" |
| |
| #include <gtest/gtest.h> |
| |
| #include <map> |
| #include <queue> |
| |
| // Uncomment the following line to generate some debugging output that |
| // may be useful when analyzing failures: |
| // |
| // #define VERBOSE VERBOSE |
| |
| // These tests do whitebox testing of the graph partitioning |
| // algorithm. It is "whitebox" in the sense that we're not evaluating |
| // whether a particular partitioning is legal, or "good enough" |
| // according to some metric, but whether it exactly matches the |
| // expected behavior of the current partitioning algorithm. |
| // |
| // A key part of the current partitioning algorithm is to determine |
| // which device among the available devices should be the one to |
| // execute a particular operation from the graph. This determination |
| // is made "locally" -- i.e., it does not depend on the graph |
| // topology, only on the properties of the operation in question. |
| // IDevice::getSupportedOperations() indicates which operations in a |
| // graph can be executed on a device, and IDevice::getCapabilities() |
| // indicates how "good" that device is for executing particular kinds |
| // of operations. For each operation, the partitioning algorithm |
| // picks the "best" device that is capable of executing that |
| // operation; if no device can do so, then the algorithm picks the |
| // cpu. |
| // |
| // As part of this testing approach, we want to make it easy to |
| // specify which operations in a test graph can be executed on which |
| // devices. We accomplish this with an abstraction: There are eight |
| // different kinds of operations (each of which has two inputs and one |
| // output), and when we instantiate a device for testing purposes, we |
| // specify what subset of those eight kinds of operations the device |
| // is able to execute. |
| // |
| // The eight kinds of operations are represented in the graph as ADD |
| // or MUL with a particular activation function -- two opcodes times |
| // four activation functions means eight available operation kinds. |
| // This is a low-level representation detail -- when we specify the |
| // behavior of the device or build a graph, we do so in terms of |
| // operation encodings 0..7. |
| // |
| // In order to determine whether or not a partitioning matches the |
| // expected partitioning, we check the number of partitions, check |
| // which device each partition targets, and compare each partition's |
| // subgraph, model inputs, model outputs, submodel inputs, and |
| // submodel outputs against what is expected. In order to perform |
| // that comparison, we build a model to compare against a partition's |
| // submodel and run a graph comparison algorithm on it. The graph |
| // comparison and the inputs and outputs comparisons are syntactic |
| // rather than semantic comparisons -- they don't allow for |
| // reorderings of inputs and outputs. Because of this, we need to |
| // know exactly how the partitioning algorithm orders inputs and |
| // outputs in order to construct the models and operand lists to |
| // compare against. Here are some relevant behaviors of the |
| // partitioning algorithm: |
| // |
| // - It builds a subgraph by walking operations in forward topological |
| // order, and adding each operation's input operands and output |
| // operands in index order (input followed by output) when that |
| // operation is added. (It does not add an input that has already |
| // been added.) |
| // - It finds model inputs, model outputs, and submodel inputs in |
| // the order the corresponding operands were added to the subgraph |
| // (see ExecutionStep methods getModelInputs(), getModelOutputs(), |
| // getSubModelInputs()). |
| // - It finds submodel outputs in numerical order of corresponding |
| // operand number in the original model (see ExecutionStep method |
| // getSubModelOutputs()). |
| // - When it calls identifyInputsAndOutputs() on the submodel, it |
| // passes inputs from getModelInputs() in order followed by submodel |
| // inputs from getSubModelInputs() in order; and it passes outputs |
| // from getModelOutputs() in order followed by submodel outputs from |
| // getSubModelOutputs() in order. |
| // |
| // TODO: Maybe the logic for comparing a partition to an expected |
| // model should be changed to tolerate reorderings of inputs and |
| // outputs, so that when we build models and lists to compare |
| // against, we don't need to worry about input and output |
| // orderings. But is there a way to do this that still lets us |
| // verify that we have the correct relationships between |
| // an (original) model's inputs and outputs and each submodel's |
| // inputs and outputs, as well as the correct relationship |
| // between submodel inputs and outputs across partitions? |
| |
| namespace { |
| |
| using Device = ::android::nn::Device; |
| using ExecutePreference = ::android::nn::wrapper::ExecutePreference; |
| using ExecutionPlan = ::android::nn::ExecutionPlan; |
| using ExecutionStep = ::android::nn::ExecutionStep; |
| using HidlModel = ::android::hardware::neuralnetworks::V1_0::Model; |
| using ModelBuilder = ::android::nn::ModelBuilder; |
| using WrapperModel = ::android::nn::wrapper::Model; |
| using WrapperOperandType = ::android::nn::wrapper::OperandType; |
| using WrapperType = ::android::nn::wrapper::Type; |
| |
| template <typename T> using sp = ::android::sp<T>; |
| |
| // We employ an operation numbering scheme: |
| // - 0..FuseCode-1 = ADD with the appropriate activation function |
| // - FuseCode..2*FuseCode-1 = MUL with the appropriate activation function |
| const uint32_t kNumFuseCodes = 4; |
| const uint32_t kBadOperation = ~0; |
| |
| // Look up the operation with the specified index in a graph, and |
| // return the operation encoding -- 0..7; or, if for some reason this |
| // is not one of the encoded operations, then return kBadOperation. |
| uint32_t lookupOperation(std::function<const Operation&(uint32_t)> getOperation, |
| std::function<const Operand&(uint32_t)> getOperand, |
| std::function<const uint8_t*(uint32_t)> getValue, |
| uint32_t operationIndex) { |
| const Operation& operation = getOperation(operationIndex); |
| switch (operation.type) { |
| case OperationType::ADD: |
| case OperationType::MUL: { |
| // input2 is the fused activation function |
| const Operand& input2 = getOperand(operation.inputs[2]); |
| if ((input2.type == OperandType::INT32) && |
| (input2.lifetime == OperandLifeTime::CONSTANT_COPY)) { |
| int32_t value; |
| memcpy(&value, |
| getValue(input2.location.offset), |
| input2.location.length); |
| if (operation.type == OperationType::MUL) { |
| value += kNumFuseCodes; |
| } |
| return value; |
| } |
| break; |
| } |
| default: |
| break; |
| } |
| return kBadOperation; |
| } |
| |
| uint32_t lookupOperation(const HidlModel& model, uint32_t operationIndex) { |
| return lookupOperation( |
| [&model](uint32_t index) -> const Operation& { |
| return model.operations[index]; |
| }, |
| [&model](uint32_t index) -> const Operand& { |
| return model.operands[index]; |
| }, |
| [&model](uint32_t offset) {return &model.operandValues[offset];}, |
| operationIndex); |
| } |
| |
| #ifdef VERBOSE |
| // This is a debugging utility function |
| void dump(const char* name, const ModelBuilder* model) { |
| HidlModel hidlModel; |
| model->setHidlModel(&hidlModel); |
| std::cout << name << ": " << toString(hidlModel) << std::endl; |
| std::cout << "inputs: " << toString(hidlModel.inputIndexes) << std::endl; |
| std::cout << "outputs: " << toString(hidlModel.outputIndexes) << std::endl; |
| for (size_t i = 0, e = hidlModel.operations.size(); i < e; i++) { |
| std::cout << "operation[" << i << "]: " << toString(hidlModel.operations[i]) << std::endl; |
| } |
| } |
| #endif |
| |
| // This is an IDevice for testing purposes. It only has two |
| // interesting properties, both of which are specified as constructor |
| // arguments: device capabilities, and which subset of operation kinds |
| // (0..7) does the device support. The subset is represented with a |
| // bitmask, in which operation kind K corresponds to the bit (1 << K). |
| class PartitioningIDevice : public IDevice { |
| public: |
| PartitioningIDevice(Capabilities capabilities, uint32_t operationMask) : |
| mCapabilities(capabilities), mOperationMask(operationMask) {} |
| ~PartitioningIDevice() override {} |
| |
| Return<ErrorStatus> prepareModel(const HidlModel&, |
| const sp<IPreparedModelCallback>& cb) override { |
| cb->notify(ErrorStatus::NONE, nullptr); |
| return ErrorStatus::NONE; |
| } |
| Return<DeviceStatus> getStatus() override { |
| return DeviceStatus::AVAILABLE; |
| } |
| |
| Return<void> getCapabilities(getCapabilities_cb cb) override { |
| cb(ErrorStatus::NONE, mCapabilities); |
| return Void(); |
| } |
| Return<void> getSupportedOperations(const HidlModel& model, |
| getSupportedOperations_cb cb) override { |
| if (!android::nn::validateModel(model)) { |
| cb(ErrorStatus::INVALID_ARGUMENT, std::vector<bool>()); |
| return Void(); |
| } |
| |
| const size_t count = model.operations.size(); |
| std::vector<bool> supported(count); |
| for (size_t i = 0; i < count; i++) { |
| supported[i] = false; |
| uint32_t operation = lookupOperation(model, i); |
| if ((operation != kBadOperation) && (mOperationMask & (1 << operation))) { |
| supported[i] = true; |
| } |
| } |
| cb(ErrorStatus::NONE, supported); |
| return Void(); |
| } |
| private: |
| Capabilities mCapabilities; |
| uint32_t mOperationMask; |
| }; |
| |
| // This class adds some simple abstractions and utilities on top of |
| // ::android::nn::wrapper::Model. For example, it provides methods |
| // that work in terms of operation kind (0..7); and because we care |
| // about graph topology rather than details of operand types and |
| // values, it greatly simplifies the process of creating operands. |
| class PartitioningModel : public WrapperModel { |
| public: |
| // Create a tensor operand of the specified type, and return the |
| // corresponding operand index. |
| uint32_t addFloatOperand() { |
| static const WrapperOperandType type(WrapperType::TENSOR_FLOAT32, { 1 }); |
| return addOperand(&type); |
| } |
| uint32_t addQuantOperand() { |
| static const WrapperOperandType type(WrapperType::TENSOR_QUANT8_ASYMM, { 1 }); |
| return addOperand(&type); |
| } |
| |
| // Create an operation with two inputs and one output, specifying |
| // the operation kind (0..7) and the input operand indexes. |
| // Returns the output operand index. |
| uint32_t addOperation2To1(uint32_t operation, const uint32_t input0, const uint32_t input1) { |
| ANeuralNetworksOperationType type = |
| (operation < kNumFuseCodes ? ANEURALNETWORKS_ADD : ANEURALNETWORKS_MUL); |
| int32_t fuseCode = (operation < kNumFuseCodes ? operation : operation - kNumFuseCodes); |
| uint32_t input2 = addIntOperand(fuseCode); |
| uint32_t output = addOperandOfSameType(input0); |
| addOperation(type, { input0, input1, input2 }, { output }); |
| return output; |
| } |
| |
| // Run the partitioning algorithm to create an ExecutionPlan. |
| int partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices, |
| ExecutePreference preference, ExecutionPlan* plan) { |
| return reinterpret_cast<ModelBuilder*>(getHandle())->partitionTheWork( |
| devices, static_cast<uint32_t>(preference), plan); |
| } |
| |
| #ifdef VERBOSE |
| // This is a debugging utility function. |
| void dump(const char* name) const { |
| const ModelBuilder* mb = reinterpret_cast<const ModelBuilder*>(getHandle()); |
| ::dump(name, mb); |
| } |
| #endif |
| |
| private: |
| |
| // Create a scalar integer operand of the specified value, and |
| // return the corresponding operand index. |
| uint32_t addIntOperand(int32_t value) { |
| static const WrapperOperandType type(WrapperType::INT32, { }); |
| uint32_t operand = addOperand(&type); |
| setOperandValue(operand, &value, sizeof(value)); |
| return operand; |
| } |
| |
| // Create an operand of the same type as the specified operand, |
| // and return the operand index of the new operand. |
| uint32_t addOperandOfSameType(uint32_t operand) { |
| const Operand& operandStruct = |
| reinterpret_cast<const ModelBuilder*>(getHandle())->getOperand(operand); |
| WrapperOperandType type(static_cast<WrapperType>(operandStruct.type), { 1 }); |
| return addOperand(&type); |
| } |
| }; |
| |
| #ifdef VERBOSE |
| #define RETURN_TRUE() \ |
| { \ |
| std::cerr << "returning true from " << __LINE__ << std::endl; \ |
| return true; \ |
| } |
| #else |
| #define RETURN_TRUE() \ |
| { \ |
| return true; \ |
| } |
| #endif |
| #ifdef VERBOSE |
| #define RETURN_FALSE(MESSAGE) \ |
| { \ |
| std::cerr << "returning false from " << __LINE__ MESSAGE << std::endl; \ |
| return false; \ |
| } |
| #else |
| #define RETURN_FALSE(MESSAGE) \ |
| { \ |
| return false; \ |
| } |
| #endif |
| |
| class PartitioningTest : public ::testing::Test { |
| protected: |
| // workaround for private types in ExecutionStep |
| using RemapVectorType = decltype(static_cast<ExecutionStep*>(nullptr)->getModelInputs()); |
| using SubModelOutputSetType = decltype(static_cast<ExecutionStep*>(nullptr)->getSubModelOutputs()); |
| |
| virtual void SetUp() { |
| } |
| |
| // From a vector of triples (tuples), each of the form (name, |
| // capabilities, bitmask of supported operation kinds), create a |
| // vector of Devices. |
| static std::vector<std::shared_ptr<Device>> |
| makeDevices(std::vector<std::tuple<std::string, Capabilities, uint32_t>> specifications) { |
| std::vector<std::shared_ptr<Device>> devices; |
| for (const auto& specification : specifications) { |
| devices.push_back(std::make_shared<Device>( |
| std::get<0>(specification), |
| new PartitioningIDevice(std::get<1>(specification), std::get<2>(specification)))); |
| devices.back()->initialize(); |
| } |
| return devices; |
| } |
| |
| /*-- Graph comparision ----------------------------------------------------------------*/ |
| |
| // An operand with certain values for its lifetime does not have a |
| // defining operation in the graph. For the purposes of the graph |
| // comparison algorithm, we encode the "defining operation" index of |
| // such an operand as follows: |
| // - NO_VALUE kPseudoDefiningOperationNoValue |
| // - MODEL_INPUT kPseudoDefiningOperationModelInput0 + (position in list of inputs) |
| // - CONSTANT_COPY kPseudoDefiningOperationConstantCopy0 + (constant value) |
| // Note: For the graphs we build in this test, we |
| // only expect to see 4-byte constants within |
| // a very restricted range, so we only make |
| // room for such constants in our encoding |
| // space. |
| // We do not expect to see CONSTANT_REFERENCE, and so we do not handle |
| // it. |
| // |
| // The encoding is intended to be relatively human readable; it is not |
| // designed to represent some optimal balance of ranges for the items |
| // within its scope (actual operations, inputs, constants). |
| |
| enum PseudoDefiningOperationEncodings : uint32_t { |
| kPseudoDefiningOperationModelInput0 = 0x80000000U, |
| kPseudoDefiningOperationConstantCopy0 = 0x90000000U, |
| kPseudoDefiningOperationNoValue = 0xeeeeeeeeU, |
| |
| // lowest value for special encoding |
| kPseudoDefiningOperationBase = 0x80000000U, |
| |
| // range of encoded input or constant |
| kPseudoDefiningOperationRange = 0x10000000U, |
| }; |
| |
| // Build a map from operand to defining operation. |
| // TODO: Replace map with vector? |
| void buildDefinitionMap(const ModelBuilder* model, |
| std::map<uint32_t, uint32_t>* defMap) { |
| // actual definitions |
| ASSERT_LT(model->operationCount(), kPseudoDefiningOperationBase); |
| for (uint32_t i = 0, e = model->operationCount(); i < e; i++) { |
| const Operation& operation = model->getOperation(i); |
| for (uint32_t output : operation.outputs) { |
| (*defMap)[output] = i; |
| } |
| } |
| // inputs |
| ASSERT_LT(model->inputCount(), kPseudoDefiningOperationRange); |
| for (uint32_t i = 0, e = model->inputCount(); i < e; i++) { |
| (*defMap)[model->getInputOperandIndex(i)] = kPseudoDefiningOperationModelInput0 + i; |
| } |
| // look for NO_VALUE and CONSTANT_COPY |
| for (uint32_t i = 0, e = model->operandCount(); i < e; i++) { |
| const Operand& operand = model->getOperand(i); |
| switch (operand.lifetime) { |
| case OperandLifeTime::NO_VALUE: |
| (*defMap)[i] = kPseudoDefiningOperationNoValue; |
| break; |
| case OperandLifeTime::CONSTANT_COPY: { |
| ASSERT_EQ(operand.location.length, sizeof(uint32_t)); |
| uint32_t value; |
| memcpy(&value, model->getPointerToOperandValue(operand.location.offset), sizeof(uint32_t)); |
| ASSERT_LT(value, kPseudoDefiningOperationNoValue); |
| (*defMap)[i] = kPseudoDefiningOperationConstantCopy0 + value; |
| break; |
| } |
| case OperandLifeTime::TEMPORARY_VARIABLE: |
| case OperandLifeTime::MODEL_INPUT: |
| case OperandLifeTime::MODEL_OUTPUT: |
| // already handled |
| break; |
| default: |
| FAIL(); |
| break; |
| } |
| } |
| // sanity check |
| ASSERT_EQ(model->operandCount(), defMap->size()); |
| } |
| |
| #ifdef VERBOSE |
| void dump(const char* name, const std::map<uint32_t, uint32_t>* aMap) { |
| auto writeNum = [](uint32_t num) { |
| if (num >= kPseudoDefiningOperationBase) { |
| std::cout << "0x" << std::hex << num << std::dec; |
| } else { |
| std::cout << num; |
| } |
| }; |
| |
| std::cout << name << ": { "; |
| bool gotOne = false; |
| for (const auto& entry : *aMap) { |
| if (gotOne) { |
| std::cout << ", "; |
| } else { |
| gotOne = true; |
| } |
| std::cout << "("; |
| writeNum(entry.first); |
| std::cout << ", "; |
| writeNum(entry.second); |
| std::cout << ")"; |
| } |
| std::cout << " }" << std::endl; |
| } |
| #endif |
| |
| bool compare(const Operand& operandA, const Operand& operandB) { |
| if (operandA.type != operandB.type || |
| operandA.dimensions != operandB.dimensions || |
| operandA.numberOfConsumers != operandB.numberOfConsumers || |
| operandA.scale != operandB.scale || |
| operandA.zeroPoint != operandB.zeroPoint) { |
| return false; |
| } |
| return true; |
| } |
| |
| // Compare two graphs. We ignore operand and operation indexes (i.e., |
| // two nodes can be the same even if they are numbered differently) |
| // but we also ignore semantics (e.g., even if an operation kind is |
| // such that the operand is commutative, we still pay attention to the |
| // order of its input operands). |
| // |
| // The comparison algorithm works by walking modelA from outputs |
| // towards inputs, along the edge from each operand to its |
| // defining operation, and then along the edges to the operation's |
| // input operands. At each step along the way, we try to match up |
| // operands and operations from modelA with equivalent operands |
| // and operations from modelB. |
| // |
| // We start by assuming that modelA's outputs and modelB's outputs |
| // match positionally (e.g., modelA's first output operand is |
| // equivalent to modelB's first output operand). Once we've |
| // discovered two equivalent operands (such as those outputs), we |
| // place them in a work queue. We repeatedly pull operands off |
| // the queue and compare their defining operations and those |
| // operations' input operands, to discover more pairs of |
| // equivalent operands. If we ever find operations that do not |
| // match (e.g., because operation kind differs), or operands that |
| // do not match (e.g., because operand type differs); or if we |
| // ever find a conflict (we've already decided that operand A's |
| // equivalent operand is B0, but it looks like we need its |
| // equivalent operand to be B1); then the graphs compare unequal. |
| // Otherwise, we'll eventually exhaust the work queue, and |
| // conclude that the graphs compare equal. |
| bool compare(const ModelBuilder* modelA, const ModelBuilder* modelB) { |
| #ifdef VERBOSE |
| ::dump("compare(A)", modelA); |
| ::dump("compare(B)", modelB); |
| #endif |
| |
| if (modelA->operandCount() != modelB->operandCount() || |
| modelA->operationCount() != modelB->operationCount() || |
| modelA->inputCount() != modelB->inputCount() || |
| modelA->outputCount() != modelB->outputCount()) { |
| RETURN_FALSE(); |
| } |
| |
| // Maps from operand index to index of defining operation. |
| std::map<uint32_t, uint32_t> defsA, defsB; |
| buildDefinitionMap(modelA, &defsA); |
| buildDefinitionMap(modelB, &defsB); |
| if (HasFatalFailure()) return false; |
| |
| // Maps from operand index in modelA to equivalent operand index |
| // in modelB; and from operation index in modelA to equivalent |
| // operation index in modelB. |
| std::map<uint32_t, uint32_t> equivalentOperandsAToB; |
| std::map<uint32_t, uint32_t> equivalentOperationsAToB; |
| |
| // Queue of operand indexes from modelA, each of whose defining |
| // operations are to be checked for equivalence with modelB. |
| std::queue<uint32_t> workQueueOperandsA; |
| |
| // Seed operand equivalence map and work queue from model outputs. |
| for (uint32_t i = 0, e = modelA->outputCount(); i < e; i++) { |
| uint32_t outputA = modelA->getOutputOperandIndex(i); |
| uint32_t outputB = modelB->getOutputOperandIndex(i); |
| if (!compare(modelA->getOperand(outputA), modelB->getOperand(outputB))) { |
| RETURN_FALSE(); |
| } |
| equivalentOperandsAToB[outputA] = outputB; |
| workQueueOperandsA.push(outputA); |
| } |
| |
| #ifdef VERBOSE |
| dump("defsA", &defsA); |
| dump("defsB", &defsB); |
| #endif |
| |
| // Process the queue. |
| uint32_t pseudoDefinitionCount = 0; |
| while (!workQueueOperandsA.empty()) { |
| #ifdef VERBOSE |
| dump("equivalentOperandsAToB", &equivalentOperandsAToB); |
| dump("equivalentOperationsAToB", &equivalentOperationsAToB); |
| #endif |
| uint32_t operandIndexA = workQueueOperandsA.front(); |
| #ifdef VERBOSE |
| std::cout << "operandIndexA: " << operandIndexA << std::endl; |
| #endif |
| workQueueOperandsA.pop(); |
| uint32_t operandIndexB = equivalentOperandsAToB.at(operandIndexA); |
| |
| uint32_t operationIndexA = defsA.at(operandIndexA); |
| uint32_t operationIndexB = defsB.at(operandIndexB); |
| auto it = equivalentOperationsAToB.find(operationIndexA); |
| if (it != equivalentOperationsAToB.end()) { |
| if (it->second != operationIndexB) { |
| RETURN_FALSE(); |
| } |
| continue; |
| } |
| |
| // We haven't identified an equivalent operation for |
| // operationIndexA. |
| |
| if ((operationIndexA >= kPseudoDefiningOperationBase) != |
| (operationIndexB >= kPseudoDefiningOperationBase)) { |
| RETURN_FALSE(); |
| } |
| // Either both operands have pseudo-definitions, or neither |
| // does. |
| if (operationIndexA >= kPseudoDefiningOperationBase) { |
| // Both operands have pseudo-definitions. |
| if (operationIndexA != operationIndexB) { |
| RETURN_FALSE(); |
| } |
| equivalentOperationsAToB[operationIndexA] = operationIndexB; |
| ++pseudoDefinitionCount; |
| continue; |
| } |
| |
| // If we get here, neither operation A nor operation B is a |
| // pseudo-definition. |
| |
| const Operation& operationA = modelA->getOperation(operationIndexA); |
| const Operation& operationB = modelB->getOperation(operationIndexB); |
| if (operationA.type != operationB.type || |
| operationA.inputs.size() != operationB.inputs.size() || |
| operationA.outputs.size() != operationB.outputs.size()) { |
| RETURN_FALSE(); |
| } |
| equivalentOperationsAToB[operationIndexA] = operationIndexB; |
| for (uint32_t i = 0, e = operationA.inputs.size(); i < e; i++) { |
| uint32_t inputA = operationA.inputs[i]; |
| uint32_t inputB = operationB.inputs[i]; |
| auto it = equivalentOperandsAToB.find(inputA); |
| if (it != equivalentOperandsAToB.end()) { |
| if (it->second != inputB) { |
| RETURN_FALSE(); |
| } |
| continue; |
| } |
| // We haven't identified an equivalent operand for inputA. |
| if (!compare(modelA->getOperand(inputA), modelB->getOperand(inputB))) { |
| RETURN_FALSE(); |
| } |
| equivalentOperandsAToB[inputA] = inputB; |
| workQueueOperandsA.push(inputA); |
| } |
| } |
| |
| // Sanity check |
| if (modelA->operandCount() != defsA.size() || |
| modelA->operandCount() != defsB.size() || |
| modelA->operandCount() != equivalentOperandsAToB.size() || |
| modelA->operationCount() + pseudoDefinitionCount != equivalentOperationsAToB.size()) { |
| RETURN_FALSE(); |
| } |
| |
| RETURN_TRUE(); |
| } |
| |
| /*-------------------------------------------------------------------------------------*/ |
| |
| bool compare(std::shared_ptr<const ExecutionStep> step, |
| const WrapperModel* model, std::shared_ptr<Device> device) { |
| return (step->getDevice() == device) && |
| compare(step->getSubModel().get(), |
| reinterpret_cast<const ModelBuilder*>(model->getHandle())); |
| } |
| }; |
| |
| TEST_F(PartitioningTest, SimpleModel) { |
| PartitioningModel model; |
| uint32_t opnd0 = model.addFloatOperand(); |
| uint32_t opnd1 = model.addFloatOperand(); |
| uint32_t opnd2 = model.addOperation2To1(0, opnd0, opnd1); |
| uint32_t opnd3 = model.addFloatOperand(); |
| uint32_t opnd4 = model.addOperation2To1(1, opnd2, opnd3); |
| model.identifyInputsAndOutputs({ opnd0, opnd1, opnd3 }, { opnd4 }); |
| model.finish(); |
| ASSERT_TRUE(model.isValid()); |
| |
| // Simple partition (two devices are each capable of everything, one is the best). |
| const auto devicesA = makeDevices( |
| { |
| {"bad", { .float32Performance = { .execTime = 1.5, .powerUsage = 1.5 }, |
| .quantized8Performance = { .execTime = 1.5, .powerUsage = 1.5 } }, ~0}, |
| {"good", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 }, |
| .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, ~0} |
| }); |
| ExecutionPlan planA; |
| ASSERT_EQ(model.partitionTheWork(devicesA, ExecutePreference::PREFER_LOW_POWER, &planA), |
| ANEURALNETWORKS_NO_ERROR); |
| ASSERT_EQ(planA.forTest_getKind(), ExecutionPlan::Kind::SIMPLE); |
| ASSERT_EQ(planA.forTest_simpleGetDevice()->getName(), "good"); |
| |
| // Compound partition (two devices, each is capable of one of the |
| // two operations). We could do more extensive checking here -- |
| // for example, verify that each step within the plan has the |
| // correct (model and submodel)x(inputs and outputs). |
| const auto devicesB = makeDevices( |
| { |
| {"0", { .float32Performance = { .execTime = 1.5, .powerUsage = 1.5 }, |
| .quantized8Performance = { .execTime = 1.5, .powerUsage = 1.5 } }, 1<<0}, |
| {"1", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 }, |
| .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, 1<<1} |
| }); |
| ExecutionPlan planB; |
| ASSERT_EQ(model.partitionTheWork(devicesB, ExecutePreference::PREFER_LOW_POWER, &planB), |
| ANEURALNETWORKS_NO_ERROR); |
| ASSERT_EQ(planB.forTest_getKind(), ExecutionPlan::Kind::COMPOUND); |
| const auto& stepsB = planB.forTest_compoundGetSteps(); |
| ASSERT_EQ(stepsB.size(), size_t(2)); |
| { |
| // Build a model to compare against the submodel from stepsB[0]. |
| PartitioningModel modelB0; |
| uint32_t b0Opnd0 = modelB0.addFloatOperand(); |
| uint32_t b0Opnd1 = modelB0.addFloatOperand(); |
| uint32_t b0Opnd2 = modelB0.addOperation2To1(0, b0Opnd0, b0Opnd1); |
| modelB0.identifyInputsAndOutputs({ b0Opnd0, b0Opnd1 }, { b0Opnd2 }); |
| modelB0.finish(); |
| ASSERT_TRUE(modelB0.isValid()); |
| ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(stepsB[0], &modelB0, devicesB[0]))); |
| ASSERT_EQ(stepsB[0]->getModelInputs(), |
| (RemapVectorType{ { opnd0, b0Opnd0 }, { opnd1, b0Opnd1 } })); |
| ASSERT_EQ(stepsB[0]->getModelOutputs(), |
| (RemapVectorType{})); |
| ASSERT_EQ(stepsB[0]->getSubModelInputs(), |
| (RemapVectorType{})); |
| ASSERT_EQ(stepsB[0]->getSubModelOutputs(), |
| (SubModelOutputSetType{ { opnd2, b0Opnd2 } })); |
| } |
| { |
| // Build a model to compare against the submodel from stepsB[1]. |
| PartitioningModel modelB1; |
| uint32_t b1Opnd2 = modelB1.addFloatOperand(); |
| uint32_t b1Opnd3 = modelB1.addFloatOperand(); |
| uint32_t b1Opnd4 = modelB1.addOperation2To1(1, b1Opnd2, b1Opnd3); |
| // Note: In the partitioning algorithm, submodel inputs follow |
| // model inputs. In the original model "model", opnd2 is not |
| // an input; so in the submodel "modelB1", the corresponding |
| // input b1Opnd2 is a submodel input, and must follow the |
| // model input b1Opnd3. |
| modelB1.identifyInputsAndOutputs({ b1Opnd3, b1Opnd2 }, { b1Opnd4 }); |
| modelB1.finish(); |
| ASSERT_TRUE(modelB1.isValid()); |
| ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(stepsB[1], &modelB1, devicesB[1]))); |
| ASSERT_EQ(stepsB[1]->getModelInputs(), |
| (RemapVectorType{ { opnd3, b1Opnd3 } })); |
| ASSERT_EQ(stepsB[1]->getModelOutputs(), |
| (RemapVectorType{ { opnd4, b1Opnd4 } })); |
| ASSERT_EQ(stepsB[1]->getSubModelInputs(), |
| (RemapVectorType{ { opnd2, b1Opnd2 } })); |
| ASSERT_EQ(stepsB[1]->getSubModelOutputs(), |
| (SubModelOutputSetType{})); |
| } |
| } |
| |
| TEST_F(PartitioningTest, Cpu) { |
| // Here's a model where some operations execute only on the Cpu. |
| // To make things interesting, we produce three partitions -- |
| // device, cpu, same-device. |
| |
| static const uint32_t kCpuOp = 1; |
| static const uint32_t kDevOp = 2; |
| |
| const auto devices = makeDevices( |
| { |
| {"1", { .float32Performance = { .execTime = 0.5, .powerUsage = 0.5 }, |
| .quantized8Performance = { .execTime = 0.5, .powerUsage = 0.5 } }, 1<<kDevOp} |
| }); |
| |
| PartitioningModel model; |
| |
| uint32_t opnd0 = model.addFloatOperand(); |
| uint32_t opnd1 = model.addFloatOperand(); |
| |
| uint32_t opnd2 = model.addOperation2To1(kDevOp, opnd0, opnd1); |
| uint32_t opnd3 = model.addOperation2To1(kDevOp, opnd0, opnd2); |
| |
| uint32_t opnd4 = model.addOperation2To1(kCpuOp, opnd0, opnd3); |
| uint32_t opnd5 = model.addOperation2To1(kCpuOp, opnd2, opnd4); |
| |
| uint32_t opnd6 = model.addFloatOperand(); |
| |
| uint32_t opnd7 = model.addOperation2To1(kDevOp, opnd3, opnd5); |
| uint32_t opnd8 = model.addOperation2To1(kDevOp, opnd6, opnd7); |
| |
| model.identifyInputsAndOutputs({ opnd0, opnd1, opnd6 }, { opnd4, opnd8 }); |
| model.finish(); |
| ASSERT_TRUE(model.isValid()); |
| |
| ExecutionPlan plan; |
| ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan), |
| ANEURALNETWORKS_NO_ERROR); |
| ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND); |
| const auto& steps = plan.forTest_compoundGetSteps(); |
| ASSERT_EQ(steps.size(), size_t(3)); |
| { |
| const auto& step0 = steps[0]; |
| |
| // Build a model to compare against the submodel from steps[0]. |
| PartitioningModel model0; |
| uint32_t m0Opnd0 = model0.addFloatOperand(); |
| uint32_t m0Opnd1 = model0.addFloatOperand(); |
| uint32_t m0Opnd2 = model0.addOperation2To1(kDevOp, m0Opnd0, m0Opnd1); |
| uint32_t m0Opnd3 = model0.addOperation2To1(kDevOp, m0Opnd0, m0Opnd2); |
| model0.identifyInputsAndOutputs({ m0Opnd0, m0Opnd1 }, { m0Opnd2, m0Opnd3 }); |
| model0.finish(); |
| ASSERT_TRUE(model0.isValid()); |
| ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(step0, &model0, devices[0]))); |
| ASSERT_EQ(step0->getModelInputs(), |
| (RemapVectorType{ { opnd0, m0Opnd0 }, { opnd1, m0Opnd1 } })); |
| ASSERT_EQ(step0->getModelOutputs(), |
| (RemapVectorType{})); |
| ASSERT_EQ(step0->getSubModelInputs(), |
| (RemapVectorType{})); |
| ASSERT_EQ(step0->getSubModelOutputs(), |
| (SubModelOutputSetType{ { opnd2, m0Opnd2 }, { opnd3, m0Opnd3 } })); |
| } |
| { |
| const auto& step1 = steps[1]; |
| |
| // Build a model to compare against the submodel from steps[1]. |
| PartitioningModel model1; |
| uint32_t m1Opnd0 = model1.addFloatOperand(); |
| uint32_t m1Opnd3 = model1.addFloatOperand(); |
| uint32_t m1Opnd4 = model1.addOperation2To1(kCpuOp, m1Opnd0, m1Opnd3); |
| uint32_t m1Opnd2 = model1.addFloatOperand(); |
| uint32_t m1Opnd5 = model1.addOperation2To1(kCpuOp, m1Opnd2, m1Opnd4); |
| model1.identifyInputsAndOutputs({ m1Opnd0, m1Opnd3, m1Opnd2 }, { m1Opnd4, m1Opnd5 }); |
| model1.finish(); |
| ASSERT_TRUE(model1.isValid()); |
| ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(step1, &model1, nullptr))); |
| ASSERT_EQ(step1->getModelInputs(), |
| (RemapVectorType{ { opnd0, m1Opnd0 } })); |
| ASSERT_EQ(step1->getModelOutputs(), |
| (RemapVectorType{ { opnd4, m1Opnd4 } })); |
| ASSERT_EQ(step1->getSubModelInputs(), |
| (RemapVectorType{ { opnd3, m1Opnd3 }, { opnd2, m1Opnd2 } })); |
| ASSERT_EQ(step1->getSubModelOutputs(), |
| (SubModelOutputSetType{ { opnd5, m1Opnd5 } })); |
| } |
| { |
| const auto& step2 = steps[2]; |
| |
| // Build a model to compare against the submodel from steps[2]. |
| PartitioningModel model2; |
| uint32_t m2Opnd3 = model2.addFloatOperand(); |
| uint32_t m2Opnd5 = model2.addFloatOperand(); |
| uint32_t m2Opnd7 = model2.addOperation2To1(kDevOp, m2Opnd3, m2Opnd5); |
| uint32_t m2Opnd6 = model2.addFloatOperand(); |
| uint32_t m2Opnd8 = model2.addOperation2To1(kDevOp, m2Opnd6, m2Opnd7); |
| model2.identifyInputsAndOutputs({ m2Opnd6, m2Opnd3, m2Opnd5 }, { m2Opnd8 }); |
| model2.finish(); |
| ASSERT_TRUE(model2.isValid()); |
| ASSERT_NO_FATAL_FAILURE(ASSERT_TRUE(compare(step2, &model2, devices[0]))); |
| ASSERT_EQ(step2->getModelInputs(), |
| (RemapVectorType{ { opnd6, m2Opnd6 } })); |
| ASSERT_EQ(step2->getModelOutputs(), |
| (RemapVectorType{ { opnd8, m2Opnd8 } })); |
| ASSERT_EQ(step2->getSubModelInputs(), |
| (RemapVectorType{ { opnd3, m2Opnd3 }, { opnd5, m2Opnd5 } })); |
| ASSERT_EQ(step2->getSubModelOutputs(), |
| (SubModelOutputSetType{})); |
| } |
| } |
| |
| } // namespace |