| /* |
| * Copyright (C) 2017 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #define LOG_TAG "ExecutionPlan" |
| |
| #include "ExecutionPlan.h" |
| |
| #include <cutils/native_handle.h> |
| #include <fcntl.h> |
| #include <openssl/sha.h> |
| #include <sys/stat.h> |
| #include <sys/types.h> |
| |
| #include <algorithm> |
| #include <functional> |
| #include <map> |
| #include <memory> |
| #include <mutex> |
| #include <queue> |
| #include <set> |
| #include <string> |
| #include <type_traits> |
| #include <unordered_set> |
| #include <utility> |
| #include <vector> |
| |
| #include "BurstBuilder.h" |
| #include "Callbacks.h" |
| #include "CompilationBuilder.h" |
| #include "ControlFlow.h" |
| #include "CpuExecutor.h" |
| #include "ExecutionBuilder.h" |
| #include "ExecutionBurstController.h" |
| #include "GraphDump.h" |
| #include "Manager.h" |
| #include "MetaModel.h" |
| #include "ModelBuilder.h" |
| #include "OperationsUtils.h" |
| #include "TokenHasher.h" |
| #include "Tracing.h" |
| #include "TypeManager.h" |
| #include "Utils.h" |
| |
| namespace android { |
| namespace nn { |
| |
| namespace { |
| |
| using namespace hal; |
| |
| // The index of the main model in SourceModels. |
| constexpr uint32_t kMainModelInSourceModels = 0; |
| |
| // Compiles the model on device. |
| // If compilation caching is available, depending on ExecutionPlan::mState, the token may only have |
| // been initialized by the user provided token (SIMPLE body), or is already re-hashed by the |
| // operation indices to be executed (COMPOUND body). The token will be re-hashed further by the |
| // device name, device version string, and the execution preference in this function. |
| int compile(const Device& device, const ModelBuilder& model, int executionPreference, |
| int compilationPriority, const std::optional<Deadline>& deadline, |
| const std::string& cacheDir, TokenHasher* token, |
| std::shared_ptr<PreparedModel>* preparedModel) { |
| CHECK(token != nullptr); |
| CHECK(preparedModel != nullptr); |
| *preparedModel = nullptr; |
| |
| std::optional<CacheToken> cacheToken; |
| if (device.isCachingSupported() && token->ok() && |
| token->updateFromString(device.getName().c_str()) && |
| token->updateFromString(device.getVersionString().c_str()) && |
| token->update(&executionPreference, sizeof(executionPreference)) && |
| token->update(&compilationPriority, sizeof(compilationPriority)) && token->finish()) { |
| cacheToken.emplace(token->getCacheToken()); |
| } |
| |
| const ModelFactory makeModel = [&model] { return model.makeHidlModel(); }; |
| const ExecutionPreference preference = static_cast<ExecutionPreference>(executionPreference); |
| const Priority priority = convertToHalPriority(compilationPriority); |
| const auto [n, returnedPreparedModel] = |
| device.prepareModel(makeModel, preference, priority, deadline, cacheDir, cacheToken); |
| *preparedModel = returnedPreparedModel; |
| return n; |
| } |
| |
| typedef std::function<void(uint32_t)> OperationReadyCallback; |
| |
| int copyOperandExtraParams(ModelBuilder& model, uint32_t toOperandIndex, |
| const Operand& fromOperand) { |
| if (fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL && |
| fromOperand.extraParams.getDiscriminator() == |
| OperandExtraParams::hidl_discriminator::channelQuant) { |
| auto& fromChannelQuant = fromOperand.extraParams.channelQuant(); |
| ANeuralNetworksSymmPerChannelQuantParams toChannelQuant = { |
| .channelDim = fromChannelQuant.channelDim, |
| .scaleCount = static_cast<uint32_t>(fromChannelQuant.scales.size()), |
| .scales = fromChannelQuant.scales.data(), |
| }; |
| return model.setOperandSymmPerChannelQuantParams(toOperandIndex, toChannelQuant); |
| } else if (isExtensionOperandType(fromOperand.type) && |
| fromOperand.extraParams.getDiscriminator() == |
| OperandExtraParams::hidl_discriminator::extension) { |
| hidl_vec<uint8_t> extensionData = fromOperand.extraParams.extension(); |
| return model.setOperandExtensionData(toOperandIndex, extensionData.data(), |
| extensionData.size()); |
| } else if (fromOperand.extraParams.getDiscriminator() != |
| OperandExtraParams::hidl_discriminator::none || |
| fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) { |
| LOG(ERROR) << "Type " << toString(fromOperand.type) |
| << " has an unexpected extraParams discriminator: " |
| << static_cast<int>(fromOperand.extraParams.getDiscriminator()); |
| return ANEURALNETWORKS_BAD_DATA; |
| } else { |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| } |
| |
| // This class tracks whether we know the value of an operand as operations |
| // are processed. |
| class OperandTracker { |
| public: |
| // Creates the tracker for this model. Figure out which operations can be |
| // executed right away and cb for each one of them. |
| OperandTracker(const ModelBuilder* model, OperationReadyCallback cb); |
| // Mark the specified operation as having been processed. The output |
| // of the operation now being known, this may make new operations to be |
| // able to run. Call cb for each one of them. |
| void markProcessed(uint32_t operationIndex, OperationReadyCallback cb); |
| |
| private: |
| const ModelBuilder* mModel; |
| std::multimap<uint32_t, uint32_t> mOperandToOperations; |
| std::vector<uint32_t> mUnknownInputCount; // For each operation |
| }; |
| |
| OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb) |
| : mModel(model) { |
| const auto& operations = mModel->getOperations(); |
| mUnknownInputCount.resize(operations.size()); |
| for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) { |
| const Operation& operation = operations[operationIndex]; |
| uint32_t count = 0; |
| for (uint32_t operandIndex : operation.inputs) { |
| auto lifetime = mModel->getOperand(operandIndex).lifetime; |
| if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE || |
| lifetime == OperandLifeTime::SUBGRAPH_OUTPUT) { |
| count++; |
| mOperandToOperations.emplace(operandIndex, operationIndex); |
| } |
| } |
| if (count == 0) { |
| cb(operationIndex); |
| } |
| mUnknownInputCount[operationIndex] = count; |
| } |
| } |
| |
| void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) { |
| // Mark all its outputs as known. |
| const Operation& operation = mModel->getOperations()[operationIndex]; |
| for (uint32_t operandIndex : operation.outputs) { |
| auto range = mOperandToOperations.equal_range(operandIndex); |
| for (auto i = range.first; i != range.second; i++) { |
| uint32_t& count = mUnknownInputCount[i->second]; |
| if (--count == 0) { |
| cb(i->second); |
| } |
| } |
| } |
| } |
| |
| } // namespace |
| |
| ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex, uint32_t sourceModelIndex, |
| std::shared_ptr<Device> device) |
| : mPlan(plan), |
| mIndex(stepIndex), |
| mSourceModelIndex(sourceModelIndex), |
| mStepModel(), |
| mDevice(device), |
| mToken(plan->getCacheToken()) {} |
| |
| // Adds an operand if it has not been added already. |
| // Sets the index in the step model for the corresponding operand. |
| int ExecutionStep::addOperand(uint32_t sourceOperandIndex, uint32_t* stepOperandIndex, |
| OperandKind kind) { |
| // Have we added this operand already? |
| auto i = mOperandMap.find(sourceOperandIndex); |
| if (i != mOperandMap.end()) { |
| CHECK(kind == INPUT); |
| *stepOperandIndex = i->second; |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| // First time we add this operand. |
| *stepOperandIndex = mStepModel.operandCount(); |
| mOperandMap.emplace(sourceOperandIndex, *stepOperandIndex); |
| |
| // Add the operand to the step model. |
| const ModelBuilder& sourceModel = *getSourceModel(); |
| const Operand& operand = sourceModel.getOperand(sourceOperandIndex); |
| ANeuralNetworksOperandType type = { |
| .type = static_cast<int32_t>(operand.type), |
| .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()), |
| .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr, |
| .scale = operand.scale, |
| .zeroPoint = operand.zeroPoint, |
| }; |
| |
| int n = mStepModel.addOperand(type); |
| if (n != ANEURALNETWORKS_NO_ERROR) { |
| LOG(ERROR) << "Previous error occurred when partitioning the graph"; |
| return n; |
| } |
| |
| n = copyOperandExtraParams(mStepModel, *stepOperandIndex, operand); |
| if (n != ANEURALNETWORKS_NO_ERROR) { |
| LOG(ERROR) << "Error when copying extra parameters to the operand"; |
| return n; |
| } |
| |
| // Sets its value. |
| switch (operand.lifetime) { |
| case OperandLifeTime::CONSTANT_COPY: { |
| const uint8_t* data = sourceModel.getPointerToOperandValue(operand.location.offset); |
| n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length); |
| if (n != ANEURALNETWORKS_NO_ERROR) { |
| LOG(ERROR) << "Previous error occurred when partitioning the graph"; |
| return n; |
| } |
| } break; |
| case OperandLifeTime::CONSTANT_REFERENCE: { |
| const Memory* memory = sourceModel.getMemories()[operand.location.poolIndex]; |
| n = mStepModel.setOperandValueFromMemory( |
| *stepOperandIndex, memory, operand.location.offset, operand.location.length); |
| if (n != ANEURALNETWORKS_NO_ERROR) { |
| LOG(ERROR) << "Previous error occurred when partitioning the graph"; |
| return n; |
| } |
| } break; |
| case OperandLifeTime::NO_VALUE: { |
| n = mStepModel.setOperandValue(*stepOperandIndex, nullptr, 0); |
| if (n != ANEURALNETWORKS_NO_ERROR) { |
| LOG(ERROR) << "Previous error occurred when partitioning the graph"; |
| return n; |
| } |
| } break; |
| case OperandLifeTime::TEMPORARY_VARIABLE: { // handled similarly to SUBGRAPH_OUTPUT |
| if (kind == INPUT) { |
| // The first time we've seen this operand is as an |
| // input. That means it must be defined by a |
| // different partition, and is an input to this one. |
| mTempsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex); |
| } else { |
| // The first time we've seen this operand is as an |
| // output. It may be an input to a different |
| // partition, so keep track of it. |
| mPlan->recordTemporaryDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex), |
| mIndex); |
| } |
| } break; |
| case OperandLifeTime::SUBGRAPH_INPUT: { |
| mModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex); |
| } break; |
| case OperandLifeTime::SUBGRAPH_OUTPUT: { // handled similarly to TEMPORARY_VARIABLE |
| if (kind == INPUT) { |
| // The first time we've seen this operand is as an |
| // input. That means it must be defined by a |
| // different partition, and is an input to this one. |
| mOutputsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex); |
| } else { |
| // The first time we've seen this operand is as an |
| // output. |
| mModelOutputs.emplace_back(sourceOperandIndex, *stepOperandIndex); |
| } |
| } break; |
| case OperandLifeTime::SUBGRAPH: { |
| const ModelBuilder* model = sourceModel.getReferencedModel(operand); |
| n = mStepModel.setOperandValueFromModel(*stepOperandIndex, model); |
| if (n != ANEURALNETWORKS_NO_ERROR) { |
| LOG(ERROR) << "Previous error occurred when partitioning the graph"; |
| return n; |
| } |
| } break; |
| default: { |
| CHECK(!"unexpected"); |
| } break; |
| } |
| |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| int ExecutionStep::addOperation(int operationIndex) { |
| const Operation& operation = getSourceModel()->getOperation(operationIndex); |
| if (mToken.ok()) { |
| mToken.update(&operationIndex, sizeof(operationIndex)); |
| } |
| |
| // Convert the input and output operand indexes. |
| // |
| // We expect operations to be added in topological order. Therefore: |
| // |
| // - We may not have seen an input if it is a model input, a |
| // constant, or an operand written by a different partition. |
| // |
| // - We should not have seen any outputs. |
| auto addOperands = [this](const hidl_vec<uint32_t>& sourceModelOperands, |
| std::vector<uint32_t>* stepModelOperands, OperandKind kind) -> int { |
| const uint32_t operandCount = static_cast<uint32_t>(sourceModelOperands.size()); |
| for (uint32_t i = 0; i < operandCount; i++) { |
| NN_RETURN_IF_ERROR(addOperand(sourceModelOperands[i], &stepModelOperands->at(i), kind)); |
| } |
| return ANEURALNETWORKS_NO_ERROR; |
| }; |
| |
| const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size()); |
| const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size()); |
| std::vector<uint32_t> inputs(inputCount); |
| std::vector<uint32_t> outputs(outputCount); |
| NN_RETURN_IF_ERROR(addOperands(operation.inputs, &inputs, INPUT)); |
| NN_RETURN_IF_ERROR(addOperands(operation.outputs, &outputs, OUTPUT)); |
| return mStepModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(), |
| outputCount, outputs.data()); |
| } |
| |
| void ExecutionStep::mapInputsAndOutputs( |
| std::shared_ptr<StepExecutor> executor, const Memory* temporaryMemory, |
| const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOffsetOfTemporary, |
| const std::map<SourceOperandIndex, uint32_t>& sourceOperandToInputIndex, |
| const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOutputIndex, |
| const std::map<SourceOperandIndex, ConstantReferenceLocation>& |
| sourceOperandToConstantReference) const { |
| auto mapInput = [&](uint32_t stepModelOperandIndex, uint32_t stepInputIndex) { |
| SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex); |
| if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex); |
| it != sourceOperandToOffsetOfTemporary.end()) { |
| executor->setInputFromMemory(stepInputIndex, temporaryMemory, it->second); |
| } else if (auto it = sourceOperandToInputIndex.find(sourceOperandIndex); |
| it != sourceOperandToInputIndex.end()) { |
| executor->mapInput(it->second, stepInputIndex); |
| } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex); |
| it != sourceOperandToOutputIndex.end()) { |
| executor->mapOutputToInput(it->second, stepInputIndex); |
| } else if (auto it = sourceOperandToConstantReference.find(sourceOperandIndex); |
| it != sourceOperandToConstantReference.end()) { |
| // Constant partition boundary operand. This could be an IF branch |
| // model input or a WHILE variable initializer. |
| executor->setInputFromMemory(stepInputIndex, it->second.memory, it->second.offset); |
| } else { |
| CHECK(false) << "Cannot map step input " << stepInputIndex << " from operand " |
| << toString(sourceOperandIndex); |
| } |
| }; |
| auto mapOutput = [&](uint32_t stepModelOperandIndex, uint32_t stepOutputIndex) { |
| SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex); |
| if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex); |
| it != sourceOperandToOffsetOfTemporary.end()) { |
| executor->setOutputFromMemory(stepOutputIndex, temporaryMemory, it->second); |
| } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex); |
| it != sourceOperandToOutputIndex.end()) { |
| executor->mapOutput(it->second, stepOutputIndex); |
| } else { |
| CHECK(false) << "Cannot map step output " << stepOutputIndex << " from operand " |
| << toString(sourceOperandIndex); |
| } |
| }; |
| for (uint32_t i = 0, n = mStepModelInputs.size(); i < n; ++i) { |
| mapInput(mStepModelInputs[i].first, i); |
| } |
| for (uint32_t i = 0, n = mStepModelOutputs.size(); i < n; ++i) { |
| mapOutput(mStepModelOutputs[i].first, i); |
| } |
| } |
| |
| void ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs() { |
| auto recordAsOutputIfTemporary = [this](const SourceOperandIndex& sourceOperandIndex) { |
| const auto it = mTemporaryToDefiningExecutionStep.find(sourceOperandIndex); |
| if (it == mTemporaryToDefiningExecutionStep.end()) { |
| // The operand is not a temporary or is not defined by an |
| // ExecutionStep (i.e. it's an output of an IF or a WHILE). |
| // The latter case is handled by ExecutionPlan::makeController(). |
| return; |
| } |
| uint32_t stepIndex = it->second; |
| CHECK_LT(stepIndex, mSteps.size()); |
| mSteps[stepIndex]->executionStep()->recordTempAsStepModelOutput(sourceOperandIndex.second); |
| }; |
| for (const auto& logicalStep : mSteps) { |
| if (const ExecutionStep* step = logicalStep->tryExecutionStep()) { |
| for (const auto& input : step->getTempsAsStepModelInputs()) { |
| SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), input.first); |
| recordAsOutputIfTemporary(sourceOperandIndex); |
| } |
| } else if (const IfStep* step = logicalStep->tryIfStep()) { |
| recordAsOutputIfTemporary(step->conditionOperandIndex); |
| for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) { |
| recordAsOutputIfTemporary(sourceOperandIndex); |
| } |
| } else if (const WhileStep* step = logicalStep->tryWhileStep()) { |
| for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) { |
| recordAsOutputIfTemporary(sourceOperandIndex); |
| } |
| } else { |
| CHECK(logicalStep->isGoto()); |
| } |
| } |
| } |
| |
| void ExecutionStep::recordTempAsStepModelOutput(uint32_t stepOperandIndex) { |
| const auto it = mOperandMap.find(stepOperandIndex); |
| CHECK(it != mOperandMap.end()); |
| mTempsAsStepModelOutputs.emplace(stepOperandIndex, it->second); |
| } |
| |
| const ModelBuilder* ExecutionStep::getSourceModel() const { |
| return mPlan->getSourceModels().getModel(mSourceModelIndex); |
| } |
| |
| void ExecutionStep::logStepModel() const { |
| VLOG(COMPILATION) << "ExecutionStep::finishStepModel, step " << mIndex; |
| |
| auto logRemapEntry = [](std::string& toLog, const std::pair<uint32_t, uint32_t>& e) { |
| if (!toLog.empty()) { |
| toLog += ", "; |
| } |
| toLog += toString(e.first); |
| toLog += "->"; |
| toLog += toString(e.second); |
| }; |
| |
| auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) { |
| std::string toLog; |
| for (const auto& e : map) { |
| logRemapEntry(toLog, e); |
| } |
| VLOG(COMPILATION) << name << ": " << toLog; |
| }; |
| auto logRemapSet = [&logRemapEntry](const char* name, const StepModelOutputSetType& set) { |
| std::string toLog; |
| for (const auto& e : set) { |
| logRemapEntry(toLog, e); |
| } |
| VLOG(COMPILATION) << name << ": " << toLog; |
| }; |
| |
| logRemapVector("step model inputs", mStepModelInputs); |
| logRemapVector("step model outputs", mStepModelOutputs); |
| logRemapVector("model inputs", mModelInputs); |
| logRemapVector("model outputs", mModelOutputs); |
| logRemapVector("temps as step model inputs", mTempsAsStepModelInputs); |
| logRemapSet("temps as step model outputs", mTempsAsStepModelOutputs); |
| logRemapVector("outputs as step model inputs", mOutputsAsStepModelInputs); |
| } |
| |
| static bool hasUnknownSize(const Operand& operand) { |
| if (operand.dimensions.size() == 0) { |
| return TypeManager::get()->isTensorType(operand.type); |
| } |
| for (uint32_t dimension : operand.dimensions) { |
| if (dimension == 0) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| int ExecutionStep::finishStepModel(const ModelBuilder* mainModel, bool* hasOutputOfUnknownSize, |
| int32_t executionPreference, int32_t priority) { |
| CHECK(mDevice != nullptr); |
| |
| for (const auto& stepModelOutput : mTempsAsStepModelOutputs) { |
| const Operand& operand = mStepModel.getOperand(stepModelOutput.second); |
| if (hasUnknownSize(operand)) { |
| *hasOutputOfUnknownSize = true; |
| VLOG(COMPILATION) << "StepModelOutput (operand#" << toString(stepModelOutput.first) |
| << " of source graph) has unknown size: " << toString(operand); |
| } |
| } |
| |
| mStepModel.relaxComputationFloat32toFloat16(mainModel->isComputationFloat32RelaxedToFloat16()); |
| |
| mStepModelInputs.insert(mStepModelInputs.end(), mModelInputs.begin(), mModelInputs.end()); |
| mStepModelInputs.insert(mStepModelInputs.end(), mTempsAsStepModelInputs.begin(), |
| mTempsAsStepModelInputs.end()); |
| mStepModelInputs.insert(mStepModelInputs.end(), mOutputsAsStepModelInputs.begin(), |
| mOutputsAsStepModelInputs.end()); |
| |
| mStepModelOutputs.insert(mStepModelOutputs.end(), mModelOutputs.begin(), mModelOutputs.end()); |
| mStepModelOutputs.insert(mStepModelOutputs.end(), mTempsAsStepModelOutputs.begin(), |
| mTempsAsStepModelOutputs.end()); |
| |
| if (mSourceModelIndex == kMainModelInSourceModels) { |
| std::map<uint32_t, uint32_t> mainModelOperandToInputIndex; |
| for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) { |
| mainModelOperandToInputIndex[mainModel->getInputOperandIndex(i)] = i; |
| } |
| std::map<uint32_t, uint32_t> mainModelOperandToOutputIndex; |
| for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) { |
| mainModelOperandToOutputIndex[mainModel->getOutputOperandIndex(i)] = i; |
| } |
| |
| // mInputIndexStepModelToMainModel is ordered by step model input index and relies on |
| // mModelInputs being the first inputs, as specified by mStepModelInputs. |
| mInputIndexStepModelToMainModel.resize(mModelInputs.size()); |
| std::transform(mModelInputs.begin(), mModelInputs.end(), |
| mInputIndexStepModelToMainModel.begin(), |
| [&mainModelOperandToInputIndex](auto& e) { |
| uint32_t sourceOperandIndex = e.first; |
| return mainModelOperandToInputIndex[sourceOperandIndex]; |
| }); |
| |
| // mOutputIndexStepModelToMainModel is ordered by step model output index and relies on |
| // mModelOutputs being the first outputs, as specified by mStepModelOutputs. |
| mOutputIndexStepModelToMainModel.resize(mModelOutputs.size()); |
| std::transform(mModelOutputs.begin(), mModelOutputs.end(), |
| mOutputIndexStepModelToMainModel.begin(), |
| [&mainModelOperandToOutputIndex](auto& e) { |
| uint32_t sourceOperandIndex = e.first; |
| return mainModelOperandToOutputIndex[sourceOperandIndex]; |
| }); |
| |
| // mOutputsAsStepModelInputsIndexToMainModel is ordered by step model input index and relies |
| // on mOutputsAsStepModelInputs being the first outputs. |
| mOutputsAsStepModelInputsIndexToMainModel.resize(mOutputsAsStepModelInputs.size()); |
| std::transform(mOutputsAsStepModelInputs.begin(), mOutputsAsStepModelInputs.end(), |
| mOutputsAsStepModelInputsIndexToMainModel.begin(), |
| [&mainModelOperandToOutputIndex](auto& e) { |
| uint32_t sourceOperandIndex = e.first; |
| return mainModelOperandToOutputIndex[sourceOperandIndex]; |
| }); |
| } |
| |
| if (VLOG_IS_ON(COMPILATION)) { |
| logStepModel(); |
| } |
| |
| std::vector<uint32_t> inputs(mStepModelInputs.size()); |
| std::vector<uint32_t> outputs(mStepModelOutputs.size()); |
| std::transform(mStepModelInputs.begin(), mStepModelInputs.end(), inputs.begin(), |
| [](auto& e) { return e.second; }); |
| std::transform(mStepModelOutputs.begin(), mStepModelOutputs.end(), outputs.begin(), |
| [](auto& e) { return e.second; }); |
| NN_RETURN_IF_ERROR(mStepModel.identifyInputsAndOutputs(inputs.size(), inputs.data(), |
| outputs.size(), outputs.data())); |
| NN_RETURN_IF_ERROR(mStepModel.finish()); |
| |
| // TODO: Move compilation elsewhere? |
| VLOG(COMPILATION) << "ExecutionStep::finishStepModel, compilation on " << mDevice->getName(); |
| return compile(*mDevice, mStepModel, executionPreference, priority, {}, *mPlan->getCacheDir(), |
| &mToken, &mPreparedStepModel); |
| } |
| |
| void ExecutionStep::dump() const { |
| if (VLOG_IS_ON(COMPILATION)) { |
| VLOG(COMPILATION) << "Step#" << mIndex << ": execute on " << mDevice->getName(); |
| logModelToInfo(mStepModel.makeHidlModel()); |
| } |
| } |
| |
| std::string toString(const IfStep& step) { |
| std::ostringstream oss; |
| oss << "Step#" << step.index << ": if " << toString(step.conditionOperandIndex) |
| << " then=" << step.thenStepIndex << " else=" << step.elseStepIndex; |
| return oss.str(); |
| } |
| |
| std::string toString(const WhileStep& step) { |
| std::ostringstream oss; |
| oss << "Step#" << step.index << ": while cond=" << step.condStepIndex |
| << " body=" << step.bodyStepIndex << " exit=" << step.exitStepIndex; |
| return oss.str(); |
| } |
| |
| std::string toString(const GotoStep& step) { |
| std::ostringstream oss; |
| oss << "Step#" << step.index << ": goto " << step.gotoStepIndex; |
| return oss.str(); |
| } |
| |
| void LogicalStep::dump() const { |
| if (VLOG_IS_ON(COMPILATION)) { |
| if (const IfStep* step = tryIfStep()) { |
| VLOG(COMPILATION) << toString(*step); |
| } else if (const WhileStep* step = tryWhileStep()) { |
| VLOG(COMPILATION) << toString(*step); |
| } else if (const GotoStep* step = tryGotoStep()) { |
| VLOG(COMPILATION) << toString(*step); |
| } else { |
| executionStep()->dump(); |
| } |
| } |
| } |
| |
| int ExecutionPlan::CompoundBody::finish(const SourceModels* sourceModels, |
| int32_t executionPreference, int32_t priority, |
| const std::optional<Deadline>& deadline) { |
| CHECK(!deadline.has_value()); |
| const ModelBuilder* mainModel = sourceModels->getModel(kMainModelInSourceModels); |
| |
| auto containsUnknownSize = [sourceModels](const std::vector<SourceOperandIndex>& operands) { |
| for (const auto& sourceOperandIndex : operands) { |
| const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first); |
| const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second); |
| if (hasUnknownSize(operand)) { |
| return true; |
| } |
| } |
| return false; |
| }; |
| |
| findTempsAsStepModelOutputs(); |
| for (const auto& logicalStep : mSteps) { |
| if (ExecutionStep* step = logicalStep->tryExecutionStep()) { |
| int n = step->finishStepModel(mainModel, &mHasStepModelOutputOfUnknownSize, |
| executionPreference, priority); |
| if (n != ANEURALNETWORKS_NO_ERROR) { |
| VLOG(COMPILATION) |
| << "ExecutionPlan::CompoundBody::finish -- finishStepModel failed"; |
| return n; |
| } |
| } else if (IfStep* step = logicalStep->tryIfStep()) { |
| if (containsUnknownSize(step->outerOutputOperands)) { |
| mHasStepModelOutputOfUnknownSize = true; |
| } |
| } else if (WhileStep* step = logicalStep->tryWhileStep()) { |
| if (containsUnknownSize(step->outerOutputOperands)) { |
| mHasStepModelOutputOfUnknownSize = true; |
| } |
| } else { |
| CHECK(logicalStep->isGoto()); |
| } |
| } |
| if (mHasStepModelOutputOfUnknownSize) { |
| VLOG(COMPILATION) |
| << "ExecutionPlan::CompoundBody::finish -- mHasStepModelOutputOfUnknownSize"; |
| return ANEURALNETWORKS_OP_FAILED; |
| } |
| |
| for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) { |
| SourceOperandIndex index(kMainModelInSourceModels, mainModel->getInputOperandIndex(i)); |
| mSourceOperandToInputIndex[index] = i; |
| } |
| for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) { |
| SourceOperandIndex index(kMainModelInSourceModels, mainModel->getOutputOperandIndex(i)); |
| mSourceOperandToOutputIndex[index] = i; |
| } |
| |
| findControlFlowBoundaryConstants(sourceModels); |
| |
| mSuccessfulFinish = true; |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| void ExecutionPlan::CompoundBody::findControlFlowBoundaryConstants( |
| const SourceModels* sourceModels) { |
| auto handleBoundaryConstants = [this, |
| sourceModels](const SourceOperandIndex& sourceOperandIndex) { |
| const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first); |
| const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second); |
| const DataLocation& location = operand.location; |
| if (operand.lifetime == OperandLifeTime::CONSTANT_COPY) { |
| mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = { |
| .buffer = sourceModel->getPointerToOperandValue(location.offset), |
| .length = location.length, |
| }; |
| } else if (operand.lifetime == OperandLifeTime::CONSTANT_REFERENCE) { |
| mSourceOperandToBoundaryConstantReference[sourceOperandIndex] = { |
| .memory = sourceModel->getMemories()[location.poolIndex], |
| .offset = location.offset, |
| .length = location.length, |
| }; |
| } |
| }; |
| for (const auto& logicalStep : mSteps) { |
| if (const IfStep* step = logicalStep->tryIfStep()) { |
| handleBoundaryConstants(step->conditionOperandIndex); |
| for (const auto& sourceOperandIndex : step->outerInputOperands) { |
| handleBoundaryConstants(sourceOperandIndex); |
| } |
| } else if (const WhileStep* step = logicalStep->tryWhileStep()) { |
| for (const auto& sourceOperandIndex : step->outerInputOperands) { |
| handleBoundaryConstants(sourceOperandIndex); |
| } |
| } |
| } |
| } |
| |
| int ExecutionPlan::SimpleBody::finish(const SourceModels*, int32_t executionPreference, |
| int32_t priority, const std::optional<Deadline>& deadline) { |
| CHECK(mDevice != nullptr); |
| VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation"; |
| const int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheDir, |
| &mToken, &mPreparedModel); |
| mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR); |
| return n; |
| } |
| |
| int ExecutionPlan::finish(int32_t executionPreference, int32_t priority, |
| const std::optional<Deadline>& deadline) { |
| CHECK(mBody != nullptr); |
| return mBody->finish(&getSourceModels(), executionPreference, priority, deadline); |
| } |
| |
| ExecutionPlan::Controller::Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder, |
| const BurstBuilder* burstBuilder) |
| : Controller(plan, executionBuilder, burstBuilder, 0, {}, {}, {}, {}, {}, {}) {} |
| |
| ExecutionPlan::Controller::Controller( |
| const ExecutionPlan* plan, ExecutionBuilder* executionBuilder, |
| const BurstBuilder* burstBuilder, uint32_t totalSizeOfTemporaries, |
| std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary, |
| std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary2, |
| std::map<SourceOperandIndex, uint32_t> sourceOperandToInputIndex, |
| std::map<SourceOperandIndex, uint32_t> sourceOperandToOutputIndex, |
| const std::map<SourceOperandIndex, ConstantCopyLocation>& sourceOperandToConstantCopy, |
| std::map<SourceOperandIndex, ConstantReferenceLocation> sourceOperandToConstantReference) |
| : mPlan(plan), |
| mExecutionBuilder(executionBuilder), |
| mBurstBuilder(burstBuilder), |
| mSourceOperandToOffsetOfTemporary(std::move(sourceOperandToOffsetOfTemporary)), |
| mSourceOperandToOffsetOfTemporary2(std::move(sourceOperandToOffsetOfTemporary2)), |
| mSourceOperandToInputIndex(std::move(sourceOperandToInputIndex)), |
| mSourceOperandToOutputIndex(std::move(sourceOperandToOutputIndex)), |
| mSourceOperandToConstantReference(std::move(sourceOperandToConstantReference)), |
| mNextStepIndex(0), |
| mLastStepIndex(kBadStepIndex) { |
| if (totalSizeOfTemporaries == 0) { |
| return; |
| } |
| int n; |
| std::tie(n, mTemporaries) = MemoryAshmem::create(totalSizeOfTemporaries); |
| if (n != ANEURALNETWORKS_NO_ERROR) { |
| LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries"; |
| mNextStepIndex = kBadStepIndex; |
| } |
| for (const auto& [sourceOperandIndex, location] : sourceOperandToConstantCopy) { |
| memcpy(mTemporaries->getPointer() + mSourceOperandToOffsetOfTemporary[sourceOperandIndex], |
| location.buffer, location.length); |
| } |
| } |
| |
| // Attempt to create a burst object for each PreparedModel/Partition. If the |
| // burst controller object cannot be made, return a nullptr in its place to |
| // indicate the regular execution path should be used. This can occur either |
| // because PreparedModel was nullptr (cpu was best choice), or because the |
| // IPreparedModel was of insufficient version or failed to configure the burst. |
| std::vector<std::shared_ptr<ExecutionBurstController>> ExecutionPlan::makeBursts( |
| int preference) const { |
| switch (mState) { |
| // burst object for each partition in the compound case |
| case COMPOUND: { |
| std::vector<std::shared_ptr<ExecutionBurstController>> bursts; |
| bursts.reserve(compound()->mSteps.size()); |
| for (const auto& logicalStep : compound()->mSteps) { |
| if (!logicalStep->isExecution()) { |
| bursts.push_back(nullptr); |
| continue; |
| } |
| if (const auto preparedModel = |
| logicalStep->executionStep()->getPreparedStepModel()) { |
| const bool preferPowerOverLatency = |
| (preference == ANEURALNETWORKS_PREFER_LOW_POWER); |
| bursts.push_back( |
| preparedModel->configureExecutionBurst(preferPowerOverLatency)); |
| } else { |
| bursts.push_back(nullptr); |
| } |
| } |
| return bursts; |
| } |
| // single burst object for the simple case |
| case SIMPLE: { |
| std::vector<std::shared_ptr<ExecutionBurstController>> burst; |
| auto simpleBody = simple(); |
| if (const auto preparedModel = simpleBody->mPreparedModel) { |
| const bool preferPowerOverLatency = |
| (preference == ANEURALNETWORKS_PREFER_LOW_POWER); |
| burst.push_back(preparedModel->configureExecutionBurst(preferPowerOverLatency)); |
| } else { |
| burst.push_back(nullptr); |
| } |
| return burst; |
| } |
| // no burst objects made |
| default: |
| return {}; |
| } |
| } |
| |
| std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController( |
| ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const { |
| CHECK(isValid()); |
| if (mState == SIMPLE) { |
| return std::shared_ptr<Controller>(new Controller(this, executionBuilder, burstBuilder)); |
| } |
| // Create the layout for a Memory object big enough to hold |
| // - every partition boundary TEMPORARY operand and |
| // - buffers required by the control flow implementation. |
| // |
| // TODO: Rethink this approach for managing temporaries. Some |
| // alternatives: |
| // |
| // 1) Adopt a memory layout scheme analogous to stack allocation, |
| // where objects of non-overlapping lifetime can occupy the same |
| // storage. We would still have a single Memory object in this |
| // case. |
| // |
| // 2) Do something like what CpuExecutor does, and do allocations |
| // and deallocations on the fly (during execution) before first |
| // reference and after last reference, respectively. This would |
| // mean having one Memory object per TEMPORARY; or, in a more |
| // complicated implementation, one Memory object per set of |
| // temporaries that have the same lifetime. Note that the Android |
| // system limits the number of shared memory objects, which are |
| // what our Memory objects represent. |
| // |
| uint32_t totalSizeOfTemporaries = 0; |
| auto addTemporaryOfSize = [&totalSizeOfTemporaries](uint32_t size) { |
| totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size); |
| const uint32_t offset = totalSizeOfTemporaries; |
| totalSizeOfTemporaries += size; |
| return offset; |
| }; |
| // This function has two modes of operation: |
| // 1. When lifetime is TEMPORARY_VARIABLE, we allocate memory for |
| // TEMPORARY_VARIABLE source operands, skip SUBGRAPH_OUTPUT source |
| // operands, and panic if we see a source operand of another lifetime. |
| // 2. When lifetime is SUBGRAPH_OUTPUT, we allocate memory for |
| // SUBGRAPH_OUTPUT source operands and panic if we see a source operand |
| // of another lifetime. |
| auto mapTemporary = |
| [executionBuilder, addTemporaryOfSize]( |
| const SourceOperandIndex& sourceOperandIndex, |
| std::map<SourceOperandIndex, uint32_t>* sourceOperandToOffsetOfTemporary, |
| OperandLifeTime lifetime = OperandLifeTime::TEMPORARY_VARIABLE) { |
| CHECK(lifetime == OperandLifeTime::TEMPORARY_VARIABLE || |
| lifetime == OperandLifeTime::SUBGRAPH_OUTPUT); |
| const Operand& sourceOperand = |
| executionBuilder->getSourceOperand(sourceOperandIndex); |
| if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE && |
| sourceOperand.lifetime == OperandLifeTime::SUBGRAPH_OUTPUT) { |
| // See the caller for explanation. |
| return; |
| } |
| CHECK(sourceOperand.lifetime == lifetime); |
| const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand); |
| CHECK_NE(size, 0u); |
| const uint32_t offset = addTemporaryOfSize(size); |
| auto [_, isNew] = |
| sourceOperandToOffsetOfTemporary->emplace(sourceOperandIndex, offset); |
| CHECK(isNew); |
| VLOG(EXECUTION) << "temp: operand " << toString(sourceOperandIndex) |
| << " offset = " << offset; |
| }; |
| std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary; |
| std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary2; |
| for (const auto& logicalStep : compound()->mSteps) { |
| if (const ExecutionStep* step = logicalStep->tryExecutionStep()) { |
| // Allocate memory for ExecutionStep temporary outputs that are |
| // inputs to other steps, as determined by |
| // ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs(). |
| // |
| // We don't allocate memory for step model output operands with |
| // source operand lifetime SUBGRAPH_OUTPUT because they will be |
| // - managed by the client (main model outputs), |
| // - assigned a location of another operand (when this step model |
| // output is a branch model output of an IF; see |
| // ExecutionPlan::nextCompound(const IfStep*, ...)), or |
| // - allocated by a WHILE (when this step model output |
| // is a condition or body model output of a WHILE; see the |
| // step->bodyOutputOperands and step->condOutputOperand handling |
| // below). |
| for (const auto& output : step->getTempsAsStepModelOutputs()) { |
| mapTemporary(SourceOperandIndex(step->getSourceModelIndex(), output.first), |
| &sourceOperandToOffsetOfTemporary); |
| } |
| } else if (const IfStep* step = logicalStep->tryIfStep()) { |
| // Allocate memory for all temporary outputs of an IfStep because |
| // they are going to be written to by a branch model. We don't |
| // perform unused output operand optimisation for referenced models. |
| // |
| // We don't allocate memory for branch output operands because they |
| // use the same location as the corresponding outer output operands, |
| // as established in ExecutionPlan::nextCompound(const IfStep*, ...) |
| // |
| // We don't allocate memory for outer output operands with source |
| // operand lifetime SUBGRAPH_OUTPUT because they will be |
| // - managed by the client (main model outputs), |
| // - assigned a location of another operand (when this IF outer |
| // output is a branch model output of another IF; see |
| // ExecutionPlan::nextCompound(const IfStep*, ...)), or |
| // - allocated by a WHILE (when this IF outer output |
| // is a condition or body model output of a WHILE; see the |
| // step->bodyOutputOperands and step->condOutputOperand handling |
| // below). |
| for (const auto& sourceOperandIndex : step->outerOutputOperands) { |
| mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary); |
| } |
| } else if (const WhileStep* step = logicalStep->tryWhileStep()) { |
| // Allocate memory for all temporary outputs of an WhileStep because |
| // they are going to be written to by the WHILE loop. |
| // |
| // We don't allocate memory for outer output operands with source |
| // operand lifetime SUBGRAPH_OUTPUT because they will be |
| // - managed by the client (main model outputs), |
| // - assigned a location of another operand (when this WHILE outer |
| // output is a branch model output of an IF; see |
| // ExecutionPlan::nextCompound(const IfStep*, ...)), or |
| // - allocated by another WHILE (when this WHILE outer output |
| // is a condition or body model output of another WHILE; see the |
| // step->bodyOutputOperands and step->condOutputOperand handling |
| // below). |
| for (const auto& sourceOperandIndex : step->outerOutputOperands) { |
| mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary); |
| } |
| // Allocate memory for body model outputs. Note that we could use |
| // the outer output operand memory instead but we currently don't do |
| // so (b/148206073). |
| for (const auto& sourceOperandIndex : step->bodyOutputOperands) { |
| mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary, |
| OperandLifeTime::SUBGRAPH_OUTPUT); |
| // Allocate another set of temporaries for double buffering. |
| mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary2, |
| OperandLifeTime::SUBGRAPH_OUTPUT); |
| } |
| // Allocate memory for condition model output. |
| // TODO: Share one condition output memory region between all loops. |
| mapTemporary(step->condOutputOperand, &sourceOperandToOffsetOfTemporary, |
| OperandLifeTime::SUBGRAPH_OUTPUT); |
| } else { |
| CHECK(logicalStep->isGoto()); |
| } |
| } |
| // Allocate temporary memory for boundary CONSTANT_COPY operands. |
| for (const auto& [sourceOperandIndex, location] : |
| compound()->mSourceOperandToBoundaryConstantCopy) { |
| const uint32_t offset = addTemporaryOfSize(location.length); |
| sourceOperandToOffsetOfTemporary.emplace(sourceOperandIndex, offset); |
| VLOG(EXECUTION) << "temp (boundary constant): operand " << toString(sourceOperandIndex) |
| << " offset = " << offset; |
| } |
| return std::shared_ptr<Controller>(new Controller( |
| this, executionBuilder, burstBuilder, totalSizeOfTemporaries, |
| std::move(sourceOperandToOffsetOfTemporary), |
| std::move(sourceOperandToOffsetOfTemporary2), compound()->mSourceOperandToInputIndex, |
| compound()->mSourceOperandToOutputIndex, |
| compound()->mSourceOperandToBoundaryConstantCopy, |
| compound()->mSourceOperandToBoundaryConstantReference)); |
| } |
| |
| // TODO: Find a better way to provide this functionality. |
| int ExecutionPlan::fallback(std::shared_ptr<Controller> controller, |
| std::shared_ptr<StepExecutor>* executor) const { |
| *executor = nullptr; |
| |
| VLOG(EXECUTION) << "ExecutionPlan::fallback(" << SHOW_IF_DEBUG(controller << ", " << executor) |
| << "): mNextStepIndex = " << controller->mNextStepIndex; |
| |
| if (controller->mLastStepIndex == Controller::kBadStepIndex) { |
| // We haven't called next(). |
| return ANEURALNETWORKS_OP_FAILED; |
| } |
| |
| if (controller->mNextStepIndex == Controller::kBadStepIndex) { |
| // The last call to next() did not produce an executor. |
| return ANEURALNETWORKS_OP_FAILED; |
| } |
| |
| controller->mNextStepIndex = controller->mLastStepIndex; |
| return next(controller, executor); |
| } |
| |
| ExecutionPlan::Buffer::Buffer(void* pointer, uint32_t size) |
| : mInfo(RunTimePoolInfo::createFromExistingBuffer(reinterpret_cast<uint8_t*>(pointer), size)), |
| mOffset(0) {} |
| |
| ExecutionPlan::Buffer::Buffer(RunTimePoolInfo info, uint32_t offset) |
| : mInfo(std::move(info)), mOffset(offset) {} |
| |
| void* ExecutionPlan::Buffer::getPointer() const { |
| return mInfo.getBuffer() + mOffset; |
| } |
| |
| uint32_t ExecutionPlan::Buffer::getSize() const { |
| return mInfo.getSize() - mOffset; |
| } |
| |
| void ExecutionPlan::Buffer::flush() const { |
| mInfo.flush(); |
| } |
| |
| std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBufferFromModelArgumentInfo( |
| const ModelArgumentInfo& info, const ExecutionBuilder* executionBuilder) const { |
| switch (info.state()) { |
| case ModelArgumentInfo::POINTER: { |
| return Buffer(info.buffer(), info.length()); |
| } break; |
| case ModelArgumentInfo::MEMORY: { |
| if (std::optional<RunTimePoolInfo> poolInfo = |
| executionBuilder->getRunTimePoolInfo(info.locationAndLength().poolIndex)) { |
| return Buffer(*poolInfo, info.locationAndLength().offset); |
| } else { |
| LOG(ERROR) << "Unable to map operand memory pool"; |
| return std::nullopt; |
| } |
| } break; |
| case ModelArgumentInfo::HAS_NO_VALUE: { |
| LOG(ERROR) << "Attempting to read an operand that has no value"; |
| return std::nullopt; |
| } break; |
| default: { |
| LOG(ERROR) << "Unexpected operand memory state: " << static_cast<int>(info.state()); |
| return std::nullopt; |
| } break; |
| } |
| } |
| |
| std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBuffer( |
| std::shared_ptr<Controller> controller, SourceOperandIndex operandIndex) const { |
| const auto& sourceOperandToOffsetOfTemporary = controller->mSourceOperandToOffsetOfTemporary; |
| const auto& sourceOperandToInputIndex = controller->mSourceOperandToInputIndex; |
| const auto& sourceOperandToOutputIndex = controller->mSourceOperandToOutputIndex; |
| if (auto it = sourceOperandToOffsetOfTemporary.find(operandIndex); |
| it != sourceOperandToOffsetOfTemporary.end()) { |
| const uint32_t offset = it->second; |
| const std::unique_ptr<MemoryAshmem>& memory = controller->mTemporaries; |
| return Buffer(memory->getPointer() + offset, memory->getSize() - offset); |
| } else if (auto it = sourceOperandToInputIndex.find(operandIndex); |
| it != sourceOperandToInputIndex.end()) { |
| const ModelArgumentInfo& info = controller->mExecutionBuilder->getInputInfo(it->second); |
| return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder); |
| } else if (auto it = sourceOperandToOutputIndex.find(operandIndex); |
| it != sourceOperandToOutputIndex.end()) { |
| const ModelArgumentInfo& info = controller->mExecutionBuilder->getOutputInfo(it->second); |
| return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder); |
| } |
| return std::nullopt; |
| } |
| |
| bool ExecutionPlan::readConditionValue(std::shared_ptr<Controller> controller, |
| SourceOperandIndex operandIndex) const { |
| std::optional<ExecutionPlan::Buffer> buffer = getBuffer(controller, operandIndex); |
| CHECK(buffer != std::nullopt) << "Unable to read operand " << toString(operandIndex); |
| bool8 value; |
| CHECK_GE(buffer->getSize(), sizeof(value)); |
| std::memcpy(&value, buffer->getPointer(), sizeof(value)); |
| VLOG(EXECUTION) << "readConditionValue: " << static_cast<int>(value); |
| return value; |
| } |
| |
| int ExecutionPlan::next(std::shared_ptr<Controller> controller, |
| std::shared_ptr<StepExecutor>* executor, |
| std::shared_ptr<ExecutionBurstController>* burstController) const { |
| controller->mLastStepIndex = controller->mNextStepIndex; |
| *executor = nullptr; |
| if (burstController != nullptr) { |
| *burstController = nullptr; |
| } |
| |
| VLOG(EXECUTION) << "ExecutionPlan::next(" << SHOW_IF_DEBUG(controller << ", " << executor) |
| << "): mNextStepIndex = " << controller->mNextStepIndex; |
| |
| if (controller->mNextStepIndex == Controller::kBadStepIndex) { |
| return ANEURALNETWORKS_OP_FAILED; |
| } |
| |
| if (mState == EMPTY) { |
| CHECK_EQ(controller->mNextStepIndex, 0u); // end |
| controller->mNextStepIndex = Controller::kBadStepIndex; |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| if (mState == SIMPLE) { |
| if (controller->mNextStepIndex == 0) { |
| // First (and only) step. |
| auto simpleBody = simple(); |
| *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder, |
| simpleBody->mModel, simpleBody->mDevice, |
| simpleBody->mPreparedModel); |
| (*executor)->mapInputsAndOutputsTrivially(); |
| if (burstController != nullptr && controller->mBurstBuilder != nullptr) { |
| *burstController = controller->mBurstBuilder->getControllerAt(0); |
| } |
| controller->mNextStepIndex = 1; |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| CHECK_EQ(controller->mNextStepIndex, 1u); // end |
| controller->mNextStepIndex = Controller::kBadStepIndex; |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| return nextCompound(controller, executor, burstController); |
| } |
| |
| int ExecutionPlan::nextCompound(std::shared_ptr<Controller> controller, |
| std::shared_ptr<StepExecutor>* executor, |
| std::shared_ptr<ExecutionBurstController>* burstController) const { |
| if (controller->mNextStepIndex == Controller::kBadStepIndex) { |
| return ANEURALNETWORKS_OP_FAILED; |
| } |
| |
| auto compoundBody = compound(); |
| if (controller->mNextStepIndex == compoundBody->mSteps.size()) { |
| controller->mNextStepIndex = Controller::kBadStepIndex; // end |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| const auto& logicalStep = compoundBody->mSteps[controller->mNextStepIndex]; |
| if (const IfStep* step = logicalStep->tryIfStep()) { |
| return nextCompound(step, controller, executor, burstController); |
| } else if (const WhileStep* step = logicalStep->tryWhileStep()) { |
| return nextCompound(step, controller, executor, burstController); |
| } else if (const GotoStep* step = logicalStep->tryGotoStep()) { |
| return nextCompound(step, controller, executor, burstController); |
| } else if (const ExecutionStep* step = logicalStep->tryExecutionStep()) { |
| return nextCompound(step, controller, executor, burstController); |
| } else { |
| CHECK(false) << "Unknown step variant"; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| } |
| |
| int ExecutionPlan::nextCompound(const ExecutionStep* step, std::shared_ptr<Controller> controller, |
| std::shared_ptr<StepExecutor>* executor, |
| std::shared_ptr<ExecutionBurstController>* burstController) const { |
| VLOG(EXECUTION) << "next: Step#" << controller->mNextStepIndex << ": execute on " |
| << step->getDevice()->getName(); |
| *executor = |
| std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(), |
| step->getDevice(), step->getPreparedStepModel(), step); |
| step->mapInputsAndOutputs( |
| *executor, controller->mTemporaries.get(), |
| controller->mSourceOperandToOffsetOfTemporary, controller->mSourceOperandToInputIndex, |
| controller->mSourceOperandToOutputIndex, controller->mSourceOperandToConstantReference); |
| if (burstController != nullptr && controller->mBurstBuilder != nullptr) { |
| *burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex); |
| } |
| |
| controller->mNextStepIndex++; |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| // The first argument is the "source" operand, the second operand is the "destination". |
| void ExecutionPlan::Controller::setInput(const SourceOperandIndex& outerOperand, |
| const SourceOperandIndex& innerOperand) { |
| VLOG(EXECUTION) << "mapping input " << toString(innerOperand) << " from " |
| << toString(outerOperand); |
| #ifdef NN_DEBUGGABLE |
| CHECK_LE(mSourceOperandToOffsetOfTemporary.count(innerOperand) + |
| mSourceOperandToInputIndex.count(innerOperand) + |
| mSourceOperandToOutputIndex.count(innerOperand) + |
| mSourceOperandToConstantReference.count(innerOperand), |
| 1u); |
| #endif |
| mSourceOperandToOffsetOfTemporary.erase(innerOperand); |
| mSourceOperandToInputIndex.erase(innerOperand); |
| mSourceOperandToOutputIndex.erase(innerOperand); |
| mSourceOperandToConstantReference.erase(innerOperand); |
| if (auto it = mSourceOperandToOffsetOfTemporary.find(outerOperand); |
| it != mSourceOperandToOffsetOfTemporary.end()) { |
| mSourceOperandToOffsetOfTemporary.emplace(innerOperand, it->second); |
| } else if (auto it = mSourceOperandToInputIndex.find(outerOperand); |
| it != mSourceOperandToInputIndex.end()) { |
| mSourceOperandToInputIndex.emplace(innerOperand, it->second); |
| } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand); |
| it != mSourceOperandToOutputIndex.end()) { |
| mSourceOperandToOutputIndex.emplace(innerOperand, it->second); |
| } else if (auto it = mSourceOperandToConstantReference.find(outerOperand); |
| it != mSourceOperandToConstantReference.end()) { |
| mSourceOperandToConstantReference.emplace(innerOperand, it->second); |
| } else { |
| CHECK(false) << "Cannot set step model input operand " << toString(innerOperand) |
| << " from operand " << toString(outerOperand); |
| } |
| } |
| |
| // The first argument is the "source" operand, the second operand is the "destination". |
| void ExecutionPlan::Controller::setOutput(const SourceOperandIndex& outerOperand, |
| const SourceOperandIndex& innerOperand) { |
| VLOG(EXECUTION) << "mapping output " << toString(innerOperand) << " from " |
| << toString(outerOperand); |
| #ifdef NN_DEBUGGABLE |
| CHECK_LE(mSourceOperandToOffsetOfTemporary.count(innerOperand) + |
| mSourceOperandToOutputIndex.count(innerOperand), |
| 1u); |
| #endif |
| mSourceOperandToOffsetOfTemporary.erase(innerOperand); |
| mSourceOperandToOutputIndex.erase(innerOperand); |
| if (auto it = mSourceOperandToOffsetOfTemporary.find(outerOperand); |
| it != mSourceOperandToOffsetOfTemporary.end()) { |
| mSourceOperandToOffsetOfTemporary.emplace(innerOperand, it->second); |
| } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand); |
| it != mSourceOperandToOutputIndex.end()) { |
| mSourceOperandToOutputIndex.emplace(innerOperand, it->second); |
| } else { |
| CHECK(false) << "Cannot set step model output operand " << toString(innerOperand) |
| << " from operand " << toString(outerOperand); |
| } |
| } |
| |
| int ExecutionPlan::nextCompound(const IfStep* step, std::shared_ptr<Controller> controller, |
| std::shared_ptr<StepExecutor>* executor, |
| std::shared_ptr<ExecutionBurstController>* burstController) const { |
| VLOG(EXECUTION) << "next: " << toString(*step); |
| bool condValue = readConditionValue(controller, step->conditionOperandIndex); |
| controller->mNextStepIndex = condValue ? step->thenStepIndex : step->elseStepIndex; |
| const std::vector<SourceOperandIndex>& branchInputOperands = |
| condValue ? step->thenBranchInputOperands : step->elseBranchInputOperands; |
| const std::vector<SourceOperandIndex>& branchOutputOperands = |
| condValue ? step->thenBranchOutputOperands : step->elseBranchOutputOperands; |
| CHECK_EQ(branchInputOperands.size(), step->outerInputOperands.size()); |
| CHECK_EQ(branchOutputOperands.size(), step->outerOutputOperands.size()); |
| for (uint32_t i = 0, n = step->outerInputOperands.size(); i < n; ++i) { |
| // We have to do this assignment just before executing this step to |
| // accommodate cases when the IF resides within a WHILE condition or |
| // body model and for some j the i-th input of the IF branch model is |
| // - an input of the WHILE condition model (whileStep->condInputOperands[j]), |
| // - an input of the WHILE body model (whileStep->bodyInputOperands[j]), or |
| // - an output of the WHILE body model (whileStep->bodyOutputOperands[j]). |
| // In such cases, the WhileStep modifies the location of |
| // step->outerInputOperands[i] to implement double buffering. |
| controller->setInput(step->outerInputOperands[i], branchInputOperands[i]); |
| } |
| for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) { |
| // We have to do this assignment just before executing this step to |
| // accommodate the case when the IF resides within a WHILE body |
| // model and the i-th output of the IF branch model is an |
| // output of the WHILE body model (whileStep->bodyOutputOperands[j] for |
| // some j). In that case, the WhileStep modifies the location of |
| // step->outerOutputOperands[i] to implement double buffering. |
| controller->setOutput(step->outerOutputOperands[i], branchOutputOperands[i]); |
| } |
| return nextCompound(controller, executor, burstController); |
| } |
| |
| int ExecutionPlan::nextCompound(const WhileStep* step, std::shared_ptr<Controller> controller, |
| std::shared_ptr<StepExecutor>* executor, |
| std::shared_ptr<ExecutionBurstController>* burstController) const { |
| WhileState& state = controller->mWhileState[controller->mNextStepIndex]; |
| if (state.stage == WhileState::EVALUATE_CONDITION) { |
| state.iteration = state.iteration == WhileState::kOutsideLoop ? 0 : state.iteration + 1; |
| VLOG(EXECUTION) << "next: " << toString(*step) << ": iteration " << state.iteration |
| << ": evaluating condition"; |
| controller->mNextStepIndex = step->condStepIndex; |
| |
| if (state.iteration == 0) { |
| state.startTime = std::chrono::steady_clock::now(); |
| } |
| |
| // iteration = 0 cond inputs = outer inputs |
| // iteration = 1 cond inputs = body outputs |
| // iteration = 2 cond inputs = body outputs |
| // iteration = 3 cond inputs = ... |
| uint32_t loopBodyOutputCount = step->bodyOutputOperands.size(); |
| CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size()); |
| CHECK_GE(step->condInputOperands.size(), loopBodyOutputCount); |
| for (uint32_t i = 0, n = step->condInputOperands.size(); i < n; ++i) { |
| bool operandIsInputOnly = i >= loopBodyOutputCount; |
| controller->setInput((state.iteration == 0 || operandIsInputOnly) |
| ? step->outerInputOperands[i] |
| : step->bodyOutputOperands[i], |
| step->condInputOperands[i]); |
| } |
| |
| state.stage = WhileState::EVALUATE_BODY; |
| return nextCompound(controller, executor, burstController); |
| } |
| |
| CHECK(state.stage == WhileState::EVALUATE_BODY); |
| bool condValue = readConditionValue(controller, step->condOutputOperand); |
| |
| std::chrono::nanoseconds timeoutDuration( |
| controller->mExecutionBuilder->getLoopTimeoutDuration()); |
| auto duration = std::chrono::steady_clock::now() - state.startTime; |
| if (duration > timeoutDuration) { |
| LOG(ERROR) << "WHILE loop timed out after " |
| << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count() |
| << " ms"; |
| return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT; |
| } |
| |
| if (condValue) { |
| VLOG(EXECUTION) << "next: " << toString(*step) << ": iteration " << state.iteration |
| << ": evaluating body"; |
| controller->mNextStepIndex = step->bodyStepIndex; |
| |
| // iteration = 0 body inputs = cond inputs = outer inputs body outputs = tmp1 |
| // iteration = 1 body inputs = cond inputs = tmp1 body outputs = tmp2 |
| // iteration = 2 body inputs = cond inputs = tmp2 body outputs = tmp1 |
| // iteration = 3 body inputs = cond inputs = ... body outputs = ... |
| #ifdef NN_DEBUGGABLE |
| CHECK_GE(step->bodyInputOperands.size(), step->bodyOutputOperands.size()); |
| CHECK_EQ(step->bodyInputOperands.size(), step->outerInputOperands.size()); |
| CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size()); |
| CHECK_GE(step->bodyOutputOperands.size(), step->outerOutputOperands.size()); |
| #endif |
| for (uint32_t i = 0, n = step->bodyInputOperands.size(); i < n; ++i) { |
| controller->setInput(step->condInputOperands[i], step->bodyInputOperands[i]); |
| } |
| if (state.iteration != 0) { |
| for (const SourceOperandIndex& outputOperand : step->bodyOutputOperands) { |
| #ifdef NN_DEBUGGABLE |
| CHECK_EQ(controller->mSourceOperandToInputIndex.count(outputOperand), 0u); |
| CHECK_EQ(controller->mSourceOperandToOutputIndex.count(outputOperand), 0u); |
| CHECK_EQ(controller->mSourceOperandToOffsetOfTemporary.count(outputOperand), 1u); |
| CHECK_EQ(controller->mSourceOperandToOffsetOfTemporary2.count(outputOperand), 1u); |
| #endif |
| std::swap(controller->mSourceOperandToOffsetOfTemporary[outputOperand], |
| controller->mSourceOperandToOffsetOfTemporary2[outputOperand]); |
| } |
| } |
| } else { |
| VLOG(EXECUTION) << "next: " << toString(*step) << ": iteration " << state.iteration |
| << ": exiting loop"; |
| controller->mNextStepIndex = step->exitStepIndex; |
| |
| // Copy body outputs to outer outputs. |
| // TODO: Use outer outputs instead of tmp2 to avoid copying? |
| CHECK_LE(step->outerOutputOperands.size(), step->bodyOutputOperands.size()); |
| for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) { |
| // condInputOperands[i] points to a body output operand from the |
| // last iteration if we've executed at least one iteration and to a |
| // WHILE operation input operand otherwise. |
| const SourceOperandIndex& innerOperand = step->condInputOperands[i]; |
| const SourceOperandIndex& outerOperand = step->outerOutputOperands[i]; |
| std::optional<Buffer> outerBuffer = getBuffer(controller, outerOperand); |
| if (outerBuffer == std::nullopt) { |
| return ANEURALNETWORKS_OP_FAILED; |
| } |
| const Operand& sourceOperand = |
| controller->mExecutionBuilder->getSourceOperand(outerOperand); |
| const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand); |
| CHECK_NE(size, 0u); |
| std::optional<Buffer> innerBuffer = getBuffer(controller, innerOperand); |
| if (innerBuffer == std::nullopt) { |
| return ANEURALNETWORKS_OP_FAILED; |
| } |
| CHECK_LE(size, innerBuffer->getSize()); |
| CHECK_LE(size, outerBuffer->getSize()); |
| memcpy(outerBuffer->getPointer(), innerBuffer->getPointer(), size); |
| outerBuffer->flush(); |
| } |
| state.iteration = WhileState::kOutsideLoop; |
| } |
| |
| state.stage = WhileState::EVALUATE_CONDITION; |
| return nextCompound(controller, executor, burstController); |
| } |
| |
| int ExecutionPlan::nextCompound(const GotoStep* step, std::shared_ptr<Controller> controller, |
| std::shared_ptr<StepExecutor>* executor, |
| std::shared_ptr<ExecutionBurstController>* burstController) const { |
| VLOG(EXECUTION) << "next: " << toString(*step); |
| controller->mNextStepIndex = step->gotoStepIndex; |
| return nextCompound(controller, executor, burstController); |
| } |
| |
| void ExecutionPlan::becomeCompoundIfEmpty() { |
| CHECK(mState != SIMPLE); |
| if (mState == EMPTY) { |
| mBody = new CompoundBody(); |
| mState = COMPOUND; |
| } |
| } |
| |
| ExecutionStep* ExecutionPlan::createNewExecutionStep(uint32_t sourceModelIndex, |
| const std::shared_ptr<Device> device) { |
| becomeCompoundIfEmpty(); |
| auto step = std::make_shared<LogicalStep>(std::in_place_type<ExecutionStep>, this, |
| compound()->mSteps.size(), sourceModelIndex, device); |
| compound()->mSteps.push_back(step); |
| return step->executionStep(); |
| } |
| |
| IfStep* ExecutionPlan::createNewIfStep() { |
| becomeCompoundIfEmpty(); |
| auto step = std::make_shared<LogicalStep>(std::in_place_type<IfStep>); |
| step->ifStep()->index = compound()->mSteps.size(); |
| compound()->mSteps.push_back(step); |
| return step->ifStep(); |
| } |
| |
| WhileStep* ExecutionPlan::createNewWhileStep() { |
| becomeCompoundIfEmpty(); |
| auto step = std::make_shared<LogicalStep>(std::in_place_type<WhileStep>); |
| step->whileStep()->index = compound()->mSteps.size(); |
| compound()->mSteps.push_back(step); |
| return step->whileStep(); |
| } |
| |
| GotoStep* ExecutionPlan::createNewGotoStep() { |
| becomeCompoundIfEmpty(); |
| auto step = std::make_shared<LogicalStep>(std::in_place_type<GotoStep>); |
| step->gotoStep()->index = compound()->mSteps.size(); |
| compound()->mSteps.push_back(step); |
| return step->gotoStep(); |
| } |
| |
| void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device, |
| const ModelBuilder* model) { |
| CHECK(mState == EMPTY); |
| mBody = new SimpleBody(device, model, mCacheDir, mToken); |
| mState = SIMPLE; |
| } |
| |
| void ExecutionPlan::recordTemporaryDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) { |
| auto [it, isNew] = |
| compound()->mTemporaryToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex); |
| CHECK(isNew) << "Step " << stepIndex << " redefines temporary operand " |
| << toString(sourceOperandIndex) << " already defined by step " << it->second; |
| } |
| |
| void ExecutionPlan::dump() const { |
| if (mBody) { |
| mBody->dump(); |
| } else { |
| VLOG(COMPILATION) << "EMPTY"; |
| } |
| } |
| |
| void ExecutionPlan::reset() { |
| if (mBody) { |
| delete mBody; |
| mBody = nullptr; |
| } |
| mState = EMPTY; |
| } |
| |
| bool ExecutionPlan::isSimpleCpu() const { |
| return isSimple() && simple()->mDevice == DeviceManager::getCpuDevice(); |
| } |
| |
| ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const { |
| switch (mState) { |
| case EMPTY: |
| return Kind::EMPTY; |
| case SIMPLE: |
| nnAssert(mBody); |
| return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR; |
| case COMPOUND: |
| nnAssert(mBody); |
| return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR; |
| default: |
| nnAssert(!"unexpected state"); |
| return Kind::ERROR; |
| } |
| } |
| |
| std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const { |
| return simple()->mDevice; |
| } |
| |
| const std::vector<std::shared_ptr<LogicalStep>>& ExecutionPlan::forTest_compoundGetSteps() const { |
| return compound()->mSteps; |
| } |
| |
| bool ExecutionPlan::forTest_hasStepModelOutputsOfUnknownSize() const { |
| return mBody->hasStepModelOutputsOfUnknownSize(); |
| } |
| |
| const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const { |
| return simple()->mToken.getCacheToken(); |
| } |
| |
| void ExecutionPlan::SimpleBody::dump() const { |
| VLOG(COMPILATION) << "SIMPLE for " << mDevice->getName(); |
| } |
| |
| void ExecutionPlan::CompoundBody::dump() const { |
| for (const auto& step : mSteps) { |
| step->dump(); |
| } |
| } |
| |
| void ExecutionPlan::SimpleBody::forEachStepRoleOfInput(uint32_t index, |
| const StepRoleCallback& callback) const { |
| callback(mPreparedModel.get(), IOType::INPUT, index); |
| } |
| |
| void ExecutionPlan::SimpleBody::forEachStepRoleOfOutput(uint32_t index, |
| const StepRoleCallback& callback) const { |
| callback(mPreparedModel.get(), IOType::OUTPUT, index); |
| } |
| |
| // Map an input role of the main model to the input/output roles in the step models: |
| // - An input role of the main model may be used as an input of multiple step models. |
| // - An input role of the main model should not be used as an output of any step model. |
| void ExecutionPlan::CompoundBody::forEachStepRoleOfInput(uint32_t index, |
| const StepRoleCallback& callback) const { |
| for (const auto& logicalStep : mSteps) { |
| if (const ExecutionStep* step = logicalStep->tryExecutionStep()) { |
| // Model input as step model input. |
| const auto& inputMapping = step->getInputIndexStepModelToMainModel(); |
| for (uint32_t i = 0; i < inputMapping.size(); i++) { |
| if (inputMapping[i] == index) { |
| callback(step->getPreparedStepModel().get(), IOType::INPUT, i); |
| } |
| } |
| } |
| } |
| } |
| |
| // Map an output role of the main model to the input/output roles in the step models: |
| // - An output role of the main model may only be used as one output of one single step model. |
| // - An output role of the main model may be used as an input of multiple step models. |
| void ExecutionPlan::CompoundBody::forEachStepRoleOfOutput(uint32_t index, |
| const StepRoleCallback& callback) const { |
| bool found = false; |
| for (const auto& logicalStep : mSteps) { |
| if (const ExecutionStep* step = logicalStep->tryExecutionStep()) { |
| // Model output as step model output. |
| if (!found) { |
| const auto& outputMapping = step->getOutputIndexStepModelToMainModel(); |
| for (uint32_t i = 0; i < outputMapping.size(); i++) { |
| if (outputMapping[i] == index) { |
| callback(step->getPreparedStepModel().get(), IOType::OUTPUT, i); |
| found = true; |
| break; |
| } |
| } |
| } |
| // Model output as step model input. |
| const auto& inputToOutputMapping = step->getOutputsAsStepModelInputsIndexToMainModel(); |
| for (uint32_t i = 0; i < inputToOutputMapping.size(); i++) { |
| if (inputToOutputMapping[i] == index) { |
| callback(step->getPreparedStepModel().get(), IOType::INPUT, i); |
| } |
| } |
| } |
| } |
| } |
| |
| int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices, |
| uint32_t preference, uint32_t priority, |
| const std::optional<Deadline>& deadline, |
| ExecutionPlan* plan) const { |
| uint32_t sourceModelIndex = plan->getSourceModels().addModel(this); |
| NN_RETURN_IF_ERROR(partitionTheWorkInternal(sourceModelIndex, devices, preference, priority, |
| deadline, plan)); |
| int n = plan->finish(preference, priority, deadline); |
| if (VLOG_IS_ON(COMPILATION)) { |
| VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: source model: "; |
| logModelToInfo(makeHidlModel()); |
| plan->dump(); |
| } |
| return n; |
| } |
| |
| int ModelBuilder::partitionTheWorkInternal(uint32_t sourceModelIndex, |
| const std::vector<std::shared_ptr<Device>>& devices, |
| uint32_t preference, uint32_t priority, |
| const std::optional<Deadline>& deadline, |
| ExecutionPlan* plan) const { |
| // This function uses a heuristic approach to partitioning the graph. |
| // It should be good enough for the first release. |
| |
| SourceModels* sourceModels = &plan->getSourceModels(); |
| const size_t deviceCount = devices.size(); |
| const size_t operationCount = mOperations.size(); |
| |
| VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: " |
| << "sourceModelIndex = " << sourceModelIndex << ", " |
| << "deviceCount = " << deviceCount << ", " |
| << "operationCount = " << operationCount; |
| |
| // Figure out where each operation will best execute. |
| // The value of the vector is the index in the devices vector. |
| std::vector<int> bestDeviceForOperation(operationCount); |
| NN_RETURN_IF_ERROR( |
| findBestDeviceForEachOperation(preference, devices, &bestDeviceForOperation)); |
| |
| // A special value produced by findBestDeviceForEachOperation meaning that |
| // this is a control flow operation scheduled for interpreted execution |
| // (see LogicalStep). |
| const int kControlFlow = deviceCount; |
| |
| // If one device will run all the operations, we don't need to split the work. |
| if (sourceModelIndex == kMainModelInSourceModels && |
| std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(), |
| std::not_equal_to<int>()) == bestDeviceForOperation.end()) { |
| const int bestDeviceIndex = bestDeviceForOperation[0]; |
| if (bestDeviceIndex != kControlFlow) { // The model is not a single control flow operation. |
| VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: " |
| << bestDeviceIndex << " = " << devices[bestDeviceIndex]->getName(); |
| plan->becomeSingleStep(devices[bestDeviceIndex], this); |
| return plan->finish(preference, priority, deadline); |
| } |
| } |
| |
| // No easy solution, we need to split the work. |
| |
| // We keep track of the operations that are ready to run for each device. |
| // perDeviceQueue[deviceCount] is for interpreted execution of control flow |
| // (see LogicalStep). |
| std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount + 1); |
| |
| // This helper function enqueues the operation on the appropriate queue. |
| auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) { |
| int deviceIndex = bestDeviceForOperation[operationIndex]; |
| perDeviceQueue[deviceIndex].push(operationIndex); |
| VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto " |
| << deviceIndex; |
| }; |
| |
| // This helper function finds a device that has operations ready to process. |
| // We start by looking at the control flow queue, and then look at the |
| // devices in reverse order (i.e., starting at the end of the devices |
| // vector). Earlier devices have a chance to prepare more of the inputs |
| // required by other devices. This function returns -1 if all queues are |
| // empty. |
| auto findNextDeviceToProcess = [&]() -> int { |
| for (int i = perDeviceQueue.size() - 1; i >= 0; i--) { |
| if (!perDeviceQueue[i].empty()) { |
| return i; |
| } |
| } |
| return -1; |
| }; |
| |
| OperandTracker tracker(this, enqueueOnAppropriateDevice); |
| // For each iteration of this loop, we'll create an execution step. |
| while (true) { |
| // Find the device we'll do this step for. |
| int deviceIndex = findNextDeviceToProcess(); |
| VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex; |
| if (deviceIndex < 0) { |
| break; |
| } |
| |
| // Assign as much as possible to this device. |
| auto& queue = perDeviceQueue[deviceIndex]; |
| if (deviceIndex != kControlFlow) { |
| ExecutionStep* step = |
| plan->createNewExecutionStep(sourceModelIndex, devices[deviceIndex]); |
| while (!queue.empty()) { |
| uint32_t operationIndex = queue.front(); |
| queue.pop(); |
| int n = step->addOperation(operationIndex); |
| if (n != ANEURALNETWORKS_NO_ERROR) { |
| LOG(ERROR) << "failed to add operation " << operationIndex << " to step"; |
| return n; |
| } |
| tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice); |
| } |
| } else { |
| while (!queue.empty()) { |
| uint32_t operationIndex = queue.front(); |
| queue.pop(); |
| const Operation& operation = getOperation(operationIndex); |
| if (operation.type == OperationType::IF) { |
| namespace op = operation_if; |
| const Operand& thenOperand = |
| getOperand(operation.inputs[op::kThenModelOperand]); |
| const Operand& elseOperand = |
| getOperand(operation.inputs[op::kElseModelOperand]); |
| const ModelBuilder* thenModel = getReferencedModel(thenOperand); |
| const ModelBuilder* elseModel = getReferencedModel(elseOperand); |
| uint32_t thenModelIndex = sourceModels->addModel(thenModel); |
| uint32_t elseModelIndex = sourceModels->addModel(elseModel); |
| |
| // Emits the following: |
| // Index Step |
| // i if then=(i + 1) else=(j + 1) |
| // ... (then model steps) |
| // j goto k |
| // ... (else model steps) |
| // k (steps after the IF) |
| IfStep* ifStep = plan->createNewIfStep(); |
| ifStep->conditionOperandIndex = SourceOperandIndex( |
| sourceModelIndex, operation.inputs[op::kCondBoolOperand]); |
| ifStep->thenStepIndex = plan->getNextStepIndex(); |
| NN_RETURN_IF_ERROR(thenModel->partitionTheWorkInternal( |
| thenModelIndex, devices, preference, priority, deadline, plan)); |
| GotoStep* afterThenBranch = plan->createNewGotoStep(); |
| ifStep->elseStepIndex = plan->getNextStepIndex(); |
| NN_RETURN_IF_ERROR(elseModel->partitionTheWorkInternal( |
| elseModelIndex, devices, preference, priority, deadline, plan)); |
| afterThenBranch->gotoStepIndex = plan->getNextStepIndex(); |
| |
| // Outer model operands. |
| for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) { |
| ifStep->outerInputOperands.emplace_back(sourceModelIndex, |
| operation.inputs[i]); |
| } |
| for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) { |
| ifStep->outerOutputOperands.emplace_back(sourceModelIndex, |
| operation.outputs[i]); |
| } |
| // Then model operands. |
| for (uint32_t i = 0, n = thenModel->inputCount(); i < n; ++i) { |
| ifStep->thenBranchInputOperands.emplace_back( |
| thenModelIndex, thenModel->getInputOperandIndex(i)); |
| } |
| for (uint32_t i = 0, n = thenModel->outputCount(); i < n; ++i) { |
| ifStep->thenBranchOutputOperands.emplace_back( |
| thenModelIndex, thenModel->getOutputOperandIndex(i)); |
| } |
| // Else model operands. |
| for (uint32_t i = 0, n = elseModel->inputCount(); i < n; ++i) { |
| ifStep->elseBranchInputOperands.emplace_back( |
| elseModelIndex, elseModel->getInputOperandIndex(i)); |
| } |
| for (uint32_t i = 0, n = elseModel->outputCount(); i < n; ++i) { |
| ifStep->elseBranchOutputOperands.emplace_back( |
| elseModelIndex, elseModel->getOutputOperandIndex(i)); |
| } |
| } else if (operation.type == OperationType::WHILE) { |
| namespace op = operation_while; |
| const Operand& condOperand = |
| getOperand(operation.inputs[op::kCondModelOperand]); |
| const Operand& bodyOperand = |
| getOperand(operation.inputs[op::kBodyModelOperand]); |
| const ModelBuilder* condModel = getReferencedModel(condOperand); |
| const ModelBuilder* bodyModel = getReferencedModel(bodyOperand); |
| uint32_t condModelIndex = sourceModels->addModel(condModel); |
| uint32_t bodyModelIndex = sourceModels->addModel(bodyModel); |
| |
| // Emits the following: |
| // Index Step |
| // i while cond=(i + 1) body=(j + 1) exit=(k + 1) |
| // ... (cond model steps) |
| // j goto i |
| // ... (body model steps) |
| // k goto i |
| // ... (steps after the WHILE) |
| // |
| // Note that WhileStep has WhileState associated with it. |
| WhileStep* whileStep = plan->createNewWhileStep(); |
| whileStep->condStepIndex = plan->getNextStepIndex(); |
| NN_RETURN_IF_ERROR(condModel->partitionTheWorkInternal( |
| condModelIndex, devices, preference, priority, deadline, plan)); |
| GotoStep* afterCond = plan->createNewGotoStep(); |
| afterCond->gotoStepIndex = whileStep->index; |
| whileStep->bodyStepIndex = plan->getNextStepIndex(); |
| NN_RETURN_IF_ERROR(bodyModel->partitionTheWorkInternal( |
| bodyModelIndex, devices, preference, priority, deadline, plan)); |
| GotoStep* afterBody = plan->createNewGotoStep(); |
| afterBody->gotoStepIndex = whileStep->index; |
| whileStep->exitStepIndex = plan->getNextStepIndex(); |
| |
| // Outer model operands. |
| for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) { |
| whileStep->outerInputOperands.emplace_back(sourceModelIndex, |
| operation.inputs[i]); |
| } |
| for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) { |
| whileStep->outerOutputOperands.emplace_back(sourceModelIndex, |
| operation.outputs[i]); |
| } |
| // Cond model operands. |
| for (uint32_t i = 0, n = condModel->inputCount(); i < n; ++i) { |
| whileStep->condInputOperands.emplace_back( |
| condModelIndex, condModel->getInputOperandIndex(i)); |
| } |
| whileStep->condOutputOperand = |
| SourceOperandIndex(condModelIndex, condModel->getOutputOperandIndex(0)); |
| // Body model operands. |
| for (uint32_t i = 0, n = bodyModel->inputCount(); i < n; ++i) { |
| whileStep->bodyInputOperands.emplace_back( |
| bodyModelIndex, bodyModel->getInputOperandIndex(i)); |
| } |
| for (uint32_t i = 0, n = bodyModel->outputCount(); i < n; ++i) { |
| whileStep->bodyOutputOperands.emplace_back( |
| bodyModelIndex, bodyModel->getOutputOperandIndex(i)); |
| } |
| } else { |
| CHECK(false) << toString(operation.type) << " is not a control flow operation"; |
| } |
| tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice); |
| } |
| } |
| } |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| float ModelBuilder::getPerformance(uint32_t preference, |
| const std::shared_ptr<Device> device) const { |
| // Note that we will call this method multiple times per compilation with |
| // the same arguments if there are nested control flow operations and we |
| // decide to execute the outer operation on the ExecutionPlan::next() |
| // interpreter. |
| // |
| // This is a potential compilation performance problem. To work around it, |
| // the performance value could be cached for the duration of a compilation. |
| float perf = 0; |
| const size_t operationCount = mOperations.size(); |
| for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) { |
| perf += getPerformance(preference, device, operationIndex); |
| } |
| return perf; |
| } |
| |
| float ModelBuilder::getPerformance(uint32_t preference, const std::shared_ptr<Device> device, |
| uint32_t operationIndex) const { |
| auto applyPreference = [preference](const PerformanceInfo& perf) { |
| return preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage : perf.execTime; |
| }; |
| |
| const Operation& operation = getOperation(operationIndex); |
| |
| if (operation.type == OperationType::IF) { |
| namespace op = operation_if; |
| const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]); |
| const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]); |
| const ModelBuilder* thenModel = getReferencedModel(thenOperand); |
| const ModelBuilder* elseModel = getReferencedModel(elseOperand); |
| return applyPreference(device->getIfPerformance()) + |
| 0.5 * (thenModel->getPerformance(preference, device) + |
| elseModel->getPerformance(preference, device)); |
| } |
| |
| if (operation.type == OperationType::WHILE) { |
| namespace op = operation_while; |
| const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]); |
| const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]); |
| const ModelBuilder* condModel = getReferencedModel(condOperand); |
| const ModelBuilder* bodyModel = getReferencedModel(bodyOperand); |
| return applyPreference(device->getWhilePerformance()) + |
| condModel->getPerformance(preference, device) + |
| bodyModel->getPerformance(preference, device); |
| } |
| |
| // TODO This assumes that the type is dictated by the first operand. This is |
| // currently the case but is not a safe assumption to make in the long term. |
| const uint32_t operandIndex = operation.inputs[0]; |
| const OperandType operandType = mOperands[operandIndex].type; |
| switch (operandType) { |
| case OperandType::FLOAT32: |
| if (mRelaxComputationFloat32toFloat16) { |
| return applyPreference(device->getRelaxedFloat32toFloat16PerformanceScalar()); |
| } |
| break; |
| case OperandType::TENSOR_FLOAT32: |
| if (mRelaxComputationFloat32toFloat16) { |
| return applyPreference(device->getRelaxedFloat32toFloat16PerformanceTensor()); |
| } |
| break; |
| default: |
| break; |
| } |
| |
| return applyPreference(device->getPerformance(operandType)); |
| } |
| |
| namespace { |
| |
| // This class determines whether a given device can execute a given operation |
| class CanDo { |
| public: |
| CanDo() {} |
| |
| void initialize(const MetaModel& metaModel, std::shared_ptr<Device> device) { |
| mSupportsOperationByIndex = device->getSupportedOperations(metaModel); |
| } |
| |
| bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; } |
| |
| private: |
| std::vector<bool> mSupportsOperationByIndex; |
| }; |
| |
| } // anonymous namespace |
| |
| int ModelBuilder::findBestDeviceForEachOperation( |
| uint32_t preference, const std::vector<std::shared_ptr<Device>>& devices, |
| std::vector<int>* bestDeviceForOperation) const { |
| const MetaModel metaModel(makeHidlModel(), DeviceManager::get()->strictSlicing()); |
| |
| const size_t deviceCount = devices.size(); |
| std::vector<CanDo> canDo(deviceCount); |
| for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) { |
| canDo[deviceIndex].initialize(metaModel, devices[deviceIndex]); |
| } |
| |
| // Figure out the best driver for each operation. |
| const size_t operationCount = mOperations.size(); |
| for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) { |
| const Operation& operation = getOperation(operationIndex); |
| // Find which device, including CPU fallback, gives the best performance for this operation. |
| int bestChoice = -1; |
| float bestPerfVal = 0.0; // Do not check bestPerfVal if bestChoice < 0. |
| for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) { |
| const auto& device = devices[deviceIndex]; |
| if (canDo[deviceIndex].check(operationIndex)) { |
| const float perfVal = getPerformance(preference, device, operationIndex); |
| if (bestChoice < 0 || perfVal < bestPerfVal || |
| (perfVal == bestPerfVal && device == DeviceManager::getCpuDevice())) { |
| bestChoice = deviceIndex; |
| bestPerfVal = perfVal; |
| } |
| } else { |
| // Somewhat noisy logging, but only place where the user of NNAPI can get |
| // feedback on why an operation was not run on a specific device. |
| // |
| // Logs O(operationCount * deviceCount) times, but typically deviceCount is |
| // very small. |
| VLOG(COMPILATION) << "Device " << device->getName() << " can't do operation " |
| << toString(operation.type); |
| } |
| } |
| if (bestChoice < 0) { |
| LOG(ERROR) << "No driver can do the op"; |
| return ANEURALNETWORKS_BAD_DATA; |
| } else if (devices[bestChoice] == DeviceManager::getCpuDevice() && |
| (operation.type == OperationType::IF || |
| operation.type == OperationType::WHILE)) { |
| // Run control flow on the ExecutionPlan::next() interpreter and try |
| // to delegate referenced models. |
| const int kControlFlow = deviceCount; |
| (*bestDeviceForOperation)[operationIndex] = kControlFlow; |
| VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation(" |
| << toString(operation.type) << ") = -1" |
| << " (NNAPI)"; |
| } else { |
| (*bestDeviceForOperation)[operationIndex] = bestChoice; |
| VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation(" |
| << toString(operation.type) << ") = " << bestChoice << " (" |
| << devices[bestChoice]->getName() << ")"; |
| } |
| } |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| } // namespace nn |
| } // namespace android |