blob: 395357dd3e5004d2db72758191852f42a5d2c3b2 [file] [log] [blame]
/*
* Copyright (C) 2017 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define LOG_TAG "ExecutionPlan"
#include "ExecutionPlan.h"
#include <cutils/native_handle.h>
#include <fcntl.h>
#include <openssl/sha.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <algorithm>
#include <functional>
#include <map>
#include <memory>
#include <mutex>
#include <queue>
#include <set>
#include <string>
#include <type_traits>
#include <unordered_set>
#include <utility>
#include <vector>
#include "BurstBuilder.h"
#include "Callbacks.h"
#include "CompilationBuilder.h"
#include "ExecutionBuilder.h"
#include "ExecutionBurstController.h"
#include "GraphDump.h"
#include "Manager.h"
#include "MetaModel.h"
#include "ModelBuilder.h"
#include "OperationsUtils.h"
#include "TokenHasher.h"
#include "Tracing.h"
#include "TypeManager.h"
#include "Utils.h"
namespace android {
namespace nn {
namespace {
using namespace hal;
// Compiles the model on device.
// If compilation caching is available, depending on ExecutionPlan::mState, the token may only have
// been initialized by the user provided token (SIMPLE body), or is already re-hashed by the
// operation indices to be executed (COMPOUND body). The token will be re-hashed further by the
// device name, device version string, and the execution preference in this function.
int compile(const Device& device, const ModelBuilder& model, int executionPreference,
int compilationPriority, const OptionalTimePoint& deadline, const std::string& cacheDir,
TokenHasher* token, std::shared_ptr<PreparedModel>* preparedModel) {
CHECK(token != nullptr);
CHECK(preparedModel != nullptr);
*preparedModel = nullptr;
std::optional<CacheToken> cacheToken;
if (device.isCachingSupported() && token->ok() &&
token->updateFromString(device.getName().c_str()) &&
token->updateFromString(device.getVersionString().c_str()) &&
token->update(&executionPreference, sizeof(executionPreference)) && token->finish()) {
cacheToken.emplace(token->getCacheToken());
}
const ModelFactory makeModel = [&model] { return model.makeHidlModel(); };
const ExecutionPreference preference = static_cast<ExecutionPreference>(executionPreference);
const Priority priority = convertToHalPriority(compilationPriority);
const auto [n, returnedPreparedModel] =
device.prepareModel(makeModel, preference, priority, deadline, cacheDir, cacheToken);
*preparedModel = returnedPreparedModel;
return n;
}
typedef std::function<void(uint32_t)> OperationReadyCallback;
int copyOperandExtraParams(ModelBuilder& model, uint32_t toOperandIndex,
const Operand& fromOperand) {
if (fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL &&
fromOperand.extraParams.getDiscriminator() ==
Operand::ExtraParams::hidl_discriminator::channelQuant) {
auto& fromChannelQuant = fromOperand.extraParams.channelQuant();
ANeuralNetworksSymmPerChannelQuantParams toChannelQuant = {
.channelDim = fromChannelQuant.channelDim,
.scaleCount = static_cast<uint32_t>(fromChannelQuant.scales.size()),
.scales = fromChannelQuant.scales.data(),
};
return model.setOperandSymmPerChannelQuantParams(toOperandIndex, toChannelQuant);
} else if (isExtensionOperandType(fromOperand.type) &&
fromOperand.extraParams.getDiscriminator() ==
Operand::ExtraParams::hidl_discriminator::extension) {
hidl_vec<uint8_t> extensionData = fromOperand.extraParams.extension();
return model.setOperandExtensionData(toOperandIndex, extensionData.data(),
extensionData.size());
} else if (fromOperand.extraParams.getDiscriminator() !=
Operand::ExtraParams::hidl_discriminator::none ||
fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
LOG(ERROR) << "Type " << toString(fromOperand.type)
<< " has an unexpected extraParams discriminator: "
<< static_cast<int>(fromOperand.extraParams.getDiscriminator());
return ANEURALNETWORKS_BAD_DATA;
} else {
return ANEURALNETWORKS_NO_ERROR;
}
}
// This class tracks whether we know the value of an operand as operations
// are processed.
class OperandTracker {
public:
// Creates the tracker for this model. Figure out which operations can be
// executed right away and cb for each one of them.
OperandTracker(const ModelBuilder* model, OperationReadyCallback cb);
// Mark the specified operation as having been processed. The output
// of the operation now being known, this may make new operations to be
// able to run. Call cb for each one of them.
void markProcessed(uint32_t operationIndex, OperationReadyCallback cb);
private:
const ModelBuilder* mModel;
std::multimap<uint32_t, uint32_t> mOperandToOperations;
std::vector<uint32_t> mUnknownInputCount; // For each operation
};
OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb)
: mModel(model) {
const auto& operations = mModel->getOperations();
mUnknownInputCount.resize(operations.size());
for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) {
const Operation& operation = operations[operationIndex];
uint32_t count = 0;
for (uint32_t operandIndex : operation.inputs) {
auto lifetime = mModel->getOperand(operandIndex).lifetime;
if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
lifetime == OperandLifeTime::SUBGRAPH_OUTPUT) {
count++;
mOperandToOperations.emplace(operandIndex, operationIndex);
}
}
if (count == 0) {
cb(operationIndex);
}
mUnknownInputCount[operationIndex] = count;
}
}
void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) {
// Mark all its outputs as known.
const Operation& operation = mModel->getOperations()[operationIndex];
for (uint32_t operandIndex : operation.outputs) {
auto range = mOperandToOperations.equal_range(operandIndex);
for (auto i = range.first; i != range.second; i++) {
uint32_t& count = mUnknownInputCount[i->second];
if (--count == 0) {
cb(i->second);
}
}
}
}
} // namespace
ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex,
std::shared_ptr<Device> device)
: mPlan(plan),
mIndex(stepIndex),
mStepModel(),
mDevice(device),
mToken(plan->getCacheToken()) {}
// Adds an operand if it has not been added already.
// Sets the index in the step model for the corresponding operand.
int ExecutionStep::addOperand(uint32_t sourceOperandIndex, uint32_t* toOperandIndex,
const ModelBuilder& sourceModel, OperandKind kind) {
// Have we added this operand already?
auto i = mOperandMap.find(sourceOperandIndex);
if (i != mOperandMap.end()) {
CHECK(kind == INPUT);
*toOperandIndex = i->second;
return ANEURALNETWORKS_NO_ERROR;
}
// First time we add this operand.
*toOperandIndex = mStepModel.operandCount();
mOperandMap.emplace(sourceOperandIndex, *toOperandIndex);
// Add the operand to the step model.
const Operand& operand = sourceModel.getOperand(sourceOperandIndex);
ANeuralNetworksOperandType type = {
.type = static_cast<int32_t>(operand.type),
.dimensionCount = static_cast<uint32_t>(operand.dimensions.size()),
.dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr,
.scale = operand.scale,
.zeroPoint = operand.zeroPoint,
};
int n = mStepModel.addOperand(type);
if (n != ANEURALNETWORKS_NO_ERROR) {
LOG(ERROR) << "Previous error occurred when partitioning the graph";
return n;
}
n = copyOperandExtraParams(mStepModel, *toOperandIndex, operand);
if (n != ANEURALNETWORKS_NO_ERROR) {
LOG(ERROR) << "Error when copying extra parameters to the operand";
return n;
}
// Sets its value.
switch (operand.lifetime) {
case OperandLifeTime::CONSTANT_COPY: {
const uint8_t* data = sourceModel.getPointerToOperandValue(operand.location.offset);
n = mStepModel.setOperandValue(*toOperandIndex, data, operand.location.length);
if (n != ANEURALNETWORKS_NO_ERROR) {
LOG(ERROR) << "Previous error occurred when partitioning the graph";
return n;
}
} break;
case OperandLifeTime::CONSTANT_REFERENCE: {
const Memory* memory = sourceModel.getMemories()[operand.location.poolIndex];
n = mStepModel.setOperandValueFromMemory(
*toOperandIndex, memory, operand.location.offset, operand.location.length);
if (n != ANEURALNETWORKS_NO_ERROR) {
LOG(ERROR) << "Previous error occurred when partitioning the graph";
return n;
}
} break;
case OperandLifeTime::NO_VALUE: {
n = mStepModel.setOperandValue(*toOperandIndex, nullptr, 0);
if (n != ANEURALNETWORKS_NO_ERROR) {
LOG(ERROR) << "Previous error occurred when partitioning the graph";
return n;
}
} break;
case OperandLifeTime::TEMPORARY_VARIABLE: // handled similarly to SUBGRAPH_OUTPUT
if (kind == INPUT) {
// The first time we've seen this operand is as an
// input. That means it must be defined by a
// different partition, and is an input to this one.
mTempsAsStepModelInputs.emplace_back(sourceOperandIndex, *toOperandIndex);
} else {
// The first time we've seen this operand is as an
// output. It may be an input to a different
// partition, so keep track of it.
mPlan->recordTemporaryDef(sourceOperandIndex, mIndex);
}
break;
case OperandLifeTime::SUBGRAPH_INPUT:
mModelInputs.emplace_back(sourceOperandIndex, *toOperandIndex);
break;
case OperandLifeTime::SUBGRAPH_OUTPUT: // handled similarly to TEMPORARY_VARIABLE
if (kind == INPUT) {
// The first time we've seen this operand is as an
// input. That means it must be defined by a
// different partition, and is an input to this one.
mOutputsAsStepModelInputs.emplace_back(sourceOperandIndex, *toOperandIndex);
} else {
// The first time we've seen this operand is as an
// output.
mModelOutputs.emplace_back(sourceOperandIndex, *toOperandIndex);
}
break;
default:
CHECK(false);
break;
}
return ANEURALNETWORKS_NO_ERROR;
}
int ExecutionStep::addOperation(int operationIndex, const ModelBuilder& sourceModel) {
const Operation& operation = sourceModel.getOperation(operationIndex);
if (mToken.ok()) {
mToken.update(&operationIndex, sizeof(operationIndex));
}
// Convert the input and output operand indexes.
//
// We expect operations to be added in topological order. Therefore:
//
// - We may not have seen an input if it is a model input, a
// constant, or an operand written by a different partition.
//
// - We should not have seen any outputs.
const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size());
const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size());
std::vector<uint32_t> inputs(inputCount);
std::vector<uint32_t> outputs(outputCount);
auto addOperands = [this, &sourceModel](const hidl_vec<uint32_t>& globalOperands,
std::vector<uint32_t>& localOperands,
OperandKind kind) -> int {
const uint32_t operandCount = static_cast<uint32_t>(globalOperands.size());
for (uint32_t i = 0; i < operandCount; i++) {
uint32_t localOperand = ~0U;
int n = addOperand(globalOperands[i], &localOperand, sourceModel, kind);
if (n != ANEURALNETWORKS_NO_ERROR) return n;
localOperands[i] = localOperand;
}
return ANEURALNETWORKS_NO_ERROR;
};
int n;
if ((n = addOperands(operation.inputs, inputs, INPUT)) != ANEURALNETWORKS_NO_ERROR ||
(n = addOperands(operation.outputs, outputs, OUTPUT)) != ANEURALNETWORKS_NO_ERROR) {
return n;
}
return mStepModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
outputCount, outputs.data());
}
void ExecutionStep::mapInputsAndOutputs(
std::shared_ptr<StepExecutor> executor, const Memory* temporaryMemory,
const std::map<uint32_t, uint32_t>& sourceOperandToOffsetOfTemporary,
const std::map<uint32_t, uint32_t>& sourceOperandToInputIndex,
const std::map<uint32_t, uint32_t>& sourceOperandToOutputIndex) const {
auto mapInput = [&](uint32_t sourceOperandIndex, uint32_t stepInputIndex) {
if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
it != sourceOperandToOffsetOfTemporary.end()) {
executor->setInputFromTemporaryMemory(stepInputIndex, temporaryMemory, it->second);
} else if (auto it = sourceOperandToInputIndex.find(sourceOperandIndex);
it != sourceOperandToInputIndex.end()) {
executor->mapInput(it->second, stepInputIndex);
} else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
it != sourceOperandToOutputIndex.end()) {
executor->mapOutputToInput(it->second, stepInputIndex);
} else {
CHECK(false) << "Cannot map step input " << stepInputIndex << " from operand "
<< sourceOperandIndex;
}
};
auto mapOutput = [&](uint32_t sourceOperandIndex, uint32_t stepOutputIndex) {
if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
it != sourceOperandToOffsetOfTemporary.end()) {
executor->setOutputFromTemporaryMemory(stepOutputIndex, temporaryMemory, it->second);
} else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
it != sourceOperandToOutputIndex.end()) {
executor->mapOutput(it->second, stepOutputIndex);
} else {
CHECK(false) << "Cannot map step output " << stepOutputIndex << " from operand "
<< sourceOperandIndex;
}
};
for (uint32_t i = 0, n = mStepModelInputs.size(); i < n; ++i) {
mapInput(mStepModelInputs[i].first, i);
}
for (uint32_t i = 0, n = mStepModelOutputs.size(); i < n; ++i) {
mapOutput(mStepModelOutputs[i].first, i);
}
}
void ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs() {
for (const auto& step : mSteps) {
for (const auto& input : step->getTempsAsStepModelInputs()) {
const uint32_t sourceOperandIndex = input.first;
const auto it = mTemporaryToDefiningStep.find(sourceOperandIndex);
nnAssert(it != mTemporaryToDefiningStep.end());
const uint32_t stepIndex = it->second;
nnAssert(stepIndex < mSteps.size());
mSteps[stepIndex]->recordTempAsStepModelOutput(sourceOperandIndex);
}
}
}
void ExecutionStep::recordTempAsStepModelOutput(uint32_t sourceOperandIndex) {
const auto it = mOperandMap.find(sourceOperandIndex);
CHECK(it != mOperandMap.end());
mTempsAsStepModelOutputs.emplace(sourceOperandIndex, it->second);
}
void ExecutionStep::logStepModel() const {
VLOG(COMPILATION) << "ExecutionStep::finishStepModel, step " << mIndex;
auto logRemapEntry = [](std::string& toLog, const std::pair<uint32_t, uint32_t>& e) {
if (!toLog.empty()) {
toLog += ", ";
}
toLog += "(";
toLog += std::to_string(e.first);
toLog += "->";
toLog += std::to_string(e.second);
toLog += ")";
};
auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) {
std::string toLog;
for (const auto& e : map) {
logRemapEntry(toLog, e);
}
VLOG(COMPILATION) << name << ": " << toLog;
};
auto logRemapSet = [&logRemapEntry](const char* name, const StepModelOutputSetType& set) {
std::string toLog;
for (const auto& e : set) {
logRemapEntry(toLog, e);
}
VLOG(COMPILATION) << name << ": " << toLog;
};
logRemapVector("step model inputs", mStepModelInputs);
logRemapVector("step model outputs", mStepModelOutputs);
logRemapVector("model inputs", mModelInputs);
logRemapVector("model outputs", mModelOutputs);
logRemapVector("temps as step model inputs", mTempsAsStepModelInputs);
logRemapSet("temps as step model outputs", mTempsAsStepModelOutputs);
logRemapVector("outputs as step model inputs", mOutputsAsStepModelInputs);
}
int ExecutionStep::finishStepModel(const ModelBuilder* mainModel, bool* hasOutputOfUnknownSize,
int32_t executionPreference, int32_t priority) {
CHECK(mDevice != nullptr);
for (const auto& stepModelOutput : mTempsAsStepModelOutputs) {
const Operand& operand = mStepModel.getOperand(stepModelOutput.second);
if (operand.dimensions.size() == 0) {
*hasOutputOfUnknownSize = true;
} else {
for (uint32_t dimension : operand.dimensions) {
if (dimension == 0) {
*hasOutputOfUnknownSize = true;
break;
}
}
}
if (*hasOutputOfUnknownSize) {
VLOG(COMPILATION) << "StepModelOutput (operand#" << stepModelOutput.first
<< " of source graph) has unknown size: " << toString(operand);
}
}
mStepModel.relaxComputationFloat32toFloat16(mainModel->isComputationFloat32RelaxedToFloat16());
mStepModelInputs.insert(mStepModelInputs.end(), mModelInputs.begin(), mModelInputs.end());
mStepModelInputs.insert(mStepModelInputs.end(), mTempsAsStepModelInputs.begin(),
mTempsAsStepModelInputs.end());
mStepModelInputs.insert(mStepModelInputs.end(), mOutputsAsStepModelInputs.begin(),
mOutputsAsStepModelInputs.end());
mStepModelOutputs.insert(mStepModelOutputs.end(), mModelOutputs.begin(), mModelOutputs.end());
mStepModelOutputs.insert(mStepModelOutputs.end(), mTempsAsStepModelOutputs.begin(),
mTempsAsStepModelOutputs.end());
std::map<uint32_t, uint32_t> mainModelOperandToInputIndex;
for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
mainModelOperandToInputIndex[mainModel->getInputOperandIndex(i)] = i;
}
std::map<uint32_t, uint32_t> mainModelOperandToOutputIndex;
for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
mainModelOperandToOutputIndex[mainModel->getOutputOperandIndex(i)] = i;
}
// mInputIndexStepModelToMainModel is ordered by step model input index and relies on
// mModelInputs being the first inputs, as specified by mStepModelInputs.
mInputIndexStepModelToMainModel.resize(mModelInputs.size());
std::transform(mModelInputs.begin(), mModelInputs.end(),
mInputIndexStepModelToMainModel.begin(),
[&mainModelOperandToInputIndex](auto& e) {
uint32_t sourceOperandIndex = e.first;
return mainModelOperandToInputIndex[sourceOperandIndex];
});
// mOutputIndexStepModelToMainModel is ordered by step model output index and relies on
// mModelOutputs being the first outputs, as specified by mStepModelOutputs.
mOutputIndexStepModelToMainModel.resize(mModelOutputs.size());
std::transform(mModelOutputs.begin(), mModelOutputs.end(),
mOutputIndexStepModelToMainModel.begin(),
[&mainModelOperandToOutputIndex](auto& e) {
uint32_t sourceOperandIndex = e.first;
return mainModelOperandToOutputIndex[sourceOperandIndex];
});
// mOutputsAsStepModelInputsIndexToMainModel is ordered by step model input index and relies on
// mOutputsAsStepModelInputs being the first outputs.
mOutputsAsStepModelInputsIndexToMainModel.resize(mOutputsAsStepModelInputs.size());
std::transform(mOutputsAsStepModelInputs.begin(), mOutputsAsStepModelInputs.end(),
mOutputsAsStepModelInputsIndexToMainModel.begin(),
[&mainModelOperandToOutputIndex](auto& e) {
uint32_t sourceOperandIndex = e.first;
return mainModelOperandToOutputIndex[sourceOperandIndex];
});
if (VLOG_IS_ON(COMPILATION)) {
logStepModel();
}
std::vector<uint32_t> inputs(mStepModelInputs.size());
std::vector<uint32_t> outputs(mStepModelOutputs.size());
std::transform(mStepModelInputs.begin(), mStepModelInputs.end(), inputs.begin(),
[](auto& e) { return e.second; });
std::transform(mStepModelOutputs.begin(), mStepModelOutputs.end(), outputs.begin(),
[](auto& e) { return e.second; });
NN_RETURN_IF_ERROR(mStepModel.identifyInputsAndOutputs(inputs.size(), inputs.data(),
outputs.size(), outputs.data()));
NN_RETURN_IF_ERROR(mStepModel.finish());
// TODO: Move compilation elsewhere?
VLOG(COMPILATION) << "ExecutionStep::finishStepModel, compilation on " << mDevice->getName();
return compile(*mDevice, mStepModel, executionPreference, priority, {}, *mPlan->getCacheDir(),
&mToken, &mPreparedStepModel);
}
void ExecutionStep::dump() const {
if (VLOG_IS_ON(COMPILATION)) {
VLOG(COMPILATION) << "ExecutionStep#" << mIndex << " for " << mDevice->getName();
logModelToInfo(mStepModel.makeHidlModel());
}
}
int ExecutionPlan::CompoundBody::finish(const ModelBuilder* mainModel, int32_t executionPreference,
int32_t priority, const OptionalTimePoint& deadline) {
CHECK(deadline.getDiscriminator() == OptionalTimePoint::hidl_discriminator::none);
findTempsAsStepModelOutputs();
for (const auto& step : mSteps) {
int n = step->finishStepModel(mainModel, &mHasStepModelOutputOfUnknownSize,
executionPreference, priority);
if (n != ANEURALNETWORKS_NO_ERROR) {
VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- finishStepModel failed";
return n;
}
}
if (mHasStepModelOutputOfUnknownSize) {
VLOG(COMPILATION)
<< "ExecutionPlan::CompoundBody::finish -- mHasStepModelOutputOfUnknownSize";
return ANEURALNETWORKS_OP_FAILED;
}
for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
mSourceOperandToInputIndex[mainModel->getInputOperandIndex(i)] = i;
}
for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
mSourceOperandToOutputIndex[mainModel->getOutputOperandIndex(i)] = i;
}
mSuccessfulFinish = true;
return ANEURALNETWORKS_NO_ERROR;
}
int ExecutionPlan::SimpleBody::finish(const ModelBuilder*, int32_t executionPreference,
int32_t priority, const OptionalTimePoint& deadline) {
CHECK(mDevice != nullptr);
VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
const int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheDir,
&mToken, &mPreparedModel);
mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
return n;
}
int ExecutionPlan::finish(const ModelBuilder* mainModel, int32_t executionPreference,
int32_t priority, const OptionalTimePoint& deadline) {
CHECK(mBody != nullptr);
return mBody->finish(mainModel, executionPreference, priority, deadline);
}
ExecutionPlan::Controller::Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
const BurstBuilder* burstBuilder,
std::map<uint32_t, uint32_t> sourceOperandToOffsetOfTemporary,
uint32_t totalSizeOfTemporaries)
: mPlan(plan),
mExecutionBuilder(executionBuilder),
mBurstBuilder(burstBuilder),
mSourceOperandToOffsetOfTemporary(std::move(sourceOperandToOffsetOfTemporary)),
mNextStepIndex(0),
mLastStepIndex(kBadStepIndex) {
if (totalSizeOfTemporaries) {
int n;
std::tie(n, mTemporaries) = MemoryAshmem::create(totalSizeOfTemporaries);
if (n != ANEURALNETWORKS_NO_ERROR) {
LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries";
mNextStepIndex = kBadStepIndex;
}
}
}
// Attempt to create a burst object for each PreparedModel/Partition. If the
// burst controller object cannot be made, return a nullptr in its place to
// indicate the regular execution path should be used. This can occur either
// because PreparedModel was nullptr (cpu was best choice), or because the
// IPreparedModel was of insufficient version or failed to configure the burst.
std::vector<std::shared_ptr<ExecutionBurstController>> ExecutionPlan::makeBursts(
int preference) const {
switch (mState) {
// burst object for each partition in the compound case
case COMPOUND: {
std::vector<std::shared_ptr<ExecutionBurstController>> bursts;
bursts.reserve(compound()->mSteps.size());
for (const auto& step : compound()->mSteps) {
if (const auto preparedModel = step->getPreparedStepModel()) {
const bool preferPowerOverLatency =
(preference == ANEURALNETWORKS_PREFER_LOW_POWER);
bursts.push_back(
preparedModel->configureExecutionBurst(preferPowerOverLatency));
} else {
bursts.push_back(nullptr);
}
}
return bursts;
}
// single burst object for the simple case
case SIMPLE: {
std::vector<std::shared_ptr<ExecutionBurstController>> burst;
auto simpleBody = simple();
if (const auto preparedModel = simpleBody->mPreparedModel) {
const bool preferPowerOverLatency =
(preference == ANEURALNETWORKS_PREFER_LOW_POWER);
burst.push_back(preparedModel->configureExecutionBurst(preferPowerOverLatency));
} else {
burst.push_back(nullptr);
}
return burst;
}
// no burst objects made
default:
return {};
}
}
std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const {
CHECK(isValid());
// Create the layout for a Memory object big enough for to hold
// every TEMPORARY in the source model that is live across
// partition boundaries.
//
// TODO: Rethink this approach for managing temporaries. Some
// alternatives:
//
// 1) Adopt a memory layout scheme analogous to stack allocation,
// where objects of non-overlapping lifetime can occupy the same
// storage. We would still have a single Memory object in this
// case.
//
// 2) Do something like what CpuExecutor does, and do allocations
// and deallocations on the fly (during execution) before first
// reference and after last reference, respectively. This would
// mean having one Memory object per TEMPORARY; or, in a more
// complicated implementation, one Memory object per set of
// temporaries that have the same lifetime. Note that the Android
// system limits the number of shared memory objects, which are
// what our Memory objects represent.
//
uint32_t totalSizeOfTemporaries = 0;
std::map<uint32_t, uint32_t> sourceOperandToOffsetOfTemporary;
if (mState == COMPOUND) {
const ModelBuilder* mainModel = executionBuilder->getModel();
for (const auto& step : compound()->mSteps) {
for (const auto& output : step->getTempsAsStepModelOutputs()) {
const uint32_t mainModelOperandIndex = output.first;
const Operand& mainModelOperand = mainModel->getOperand(mainModelOperandIndex);
const uint32_t size = TypeManager::get()->getSizeOfData(mainModelOperand);
totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size);
sourceOperandToOffsetOfTemporary.emplace(mainModelOperandIndex,
totalSizeOfTemporaries);
totalSizeOfTemporaries += size;
}
}
if (VLOG_IS_ON(EXECUTION)) {
for (const auto& io : sourceOperandToOffsetOfTemporary) {
VLOG(EXECUTION) << "temp: source operand index = " << io.first
<< ", offset = " << io.second;
}
}
}
return std::shared_ptr<Controller>(new Controller(this, executionBuilder, burstBuilder,
std::move(sourceOperandToOffsetOfTemporary),
totalSizeOfTemporaries));
}
// TODO: Find a better way to provide this functionality.
int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
std::shared_ptr<StepExecutor>* executor) const {
*executor = nullptr;
VLOG(EXECUTION) << "ExecutionPlan::fallback(" << controller << ", " << executor
<< "): mNextStepIndex = " << controller->mNextStepIndex;
if (controller->mLastStepIndex == Controller::kBadStepIndex) {
// We haven't called next().
return ANEURALNETWORKS_OP_FAILED;
}
if (controller->mNextStepIndex == Controller::kBadStepIndex) {
// The last call to next() did not produce an executor.
return ANEURALNETWORKS_OP_FAILED;
}
controller->mNextStepIndex = controller->mLastStepIndex;
return next(controller, executor);
}
int ExecutionPlan::next(std::shared_ptr<Controller> controller,
std::shared_ptr<StepExecutor>* executor,
std::shared_ptr<ExecutionBurstController>* burstController) const {
controller->mLastStepIndex = controller->mNextStepIndex;
*executor = nullptr;
if (burstController != nullptr) {
*burstController = nullptr;
}
VLOG(EXECUTION) << "ExecutionPlan::next(" << SHOW_IF_DEBUG(controller << ", " << executor)
<< "): mNextStepIndex = " << controller->mNextStepIndex;
if (controller->mNextStepIndex == Controller::kBadStepIndex) {
return ANEURALNETWORKS_OP_FAILED;
}
if (mState == EMPTY) {
nnAssert(controller->mNextStepIndex == 0); // end
controller->mNextStepIndex = Controller::kBadStepIndex;
return ANEURALNETWORKS_NO_ERROR;
}
if (mState == SIMPLE) {
if (controller->mNextStepIndex == 0) {
// First (and only) step.
auto simpleBody = simple();
*executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder,
simpleBody->mModel, simpleBody->mDevice,
simpleBody->mPreparedModel);
(*executor)->mapInputsAndOutputsTrivially();
if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
*burstController = controller->mBurstBuilder->getControllerAt(0);
}
controller->mNextStepIndex = 1;
return ANEURALNETWORKS_NO_ERROR;
}
nnAssert(controller->mNextStepIndex == 1); // end
controller->mNextStepIndex = Controller::kBadStepIndex;
return ANEURALNETWORKS_NO_ERROR;
}
auto compoundBody = compound();
if (controller->mNextStepIndex == compoundBody->mSteps.size()) {
// end
controller->mNextStepIndex = Controller::kBadStepIndex;
return ANEURALNETWORKS_NO_ERROR;
}
const auto step = compoundBody->mSteps[controller->mNextStepIndex];
*executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(),
step->getDevice(), step->getPreparedStepModel());
(*executor)->setExecutionStep(step);
step->mapInputsAndOutputs(*executor, controller->mTemporaries.get(),
controller->mSourceOperandToOffsetOfTemporary,
compoundBody->mSourceOperandToInputIndex,
compoundBody->mSourceOperandToOutputIndex);
if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
*burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex);
}
controller->mNextStepIndex++;
return ANEURALNETWORKS_NO_ERROR;
}
std::shared_ptr<ExecutionStep> ExecutionPlan::createNewStep(const std::shared_ptr<Device> device) {
nnAssert(mState != SIMPLE);
if (mState == EMPTY) {
mBody = new CompoundBody();
mState = COMPOUND;
}
auto& steps = compound()->mSteps;
auto step = std::make_shared<ExecutionStep>(this, steps.size(), device);
steps.push_back(step);
return step;
}
void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device,
const ModelBuilder* model) {
nnAssert(mState == EMPTY);
mBody = new SimpleBody(device, model, mCacheDir, mToken);
mState = SIMPLE;
}
void ExecutionPlan::recordTemporaryDef(uint32_t sourceOperandIndex, uint32_t stepIndex) {
auto [it, isNew] = compound()->mTemporaryToDefiningStep.emplace(sourceOperandIndex, stepIndex);
CHECK(isNew) << "Step " << stepIndex << " redefines temporary operand "
<< toString(sourceOperandIndex) << " already defined by step " << it->second;
}
void ExecutionPlan::dump() const {
if (mBody) {
mBody->dump();
} else {
VLOG(COMPILATION) << "EMPTY";
}
}
void ExecutionPlan::reset() {
if (mBody) {
delete mBody;
mBody = nullptr;
}
mState = EMPTY;
}
bool ExecutionPlan::isSimpleCpu() const {
return isSimple() && simple()->mDevice == DeviceManager::getCpuDevice();
}
ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const {
switch (mState) {
case EMPTY:
return Kind::EMPTY;
case SIMPLE:
nnAssert(mBody);
return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR;
case COMPOUND:
nnAssert(mBody);
return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR;
default:
nnAssert(!"unexpected state");
return Kind::ERROR;
}
}
std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const {
return simple()->mDevice;
}
const std::vector<std::shared_ptr<ExecutionStep>>& ExecutionPlan::forTest_compoundGetSteps() const {
return compound()->mSteps;
}
bool ExecutionPlan::forTest_hasStepModelOutputsOfUnknownSize() const {
return mBody->hasStepModelOutputsOfUnknownSize();
}
const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const {
return simple()->mToken.getCacheToken();
}
void ExecutionPlan::SimpleBody::dump() const {
VLOG(COMPILATION) << "SIMPLE for " << mDevice->getName();
}
void ExecutionPlan::CompoundBody::dump() const {
for (const auto& step : mSteps) {
step->dump();
}
}
void ExecutionPlan::SimpleBody::forEachStepRoleOfInput(uint32_t index,
const StepRoleCallback& callback) const {
callback(mPreparedModel.get(), IOType::INPUT, index);
}
void ExecutionPlan::SimpleBody::forEachStepRoleOfOutput(uint32_t index,
const StepRoleCallback& callback) const {
callback(mPreparedModel.get(), IOType::OUTPUT, index);
}
// Map an input role of the parent model to the input/output roles in the step models:
// - An input role of the parent model may be used as an input of multiple step-models.
// - An input role of the parent model should not be used as an output of any step-model.
void ExecutionPlan::CompoundBody::forEachStepRoleOfInput(uint32_t index,
const StepRoleCallback& callback) const {
for (const auto& step : mSteps) {
// Model input as step-model input.
const auto& inputMapping = step->getInputIndexStepModelToMainModel();
for (uint32_t i = 0; i < inputMapping.size(); i++) {
if (inputMapping[i] == index) {
callback(step->getPreparedStepModel().get(), IOType::INPUT, i);
}
}
}
}
// Map an output role of the parent model to the input/output roles in the step models:
// - An output role of the parent model may only be used as one output of one single step-model.
// - An output role of the parent model may be used as an input of multiple step-models.
void ExecutionPlan::CompoundBody::forEachStepRoleOfOutput(uint32_t index,
const StepRoleCallback& callback) const {
bool found = false;
for (const auto& step : mSteps) {
// Model output as step-model output.
if (!found) {
const auto& outputMapping = step->getOutputIndexStepModelToMainModel();
for (uint32_t i = 0; i < outputMapping.size(); i++) {
if (outputMapping[i] == index) {
callback(step->getPreparedStepModel().get(), IOType::OUTPUT, i);
found = true;
break;
}
}
}
// Model output as step-model input.
const auto& inputToOutputMapping = step->getOutputsAsStepModelInputsIndexToMainModel();
for (uint32_t i = 0; i < inputToOutputMapping.size(); i++) {
if (inputToOutputMapping[i] == index) {
callback(step->getPreparedStepModel().get(), IOType::INPUT, i);
}
}
}
}
int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
uint32_t preference, uint32_t priority,
const OptionalTimePoint& deadline, ExecutionPlan* plan) const {
// This function uses a heuristic approach to partitioning the graph.
// It should be good enough for the first release.
const size_t deviceCount = devices.size();
const size_t operationCount = mOperations.size();
VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: deviceCount = " << deviceCount
<< ", operationCount = " << operationCount;
// Figure out where each operation will best execute.
// The value of the vector is the index in the devices vector.
std::vector<int> bestDeviceForOperation(operationCount);
NN_RETURN_IF_ERROR(
findBestDeviceForEachOperation(preference, devices, &bestDeviceForOperation));
// If one device will run all the operations, we don't need to split the work.
if (std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(),
std::not_equal_to<int>()) == bestDeviceForOperation.end()) {
const int bestDeviceIndex = bestDeviceForOperation[0];
VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: "
<< bestDeviceIndex << " = " << devices[bestDeviceIndex]->getName();
plan->becomeSingleStep(devices[bestDeviceIndex], this);
return plan->finish(this, preference, priority, deadline);
}
// No easy solution, we need to split the work.
// We keep track of the operations that are ready to run for each device.
std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount);
// This helper function enqueues the operation on the appropriate queue.
auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
int deviceIndex = bestDeviceForOperation[operationIndex];
perDeviceQueue[deviceIndex].push(operationIndex);
VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
<< deviceIndex;
};
// This helper function finds a device that has operations ready to process.
// We start by looking at the CPU. We do this to try to maximize the
// size of the graph we'll send to non-CPU devices. If the CPU runs first,
// it will have the chance to prepare more of the inputs required by the
// other devices. This function returns -1 if all queues are empty.
auto findNextDeviceToProcess = [&]() -> int {
for (int i = deviceCount - 1; i >= 0; i--) {
if (!perDeviceQueue[i].empty()) {
return i;
}
}
return -1;
};
OperandTracker tracker(this, enqueueOnAppropriateDevice);
// For each iteration of this loop, we'll create an execution step.
while (true) {
// Find the device we'll do this step for.
int deviceIndex = findNextDeviceToProcess();
VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex;
if (deviceIndex < 0) {
break;
}
// Assign as much as possible to this device.
std::shared_ptr<ExecutionStep> step = plan->createNewStep(devices[deviceIndex]);
auto& queue = perDeviceQueue[deviceIndex];
while (!queue.empty()) {
uint32_t operationIndex = queue.front();
queue.pop();
int n = step->addOperation(operationIndex, *this);
if (n != ANEURALNETWORKS_NO_ERROR) {
LOG(ERROR) << "failed to add operation " << operationIndex << " to step";
return n;
}
tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
}
}
int n = plan->finish(this, preference, priority, deadline);
if (VLOG_IS_ON(COMPILATION)) {
VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: source model: ";
logModelToInfo(makeHidlModel());
plan->dump();
}
return n;
}
PerformanceInfo ModelBuilder::getPerformanceInfo(const std::shared_ptr<Device> device,
uint32_t operationIndex) const {
const Operation& operation = getOperation(operationIndex);
// TODO This assumes that the type is dictated by the first operand. This is
// currently the case but is not a safe assumption to make in the long term.
const uint32_t operandIndex = operation.inputs[0];
const OperandType operandType = mOperands[operandIndex].type;
switch (operandType) {
case OperandType::FLOAT32:
if (mRelaxComputationFloat32toFloat16) {
return device->getRelaxedFloat32toFloat16PerformanceScalar();
}
break;
case OperandType::TENSOR_FLOAT32:
if (mRelaxComputationFloat32toFloat16) {
return device->getRelaxedFloat32toFloat16PerformanceTensor();
}
break;
default:
break;
}
return device->getPerformance(operandType);
}
namespace {
// This class determines whether a given device can execute a given operation
class CanDo {
public:
CanDo() {}
void initialize(const MetaModel& metaModel, std::shared_ptr<Device> device) {
mSupportsOperationByIndex = device->getSupportedOperations(metaModel);
}
bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; }
private:
std::vector<bool> mSupportsOperationByIndex;
};
} // anonymous namespace
int ModelBuilder::findBestDeviceForEachOperation(
uint32_t preference, const std::vector<std::shared_ptr<Device>>& devices,
std::vector<int>* bestDeviceForOperation) const {
const MetaModel metaModel(makeHidlModel(), DeviceManager::get()->strictSlicing());
const size_t deviceCount = devices.size();
std::vector<CanDo> canDo(deviceCount);
for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
canDo[deviceIndex].initialize(metaModel, devices[deviceIndex]);
}
// Figure out the best driver for each operation.
const size_t operationCount = mOperations.size();
for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
// Find which device, including CPU fallback, gives the best performance for this operation.
int bestChoice = -1;
float bestPerfVal = 0.0; // Do not check bestPerfVal if bestChoice < 0.
for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
const auto& device = devices[deviceIndex];
if (canDo[deviceIndex].check(operationIndex)) {
const PerformanceInfo perf = getPerformanceInfo(device, operationIndex);
const float perfVal =
(preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage
: perf.execTime);
if (bestChoice < 0 || perfVal < bestPerfVal ||
(perfVal == bestPerfVal && device == DeviceManager::getCpuDevice())) {
bestChoice = deviceIndex;
bestPerfVal = perfVal;
}
} else {
// Somewhat noisy logging, but only place where the user of
// NNAPI can get feedback on why an operation was not run on a
// specific device.
// Logs O(operationCount * deviceCount) times, but
// typically deviceCount is very small.
VLOG(COMPILATION) << "Device " << device->getName() << " can't do operation "
<< toString(getOperation(operationIndex).type);
}
}
if (bestChoice < 0) {
LOG(ERROR) << "No driver can do the op";
return ANEURALNETWORKS_BAD_DATA;
}
(*bestDeviceForOperation)[operationIndex] = bestChoice;
VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
<< toString(getOperation(operationIndex).type) << ") = " << bestChoice
<< " (" << devices[bestChoice]->getName() << ")";
}
return ANEURALNETWORKS_NO_ERROR;
}
} // namespace nn
} // namespace android