Partial fix to allow partitions to have boundary temporaries of unknown size.
The old behavior was that we'd fall back to full model CPU execution at
compilation time; the new behavior is that we'll get ordinary
partitioned compilation and execution.
Limitations:
- Needs more testing and more tests written.
- The initial guess for the size of a boundary temporary is a single
element. Perhaps it would be useful to remember actual size from
a previous execution.
- Fenced execution punts to unfenced execution (at the NDK API level)
when plan contains subgraph outputs of unknown size.
- Operands of unknown size at control flow construct boundaries still
falls back to full model CPU execution.
Also adds some diagnostic logging.
Test: NeuralNetworksTest_static
Bug: 132458982
Merged-In: I52e7179ff9783d184fd6bfc1c9fefc55972e942a
Change-Id: I52e7179ff9783d184fd6bfc1c9fefc55972e942a
(cherry picked from commit d6183c8db7feb5e2bdf0d2907af01418e7da809e)
diff --git a/runtime/ExecutionBuilder.cpp b/runtime/ExecutionBuilder.cpp
index 0f94e43..e36e564 100644
--- a/runtime/ExecutionBuilder.cpp
+++ b/runtime/ExecutionBuilder.cpp
@@ -20,6 +20,7 @@
#include <algorithm>
#include <limits>
+#include <map>
#include <memory>
#include <mutex>
#include <optional>
@@ -46,6 +47,66 @@
using namespace hal;
+// Partial validation of output shapes returned from driver, to ensure they
+// conform to a very specific set of rules.
+static bool validateOutputShapesFromDriver(ErrorStatus executionStatus, const ModelBuilder* model,
+ const std::vector<hal::OutputShape>& shapes) {
+ // Enforces the following rules (some of which are from b/154054474):
+ // - shapes vector is empty except in the case of NONE or OUTPUT_INSUFFICIENT_SIZE.
+ // If the vector is not empty, it must have as many entries as the step model has outputs.
+ // - If NONE, then either shapes vector is empty, or every shape is
+ // marked isSufficient and, if a tensor, has known rank.
+ // - If OUTPUT_INSUFFICIENT_SIZE, then the vector is not empty. At least one entry
+ // is marked !isSufficient.
+ switch (executionStatus) {
+ case ErrorStatus::NONE: {
+ NN_RET_CHECK(shapes.size() == 0 || shapes.size() == model->outputCount())
+ << "With execution ErrorStatus " << toString(executionStatus)
+ << " output shapes vector must be empty or of length " << model->outputCount()
+ << " but has length " << shapes.size();
+ NN_RET_CHECK(std::all_of(shapes.begin(), shapes.end(),
+ [](const OutputShape& shape) { return shape.isSufficient; }))
+ << "With execution ErrorStatus " << toString(executionStatus)
+ << " at least one output shape is unexpectedly marked !isSufficient";
+
+ const TypeManager* tm = TypeManager::get();
+ for (uint32_t outputIndex = 0, outputCount = shapes.size(); outputIndex < outputCount;
+ ++outputIndex) {
+ const hal::Operand& outputOperand = model->getOutputOperand(outputIndex);
+ NN_RET_CHECK(!tm->isTensorType(outputOperand.type) ||
+ (shapes[outputIndex].dimensions.size() != 0))
+ << "With execution ErrorStatus " << toString(executionStatus) << " output#"
+ << outputIndex << " shape unexpectedly has zero rank";
+ }
+
+ break;
+ }
+ case ErrorStatus::OUTPUT_INSUFFICIENT_SIZE: {
+ NN_RET_CHECK(shapes.size() == model->outputCount())
+ << "With execution ErrorStatus " << toString(executionStatus)
+ << " output shapes vector must be of length " << model->outputCount()
+ << " but has length " << shapes.size();
+ NN_RET_CHECK(std::any_of(shapes.begin(), shapes.end(),
+ [](const OutputShape& shape) { return !shape.isSufficient; }))
+ << "With execution ErrorStatus " << toString(executionStatus)
+ << " at least one output shape must have been marked !isSufficient";
+ break;
+ }
+ default: {
+ NN_RET_CHECK(shapes.size() == 0)
+ << "With execution ErrorStatus " << toString(executionStatus)
+ << " output shapes vector must be empty but has length " << shapes.size();
+ break;
+ }
+ }
+ return true;
+}
+static bool validateOutputShapesFromDriver(int executionResultCode, const ModelBuilder* model,
+ const std::vector<hal::OutputShape>& shapes) {
+ return validateOutputShapesFromDriver(convertResultCodeToErrorStatus(executionResultCode),
+ model, shapes);
+}
+
const Timing kNoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
static MeasureTiming measureTiming(const ExecutionBuilder* execution) {
@@ -497,7 +558,7 @@
static void asyncStartComputePartitioned(ExecutionBuilder* executionBuilder,
const ExecutionPlan& plan,
std::shared_ptr<ExecutionPlan::Controller> controller,
- bool allowFallback,
+ bool allowCpuFallback,
const std::optional<Deadline>& deadline,
const sp<ExecutionCallback>& executionCallback) {
CHECK(executionBuilder != nullptr);
@@ -505,8 +566,12 @@
std::vector<OutputShape> outputShapes = executionBuilder->getInitialOutputShapes();
Timing timing = kNoTiming;
- // Disallow fallback when the ExecutionPlan is simple on CPU.
- allowFallback &= !plan.isSimpleCpu();
+ // Disallow CPU fallback when the ExecutionPlan is simple on CPU.
+ allowCpuFallback &= !plan.isSimpleCpu();
+
+ // On this iteration, do I need to repeat the previous step because it
+ // reported insufficient size?
+ bool doInsufficientSizeFallback = false;
while (true) {
VLOG(EXECUTION) << "looking for next StepExecutor";
@@ -514,13 +579,15 @@
// Get the current step of the execution.
std::shared_ptr<StepExecutor> executor;
std::shared_ptr<ExecutionBurstController> burstController;
- int n = plan.next(controller, &executor, &burstController);
+ int n = doInsufficientSizeFallback ? plan.fallback(controller, &executor, &burstController)
+ : plan.next(controller, &executor, &burstController);
+ doInsufficientSizeFallback = false;
if (n != ANEURALNETWORKS_NO_ERROR) {
// During the interpreted execution of control flow, a loop timeout
// might occur in ExecutionPlan::next().
bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
- if (allowFallback && !missedDeadline) break;
+ if (allowCpuFallback && !missedDeadline) break;
executionCallback->notify(convertResultCodeToErrorStatus(n), {}, kNoTiming);
return;
}
@@ -536,36 +603,57 @@
// Attempt to execute a single step of the execution.
auto [stepN, stepOutputShapes, stepTiming] = executor->compute(deadline, burstController);
- // Update global outputs.
- if (!executor->updateOutputShapes(stepOutputShapes, &outputShapes)) {
+ // Update global outputs and dynamic temporaries.
+ StepExecutor::UpdateOutputShapes updateOutputShapes = {};
+ if (!executor->updateOutputShapes(stepN, stepOutputShapes, &outputShapes,
+ &updateOutputShapes)) {
stepN = ANEURALNETWORKS_OP_FAILED;
}
// If execution was successful, continue to next step.
if (stepN == ANEURALNETWORKS_NO_ERROR) {
- // We only support collection of timing information in the case of a
- // single step, so it's safe to just keep track of the last step's
- // timing information.
- timing = stepTiming;
+ if (updateOutputShapes.zeroSizedInput) {
+ // We'll need to do full model CPU fallback
+ VLOG(EXECUTION) << "updateOutputShapes.zeroSizedInput";
+ stepN = ANEURALNETWORKS_OP_FAILED;
+ } else {
+ CHECK(executor->areDynamicTemporariesAllocated());
+ // We only support collection of timing information in the case
+ // of a single step, so it's safe to just keep track of the last
+ // step's timing information.
+ timing = stepTiming;
+ continue;
+ }
+ }
+
+ if (stepN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
+ VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: " << toString(updateOutputShapes);
+ if (updateOutputShapes.mainOutputInsufficient ||
+ !updateOutputShapes.updatedDynamicTemporary) {
+ // Either:
+ // - At least one main model output is not of sufficient size; or
+ // - we didn't learn anything new about dynamic temporaries.
+ // Neither of these is recoverable, so end execution.
+ const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
+ executionCallback->notify(stepStatus, outputShapes, kNoTiming);
+ return;
+ }
+ // Every main model output is of sufficient size. This implies that
+ // at least one dynamic temporary is not of sufficient size. This
+ // is recoverable.
+ doInsufficientSizeFallback = true;
continue;
}
- // OUTPUT_INSUFFICIENT_SIZE is not recoverable, so end execution.
- if (stepN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
- const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
- executionCallback->notify(stepStatus, outputShapes, kNoTiming);
- return;
- }
-
- // If fallback is not allowed and there was an error, end execution.
- if (!allowFallback) {
+ // If CPU fallback is not allowed and there was an error, end execution.
+ if (!allowCpuFallback) {
const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
executionCallback->notify(stepStatus, {}, kNoTiming);
return;
}
// If CPU execution was already attempted, either:
- // (1) perform a full fallback if the plan is not simple, or
+ // (1) perform a full CPU fallback if the plan is not simple, or
// (2) return from the function with an error
if (executorIsCpu) {
if (!plan.isSimple()) break;
@@ -574,42 +662,77 @@
}
// If the code reaches this point, attempt a partial fallback to CPU.
- CHECK(allowFallback);
- auto [fallbackN, fallbackOutputShapes, fallbackTiming, fallbackExecutor] =
- cpuFallbackPartial(plan, controller);
-
- // Update global outputs.
- if (fallbackExecutor != nullptr &&
- !fallbackExecutor->updateOutputShapes(fallbackOutputShapes, &outputShapes)) {
- fallbackN = ANEURALNETWORKS_OP_FAILED;
+ CHECK(allowCpuFallback);
+ if (updateOutputShapes.zeroSizedInput) {
+ // Do not attempt a partial fallback.
+ break;
}
+ while (true) {
+ auto [fallbackN, fallbackOutputShapes, fallbackTiming, fallbackExecutor] =
+ cpuFallbackPartial(plan, controller);
- // If execution was successful, continue to next step.
- if (fallbackN == ANEURALNETWORKS_NO_ERROR) {
- // We only support collection of timing information in the case of a
- // single step, so it's safe to just keep track of the last step's
- // timing information.
- timing = fallbackTiming;
- continue;
- }
+ // Update global outputs and dynamic temporaries.
+ StepExecutor::UpdateOutputShapes fallbackUpdateOutputShapes = {};
+ if (fallbackExecutor != nullptr &&
+ !fallbackExecutor->updateOutputShapes(fallbackN, fallbackOutputShapes,
+ &outputShapes, &fallbackUpdateOutputShapes)) {
+ fallbackN = ANEURALNETWORKS_OP_FAILED;
+ }
- // OUTPUT_INSUFFICIENT_SIZE is not recoverable, so end execution.
- if (fallbackN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
- const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
- executionCallback->notify(fallbackStatus, outputShapes, kNoTiming);
- return;
- }
+ // If execution was successful, continue to next step.
+ if (fallbackN == ANEURALNETWORKS_NO_ERROR) {
+ if (fallbackUpdateOutputShapes.zeroSizedInput) {
+ // We'll need to do full model CPU fallback
+ VLOG(EXECUTION) << "fallbackUpdateOutputShapes.zeroSizedInput";
+ fallbackN = ANEURALNETWORKS_OP_FAILED;
+ break;
+ }
+ CHECK(fallbackExecutor->areDynamicTemporariesAllocated());
+ // We only support collection of timing information in the case of a
+ // single step, so it's safe to just keep track of the last step's
+ // timing information.
+ timing = fallbackTiming;
+ goto nextStep;
+ }
- // Do not fallback twice if the ExecutionPlan is simple.
- if (plan.isSimple()) {
- const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
- executionCallback->notify(fallbackStatus, {}, kNoTiming);
- return;
+ if (fallbackN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
+ VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: "
+ << toString(fallbackUpdateOutputShapes);
+ if (fallbackUpdateOutputShapes.mainOutputInsufficient ||
+ !fallbackUpdateOutputShapes.updatedDynamicTemporary) {
+ // Either:
+ // - At least one main model output is not of sufficient size; or
+ // - we didn't learn anything new about dynamic temporaries.
+ // Neither of these is recoverable, so end execution.
+ const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
+ executionCallback->notify(fallbackStatus, outputShapes, kNoTiming);
+ return;
+ }
+ // Every main model output is of sufficient size. This implies
+ // that at least one dynamic temporary is not of sufficient
+ // size. This is recoverable.
+ continue;
+ }
+
+ // Do not fallback twice if the ExecutionPlan is simple.
+ if (plan.isSimple()) {
+ const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
+ executionCallback->notify(fallbackStatus, {}, kNoTiming);
+ return;
+ }
+
+ // If the code reaches this point, then there was an error with the
+ // fallback. In this case, attempt full fallback.
+ break;
}
// If the code reaches this point, then there was an error with the
// fallback. In this case, attempt full fallback.
break;
+
+ nextStep:
+ // Bottom of the outer loop
+ continue;
}
// If the code has reached this point, a potentially recoverable error
@@ -623,16 +746,28 @@
// In case of partitioned execution, startComputeFenced call will return the sync
// fence and the fenced compute callback returned from the last partition.
// Any failed partition will result in the whole execution fallback to CPU if
-// allowFallback is set to true.
+// allowCpuFallback is set to true.
static std::tuple<int, int, sp<hal::IFencedExecutionCallback>> startComputeFenced(
ExecutionBuilder* executionBuilder, const ExecutionPlan& plan,
std::shared_ptr<ExecutionPlan::Controller> controller, const std::vector<int>& waitFor,
uint64_t timeoutDurationAfterFence, const std::optional<Deadline>& deadline,
- bool allowFallback) {
+ bool allowCpuFallback) {
+ // We should have detected this earlier in the call chain and fallen back to
+ // non-fenced execution. This is an implementation limitation: In order to
+ // support dynamic temporarires in this code, we'd need to implement
+ // something like the following:
+ // - If a partition has outputs of unknown size, execute that partition in a
+ // non fenced fashion, just as if it were scheduled on a driver that does
+ // not support fenced execution.
+ // - Implement something similar to the code in asyncStartComputePartitioned()
+ // that handles a step execution that fails with
+ // ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE.
+ CHECK(!executionBuilder->getCompilation()->hasDynamicTemporaries());
+
CHECK(executionBuilder != nullptr);
VLOG(EXECUTION) << "ExecutionBuilder::computeFenced (from plan, iteratively)";
// Disallow fallback when the ExecutionPlan is simple on CPU.
- allowFallback &= !plan.isSimpleCpu();
+ allowCpuFallback &= !plan.isSimpleCpu();
// Initiate waitForFds, syncFence for the first step.
std::vector<int> waitForFds = waitFor;
@@ -650,7 +785,7 @@
// might occur in ExecutionPlan::next().
bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
- if (allowFallback && !missedDeadline) break;
+ if (allowCpuFallback && !missedDeadline) break;
// Return -1 for the sync fence fd, and nullptr for the callback.
return std::make_tuple(n, -1, nullptr);
}
@@ -686,8 +821,8 @@
if (stepN == ANEURALNETWORKS_NO_ERROR) {
continue;
}
- // If fallback is not allowed and there was an error, end execution.
- if (!allowFallback) {
+ // If CPU fallback is not allowed and there was an error, end execution.
+ if (!allowCpuFallback) {
return std::make_tuple(stepN, -1, nullptr);
}
@@ -767,12 +902,13 @@
}
}
mStarted = true;
- const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
+ const bool allowCpuFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this, nullptr);
VLOG(EXECUTION) << "ExecutionBuilder::computeFenced";
int result;
- std::tie(result, mSyncFenceFd, mFencedExecutionCallback) = startComputeFenced(
- this, *mPlan, controller, waitFor, timeoutDurationAfterFence, deadline, allowFallback);
+ std::tie(result, mSyncFenceFd, mFencedExecutionCallback) =
+ startComputeFenced(this, *mPlan, controller, waitFor, timeoutDurationAfterFence,
+ deadline, allowCpuFallback);
*syncFence = mSyncFenceFd;
return result;
}
@@ -826,14 +962,14 @@
// asynchronous thread -- take the asynchronous thread logic out of
// CpuPreparedModel::execute() and use it to wrap the plan-based-path.
mStarted = true;
- const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
+ const bool allowCpuFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
std::shared_ptr<ExecutionPlan::Controller> controller =
mPlan->makeController(this, burstBuilder);
if (synchronous) {
VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)";
sp<ExecutionCallback> localSynchronizationCallback = new ExecutionCallback();
localSynchronizationCallback->setOnFinish(wrappedFinish);
- asyncStartComputePartitioned(this, *mPlan, controller, allowFallback, deadline,
+ asyncStartComputePartitioned(this, *mPlan, controller, allowCpuFallback, deadline,
localSynchronizationCallback);
localSynchronizationCallback->wait();
if (mMeasureTiming) {
@@ -854,13 +990,13 @@
executionCallback->setOnFinish(wrappedFinish);
if (DeviceManager::get()->syncExecRuntime()) {
VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API, non-threaded)";
- asyncStartComputePartitioned(this, *mPlan, controller, allowFallback, deadline,
+ asyncStartComputePartitioned(this, *mPlan, controller, allowCpuFallback, deadline,
executionCallback);
} else {
VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API)";
std::thread asyncExecution(
- [this, controller, allowFallback, deadline, executionCallback] {
- asyncStartComputePartitioned(this, *mPlan, controller, allowFallback,
+ [this, controller, allowCpuFallback, deadline, executionCallback] {
+ asyncStartComputePartitioned(this, *mPlan, controller, allowCpuFallback,
deadline, executionCallback);
});
executionCallback->bindThread(std::move(asyncExecution));
@@ -884,7 +1020,7 @@
}
// Check if the dimensions "to" is updatable by dimensions "from", where "from" must
-// have a higher specification level.
+// have no lower a specification level.
static bool isUpdatable(const std::vector<uint32_t>& to, const std::vector<uint32_t>& from) {
if (to.size() == 0) return true;
NN_RET_CHECK_EQ(to.size(), from.size());
@@ -894,7 +1030,17 @@
return true;
}
-bool ExecutionBuilder::updateOutputShapes(const std::vector<OutputShape>& outputShapes) {
+static bool isZeroSizedTensor(int executionResultCode, const OutputShape& outputShape) {
+ return (executionResultCode == ANEURALNETWORKS_NO_ERROR) && outputShape.isSufficient &&
+ outputShape.dimensions.size() &&
+ (std::find(outputShape.dimensions.begin(), outputShape.dimensions.end(), uint32_t(0)) !=
+ outputShape.dimensions.end());
+}
+
+bool ExecutionBuilder::updateOutputShapes(ErrorStatus status,
+ const std::vector<OutputShape>& outputShapes) {
+ NN_RET_CHECK(validateOutputShapesFromDriver(status, mModel, outputShapes));
+
if (outputShapes.size() == 0) {
return true;
}
@@ -927,7 +1073,7 @@
CHECK(!mFinishedWithoutSyncFence) << "ExecutionBuilder::finishWithoutSyncFence is called twice";
CHECK(!hasSyncFence())
<< "ExecutionBuilder::finishWithoutSyncFence is called when hasSyncFence()";
- if (!updateOutputShapes(outputShapes) || !updateMemories()) {
+ if (!updateOutputShapes(status, outputShapes) || !updateMemories()) {
status = ErrorStatus::GENERAL_FAILURE;
}
bool success = status == ErrorStatus::NONE;
@@ -951,19 +1097,124 @@
return status;
}
-bool StepExecutor::updateOutputShapes(const std::vector<OutputShape>& from,
- std::vector<OutputShape>* to) {
+std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes) {
+ return "{ .updatedDynamicTemporary = " +
+ std::to_string(updateOutputShapes.updatedDynamicTemporary) +
+ ", .mainOutputInsufficient = " +
+ std::to_string(updateOutputShapes.mainOutputInsufficient) + "}";
+}
+
+bool StepExecutor::updateOutputShapes(int executionResultCode, const std::vector<OutputShape>& from,
+ std::vector<OutputShape>* to, UpdateOutputShapes* update) {
+ CHECK(update != nullptr);
+ *update = {.updatedDynamicTemporary = false,
+ .mainOutputInsufficient = false,
+ .zeroSizedInput = false};
+
+ NN_RET_CHECK(validateOutputShapesFromDriver(executionResultCode, mModel, from));
+
if (from.size() == 0) {
return true;
}
+
+ if (VLOG_IS_ON(EXECUTION)) {
+ for (const auto& shape : from) {
+ VLOG(EXECUTION) << "updateOutputShapes: " << toString(shape);
+ }
+ }
+
if (mExecutionStep != nullptr) {
const auto& indexMapping = mExecutionStep->getOutputIndexStepModelToMainModel();
NN_RET_CHECK_LE(indexMapping.size(), from.size());
for (uint32_t i = 0, e = indexMapping.size(); i < e; i++) {
- uint32_t toIndex = indexMapping[i];
+ const uint32_t toIndex = indexMapping[i];
NN_RET_CHECK_GT(to->size(), toIndex);
NN_RET_CHECK(isUpdatable(to->at(toIndex).dimensions, from[i].dimensions));
(*to)[toIndex] = from[i];
+ update->mainOutputInsufficient |= !(*to)[toIndex].isSufficient;
+ if (mExecutionStep->getModelOutputsThatAreDownstreamInputs().count(toIndex) &&
+ isZeroSizedTensor(executionResultCode, from[i])) {
+ update->zeroSizedInput = true;
+ }
+ }
+
+ if (!mDynamicTemporaries->empty()) {
+ // TODO(b/157236079): Instead of computing this here, precompute it in ExecutionStep?
+ std::map<uint32_t, uint32_t> operandIndexStepModelOutputToSourceModelTemp;
+ for (const auto& entry : mExecutionStep->getTempsAsStepModelOutputs()) {
+ operandIndexStepModelOutputToSourceModelTemp.emplace(entry.second, entry.first);
+ }
+
+ const uint32_t sourceModelIndex = mExecutionStep->getSourceModelIndex();
+ for (uint32_t i = 0, e = mModel->outputCount(); i < e; i++) {
+ const uint32_t stepModelOperandIndex = mModel->getOutputOperandIndex(i);
+ const auto it =
+ operandIndexStepModelOutputToSourceModelTemp.find(stepModelOperandIndex);
+ if (it == operandIndexStepModelOutputToSourceModelTemp.end()) {
+ continue;
+ }
+ const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, it->second);
+ VLOG(EXECUTION) << "updateOutputShapes checking to see if output#" << i
+ << " sourceOperandIndex = (" << sourceOperandIndex.first << ", "
+ << sourceOperandIndex.second << ") is a dynamic temporary";
+ // This is a temporary, but it might not be a dynamic temporary.
+ const auto loc = mDynamicTemporaries->lookup(sourceOperandIndex, false);
+ if (loc == std::nullopt) {
+ continue;
+ }
+ NN_RET_CHECK(isUpdatable(*loc->dimensions, from[i].dimensions));
+ bool changedShape = false;
+ const uint32_t actualSize = TypeManager::get()->getSizeOfData(
+ mModel->getOperand(stepModelOperandIndex).type, from[i].dimensions);
+ if (actualSize > 0) {
+ changedShape = mDynamicTemporaries->redeclare(sourceOperandIndex,
+ from[i].dimensions, actualSize);
+ } else if (!from[i].isSufficient) {
+ NN_RET_CHECK(loc->length < UINT32_MAX / 2)
+ << "output#" << i << " length overflow";
+ changedShape = mDynamicTemporaries->redeclare(
+ sourceOperandIndex, from[i].dimensions, 2 * loc->length);
+ } else {
+ // The combination of not-fully-specified dimensions
+ // and isSufficient means that we have no
+ // information about whether the size of the dynamic
+ // temporary is adequate.
+ VLOG(EXECUTION) << "updateOutputShapes skipping redeclaration for output#" << i;
+ if (executionResultCode == ANEURALNETWORKS_NO_ERROR) {
+ NN_RET_CHECK(isZeroSizedTensor(executionResultCode, from[i]));
+ // This is a zero-sized tensor, and by
+ // definition, any dynamic temporary is an input
+ // to an execution step.
+ update->zeroSizedInput = true;
+ }
+ }
+ if (changedShape) {
+ // TODO: find a better place for this comment.
+ //
+ // isUpdatable(a, b) imposes a partial ordering a <=
+ // b. Every fully specified dimensions vector is an
+ // upper bound of that ordering. Therefore, any
+ // change in dimensions moves towards an upper
+ // bound, and hence there are a finite number of
+ // such changes possible.
+ //
+ // actualSize can only be computed from dimensions
+ // that are an upper bound. Therefore, once
+ // actualSize is computed, it will not change.
+ //
+ // If dimensions are not fully specified, and
+ // estimated size changes, it increases. There is
+ // an upper bound on estimated size to avoid
+ // overflow.
+ //
+ // Therefore, if we retry only when dimensions or
+ // size chage, and we stop retrying if we would
+ // otherwise overflow, we should only retry a finite
+ // number of times.
+ update->updatedDynamicTemporary = true;
+ }
+ }
+ mDynamicTemporaries->vlogDump("finished updateOutputShapes");
}
} else {
NN_RET_CHECK_EQ(from.size(), to->size());
@@ -977,19 +1228,26 @@
StepExecutor::StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
std::shared_ptr<Device> device,
- std::shared_ptr<PreparedModel> preparedModel, const ExecutionStep* step)
+ std::shared_ptr<PreparedModel> preparedModel, const ExecutionStep* step,
+ DynamicTemporaries* dynamicTemporaries)
: mExecutionBuilder(executionBuilder),
mExecutionStep(step),
+ mDynamicTemporaries(dynamicTemporaries),
mModel(model),
mDevice(device),
mPreparedModel(preparedModel),
mInputs(model->inputCount()),
mOutputs(model->outputCount()) {
CHECK(mDevice != nullptr);
+ CHECK_EQ(step == nullptr, dynamicTemporaries == nullptr);
VLOG(EXECUTION) << "StepExecutor::StepExecutor with " << mInputs.size() << " inputs and "
<< mOutputs.size() << " outputs";
}
+bool StepExecutor::areDynamicTemporariesAllocated() const {
+ return !mDynamicTemporaries || mDynamicTemporaries->allocated(mExecutionStep->getIndex());
+}
+
void StepExecutor::mapInputsAndOutputsTrivially() {
mInputs = mExecutionBuilder->mInputs;
mOutputs = mExecutionBuilder->mOutputs;
@@ -1019,33 +1277,56 @@
int StepExecutor::setInputOrOutputFromMemory(const Operand& inputOrOutputOperand,
const Memory* memory, uint32_t offset,
+ const hal::hidl_vec<uint32_t>& dimensions,
+ std::optional<uint32_t> length,
ModelArgumentInfo* inputOrOutputInfo) {
// Should be similar to
// ExecutionBuilder::setInputFromMemory()
// ExecutionBuilder::setOutputFromMemory()
uint32_t poolIndex = mMemories.add(memory);
- uint32_t length = TypeManager::get()->getSizeOfData(inputOrOutputOperand);
+ uint32_t lengthVal = length.value_or(TypeManager::get()->getSizeOfData(inputOrOutputOperand));
CHECK(inputOrOutputInfo->unspecified());
int n;
std::tie(n, *inputOrOutputInfo) =
ModelArgumentInfo::createFromMemory(inputOrOutputOperand,
- /*type=*/nullptr, poolIndex, offset, length);
+ /*type=*/nullptr, poolIndex, offset, lengthVal);
+ if (n == ANEURALNETWORKS_NO_ERROR && dimensions.size()) {
+ CHECK(isUpdatable(inputOrOutputInfo->dimensions(), dimensions));
+ inputOrOutputInfo->dimensions() = dimensions;
+ }
return n;
}
+static std::string toString(std::vector<uint32_t> dimensions) {
+ std::string ret = "(";
+ bool wroteOne = false;
+ for (uint32_t dimension : dimensions) {
+ if (wroteOne) {
+ ret += ", ";
+ } else {
+ wroteOne = true;
+ }
+ ret += std::to_string(dimension);
+ }
+ ret += ")";
+ return ret;
+};
+
static void logArguments(const char* kind, const std::vector<ModelArgumentInfo>& args) {
for (unsigned i = 0; i < args.size(); i++) {
const auto& arg = args[i];
std::string prefix = kind + std::string("[") + std::to_string(i) + "] = ";
switch (arg.state()) {
case ModelArgumentInfo::POINTER:
- VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer()) << ")";
+ VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer()) << ") dim"
+ << toString(arg.dimensions());
break;
case ModelArgumentInfo::MEMORY:
VLOG(EXECUTION) << prefix << "MEMORY("
<< "pool=" << arg.locationAndLength().poolIndex << ", "
- << "off=" << arg.locationAndLength().offset << ")";
+ << "off=" << arg.locationAndLength().offset << ") dim"
+ << toString(arg.dimensions());
break;
case ModelArgumentInfo::HAS_NO_VALUE:
VLOG(EXECUTION) << prefix << "HAS_NO_VALUE";