Partial fix to allow partitions to have boundary temporaries of unknown size.
The old behavior was that we'd fall back to full model CPU execution at
compilation time; the new behavior is that we'll get ordinary
partitioned compilation and execution.
Limitations:
- Needs more testing and more tests written.
- The initial guess for the size of a boundary temporary is a single
element. Perhaps it would be useful to remember actual size from
a previous execution.
- Fenced execution punts to unfenced execution (at the NDK API level)
when plan contains subgraph outputs of unknown size.
- Operands of unknown size at control flow construct boundaries still
falls back to full model CPU execution.
Also adds some diagnostic logging.
Test: NeuralNetworksTest_static
Bug: 132458982
Merged-In: I52e7179ff9783d184fd6bfc1c9fefc55972e942a
Change-Id: I52e7179ff9783d184fd6bfc1c9fefc55972e942a
(cherry picked from commit d6183c8db7feb5e2bdf0d2907af01418e7da809e)
diff --git a/common/CpuExecutor.cpp b/common/CpuExecutor.cpp
index 9f24775..8d23c0a 100644
--- a/common/CpuExecutor.cpp
+++ b/common/CpuExecutor.cpp
@@ -1914,6 +1914,8 @@
const RunTimeOperandInfo& from = operands[operandIndex];
mOutputShapes[i].dimensions = from.dimensions;
mOutputShapes[i].isSufficient = from.isSufficient();
+ VLOG(EXECUTION) << "CpuExecutor::setOutputShapes: mOutputShapes[" << i
+ << "] = " << toString(mOutputShapes[i]);
}
}
diff --git a/common/OperationsUtils.cpp b/common/OperationsUtils.cpp
index d1814b7..f0bcb0e 100644
--- a/common/OperationsUtils.cpp
+++ b/common/OperationsUtils.cpp
@@ -356,7 +356,7 @@
if (dim1 != dim2 && dim1 != 1 && dim2 != 1) {
LOG(ERROR) << "Dimensions mismatch for broadcast:\n"
<< "First tensor: dimension " << numberOfDims1 - i << " of size " << dim1
- << "\nSecond tensor: dimension " << numberOfDims2 - i << "of size " << dim2;
+ << "\nSecond tensor: dimension " << numberOfDims2 - i << " of size " << dim2;
return false;
}
out->dimensions[maxDims - i] = (dim1 == 1) ? dim2 : dim1;
diff --git a/runtime/CompilationBuilder.cpp b/runtime/CompilationBuilder.cpp
index 8b2a269..051ac88 100644
--- a/runtime/CompilationBuilder.cpp
+++ b/runtime/CompilationBuilder.cpp
@@ -63,7 +63,8 @@
mPlan.setCaching(&mCacheDir, mToken);
}
if (mPartitioning) {
- int n = mModel->partitionTheWork(mDevices, mPreference, mPriority, deadline, &mPlan);
+ int n = mModel->partitionTheWork(mDevices, mPreference, mPriority, deadline, &mPlan,
+ mFailPartitioning);
switch (n) {
case ANEURALNETWORKS_NO_ERROR:
return n;
@@ -96,7 +97,7 @@
VLOG(COMPILATION) << "CompilationBuilder::finish with CPU fallback";
mPlan.reset();
mPlan.becomeSingleStep(DeviceManager::getCpuDevice(), mModel);
- return mPlan.finish(mPreference, mPriority, deadline);
+ return mPlan.finish(mPreference, mPriority, deadline, ANEURALNETWORKS_NO_ERROR);
}
int CompilationBuilder::setPreference(int32_t preference) {
@@ -166,9 +167,9 @@
return ANEURALNETWORKS_NO_ERROR;
}
-int CompilationBuilder::setPartitioning(uint32_t partitioning) {
+int CompilationBuilder::forTest_setPartitioning(uint32_t partitioning) {
if (mFinished) {
- LOG(ERROR) << "ANeuralNetworksCompilation_setPartitioning can't modify after compilation "
+ LOG(ERROR) << "CompilationBuilder::forTest_setPartitioning can't modify after compilation "
"finished";
return ANEURALNETWORKS_BAD_STATE;
}
@@ -177,6 +178,17 @@
return ANEURALNETWORKS_NO_ERROR;
}
+int CompilationBuilder::forTest_failPartitioning(int fail) {
+ if (mFinished) {
+ LOG(ERROR) << "CompilationBuilder::forTest_failPartitioning can't modify after compilation "
+ "finished";
+ return ANEURALNETWORKS_BAD_STATE;
+ }
+
+ mFailPartitioning = fail;
+ return ANEURALNETWORKS_NO_ERROR;
+}
+
int CompilationBuilder::createExecution(ExecutionBuilder** execution) {
if (!mFinished) {
LOG(ERROR) << "ANeuralNetworksExecution_create passed an unfinished compilation";
diff --git a/runtime/CompilationBuilder.h b/runtime/CompilationBuilder.h
index d94fb18..0f2db4d 100644
--- a/runtime/CompilationBuilder.h
+++ b/runtime/CompilationBuilder.h
@@ -47,8 +47,6 @@
int setPreference(int32_t preference);
- int setPartitioning(uint32_t partitioning);
-
int setCaching(const std::string& cacheDir, const uint8_t* token);
int setPriority(int32_t priority);
@@ -66,10 +64,17 @@
int forEachStepRoleOfInput(uint32_t index, const StepRoleCallback& callback) const;
int forEachStepRoleOfOutput(uint32_t index, const StepRoleCallback& callback) const;
- const ExecutionPlan& forTest_getExecutionPlan() const { return mPlan; }
-
bool createdWithExplicitDeviceList() const { return mExplicitDeviceList; }
+ bool hasDynamicTemporaries() const { return mPlan.hasDynamicTemporaries(); }
+
+ // These functions are solely intended for use by unit tests of the
+ // partitioning algorithm.
+ const ExecutionPlan& forTest_getExecutionPlan() const { return mPlan; }
+ int forTest_setPartitioning(uint32_t partitioning);
+ int forTest_failPartitioning(
+ int resultCode); // If not ANEURALNETWORKS_NO_ERROR, then simulate partitioning failure
+
private:
const ModelBuilder* mModel;
@@ -83,6 +88,9 @@
// we can override this later.
uint32_t mPartitioning;
+ // For testing purposes, simulate partitioning failure.
+ int mFailPartitioning = ANEURALNETWORKS_NO_ERROR;
+
// Once the compilation has been finished, we should not allow further
// modifications to the compilation.
bool mFinished = false;
diff --git a/runtime/ExecutionBuilder.cpp b/runtime/ExecutionBuilder.cpp
index 0f94e43..e36e564 100644
--- a/runtime/ExecutionBuilder.cpp
+++ b/runtime/ExecutionBuilder.cpp
@@ -20,6 +20,7 @@
#include <algorithm>
#include <limits>
+#include <map>
#include <memory>
#include <mutex>
#include <optional>
@@ -46,6 +47,66 @@
using namespace hal;
+// Partial validation of output shapes returned from driver, to ensure they
+// conform to a very specific set of rules.
+static bool validateOutputShapesFromDriver(ErrorStatus executionStatus, const ModelBuilder* model,
+ const std::vector<hal::OutputShape>& shapes) {
+ // Enforces the following rules (some of which are from b/154054474):
+ // - shapes vector is empty except in the case of NONE or OUTPUT_INSUFFICIENT_SIZE.
+ // If the vector is not empty, it must have as many entries as the step model has outputs.
+ // - If NONE, then either shapes vector is empty, or every shape is
+ // marked isSufficient and, if a tensor, has known rank.
+ // - If OUTPUT_INSUFFICIENT_SIZE, then the vector is not empty. At least one entry
+ // is marked !isSufficient.
+ switch (executionStatus) {
+ case ErrorStatus::NONE: {
+ NN_RET_CHECK(shapes.size() == 0 || shapes.size() == model->outputCount())
+ << "With execution ErrorStatus " << toString(executionStatus)
+ << " output shapes vector must be empty or of length " << model->outputCount()
+ << " but has length " << shapes.size();
+ NN_RET_CHECK(std::all_of(shapes.begin(), shapes.end(),
+ [](const OutputShape& shape) { return shape.isSufficient; }))
+ << "With execution ErrorStatus " << toString(executionStatus)
+ << " at least one output shape is unexpectedly marked !isSufficient";
+
+ const TypeManager* tm = TypeManager::get();
+ for (uint32_t outputIndex = 0, outputCount = shapes.size(); outputIndex < outputCount;
+ ++outputIndex) {
+ const hal::Operand& outputOperand = model->getOutputOperand(outputIndex);
+ NN_RET_CHECK(!tm->isTensorType(outputOperand.type) ||
+ (shapes[outputIndex].dimensions.size() != 0))
+ << "With execution ErrorStatus " << toString(executionStatus) << " output#"
+ << outputIndex << " shape unexpectedly has zero rank";
+ }
+
+ break;
+ }
+ case ErrorStatus::OUTPUT_INSUFFICIENT_SIZE: {
+ NN_RET_CHECK(shapes.size() == model->outputCount())
+ << "With execution ErrorStatus " << toString(executionStatus)
+ << " output shapes vector must be of length " << model->outputCount()
+ << " but has length " << shapes.size();
+ NN_RET_CHECK(std::any_of(shapes.begin(), shapes.end(),
+ [](const OutputShape& shape) { return !shape.isSufficient; }))
+ << "With execution ErrorStatus " << toString(executionStatus)
+ << " at least one output shape must have been marked !isSufficient";
+ break;
+ }
+ default: {
+ NN_RET_CHECK(shapes.size() == 0)
+ << "With execution ErrorStatus " << toString(executionStatus)
+ << " output shapes vector must be empty but has length " << shapes.size();
+ break;
+ }
+ }
+ return true;
+}
+static bool validateOutputShapesFromDriver(int executionResultCode, const ModelBuilder* model,
+ const std::vector<hal::OutputShape>& shapes) {
+ return validateOutputShapesFromDriver(convertResultCodeToErrorStatus(executionResultCode),
+ model, shapes);
+}
+
const Timing kNoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
static MeasureTiming measureTiming(const ExecutionBuilder* execution) {
@@ -497,7 +558,7 @@
static void asyncStartComputePartitioned(ExecutionBuilder* executionBuilder,
const ExecutionPlan& plan,
std::shared_ptr<ExecutionPlan::Controller> controller,
- bool allowFallback,
+ bool allowCpuFallback,
const std::optional<Deadline>& deadline,
const sp<ExecutionCallback>& executionCallback) {
CHECK(executionBuilder != nullptr);
@@ -505,8 +566,12 @@
std::vector<OutputShape> outputShapes = executionBuilder->getInitialOutputShapes();
Timing timing = kNoTiming;
- // Disallow fallback when the ExecutionPlan is simple on CPU.
- allowFallback &= !plan.isSimpleCpu();
+ // Disallow CPU fallback when the ExecutionPlan is simple on CPU.
+ allowCpuFallback &= !plan.isSimpleCpu();
+
+ // On this iteration, do I need to repeat the previous step because it
+ // reported insufficient size?
+ bool doInsufficientSizeFallback = false;
while (true) {
VLOG(EXECUTION) << "looking for next StepExecutor";
@@ -514,13 +579,15 @@
// Get the current step of the execution.
std::shared_ptr<StepExecutor> executor;
std::shared_ptr<ExecutionBurstController> burstController;
- int n = plan.next(controller, &executor, &burstController);
+ int n = doInsufficientSizeFallback ? plan.fallback(controller, &executor, &burstController)
+ : plan.next(controller, &executor, &burstController);
+ doInsufficientSizeFallback = false;
if (n != ANEURALNETWORKS_NO_ERROR) {
// During the interpreted execution of control flow, a loop timeout
// might occur in ExecutionPlan::next().
bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
- if (allowFallback && !missedDeadline) break;
+ if (allowCpuFallback && !missedDeadline) break;
executionCallback->notify(convertResultCodeToErrorStatus(n), {}, kNoTiming);
return;
}
@@ -536,36 +603,57 @@
// Attempt to execute a single step of the execution.
auto [stepN, stepOutputShapes, stepTiming] = executor->compute(deadline, burstController);
- // Update global outputs.
- if (!executor->updateOutputShapes(stepOutputShapes, &outputShapes)) {
+ // Update global outputs and dynamic temporaries.
+ StepExecutor::UpdateOutputShapes updateOutputShapes = {};
+ if (!executor->updateOutputShapes(stepN, stepOutputShapes, &outputShapes,
+ &updateOutputShapes)) {
stepN = ANEURALNETWORKS_OP_FAILED;
}
// If execution was successful, continue to next step.
if (stepN == ANEURALNETWORKS_NO_ERROR) {
- // We only support collection of timing information in the case of a
- // single step, so it's safe to just keep track of the last step's
- // timing information.
- timing = stepTiming;
+ if (updateOutputShapes.zeroSizedInput) {
+ // We'll need to do full model CPU fallback
+ VLOG(EXECUTION) << "updateOutputShapes.zeroSizedInput";
+ stepN = ANEURALNETWORKS_OP_FAILED;
+ } else {
+ CHECK(executor->areDynamicTemporariesAllocated());
+ // We only support collection of timing information in the case
+ // of a single step, so it's safe to just keep track of the last
+ // step's timing information.
+ timing = stepTiming;
+ continue;
+ }
+ }
+
+ if (stepN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
+ VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: " << toString(updateOutputShapes);
+ if (updateOutputShapes.mainOutputInsufficient ||
+ !updateOutputShapes.updatedDynamicTemporary) {
+ // Either:
+ // - At least one main model output is not of sufficient size; or
+ // - we didn't learn anything new about dynamic temporaries.
+ // Neither of these is recoverable, so end execution.
+ const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
+ executionCallback->notify(stepStatus, outputShapes, kNoTiming);
+ return;
+ }
+ // Every main model output is of sufficient size. This implies that
+ // at least one dynamic temporary is not of sufficient size. This
+ // is recoverable.
+ doInsufficientSizeFallback = true;
continue;
}
- // OUTPUT_INSUFFICIENT_SIZE is not recoverable, so end execution.
- if (stepN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
- const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
- executionCallback->notify(stepStatus, outputShapes, kNoTiming);
- return;
- }
-
- // If fallback is not allowed and there was an error, end execution.
- if (!allowFallback) {
+ // If CPU fallback is not allowed and there was an error, end execution.
+ if (!allowCpuFallback) {
const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
executionCallback->notify(stepStatus, {}, kNoTiming);
return;
}
// If CPU execution was already attempted, either:
- // (1) perform a full fallback if the plan is not simple, or
+ // (1) perform a full CPU fallback if the plan is not simple, or
// (2) return from the function with an error
if (executorIsCpu) {
if (!plan.isSimple()) break;
@@ -574,42 +662,77 @@
}
// If the code reaches this point, attempt a partial fallback to CPU.
- CHECK(allowFallback);
- auto [fallbackN, fallbackOutputShapes, fallbackTiming, fallbackExecutor] =
- cpuFallbackPartial(plan, controller);
-
- // Update global outputs.
- if (fallbackExecutor != nullptr &&
- !fallbackExecutor->updateOutputShapes(fallbackOutputShapes, &outputShapes)) {
- fallbackN = ANEURALNETWORKS_OP_FAILED;
+ CHECK(allowCpuFallback);
+ if (updateOutputShapes.zeroSizedInput) {
+ // Do not attempt a partial fallback.
+ break;
}
+ while (true) {
+ auto [fallbackN, fallbackOutputShapes, fallbackTiming, fallbackExecutor] =
+ cpuFallbackPartial(plan, controller);
- // If execution was successful, continue to next step.
- if (fallbackN == ANEURALNETWORKS_NO_ERROR) {
- // We only support collection of timing information in the case of a
- // single step, so it's safe to just keep track of the last step's
- // timing information.
- timing = fallbackTiming;
- continue;
- }
+ // Update global outputs and dynamic temporaries.
+ StepExecutor::UpdateOutputShapes fallbackUpdateOutputShapes = {};
+ if (fallbackExecutor != nullptr &&
+ !fallbackExecutor->updateOutputShapes(fallbackN, fallbackOutputShapes,
+ &outputShapes, &fallbackUpdateOutputShapes)) {
+ fallbackN = ANEURALNETWORKS_OP_FAILED;
+ }
- // OUTPUT_INSUFFICIENT_SIZE is not recoverable, so end execution.
- if (fallbackN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
- const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
- executionCallback->notify(fallbackStatus, outputShapes, kNoTiming);
- return;
- }
+ // If execution was successful, continue to next step.
+ if (fallbackN == ANEURALNETWORKS_NO_ERROR) {
+ if (fallbackUpdateOutputShapes.zeroSizedInput) {
+ // We'll need to do full model CPU fallback
+ VLOG(EXECUTION) << "fallbackUpdateOutputShapes.zeroSizedInput";
+ fallbackN = ANEURALNETWORKS_OP_FAILED;
+ break;
+ }
+ CHECK(fallbackExecutor->areDynamicTemporariesAllocated());
+ // We only support collection of timing information in the case of a
+ // single step, so it's safe to just keep track of the last step's
+ // timing information.
+ timing = fallbackTiming;
+ goto nextStep;
+ }
- // Do not fallback twice if the ExecutionPlan is simple.
- if (plan.isSimple()) {
- const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
- executionCallback->notify(fallbackStatus, {}, kNoTiming);
- return;
+ if (fallbackN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
+ VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: "
+ << toString(fallbackUpdateOutputShapes);
+ if (fallbackUpdateOutputShapes.mainOutputInsufficient ||
+ !fallbackUpdateOutputShapes.updatedDynamicTemporary) {
+ // Either:
+ // - At least one main model output is not of sufficient size; or
+ // - we didn't learn anything new about dynamic temporaries.
+ // Neither of these is recoverable, so end execution.
+ const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
+ executionCallback->notify(fallbackStatus, outputShapes, kNoTiming);
+ return;
+ }
+ // Every main model output is of sufficient size. This implies
+ // that at least one dynamic temporary is not of sufficient
+ // size. This is recoverable.
+ continue;
+ }
+
+ // Do not fallback twice if the ExecutionPlan is simple.
+ if (plan.isSimple()) {
+ const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
+ executionCallback->notify(fallbackStatus, {}, kNoTiming);
+ return;
+ }
+
+ // If the code reaches this point, then there was an error with the
+ // fallback. In this case, attempt full fallback.
+ break;
}
// If the code reaches this point, then there was an error with the
// fallback. In this case, attempt full fallback.
break;
+
+ nextStep:
+ // Bottom of the outer loop
+ continue;
}
// If the code has reached this point, a potentially recoverable error
@@ -623,16 +746,28 @@
// In case of partitioned execution, startComputeFenced call will return the sync
// fence and the fenced compute callback returned from the last partition.
// Any failed partition will result in the whole execution fallback to CPU if
-// allowFallback is set to true.
+// allowCpuFallback is set to true.
static std::tuple<int, int, sp<hal::IFencedExecutionCallback>> startComputeFenced(
ExecutionBuilder* executionBuilder, const ExecutionPlan& plan,
std::shared_ptr<ExecutionPlan::Controller> controller, const std::vector<int>& waitFor,
uint64_t timeoutDurationAfterFence, const std::optional<Deadline>& deadline,
- bool allowFallback) {
+ bool allowCpuFallback) {
+ // We should have detected this earlier in the call chain and fallen back to
+ // non-fenced execution. This is an implementation limitation: In order to
+ // support dynamic temporarires in this code, we'd need to implement
+ // something like the following:
+ // - If a partition has outputs of unknown size, execute that partition in a
+ // non fenced fashion, just as if it were scheduled on a driver that does
+ // not support fenced execution.
+ // - Implement something similar to the code in asyncStartComputePartitioned()
+ // that handles a step execution that fails with
+ // ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE.
+ CHECK(!executionBuilder->getCompilation()->hasDynamicTemporaries());
+
CHECK(executionBuilder != nullptr);
VLOG(EXECUTION) << "ExecutionBuilder::computeFenced (from plan, iteratively)";
// Disallow fallback when the ExecutionPlan is simple on CPU.
- allowFallback &= !plan.isSimpleCpu();
+ allowCpuFallback &= !plan.isSimpleCpu();
// Initiate waitForFds, syncFence for the first step.
std::vector<int> waitForFds = waitFor;
@@ -650,7 +785,7 @@
// might occur in ExecutionPlan::next().
bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
- if (allowFallback && !missedDeadline) break;
+ if (allowCpuFallback && !missedDeadline) break;
// Return -1 for the sync fence fd, and nullptr for the callback.
return std::make_tuple(n, -1, nullptr);
}
@@ -686,8 +821,8 @@
if (stepN == ANEURALNETWORKS_NO_ERROR) {
continue;
}
- // If fallback is not allowed and there was an error, end execution.
- if (!allowFallback) {
+ // If CPU fallback is not allowed and there was an error, end execution.
+ if (!allowCpuFallback) {
return std::make_tuple(stepN, -1, nullptr);
}
@@ -767,12 +902,13 @@
}
}
mStarted = true;
- const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
+ const bool allowCpuFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this, nullptr);
VLOG(EXECUTION) << "ExecutionBuilder::computeFenced";
int result;
- std::tie(result, mSyncFenceFd, mFencedExecutionCallback) = startComputeFenced(
- this, *mPlan, controller, waitFor, timeoutDurationAfterFence, deadline, allowFallback);
+ std::tie(result, mSyncFenceFd, mFencedExecutionCallback) =
+ startComputeFenced(this, *mPlan, controller, waitFor, timeoutDurationAfterFence,
+ deadline, allowCpuFallback);
*syncFence = mSyncFenceFd;
return result;
}
@@ -826,14 +962,14 @@
// asynchronous thread -- take the asynchronous thread logic out of
// CpuPreparedModel::execute() and use it to wrap the plan-based-path.
mStarted = true;
- const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
+ const bool allowCpuFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
std::shared_ptr<ExecutionPlan::Controller> controller =
mPlan->makeController(this, burstBuilder);
if (synchronous) {
VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)";
sp<ExecutionCallback> localSynchronizationCallback = new ExecutionCallback();
localSynchronizationCallback->setOnFinish(wrappedFinish);
- asyncStartComputePartitioned(this, *mPlan, controller, allowFallback, deadline,
+ asyncStartComputePartitioned(this, *mPlan, controller, allowCpuFallback, deadline,
localSynchronizationCallback);
localSynchronizationCallback->wait();
if (mMeasureTiming) {
@@ -854,13 +990,13 @@
executionCallback->setOnFinish(wrappedFinish);
if (DeviceManager::get()->syncExecRuntime()) {
VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API, non-threaded)";
- asyncStartComputePartitioned(this, *mPlan, controller, allowFallback, deadline,
+ asyncStartComputePartitioned(this, *mPlan, controller, allowCpuFallback, deadline,
executionCallback);
} else {
VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API)";
std::thread asyncExecution(
- [this, controller, allowFallback, deadline, executionCallback] {
- asyncStartComputePartitioned(this, *mPlan, controller, allowFallback,
+ [this, controller, allowCpuFallback, deadline, executionCallback] {
+ asyncStartComputePartitioned(this, *mPlan, controller, allowCpuFallback,
deadline, executionCallback);
});
executionCallback->bindThread(std::move(asyncExecution));
@@ -884,7 +1020,7 @@
}
// Check if the dimensions "to" is updatable by dimensions "from", where "from" must
-// have a higher specification level.
+// have no lower a specification level.
static bool isUpdatable(const std::vector<uint32_t>& to, const std::vector<uint32_t>& from) {
if (to.size() == 0) return true;
NN_RET_CHECK_EQ(to.size(), from.size());
@@ -894,7 +1030,17 @@
return true;
}
-bool ExecutionBuilder::updateOutputShapes(const std::vector<OutputShape>& outputShapes) {
+static bool isZeroSizedTensor(int executionResultCode, const OutputShape& outputShape) {
+ return (executionResultCode == ANEURALNETWORKS_NO_ERROR) && outputShape.isSufficient &&
+ outputShape.dimensions.size() &&
+ (std::find(outputShape.dimensions.begin(), outputShape.dimensions.end(), uint32_t(0)) !=
+ outputShape.dimensions.end());
+}
+
+bool ExecutionBuilder::updateOutputShapes(ErrorStatus status,
+ const std::vector<OutputShape>& outputShapes) {
+ NN_RET_CHECK(validateOutputShapesFromDriver(status, mModel, outputShapes));
+
if (outputShapes.size() == 0) {
return true;
}
@@ -927,7 +1073,7 @@
CHECK(!mFinishedWithoutSyncFence) << "ExecutionBuilder::finishWithoutSyncFence is called twice";
CHECK(!hasSyncFence())
<< "ExecutionBuilder::finishWithoutSyncFence is called when hasSyncFence()";
- if (!updateOutputShapes(outputShapes) || !updateMemories()) {
+ if (!updateOutputShapes(status, outputShapes) || !updateMemories()) {
status = ErrorStatus::GENERAL_FAILURE;
}
bool success = status == ErrorStatus::NONE;
@@ -951,19 +1097,124 @@
return status;
}
-bool StepExecutor::updateOutputShapes(const std::vector<OutputShape>& from,
- std::vector<OutputShape>* to) {
+std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes) {
+ return "{ .updatedDynamicTemporary = " +
+ std::to_string(updateOutputShapes.updatedDynamicTemporary) +
+ ", .mainOutputInsufficient = " +
+ std::to_string(updateOutputShapes.mainOutputInsufficient) + "}";
+}
+
+bool StepExecutor::updateOutputShapes(int executionResultCode, const std::vector<OutputShape>& from,
+ std::vector<OutputShape>* to, UpdateOutputShapes* update) {
+ CHECK(update != nullptr);
+ *update = {.updatedDynamicTemporary = false,
+ .mainOutputInsufficient = false,
+ .zeroSizedInput = false};
+
+ NN_RET_CHECK(validateOutputShapesFromDriver(executionResultCode, mModel, from));
+
if (from.size() == 0) {
return true;
}
+
+ if (VLOG_IS_ON(EXECUTION)) {
+ for (const auto& shape : from) {
+ VLOG(EXECUTION) << "updateOutputShapes: " << toString(shape);
+ }
+ }
+
if (mExecutionStep != nullptr) {
const auto& indexMapping = mExecutionStep->getOutputIndexStepModelToMainModel();
NN_RET_CHECK_LE(indexMapping.size(), from.size());
for (uint32_t i = 0, e = indexMapping.size(); i < e; i++) {
- uint32_t toIndex = indexMapping[i];
+ const uint32_t toIndex = indexMapping[i];
NN_RET_CHECK_GT(to->size(), toIndex);
NN_RET_CHECK(isUpdatable(to->at(toIndex).dimensions, from[i].dimensions));
(*to)[toIndex] = from[i];
+ update->mainOutputInsufficient |= !(*to)[toIndex].isSufficient;
+ if (mExecutionStep->getModelOutputsThatAreDownstreamInputs().count(toIndex) &&
+ isZeroSizedTensor(executionResultCode, from[i])) {
+ update->zeroSizedInput = true;
+ }
+ }
+
+ if (!mDynamicTemporaries->empty()) {
+ // TODO(b/157236079): Instead of computing this here, precompute it in ExecutionStep?
+ std::map<uint32_t, uint32_t> operandIndexStepModelOutputToSourceModelTemp;
+ for (const auto& entry : mExecutionStep->getTempsAsStepModelOutputs()) {
+ operandIndexStepModelOutputToSourceModelTemp.emplace(entry.second, entry.first);
+ }
+
+ const uint32_t sourceModelIndex = mExecutionStep->getSourceModelIndex();
+ for (uint32_t i = 0, e = mModel->outputCount(); i < e; i++) {
+ const uint32_t stepModelOperandIndex = mModel->getOutputOperandIndex(i);
+ const auto it =
+ operandIndexStepModelOutputToSourceModelTemp.find(stepModelOperandIndex);
+ if (it == operandIndexStepModelOutputToSourceModelTemp.end()) {
+ continue;
+ }
+ const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, it->second);
+ VLOG(EXECUTION) << "updateOutputShapes checking to see if output#" << i
+ << " sourceOperandIndex = (" << sourceOperandIndex.first << ", "
+ << sourceOperandIndex.second << ") is a dynamic temporary";
+ // This is a temporary, but it might not be a dynamic temporary.
+ const auto loc = mDynamicTemporaries->lookup(sourceOperandIndex, false);
+ if (loc == std::nullopt) {
+ continue;
+ }
+ NN_RET_CHECK(isUpdatable(*loc->dimensions, from[i].dimensions));
+ bool changedShape = false;
+ const uint32_t actualSize = TypeManager::get()->getSizeOfData(
+ mModel->getOperand(stepModelOperandIndex).type, from[i].dimensions);
+ if (actualSize > 0) {
+ changedShape = mDynamicTemporaries->redeclare(sourceOperandIndex,
+ from[i].dimensions, actualSize);
+ } else if (!from[i].isSufficient) {
+ NN_RET_CHECK(loc->length < UINT32_MAX / 2)
+ << "output#" << i << " length overflow";
+ changedShape = mDynamicTemporaries->redeclare(
+ sourceOperandIndex, from[i].dimensions, 2 * loc->length);
+ } else {
+ // The combination of not-fully-specified dimensions
+ // and isSufficient means that we have no
+ // information about whether the size of the dynamic
+ // temporary is adequate.
+ VLOG(EXECUTION) << "updateOutputShapes skipping redeclaration for output#" << i;
+ if (executionResultCode == ANEURALNETWORKS_NO_ERROR) {
+ NN_RET_CHECK(isZeroSizedTensor(executionResultCode, from[i]));
+ // This is a zero-sized tensor, and by
+ // definition, any dynamic temporary is an input
+ // to an execution step.
+ update->zeroSizedInput = true;
+ }
+ }
+ if (changedShape) {
+ // TODO: find a better place for this comment.
+ //
+ // isUpdatable(a, b) imposes a partial ordering a <=
+ // b. Every fully specified dimensions vector is an
+ // upper bound of that ordering. Therefore, any
+ // change in dimensions moves towards an upper
+ // bound, and hence there are a finite number of
+ // such changes possible.
+ //
+ // actualSize can only be computed from dimensions
+ // that are an upper bound. Therefore, once
+ // actualSize is computed, it will not change.
+ //
+ // If dimensions are not fully specified, and
+ // estimated size changes, it increases. There is
+ // an upper bound on estimated size to avoid
+ // overflow.
+ //
+ // Therefore, if we retry only when dimensions or
+ // size chage, and we stop retrying if we would
+ // otherwise overflow, we should only retry a finite
+ // number of times.
+ update->updatedDynamicTemporary = true;
+ }
+ }
+ mDynamicTemporaries->vlogDump("finished updateOutputShapes");
}
} else {
NN_RET_CHECK_EQ(from.size(), to->size());
@@ -977,19 +1228,26 @@
StepExecutor::StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
std::shared_ptr<Device> device,
- std::shared_ptr<PreparedModel> preparedModel, const ExecutionStep* step)
+ std::shared_ptr<PreparedModel> preparedModel, const ExecutionStep* step,
+ DynamicTemporaries* dynamicTemporaries)
: mExecutionBuilder(executionBuilder),
mExecutionStep(step),
+ mDynamicTemporaries(dynamicTemporaries),
mModel(model),
mDevice(device),
mPreparedModel(preparedModel),
mInputs(model->inputCount()),
mOutputs(model->outputCount()) {
CHECK(mDevice != nullptr);
+ CHECK_EQ(step == nullptr, dynamicTemporaries == nullptr);
VLOG(EXECUTION) << "StepExecutor::StepExecutor with " << mInputs.size() << " inputs and "
<< mOutputs.size() << " outputs";
}
+bool StepExecutor::areDynamicTemporariesAllocated() const {
+ return !mDynamicTemporaries || mDynamicTemporaries->allocated(mExecutionStep->getIndex());
+}
+
void StepExecutor::mapInputsAndOutputsTrivially() {
mInputs = mExecutionBuilder->mInputs;
mOutputs = mExecutionBuilder->mOutputs;
@@ -1019,33 +1277,56 @@
int StepExecutor::setInputOrOutputFromMemory(const Operand& inputOrOutputOperand,
const Memory* memory, uint32_t offset,
+ const hal::hidl_vec<uint32_t>& dimensions,
+ std::optional<uint32_t> length,
ModelArgumentInfo* inputOrOutputInfo) {
// Should be similar to
// ExecutionBuilder::setInputFromMemory()
// ExecutionBuilder::setOutputFromMemory()
uint32_t poolIndex = mMemories.add(memory);
- uint32_t length = TypeManager::get()->getSizeOfData(inputOrOutputOperand);
+ uint32_t lengthVal = length.value_or(TypeManager::get()->getSizeOfData(inputOrOutputOperand));
CHECK(inputOrOutputInfo->unspecified());
int n;
std::tie(n, *inputOrOutputInfo) =
ModelArgumentInfo::createFromMemory(inputOrOutputOperand,
- /*type=*/nullptr, poolIndex, offset, length);
+ /*type=*/nullptr, poolIndex, offset, lengthVal);
+ if (n == ANEURALNETWORKS_NO_ERROR && dimensions.size()) {
+ CHECK(isUpdatable(inputOrOutputInfo->dimensions(), dimensions));
+ inputOrOutputInfo->dimensions() = dimensions;
+ }
return n;
}
+static std::string toString(std::vector<uint32_t> dimensions) {
+ std::string ret = "(";
+ bool wroteOne = false;
+ for (uint32_t dimension : dimensions) {
+ if (wroteOne) {
+ ret += ", ";
+ } else {
+ wroteOne = true;
+ }
+ ret += std::to_string(dimension);
+ }
+ ret += ")";
+ return ret;
+};
+
static void logArguments(const char* kind, const std::vector<ModelArgumentInfo>& args) {
for (unsigned i = 0; i < args.size(); i++) {
const auto& arg = args[i];
std::string prefix = kind + std::string("[") + std::to_string(i) + "] = ";
switch (arg.state()) {
case ModelArgumentInfo::POINTER:
- VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer()) << ")";
+ VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer()) << ") dim"
+ << toString(arg.dimensions());
break;
case ModelArgumentInfo::MEMORY:
VLOG(EXECUTION) << prefix << "MEMORY("
<< "pool=" << arg.locationAndLength().poolIndex << ", "
- << "off=" << arg.locationAndLength().offset << ")";
+ << "off=" << arg.locationAndLength().offset << ") dim"
+ << toString(arg.dimensions());
break;
case ModelArgumentInfo::HAS_NO_VALUE:
VLOG(EXECUTION) << prefix << "HAS_NO_VALUE";
diff --git a/runtime/ExecutionBuilder.h b/runtime/ExecutionBuilder.h
index f61df4c..f9eff4e 100644
--- a/runtime/ExecutionBuilder.h
+++ b/runtime/ExecutionBuilder.h
@@ -19,6 +19,7 @@
#include <atomic>
#include <memory>
+#include <string>
#include <tuple>
#include <utility>
#include <vector>
@@ -38,6 +39,7 @@
class BurstBuilder;
class CompilationBuilder;
class Device;
+class DynamicTemporaries;
class ExecutionBurstController;
class ExecutionPlan;
class ExecutionStep;
@@ -134,7 +136,8 @@
const CompilationBuilder* mCompilation;
// Update output dimensional information from OutputShape to ModelArgumentInfo.
- bool updateOutputShapes(const std::vector<hal::OutputShape>& outputShapes);
+ bool updateOutputShapes(hal::ErrorStatus status,
+ const std::vector<hal::OutputShape>& outputShapes);
bool updateMemories();
@@ -226,9 +229,16 @@
// Contains the output index mapping from the excerpted "step" model to
// main model if the execution has multiple "steps". Must be nullptr
// otherwise.
+ // (step == nullptr) == (dynamicTemporaries == nullptr)
+ // dynamicTemporaries
+ // If the execution has multiple "steps", describes the temporaries
+ // of source models that do not have fully specified types and are outputs
+ // of "step" models. Must be nullptr otherwise.
+ // (step == nullptr) == (dynamicTemporaries == nullptr)
StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
std::shared_ptr<Device> device, std::shared_ptr<PreparedModel> preparedModel,
- const ExecutionStep* step = nullptr);
+ const ExecutionStep* step = nullptr,
+ DynamicTemporaries* dynamicTemporaries = nullptr);
// Map inputs and outputs from ExecutionBuilder to StepExecutor,
// in the case where we have a single-"step" execution (i.e., the executor
@@ -236,8 +246,17 @@
void mapInputsAndOutputsTrivially();
// Update output shapes with shapes returned from execution.
- bool updateOutputShapes(const std::vector<hal::OutputShape>& from,
- std::vector<hal::OutputShape>* to);
+ struct UpdateOutputShapes {
+ // These fields are meaningless unless updateOutputShapes() returns true
+ bool updatedDynamicTemporary; // did shape (dimensions, size) information change for at
+ // least one dynamic temporary?
+ bool mainOutputInsufficient; // is at least one main model output written by this execution
+ // marked !isSufficient?
+ bool zeroSizedInput; // is at least one output of this execution step a zero-sized tensor
+ // that needs to be read by some other step of the same execution?
+ };
+ bool updateOutputShapes(int executionResultCode, const std::vector<hal::OutputShape>& from,
+ std::vector<hal::OutputShape>* to, UpdateOutputShapes* update);
// Map inputs and outputs from ExecutionBuilder to StepExecutor,
// one at a time. Note that these are input/output indexes, not
@@ -252,15 +271,23 @@
mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mInputs[executorIndex]);
}
- // The input or output is assumed to have the size of the
- // corresponding operand.
- int setInputFromMemory(uint32_t inputIndex, const Memory* memory, uint32_t offset) {
+ // If no length is provided, the input or output is assumed to have the length
+ // of the operand. dimensions must either have zero rank or must be
+ // consistent with and at least as well specified as operand dimensions
+ // (i.e., either rank must match, or operand rank must be zero; and for each
+ // individual dimension, either dimension must match, or operand dimension
+ // must be zero).
+ int setInputFromMemory(uint32_t inputIndex, const Memory* memory, uint32_t offset,
+ const hal::hidl_vec<uint32_t>& dimensions = {},
+ std::optional<uint32_t> length = std::nullopt) {
return setInputOrOutputFromMemory(mModel->getInputOperand(inputIndex), memory, offset,
- &mInputs.at(inputIndex));
+ dimensions, length, &mInputs.at(inputIndex));
}
- int setOutputFromMemory(uint32_t outputIndex, const Memory* memory, uint32_t offset) {
+ int setOutputFromMemory(uint32_t outputIndex, const Memory* memory, uint32_t offset,
+ const hal::hidl_vec<uint32_t>& dimensions = {},
+ std::optional<uint32_t> length = std::nullopt) {
return setInputOrOutputFromMemory(mModel->getOutputOperand(outputIndex), memory, offset,
- &mOutputs.at(outputIndex));
+ dimensions, length, &mOutputs.at(outputIndex));
}
// Executes using the (driver, preparedModel) specified at construction time.
@@ -280,12 +307,24 @@
const std::vector<int>& wait_for, uint64_t timeoutDurationAfterFence,
const std::optional<Deadline>& deadline);
+ // Do the dynamic temporaries defined by this step have valid allocations?
+ // (true if there are no dynamic temporaries defined by this step.)
+ bool areDynamicTemporariesAllocated() const;
+
private:
void mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
ModelArgumentInfo* executorInputOrOutput);
+ // If no length is provided, the input or output is assumed to have the length
+ // of the corresponding operand. dimensions must either have zero rank or
+ // must be consistent with and at least as well specified as operand
+ // dimensions (i.e., either rank must match, or operand rank must be zero;
+ // and for each individual dimension, either dimension must match, or
+ // operand dimension must be zero).
int setInputOrOutputFromMemory(const hal::Operand& inputOrOutputOperand, const Memory* memory,
- uint32_t offset, ModelArgumentInfo* inputOrOutputInfo);
+ uint32_t offset, const hal::hidl_vec<uint32_t>& dimensions,
+ std::optional<uint32_t> length,
+ ModelArgumentInfo* inputOrOutputInfo);
std::tuple<int, std::vector<hal::OutputShape>, hal::Timing> computeWithMemories(
const std::optional<Deadline>& deadline, const std::vector<const Memory*>& memories,
@@ -295,7 +334,10 @@
ExecutionBuilder* mExecutionBuilder;
// describes the single execution step
- const ExecutionStep* mExecutionStep = nullptr;
+ const ExecutionStep* mExecutionStep;
+
+ // describes the dynamic temporaries
+ DynamicTemporaries* mDynamicTemporaries;
// model to be executed on the executor, in both original and
// compiled forms; and device on which to execute it
@@ -318,6 +360,8 @@
MemoryTracker mMemories;
};
+std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes);
+
} // namespace nn
} // namespace android
diff --git a/runtime/ExecutionPlan.cpp b/runtime/ExecutionPlan.cpp
index da0c003..97bfacd 100644
--- a/runtime/ExecutionPlan.cpp
+++ b/runtime/ExecutionPlan.cpp
@@ -180,8 +180,165 @@
}
}
+uint32_t addTemporaryOfSize(uint32_t* totalSizeOfTemporaries, uint32_t size) {
+ // TODO: what about overflow?
+ *totalSizeOfTemporaries += alignBytesNeeded(*totalSizeOfTemporaries, size);
+ const uint32_t offset = *totalSizeOfTemporaries;
+ *totalSizeOfTemporaries += size;
+ return offset;
+};
+
+std::string toString(SourceOperandIndex sourceOperandIndex) {
+ return "(" + std::to_string(sourceOperandIndex.first) + ", " +
+ std::to_string(sourceOperandIndex.second) + ")";
+};
+
+std::string toString(hidl_vec<uint32_t> dimensions) {
+ std::string ret = "(";
+ bool wroteOne = false;
+ for (uint32_t dimension : dimensions) {
+ if (wroteOne) {
+ ret += ", ";
+ } else {
+ wroteOne = true;
+ }
+ ret += std::to_string(dimension);
+ }
+ ret += ")";
+ return ret;
+};
+
} // namespace
+void DynamicTemporaries::vlogDump(const char* context) const {
+ if (empty()) {
+ return;
+ }
+ if (context) {
+ VLOG(EXECUTION) << "DynamicTemporaries: \"" << context << "\"";
+ }
+ for (const auto& temp : mSourceOperandToTemporary) {
+ VLOG(EXECUTION) << "DynamicTemporaries: sourceOperandIndex = " << toString(temp.first)
+ << ", stepIndex = " << temp.second.stepIndex
+ << ", offset = " << temp.second.offset
+ << ", dimensions = " << toString(temp.second.dimensions)
+ << ", length = " << temp.second.length;
+ }
+}
+
+void DynamicTemporaries::declare(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex,
+ const hidl_vec<uint32_t>& initialDimensions,
+ uint32_t initialLength) {
+ VLOG(EXECUTION) << "DynamicTemporaries::declare(sourceOperandIndex = "
+ << toString(sourceOperandIndex) << ", stepIndex = " << stepIndex
+ << ", initialDimensions = " << toString(initialDimensions)
+ << ", initialLength = " << initialLength << ")";
+ CHECK(!mDeclared);
+ CHECK_GT(initialLength, 0u);
+ auto [_, isNew] = mSourceOperandToTemporary.emplace(
+ sourceOperandIndex,
+ InternalLocationAndShape{stepIndex, 0, initialDimensions, initialLength});
+ CHECK(isNew);
+ mStepIndexToSourceOperandIndexes[stepIndex].emplace_back(sourceOperandIndex);
+}
+
+bool DynamicTemporaries::redeclare(SourceOperandIndex sourceOperandIndex,
+ const hidl_vec<uint32_t>& newDimensions, uint32_t newLength) {
+ auto createAndLogResult = [sourceOperandIndex, &newDimensions, newLength](bool changedShape) {
+ VLOG(EXECUTION) << "DynamicTemporaries::redeclare(sourceOperandIndex = "
+ << toString(sourceOperandIndex)
+ << ", newDimensions = " << toString(newDimensions)
+ << ", newLength = " << newLength << ") -> " << toString(changedShape);
+ return changedShape;
+ };
+
+ CHECK(mDeclared);
+ CHECK_GT(newLength, 0u);
+
+ InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex);
+ if (temp.length == newLength && temp.dimensions == newDimensions) {
+ return createAndLogResult(false);
+ }
+ if (temp.length < newLength) {
+ // Otherwise allocation remains valid, even if it may be suboptimal
+ // (because it uses more space than needed). Use case: Don't force
+ // client to allocate again just because the client reported more
+ // accurate shape information.
+ mAllocatedStepIndexes.erase(temp.stepIndex);
+ }
+ temp.length = newLength;
+ temp.dimensions = newDimensions;
+ return createAndLogResult(true);
+}
+
+int DynamicTemporaries::allocate(uint32_t stepIndex) {
+ VLOG(EXECUTION) << "DynamicTemporaries::allocate(stepIndex = " << stepIndex << ")";
+
+ CHECK(mDeclared);
+
+ const auto sourceOperandIndexesI = mStepIndexToSourceOperandIndexes.find(stepIndex);
+ if (sourceOperandIndexesI == mStepIndexToSourceOperandIndexes.end()) {
+ return ANEURALNETWORKS_NO_ERROR;
+ }
+
+ // perform layout
+ uint32_t newSize = 0;
+ for (const auto sourceOperandIndex : sourceOperandIndexesI->second) {
+ InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex);
+ temp.offset = addTemporaryOfSize(&newSize, temp.length);
+ }
+
+ // perform (re-)allocation
+ // TODO: Today we may shrink the allocation in order to avoid wasting memory. Is this important
+ // to conserve memory, or do we waste time reallocating?
+ const double kWaste = 0.2 /* arbitrary */; // Willing to waste space to avoid
+ // deallocation/reallocation overhead
+ auto& memory = mStepIndexToMemory[stepIndex];
+ const uint32_t oldSize = (memory ? memory->getSize() : 0);
+ if ((oldSize >= newSize) && (oldSize <= newSize * (1 + kWaste))) {
+ // Suitable allocation already exists; nothing to do
+ } else {
+ int n;
+ std::tie(n, memory) = MemoryAshmem::create(newSize);
+ if (n != ANEURALNETWORKS_NO_ERROR) {
+ LOG(ERROR) << "Failed to allocate dynamic temporaries of size " << newSize
+ << " for step " << stepIndex;
+ mAllocatedStepIndexes.erase(stepIndex);
+ return n;
+ }
+ }
+
+ mAllocatedStepIndexes.insert(stepIndex);
+ return ANEURALNETWORKS_NO_ERROR;
+}
+
+bool DynamicTemporaries::allocated(uint32_t stepIndex) const {
+ return (mStepIndexToSourceOperandIndexes.find(stepIndex) ==
+ mStepIndexToSourceOperandIndexes.end()) ||
+ mAllocatedStepIndexes.count(stepIndex);
+}
+
+std::optional<DynamicTemporaries::LocationAndShape> DynamicTemporaries::lookup(
+ SourceOperandIndex sourceOperandIndex, bool mustBeAllocated) const {
+ CHECK(mDeclared);
+ if (auto it = mSourceOperandToTemporary.find(sourceOperandIndex);
+ it != mSourceOperandToTemporary.end()) {
+ const InternalLocationAndShape& temp = it->second;
+ const bool isAllocated = allocated(temp.stepIndex);
+ if (mustBeAllocated) {
+ CHECK(isAllocated) << "Source operand " << toString(sourceOperandIndex)
+ << " must be allocated";
+ }
+ if (isAllocated) {
+ return LocationAndShape{mStepIndexToMemory.at(temp.stepIndex).get(), temp.offset,
+ &temp.dimensions, temp.length};
+ } else {
+ return LocationAndShape{nullptr, ~uint32_t(0), &temp.dimensions, temp.length};
+ }
+ }
+ return std::nullopt;
+}
+
ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex, uint32_t sourceModelIndex,
std::shared_ptr<Device> device)
: mPlan(plan),
@@ -283,6 +440,10 @@
// The first time we've seen this operand is as an
// output.
mModelOutputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
+ // It may be an input to a different partition, so keep track of
+ // it.
+ mPlan->recordOutputDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex),
+ mIndex);
}
} break;
case OperandLifeTime::SUBGRAPH: {
@@ -338,6 +499,7 @@
void ExecutionStep::mapInputsAndOutputs(
std::shared_ptr<StepExecutor> executor, const Memory* temporaryMemory,
const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOffsetOfTemporary,
+ const DynamicTemporaries& dynamicTemporaries,
const std::map<SourceOperandIndex, uint32_t>& sourceOperandToInputIndex,
const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOutputIndex,
const std::map<SourceOperandIndex, ConstantReferenceLocation>&
@@ -347,6 +509,9 @@
if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
it != sourceOperandToOffsetOfTemporary.end()) {
executor->setInputFromMemory(stepInputIndex, temporaryMemory, it->second);
+ } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) {
+ executor->setInputFromMemory(stepInputIndex, loc->memory, loc->offset, *loc->dimensions,
+ loc->length);
} else if (auto it = sourceOperandToInputIndex.find(sourceOperandIndex);
it != sourceOperandToInputIndex.end()) {
executor->mapInput(it->second, stepInputIndex);
@@ -368,6 +533,9 @@
if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
it != sourceOperandToOffsetOfTemporary.end()) {
executor->setOutputFromMemory(stepOutputIndex, temporaryMemory, it->second);
+ } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) {
+ executor->setOutputFromMemory(stepOutputIndex, loc->memory, loc->offset,
+ *loc->dimensions, loc->length);
} else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
it != sourceOperandToOutputIndex.end()) {
executor->mapOutput(it->second, stepOutputIndex);
@@ -384,6 +552,32 @@
}
}
+void ExecutionPlan::CompoundBody::findModelOutputsThatAreDownstreamInputs() {
+ auto declareModelOutputIsDownstreamInput =
+ [this](const SourceOperandIndex& sourceOperandIndex) {
+ const auto it = mOutputToDefiningExecutionStep.find(sourceOperandIndex);
+ CHECK(it != mOutputToDefiningExecutionStep.end());
+ uint32_t stepIndex = it->second;
+ CHECK_LT(stepIndex, mSteps.size());
+ VLOG(COMPILATION)
+ << "ExecutionStep(" << stepIndex
+ << ")->declareModelOutputIsDownstreamInput(mSourceOperandToOutputIndex.at"
+ << toString(sourceOperandIndex) << ")";
+ CHECK(mSourceOperandToOutputIndex.find(sourceOperandIndex) !=
+ mSourceOperandToOutputIndex.end());
+ mSteps[stepIndex]->executionStep()->declareModelOutputIsDownstreamInput(
+ mSourceOperandToOutputIndex.at(sourceOperandIndex));
+ };
+ for (const auto& logicalStep : mSteps) {
+ if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
+ for (const auto& output : step->getOutputsAsStepModelInputs()) {
+ SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), output.first);
+ declareModelOutputIsDownstreamInput(sourceOperandIndex);
+ }
+ }
+ }
+}
+
void ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs() {
auto recordAsOutputIfTemporary = [this](const SourceOperandIndex& sourceOperandIndex) {
const auto it = mTemporaryToDefiningExecutionStep.find(sourceOperandIndex);
@@ -418,6 +612,17 @@
}
}
+void ExecutionStep::declareModelOutputIsDownstreamInput(uint32_t mainModelOutputIndex) {
+ VLOG(COMPILATION) << "ExecutionStep(" << mIndex << ")::declareModelOutputIsDownstreamInput("
+ << mainModelOutputIndex << ")";
+ const auto it = std::find(mOutputIndexStepModelToMainModel.begin(),
+ mOutputIndexStepModelToMainModel.end(), mainModelOutputIndex);
+ CHECK(it != mOutputIndexStepModelToMainModel.end());
+ const uint32_t stepModelOutputIndex = it - mOutputIndexStepModelToMainModel.begin();
+ CHECK(stepModelOutputIndex < mModelOutputs.size());
+ mModelOutputsThatAreDownstreamInputs.insert(stepModelOutputIndex);
+}
+
void ExecutionStep::recordTempAsStepModelOutput(uint32_t stepOperandIndex) {
const auto it = mOperandMap.find(stepOperandIndex);
CHECK(it != mOperandMap.end());
@@ -610,7 +815,8 @@
int ExecutionPlan::CompoundBody::finish(const SourceModels* sourceModels,
int32_t executionPreference, int32_t priority,
- const std::optional<Deadline>& deadline) {
+ const std::optional<Deadline>& deadline,
+ int simulateFailureResultCode) {
CHECK(!mSuccessfulFinish);
CHECK(!deadline.has_value());
const ModelBuilder* mainModel = sourceModels->getModel(kMainModelInSourceModels);
@@ -629,8 +835,8 @@
findTempsAsStepModelOutputs();
for (const auto& logicalStep : mSteps) {
if (ExecutionStep* step = logicalStep->tryExecutionStep()) {
- int n = step->finishStepModel(mainModel, &mHasStepModelOutputOfUnknownSize,
- executionPreference, priority);
+ int n = step->finishStepModel(mainModel, &mHasDynamicTemporaries, executionPreference,
+ priority);
if (n != ANEURALNETWORKS_NO_ERROR) {
VLOG(COMPILATION)
<< "ExecutionPlan::CompoundBody::finish -- finishStepModel failed";
@@ -657,10 +863,11 @@
CHECK(logicalStep->isGoto());
}
}
- if (mHasStepModelOutputOfUnknownSize) {
- VLOG(COMPILATION)
- << "ExecutionPlan::CompoundBody::finish -- mHasStepModelOutputOfUnknownSize";
- return ANEURALNETWORKS_OP_FAILED;
+
+ if (simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) {
+ VLOG(COMPILATION) << "ExecutionPlan::CompoundeBody::finish: simulating failure, ResultCode "
+ << simulateFailureResultCode;
+ return simulateFailureResultCode;
}
for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
@@ -673,6 +880,7 @@
}
findControlFlowBoundaryConstants(sourceModels);
+ findModelOutputsThatAreDownstreamInputs();
mSuccessfulFinish = true;
return ANEURALNETWORKS_NO_ERROR;
@@ -713,25 +921,32 @@
}
int ExecutionPlan::SimpleBody::finish(const SourceModels*, int32_t executionPreference,
- int32_t priority, const std::optional<Deadline>& deadline) {
+ int32_t priority, const std::optional<Deadline>& deadline,
+ int simulateFailureResultCode) {
CHECK(!mSuccessfulFinish);
CHECK(mDevice != nullptr);
VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
- const int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheDir,
- &mToken, &mPreparedModel);
+ int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheDir, &mToken,
+ &mPreparedModel);
+ if (n == ANEURALNETWORKS_NO_ERROR && simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) {
+ VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish: simulating failure, ResultCode "
+ << simulateFailureResultCode;
+ n = simulateFailureResultCode;
+ }
mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
return n;
}
int ExecutionPlan::finish(int32_t executionPreference, int32_t priority,
- const std::optional<Deadline>& deadline) {
+ const std::optional<Deadline>& deadline, int simulateFailureResultCode) {
CHECK(mBody != nullptr);
- return mBody->finish(&getSourceModels(), executionPreference, priority, deadline);
+ return mBody->finish(&getSourceModels(), executionPreference, priority, deadline,
+ simulateFailureResultCode);
}
ExecutionPlan::Controller::Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
const BurstBuilder* burstBuilder)
- : Controller(plan, executionBuilder, burstBuilder, 0, {}, {}, {}, {}, {}, {}) {}
+ : Controller(plan, executionBuilder, burstBuilder, 0, {}, {}, {}, {}, {}, {}, {}) {}
ExecutionPlan::Controller::Controller(
const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
@@ -741,7 +956,8 @@
std::map<SourceOperandIndex, uint32_t> sourceOperandToInputIndex,
std::map<SourceOperandIndex, uint32_t> sourceOperandToOutputIndex,
const std::map<SourceOperandIndex, ConstantCopyLocation>& sourceOperandToConstantCopy,
- std::map<SourceOperandIndex, ConstantReferenceLocation> sourceOperandToConstantReference)
+ std::map<SourceOperandIndex, ConstantReferenceLocation> sourceOperandToConstantReference,
+ DynamicTemporaries dynamicTemporaries)
: mPlan(plan),
mExecutionBuilder(executionBuilder),
mBurstBuilder(burstBuilder),
@@ -750,6 +966,7 @@
mSourceOperandToInputIndex(std::move(sourceOperandToInputIndex)),
mSourceOperandToOutputIndex(std::move(sourceOperandToOutputIndex)),
mSourceOperandToConstantReference(std::move(sourceOperandToConstantReference)),
+ mDynamicTemporaries(std::move(dynamicTemporaries)),
mNextStepIndex(0),
mFallbackNextStepIndex(kBadStepIndex),
mLastStepSyncFd(-1) {
@@ -823,7 +1040,7 @@
return std::shared_ptr<Controller>(new Controller(this, executionBuilder, burstBuilder));
}
// Create the layout for a Memory object big enough to hold
- // - every partition boundary TEMPORARY operand and
+ // - every partition boundary TEMPORARY operand that is not a dynamic temporary, and
// - buffers required by the control flow implementation.
//
// TODO: Rethink this approach for managing temporaries. Some
@@ -844,21 +1061,17 @@
// what our Memory objects represent.
//
uint32_t totalSizeOfTemporaries = 0;
- auto addTemporaryOfSize = [&totalSizeOfTemporaries](uint32_t size) {
- totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size);
- const uint32_t offset = totalSizeOfTemporaries;
- totalSizeOfTemporaries += size;
- return offset;
- };
// This function has two modes of operation:
// 1. When lifetime is TEMPORARY_VARIABLE, we allocate memory for
- // TEMPORARY_VARIABLE source operands, skip SUBGRAPH_OUTPUT source
- // operands, and panic if we see a source operand of another lifetime.
+ // TEMPORARY_VARIABLE source operands that are not dynamic temporaries,
+ // skip TEMPORARY_VARIABLE source operands that are dynamic temporaries,
+ // skip SUBGRAPH_OUTPUT source operands, and panic if we see a source
+ // operand of another lifetime.
// 2. When lifetime is SUBGRAPH_OUTPUT, we allocate memory for
// SUBGRAPH_OUTPUT source operands and panic if we see a source operand
// of another lifetime.
auto mapTemporary =
- [executionBuilder, addTemporaryOfSize](
+ [executionBuilder, &totalSizeOfTemporaries](
const SourceOperandIndex& sourceOperandIndex,
std::map<SourceOperandIndex, uint32_t>* sourceOperandToOffsetOfTemporary,
OperandLifeTime lifetime = OperandLifeTime::TEMPORARY_VARIABLE) {
@@ -873,13 +1086,19 @@
}
CHECK(sourceOperand.lifetime == lifetime);
const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
- CHECK_NE(size, 0u);
- const uint32_t offset = addTemporaryOfSize(size);
- auto [_, isNew] =
- sourceOperandToOffsetOfTemporary->emplace(sourceOperandIndex, offset);
- CHECK(isNew);
- VLOG(EXECUTION) << "temp: operand " << toString(sourceOperandIndex)
- << " offset = " << offset;
+ if (size != 0u) {
+ const uint32_t offset = addTemporaryOfSize(&totalSizeOfTemporaries, size);
+ auto [_, isNew] =
+ sourceOperandToOffsetOfTemporary->emplace(sourceOperandIndex, offset);
+ CHECK(isNew);
+ VLOG(EXECUTION) << "temp: operand " << toString(sourceOperandIndex)
+ << " offset = " << offset;
+ } else {
+ // Unknown size, hence dynamic temporary. The mapping will
+ // be established elsewhere (DynamicTemporaries::allocate()).
+ CHECK(lifetime == OperandLifeTime::TEMPORARY_VARIABLE);
+ CHECK(sourceOperand.lifetime == OperandLifeTime::TEMPORARY_VARIABLE);
+ }
};
std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary;
std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary2;
@@ -963,24 +1182,53 @@
// Allocate temporary memory for boundary CONSTANT_COPY operands.
for (const auto& [sourceOperandIndex, location] :
compound()->mSourceOperandToBoundaryConstantCopy) {
- const uint32_t offset = addTemporaryOfSize(location.length);
+ const uint32_t offset = addTemporaryOfSize(&totalSizeOfTemporaries, location.length);
sourceOperandToOffsetOfTemporary.emplace(sourceOperandIndex, offset);
VLOG(EXECUTION) << "temp (boundary constant): operand " << toString(sourceOperandIndex)
<< " offset = " << offset;
}
+ // Collect dynamic temporaries.
+ // TODO(b/157236079): Move some or all of this work to compilation time?
+ DynamicTemporaries dynamicTemporaries;
+ const TypeManager* typeManager = TypeManager::get();
+ for (const auto& logicalStep : compound()->mSteps) {
+ if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
+ const uint32_t stepIndex = step->getIndex();
+ const uint32_t sourceModelIndex = step->getSourceModelIndex();
+ for (const auto& entry : step->getTempsAsStepModelOutputs()) {
+ const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, entry.first);
+ const auto& sourceOperand = getSourceOperand(sourceOperandIndex);
+ if (hasUnknownSize(sourceOperand)) {
+ CHECK(typeManager->isTensorType(sourceOperand.type));
+ // TODO: For now we guess an initial size equal to element
+ // size, which is overly conservative.
+ const uint32_t size = typeManager->getSizeOfData(sourceOperand.type, {1});
+ dynamicTemporaries.declare(sourceOperandIndex, stepIndex,
+ sourceOperand.dimensions, size);
+ }
+ }
+ }
+ }
+ dynamicTemporaries.endDeclarations();
+ dynamicTemporaries.vlogDump("finished declarations");
+
return std::shared_ptr<Controller>(new Controller(
this, executionBuilder, burstBuilder, totalSizeOfTemporaries,
std::move(sourceOperandToOffsetOfTemporary),
std::move(sourceOperandToOffsetOfTemporary2), compound()->mSourceOperandToInputIndex,
compound()->mSourceOperandToOutputIndex,
compound()->mSourceOperandToBoundaryConstantCopy,
- compound()->mSourceOperandToBoundaryConstantReference));
+ compound()->mSourceOperandToBoundaryConstantReference, std::move(dynamicTemporaries)));
}
// TODO: Find a better way to provide this functionality.
int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
- std::shared_ptr<StepExecutor>* executor) const {
+ std::shared_ptr<StepExecutor>* executor,
+ std::shared_ptr<ExecutionBurstController>* burstController) const {
*executor = nullptr;
+ if (burstController != nullptr) {
+ *burstController = nullptr;
+ }
VLOG(EXECUTION) << "ExecutionPlan::fallback(" << SHOW_IF_DEBUG(controller << ", " << executor)
<< "): mFallbackNextStepIndex = " << controller->mFallbackNextStepIndex;
@@ -996,7 +1244,7 @@
}
controller->mNextStepIndex = controller->mFallbackNextStepIndex;
- return next(controller, executor);
+ return next(controller, executor, burstController);
}
ExecutionPlan::Buffer::Buffer(void* pointer, uint32_t size)
@@ -1169,13 +1417,19 @@
std::shared_ptr<ExecutionBurstController>* burstController) const {
VLOG(EXECUTION) << "next: Step#" << controller->mNextStepIndex << ": execute on "
<< step->getDevice()->getName();
- *executor =
- std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(),
- step->getDevice(), step->getPreparedStepModel(), step);
+
+ NN_RETURN_IF_ERROR(controller->mDynamicTemporaries.allocate(step->getIndex()));
+ controller->mDynamicTemporaries.vlogDump("finished allocating for a step");
+
+ *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(),
+ step->getDevice(), step->getPreparedStepModel(),
+ step, &controller->mDynamicTemporaries);
+
step->mapInputsAndOutputs(
*executor, controller->mTemporaries.get(),
- controller->mSourceOperandToOffsetOfTemporary, controller->mSourceOperandToInputIndex,
- controller->mSourceOperandToOutputIndex, controller->mSourceOperandToConstantReference);
+ controller->mSourceOperandToOffsetOfTemporary, controller->mDynamicTemporaries,
+ controller->mSourceOperandToInputIndex, controller->mSourceOperandToOutputIndex,
+ controller->mSourceOperandToConstantReference);
if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
*burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex);
}
@@ -1473,6 +1727,13 @@
mState = SIMPLE;
}
+void ExecutionPlan::recordOutputDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
+ auto [it, isNew] =
+ compound()->mOutputToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
+ CHECK(isNew) << "Step " << stepIndex << " redefines output operand "
+ << toString(sourceOperandIndex) << " already defined by step " << it->second;
+}
+
void ExecutionPlan::recordTemporaryDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
auto [it, isNew] =
compound()->mTemporaryToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
@@ -1524,8 +1785,8 @@
return compound()->mSteps;
}
-bool ExecutionPlan::forTest_hasStepModelOutputsOfUnknownSize() const {
- return mBody->hasStepModelOutputsOfUnknownSize();
+bool ExecutionPlan::hasDynamicTemporaries() const {
+ return mBody->hasDynamicTemporaries();
}
const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const {
@@ -1602,12 +1863,12 @@
int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
uint32_t preference, uint32_t priority,
- const std::optional<Deadline>& deadline,
- ExecutionPlan* plan) const {
+ const std::optional<Deadline>& deadline, ExecutionPlan* plan,
+ int simulateFailureResultCode) const {
uint32_t sourceModelIndex = plan->getSourceModels().addModel(this);
NN_RETURN_IF_ERROR(partitionTheWorkInternal(sourceModelIndex, devices, preference, priority,
deadline, plan));
- int n = plan->finish(preference, priority, deadline);
+ int n = plan->finish(preference, priority, deadline, simulateFailureResultCode);
if (VLOG_IS_ON(COMPILATION)) {
VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: source model: ";
logModelToInfo(makeHidlModel());
@@ -1668,12 +1929,24 @@
// (see LogicalStep).
std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount + 1);
+ // This helper function produces a device name.
+ auto deviceName = [&devices, kControlFlowInterpreter,
+ deviceCount](int deviceIndex) -> std::string {
+ if (deviceIndex == kControlFlowInterpreter) {
+ return "NNAPI";
+ } else if (deviceIndex < 0 || size_t(deviceIndex) >= deviceCount) {
+ return "{unknown}";
+ } else {
+ return devices.at(deviceIndex)->getName();
+ }
+ };
+
// This helper function enqueues the operation on the appropriate queue.
auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
int deviceIndex = bestDeviceForOperation[operationIndex];
perDeviceQueue[deviceIndex].push(operationIndex);
VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
- << deviceIndex;
+ << deviceIndex << " (" << deviceName(deviceIndex) << ")";
};
// This helper function finds a device that has operations ready to process.
@@ -1692,11 +1965,14 @@
};
OperandTracker tracker(this, enqueueOnAppropriateDevice);
- // For each iteration of this loop, we'll create an execution step.
+ // For each iteration of this loop, we'll create either an execution step or
+ // an interpreted control flow construct (including nested execution steps
+ // and interpreted control flow constructs).
while (true) {
// Find the device we'll do this step for.
int deviceIndex = findNextDeviceToProcess();
- VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex;
+ VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex << " ("
+ << deviceName(deviceIndex) << ")";
if (deviceIndex < 0) {
break;
}
@@ -2050,13 +2326,14 @@
const int kControlFlowInterpreter = deviceCount;
(*bestDeviceForOperation)[operationIndex] = kControlFlowInterpreter;
VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
- << toString(operation.type) << ") = -1"
+ << toString(operation.type) << ":" << operationIndex << ") = -1"
<< " (NNAPI)";
} else {
(*bestDeviceForOperation)[operationIndex] = bestChoice;
VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
- << toString(operation.type) << ") = " << bestChoice << " ("
- << devices[bestChoice]->getName() << ")";
+ << toString(operation.type) << ":" << operationIndex
+ << ") = " << bestChoice << " (" << devices[bestChoice]->getName()
+ << ")";
}
}
return ANEURALNETWORKS_NO_ERROR;
diff --git a/runtime/ExecutionPlan.h b/runtime/ExecutionPlan.h
index d1e7d94..3b6beb6 100644
--- a/runtime/ExecutionPlan.h
+++ b/runtime/ExecutionPlan.h
@@ -22,6 +22,7 @@
#include <android-base/logging.h>
#include <openssl/sha.h>
+#include <algorithm>
#include <chrono>
#include <map>
#include <memory>
@@ -80,6 +81,13 @@
// output of a partition. For ExecutionStep, the inputs and outputs of the
// step model are boundary operands; for IfStep and WhileStep, the inputs and
// outputs of the corresponding operation are boundary operands.
+// - A partition boundary static temporary is a partition boundary
+// operand which is of lifetime TEMPORARY_VARIABLE in the source model and
+// whose dimensions are fully specified.
+// - A partition boundary dynamic temporary is a partition boundary
+// operand which is of lifetime TEMPORARY_VARIABLE in the source model and
+// whose dimensions are not fully specified.
+// - A main execution is the execution of a main model.
//
// Referenced models can be sources of parition boundary operands. For example,
// this happens when a referenced model is paritioned into one or more
@@ -105,6 +113,107 @@
std::vector<const ModelBuilder*> mModels;
};
+// Represents all partition boundary dynamic temporaries for a particular main
+// execution.
+//
+// Usage pattern:
+// - declare() every partition boundary dynamic temporary.
+// - endDeclarations(). After this point, lookup() is permitted.
+// - Before executing an ExecutionStep, call allocate().
+// - After executing an ExecutionStep, call redeclare() for every partition
+// boundary dynamic temporary for which we've learned or guessed more about
+// the dimensions or length.
+//
+// Each partition boundary temporary has a location assigned by allocate() for
+// its defining step (see declare() and allocate()). That location remains
+// valid until redeclare() increases the length of some temporary in its defining
+// step or allocate() is called again for its defining step.
+class DynamicTemporaries {
+ DISALLOW_COPY_AND_ASSIGN(DynamicTemporaries);
+
+ public:
+ DynamicTemporaries() = default;
+ DynamicTemporaries(DynamicTemporaries&&) = default;
+ DynamicTemporaries& operator=(DynamicTemporaries&&) = default;
+
+ // Declare a dynamic temporary. stepIndex is the step that defines the
+ // temporary (i.e., in which the temporary appears as an operation output
+ // operand). initialDimensions and initialLength indicate what we know or
+ // (in the case of length) guess about those properties.
+ void declare(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex,
+ const hal::hidl_vec<uint32_t>& initialDimensions, uint32_t initialLength);
+
+ // Indicate that we've finished declaring all dynamic temporaries.
+ void endDeclarations() {
+ CHECK(!mDeclared);
+ mDeclared = true;
+ }
+
+ // Redeclare a dynamic temporary, indicating what we've learned about it.
+ // This may invalidate the location of temporaries defined by its step.
+ // Returns true if dimensions or length changed, false otherwise.
+ bool redeclare(SourceOperandIndex sourceOperandIndex,
+ const hal::hidl_vec<uint32_t>& newDimensions, uint32_t newLength);
+
+ // Ensure that all dynamic temporaries defined by the specified step have
+ // locations. The return value is a ResultCode (e.g.,
+ // ANEURALNETWORKS_NO_ERROR).
+ //
+ // Even if dynamic temporaries have already been allocated for this step,
+ // this call may reallocate them. A reallocation is not guaranteed to
+ // preserve location (LocationAndShape.memory, LocationAndShape.offset) or
+ // contents of temporaries.
+ int allocate(uint32_t stepIndex);
+
+ // Do the dynamic temporaries defined by this step have valid allocations?
+ // (Will be true if there are no dynamic temporaries defined by this step.)
+ bool allocated(uint32_t stepIndex) const;
+
+ // Dump information to VLOG(EXECUTION).
+ void vlogDump(const char* context = nullptr) const;
+
+ // If the specified operand is a dynamic temporary, return location and
+ // shape information; otherwise, return std::nullopt.
+ //
+ // If temporary exists but does not have a valid allocation, then:
+ // - If mustBeAllocated == true, then trigger a failed CHECK().
+ // - If mustBeAllocated == false, then memory == nullptr and offset == ~0.
+ struct LocationAndShape {
+ const Memory* memory;
+ uint32_t offset;
+ const hal::hidl_vec<uint32_t>* dimensions;
+ uint32_t length;
+ };
+ std::optional<LocationAndShape> lookup(SourceOperandIndex sourceOperandIndex,
+ bool mustBeAllocated = true) const;
+
+ // Have any dynamic temporaries been declared?
+ bool empty() const { return mSourceOperandToTemporary.empty(); }
+
+ private:
+ // The same as LocationAndShape, except the base of the location is
+ // represented not by memory but by defining stepIndex.
+ struct InternalLocationAndShape {
+ uint32_t stepIndex;
+ uint32_t offset;
+ hal::hidl_vec<uint32_t> dimensions;
+ uint32_t length;
+ };
+ std::map<SourceOperandIndex, InternalLocationAndShape> mSourceOperandToTemporary;
+
+ // Every dynamic temporary defined at a given stepIndex.
+ std::map<uint32_t, std::vector<SourceOperandIndex>> mStepIndexToSourceOperandIndexes;
+
+ std::map<uint32_t, std::unique_ptr<MemoryAshmem>> mStepIndexToMemory;
+
+ // For a given defining stepIndex, we consider either all its dynamic
+ // temporaries to be allocated (have valid locations) or none of them to be.
+ std::set<uint32_t> mAllocatedStepIndexes;
+
+ // Has endDeclarations() been called?
+ bool mDeclared = false;
+};
+
// An excerpt of a source model to be run by a specific device.
class ExecutionStep {
public:
@@ -137,8 +246,14 @@
return mOutputsAsStepModelInputsIndexToMainModel;
}
+ const std::set<uint32_t>& getModelOutputsThatAreDownstreamInputs() const {
+ return mModelOutputsThatAreDownstreamInputs;
+ }
+
+ uint32_t getIndex() const { return mIndex; }
uint32_t getSourceModelIndex() const { return mSourceModelIndex; }
+ void declareModelOutputIsDownstreamInput(uint32_t mainModelOutputIndex);
void recordTempAsStepModelOutput(uint32_t stepOperandIndex);
// If this step has a step model output of unknown size, sets
@@ -158,8 +273,11 @@
// This method only reads map entries for which the first element of
// SourceOperandIndex is mSourceModelIndex.
void mapInputsAndOutputs(
- std::shared_ptr<StepExecutor> stepExecutor, const Memory* temporaryMemory,
- const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOffsetOfTemporary,
+ std::shared_ptr<StepExecutor> stepExecutor,
+ const Memory* temporaryMemory, // for static temporaries
+ const std::map<SourceOperandIndex, uint32_t>&
+ sourceOperandToOffsetOfTemporary, // for static temporaries
+ const DynamicTemporaries& dynamicTemporaries,
const std::map<SourceOperandIndex, uint32_t>& sourceOperandToInputIndex,
const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOutputIndex,
const std::map<SourceOperandIndex, ConstantReferenceLocation>&
@@ -192,6 +310,7 @@
// model, the memory should be mapped using
// ExecutionPlan::CompoundBody::mSourceOperandToInputIndex,
// ExecutionPlan::Controller::mSourceOperandToOffsetOfTemporary, or
+ // ExecutionPlan::Controller::mDynamicTemporaries, or
// ExecutionPlan::CompoundBody::mSourceOperandToOutputIndex.
RemapVectorType mStepModelInputs;
// All outputs of this step model:
@@ -199,11 +318,12 @@
//
// Depending on whether the source operand is an output of the main model,
// the memory should be mapped using
- // ExecutionPlan::CompoundBody::mSourceOperandToOutputIndex or
- // ExecutionPlan::Controller::mSourceOperandToOffsetOfTemporary.
+ // ExecutionPlan::CompoundBody::mSourceOperandToOutputIndex,
+ // ExecutionPlan::Controller::mSourceOperandToOffsetOfTemporary, or
+ // ExecutionPlan::Controller::mDynamicTemporaries.
//
- // mOutputIndexStepModelToMainModel relies on mModelOutputs being a prefix of
- // mStepModelOutputs.
+ // mOutputIndexStepModelToMainModel and declareModelOutputIsDownstreamInput()
+ // rely on mModelOutputs being a prefix of mStepModelOutputs.
RemapVectorType mStepModelOutputs;
// Inputs of main model that are also inputs of this step model:
// (main model operand index, step model operand index)
@@ -247,6 +367,10 @@
// mOutputsAsStepModelInputs[i].first
std::vector<uint32_t> mOutputsAsStepModelInputsIndexToMainModel;
+ // Step model output indexes (not operand indexes) that are outputs of the
+ // main model used as inputs to some other partition.
+ std::set<uint32_t> mModelOutputsThatAreDownstreamInputs;
+
// The compilation caching token.
TokenHasher mToken;
};
@@ -417,8 +541,8 @@
ExecutionPlan() {}
~ExecutionPlan() { delete mBody; }
- // Controller is part of the interface to a mechanism for performing an
- // execution in N steps.
+ // Controller is part of the interface to a mechanism for performing a
+ // main execution in N steps.
//
// The value of N may not be known beforehand if the model contains WHILE
// loops. See LogicalStep.
@@ -445,15 +569,20 @@
const BurstBuilder* burstBuilder);
// A constructor for mState == COMPOUND.
Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
- const BurstBuilder* burstBuilder, uint32_t totalSizeOfTemporaries,
+ const BurstBuilder* burstBuilder,
+
+ // static temporaries
+ uint32_t totalSizeOfTemporaries,
std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary,
std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary2,
+
std::map<SourceOperandIndex, uint32_t> sourceOperandToInputIndex,
std::map<SourceOperandIndex, uint32_t> sourceOperandToOutputIndex,
const std::map<SourceOperandIndex, ConstantCopyLocation>&
sourceOperandToConstantCopy,
std::map<SourceOperandIndex, ConstantReferenceLocation>
- sourceOperandToConstantReference);
+ sourceOperandToConstantReference,
+ DynamicTemporaries dynamicTemporaries);
// Sets the location of innerOperand to be the same as the location of outerOperand.
void setInput(const SourceOperandIndex& outerOperand,
@@ -467,7 +596,7 @@
// does not generate a sync fence.
int waitForLastStepSyncFence() const;
- const ExecutionPlan* mPlan;
+ [[maybe_unused]] const ExecutionPlan* mPlan;
ExecutionBuilder* mExecutionBuilder;
const BurstBuilder* mBurstBuilder;
// Map from source operand index to an offset into mTemporaries used
@@ -496,7 +625,12 @@
// Map from source operand index to a constant reference location.
// Used for WHILE loop operand initializers that are constant references.
std::map<SourceOperandIndex, ConstantReferenceLocation> mSourceOperandToConstantReference;
+
+ // static temporaries
std::unique_ptr<MemoryAshmem> mTemporaries;
+
+ DynamicTemporaries mDynamicTemporaries;
+
// Index of the next step to be processed by ExecutionPlan::next().
size_t mNextStepIndex;
// The value to reset mNextStepIndex to for partial CPU fallback.
@@ -521,8 +655,8 @@
int syncFdOfLastStep = -1) const;
// Create the same executor as the last one created by next().
- int fallback(std::shared_ptr<Controller> controller,
- std::shared_ptr<StepExecutor>* executor) const;
+ int fallback(std::shared_ptr<Controller> controller, std::shared_ptr<StepExecutor>* executor,
+ std::shared_ptr<ExecutionBurstController>* burstController = nullptr) const;
ExecutionStep* createNewExecutionStep(uint32_t sourceModelIndex,
const std::shared_ptr<Device> device);
@@ -535,9 +669,11 @@
void becomeSingleStep(const std::shared_ptr<Device> device, const ModelBuilder* model);
+ // simulateFailureResultCode == ANEURALNETWORKS_NO_ERROR means behave normally.
int finish(int32_t executionPreference, int32_t priority,
- const std::optional<Deadline>& deadline);
+ const std::optional<Deadline>& deadline, int simulateFailureResultCode);
+ void recordOutputDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex);
void recordTemporaryDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex);
void dump() const;
@@ -568,6 +704,8 @@
SourceModels& getSourceModels() { return mSourceModels; }
const SourceModels& getSourceModels() const { return mSourceModels; }
+ bool hasDynamicTemporaries() const;
+
// These functions are solely intended for use by unit tests of
// the partitioning algorithm.
enum class Kind {
@@ -579,14 +717,19 @@
Kind forTest_getKind() const;
std::shared_ptr<const Device> forTest_simpleGetDevice() const;
const std::vector<std::shared_ptr<LogicalStep>>& forTest_compoundGetSteps() const;
- bool forTest_hasStepModelOutputsOfUnknownSize() const;
const uint8_t* forTest_simpleGetCacheToken() const;
private:
// Becomes a new COMPOUND step if mState == EMPTY, otherwise does nothing.
// Illegal to call for when mState == SIMPLE.
void becomeCompoundIfEmpty();
- void findTempsAsStepModelOutputs();
+
+ const hal::Operand& getSourceOperand(
+ const std::pair<uint32_t, uint32_t>& sourceOperandIndex) const {
+ return getSourceModels()
+ .getModel(sourceOperandIndex.first)
+ ->getOperand(sourceOperandIndex.second);
+ }
class Buffer {
public:
@@ -631,8 +774,9 @@
virtual ~Body() {}
virtual void dump() const = 0;
virtual int finish(const SourceModels* sourceModels, int32_t executionPreference,
- int32_t priority, const std::optional<Deadline>& deadline) = 0;
- virtual bool hasStepModelOutputsOfUnknownSize() const = 0;
+ int32_t priority, const std::optional<Deadline>& deadline,
+ int simulateFailureResultCode) = 0;
+ virtual bool hasDynamicTemporaries() const = 0;
virtual void forEachStepRoleOfInput(uint32_t index,
const StepRoleCallback& callback) const = 0;
virtual void forEachStepRoleOfOutput(uint32_t index,
@@ -647,8 +791,8 @@
void dump() const override;
int finish(const SourceModels* sourceModels, int32_t executionPreference, int32_t priority,
- const std::optional<Deadline>& deadline) override;
- bool hasStepModelOutputsOfUnknownSize() const override { return false; }
+ const std::optional<Deadline>& deadline, int simulateFailureResultCode) override;
+ bool hasDynamicTemporaries() const override { return false; }
void forEachStepRoleOfInput(uint32_t index,
const StepRoleCallback& callback) const override;
void forEachStepRoleOfOutput(uint32_t index,
@@ -665,10 +809,8 @@
struct CompoundBody : Body {
void dump() const override;
int finish(const SourceModels* sourceModels, int32_t executionPreference, int32_t priority,
- const std::optional<Deadline>& deadline) override;
- bool hasStepModelOutputsOfUnknownSize() const override {
- return mHasStepModelOutputOfUnknownSize;
- }
+ const std::optional<Deadline>& deadline, int simulateFailureResultCode) override;
+ bool hasDynamicTemporaries() const override { return mHasDynamicTemporaries; }
void forEachStepRoleOfInput(uint32_t index,
const StepRoleCallback& callback) const override;
void forEachStepRoleOfOutput(uint32_t index,
@@ -681,6 +823,12 @@
std::vector<std::shared_ptr<LogicalStep>> mSteps;
// Map from source operand index to defining ExecutionStep index.
+ // Used for all (and only) SUBGRAPH_OUTPUTs that are defined by
+ // ExecutionSteps. Those defined by IfSteps and WhileSteps are not in
+ // the map.
+ std::map<SourceOperandIndex, uint32_t> mOutputToDefiningExecutionStep;
+
+ // Map from source operand index to defining ExecutionStep index.
// Used for all (and only) TEMPORARY_VARIABLEs that are defined by
// ExecutionSteps. Those defined by IfSteps and WhileSteps are not in
// the map.
@@ -708,11 +856,13 @@
std::map<SourceOperandIndex, ConstantReferenceLocation>
mSourceOperandToBoundaryConstantReference;
- bool mHasStepModelOutputOfUnknownSize = false;
+ bool mHasDynamicTemporaries = false;
private:
void findTempsAsStepModelOutputs();
+ void findModelOutputsThatAreDownstreamInputs();
+
// Constant values that are inputs to IF and WHILE operations and lie on
// a partition boundary ("control flow boundary constants") require
// special treatment. We need to be able to dynamically associate those
@@ -758,6 +908,7 @@
// Pointers to compilation caching information in CompilationBuilder.
const std::string* mCacheDir = nullptr;
const uint8_t* mToken = nullptr;
+
SourceModels mSourceModels;
};
diff --git a/runtime/Manager.cpp b/runtime/Manager.cpp
index 6b80d20..78d7c36 100644
--- a/runtime/Manager.cpp
+++ b/runtime/Manager.cpp
@@ -405,7 +405,7 @@
}
if (n != ANEURALNETWORKS_NO_ERROR) {
- VLOG(EXECUTION) << "**Execution failed**";
+ VLOG(EXECUTION) << "**Execution failed** (ResultCode = " << n << ")";
return {n, std::move(outputShapes), timing};
}
diff --git a/runtime/ModelBuilder.h b/runtime/ModelBuilder.h
index 94baab7..2de68b3 100644
--- a/runtime/ModelBuilder.h
+++ b/runtime/ModelBuilder.h
@@ -126,9 +126,11 @@
return getReferencedModel(operand.location.offset);
}
+ // simulateFailureResultCode == ANEURALNETWORKS_NO_ERROR means behave normally.
int partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices, uint32_t preference,
uint32_t priority, const std::optional<Deadline>& deadline,
- ExecutionPlan* plan) const;
+ ExecutionPlan* plan,
+ int simulateFailureResultCode = ANEURALNETWORKS_NO_ERROR) const;
private:
// TODO(b/132322449): move partitionTheWork, findBestDeviceForEachOperation,
diff --git a/runtime/NeuralNetworks.cpp b/runtime/NeuralNetworks.cpp
index 5d3dae4..f5206c8 100644
--- a/runtime/NeuralNetworks.cpp
+++ b/runtime/NeuralNetworks.cpp
@@ -1543,6 +1543,26 @@
waitForList.push_back(syncFenceFd);
}
}
+
+ if (r->getCompilation()->hasDynamicTemporaries()) {
+ // The current implementation of fenced execution does not support
+ // dynamic temporaries. Fall back to non fenced execution.
+ LOG(INFO) << "ANeuralNetworksExecution_startComputeWithDependencies falling back"
+ << " to ANeuralNetworksExecution_startCompute"
+ << " because of boundary operands of unknown size";
+ for (int syncFenceFd : waitForList) {
+ if (syncFenceFd > 0) {
+ auto w = syncWait(syncFenceFd, -1);
+ if (w != FenceState::SIGNALED) {
+ VLOG(EXECUTION) << "syncWait failed, fd: " << syncFenceFd;
+ *event = nullptr;
+ return ANEURALNETWORKS_OP_FAILED;
+ }
+ }
+ }
+ return ANeuralNetworksExecution_startCompute(execution, event);
+ }
+
int syncFenceToSignal = -1;
int n = r->computeFenced(waitForList, duration, &syncFenceToSignal);
std::unique_ptr<SyncFenceEvent> e =
diff --git a/runtime/VersionedInterfaces.cpp b/runtime/VersionedInterfaces.cpp
index 7139b83..ccb29dc 100644
--- a/runtime/VersionedInterfaces.cpp
+++ b/runtime/VersionedInterfaces.cpp
@@ -638,6 +638,7 @@
LOG(ERROR) << "IDevice::getVersionString returned the error " << toString(versionStatus);
return std::nullopt;
}
+ VLOG(MANAGER) << "Version " << versionString;
const int32_t type = getTypeFunction(device);
if (type == -1) {
diff --git a/runtime/include/NeuralNetworksOEM.h b/runtime/include/NeuralNetworksOEM.h
index 54a5dfe..e184d52 100644
--- a/runtime/include/NeuralNetworksOEM.h
+++ b/runtime/include/NeuralNetworksOEM.h
@@ -55,9 +55,10 @@
}; // extends OperandCode
/**
- * If a model contains an {@link ANEURALNETWORKS_OEM_OPERATION}, then
- * either the model must contain only a single operation, or every
- * tensor operand type in the model must be fully specified.
+ * Before API level 30, if a model contains an
+ * {@link ANEURALNETWORKS_OEM_OPERATION}, then either the model must contain
+ * only a single operation, or every tensor operand type in the model must be
+ * fully specified.
*/
enum {
/**
diff --git a/runtime/test/TestExecution.cpp b/runtime/test/TestExecution.cpp
index 66bef6b..480b0ef 100644
--- a/runtime/test/TestExecution.cpp
+++ b/runtime/test/TestExecution.cpp
@@ -576,7 +576,7 @@
// fall back to CPU. (If we allow CPU fallback, then when our
// TestDriver reports an execution failure, we'll re-execute
// on CPU, and will not see the failure.)
- c->setPartitioning(DeviceManager::kPartitioningWithoutFallback);
+ c->forTest_setPartitioning(DeviceManager::kPartitioningWithoutFallback);
mCompilation = reinterpret_cast<ANeuralNetworksCompilation*>(c);
}
};
diff --git a/runtime/test/TestGenerated.cpp b/runtime/test/TestGenerated.cpp
index 70b0e6f..6b96004 100644
--- a/runtime/test/TestGenerated.cpp
+++ b/runtime/test/TestGenerated.cpp
@@ -265,6 +265,7 @@
// Check output dimensions.
for (uint32_t i = 0; i < testModel.main.outputIndexes.size(); i++) {
+ SCOPED_TRACE("Output index: " + std::to_string(i));
const auto& output = testModel.main.operands[testModel.main.outputIndexes[i]];
if (output.isIgnored) continue;
std::vector<uint32_t> actualDimensions;
diff --git a/runtime/test/TestPartitioning.cpp b/runtime/test/TestPartitioning.cpp
index 45dabe3..c58b1a4 100644
--- a/runtime/test/TestPartitioning.cpp
+++ b/runtime/test/TestPartitioning.cpp
@@ -888,7 +888,13 @@
}
Result setPartitioning(uint32_t partitioning) {
- return static_cast<Result>(builder()->setPartitioning(partitioning));
+ return static_cast<Result>(builder()->forTest_setPartitioning(partitioning));
+ }
+
+ // Simulate recoverable partitioning failure.
+ Result failPartitioning() {
+ return static_cast<Result>(
+ builder()->forTest_failPartitioning(static_cast<int>(Result::OP_FAILED)));
}
using WrapperCompilation::finish;
@@ -1790,10 +1796,6 @@
model.finish();
ASSERT_TRUE(model.isValid());
- // We expect that we cannot successfully partition, because we
- // have an intermediate operand (opnd2) without dimensions, and
- // this is not currently handled.
-
// One device that can and should execute operation 0.
const auto devices = makeDevices({{"hw", 0.5, (1 << 0)}});
@@ -1803,32 +1805,31 @@
// didn't actually do any partitioning.
PartitioningCompilation cPNo(&model, devices);
ASSERT_EQ(cPNo.setPartitioning(DeviceManager::kPartitioningNo), Result::NO_ERROR);
+ ASSERT_EQ(cPNo.failPartitioning(), Result::NO_ERROR);
ASSERT_EQ(cPNo.finish(), Result::NO_ERROR);
ASSERT_EQ(cPNo.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
ASSERT_EQ(cPNo.getExecutionPlan().forTest_simpleGetDevice(), DeviceManager::getCpuDevice());
- // Test kPartitioningWithFallback. We should attempt
- // partitioning, reach the end of the partitioning process (so we
- // have an unsuccessful execution plan), discover the dimensionless
- // intermediate operand, then fallback to CPU with a SIMPLE plan, and
- // finally return success.
- // No need to compare the original model to the model from the plan -- we
- // didn't actually do any partitioning.
+ // Test kPartitioningWithFallback. We should attempt partitioning, simulate
+ // a recoverable failure, then fallback to CPU with a SIMPLE plan, and
+ // finally return success. No need to compare the original model to the
+ // model from the plan -- we didn't actually do any partitioning.
PartitioningCompilation cPWithFallback(&model, devices);
ASSERT_EQ(cPWithFallback.setPartitioning(DeviceManager::kPartitioningWithFallback),
Result::NO_ERROR);
+ ASSERT_EQ(cPWithFallback.failPartitioning(), Result::NO_ERROR);
ASSERT_EQ(cPWithFallback.finish(), Result::NO_ERROR);
ASSERT_EQ(cPWithFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
ASSERT_EQ(cPWithFallback.getExecutionPlan().forTest_simpleGetDevice(),
DeviceManager::getCpuDevice());
- // Test kPartitioningWithoutFallback. We should attempt
- // partitioning, and fail.
+ // Test kPartitioningWithoutFallback. We should attempt partitioning,
+ // simulate a recoverable failure, and fail.
PartitioningCompilation cPWithoutFallback(&model, devices);
ASSERT_EQ(cPWithoutFallback.setPartitioning(DeviceManager::kPartitioningWithoutFallback),
Result::NO_ERROR);
+ ASSERT_EQ(cPWithoutFallback.failPartitioning(), Result::NO_ERROR);
ASSERT_EQ(cPWithoutFallback.finish(), Result::OP_FAILED);
- ASSERT_TRUE(cPWithoutFallback.getExecutionPlan().forTest_hasStepModelOutputsOfUnknownSize());
ASSERT_EQ(cPWithoutFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::ERROR);
}
diff --git a/runtime/test/TestPartitioningRandom.cpp b/runtime/test/TestPartitioningRandom.cpp
index 968625e..d94ec9f 100644
--- a/runtime/test/TestPartitioningRandom.cpp
+++ b/runtime/test/TestPartitioningRandom.cpp
@@ -220,7 +220,7 @@
using WrapperCompilation::finish;
Result setPartitioning(uint32_t partitioning) {
- return static_cast<Result>(builder()->setPartitioning(partitioning));
+ return static_cast<Result>(builder()->forTest_setPartitioning(partitioning));
}
const ExecutionPlan& getExecutionPlan() const { return builder()->forTest_getExecutionPlan(); }
@@ -751,7 +751,14 @@
const unsigned problemSize = 1 + randUInt(kMaxProblemSize);
const WrapperOperandType problemType(WrapperType::TENSOR_FLOAT32, {problemSize, problemSize});
- const WrapperOperandType unknownDimensionsType(WrapperType::TENSOR_FLOAT32, {0, 0});
+ const WrapperOperandType unknownDimensionsTypes[] = {
+ {WrapperType::TENSOR_FLOAT32, {}},
+ {WrapperType::TENSOR_FLOAT32, {0, 0}},
+ {WrapperType::TENSOR_FLOAT32, {0, problemSize}},
+ {WrapperType::TENSOR_FLOAT32, {problemSize, 0}},
+ };
+ const unsigned kUnknownDimensionsTypesCount =
+ sizeof(unknownDimensionsTypes) / sizeof(unknownDimensionsTypes[0]);
static const WrapperOperandType activationFunctionType(WrapperType::INT32, {});
@@ -803,11 +810,6 @@
// operations).
unsigned rootOperationCount = 0;
- // Track if we added operands with unknown dimensions. In this case,
- // partitioned compilation will fail if such an operand is read in a
- // different partition than it is written.
- bool hasUnknownDimensions = false;
-
// Generate operations.
for (unsigned i = 0; i < numOperations; i++) {
const unsigned operationPatternIndex = randUInt(std::size(kOperationPatterns));
@@ -995,19 +997,18 @@
// OUTPUTS /////////////////////////////////////////////////////////////////////////////////
std::vector<uint32_t> operationOutputs(operationPattern.mNumOutputs);
- std::generate(operationOutputs.begin(), operationOutputs.end(),
- [&model, &problemType, &unknownDimensionsType, &hasUnknownDimensions,
- allowUnknownDimensions, this] {
- // 3% unknowns causes ~35% of partitionings to fail
- // (determined by commenting out the fallback code,
- // running tests and noting number of failures).
- if (allowUnknownDimensions && randFrac() < 0.03) {
- hasUnknownDimensions = true;
- return model.addOperand(&unknownDimensionsType);
- } else {
- return model.addOperand(&problemType);
- }
- });
+ std::generate(
+ operationOutputs.begin(), operationOutputs.end(),
+ [&model, &problemType, &unknownDimensionsTypes, allowUnknownDimensions, this] {
+ // Before the fix for http://b/132458982, 3% unknowns
+ // causes ~35% of partitionings to fail.
+ if (allowUnknownDimensions && randFrac() < 0.03) {
+ return model.addOperand(
+ &unknownDimensionsTypes[randUInt(kUnknownDimensionsTypesCount)]);
+ } else {
+ return model.addOperand(&problemType);
+ }
+ });
// OPERATION ///////////////////////////////////////////////////////////////////////////////
@@ -1157,37 +1158,18 @@
// CPU fallback device
devices.push_back(DeviceManager::getCpuDevice());
- // Partitioned compilation.
- // For test cases without unknown intermediate operand sizes we require the
- // partitioning to succeed without CPU fallback. With unknown sizes we
- // retry with a fallback if the non-fallback partitioning fails and require
- // the fallback to succeed.
- TestCompilation cNoFallback(&model, devices);
- TestCompilation cWithFallback(&model, devices);
- TestCompilation* c2 = nullptr;
- ASSERT_EQ(cNoFallback.setPartitioning(DeviceManager::kPartitioningWithoutFallback),
- Result::NO_ERROR);
- auto compilationResult = cNoFallback.finish();
- if (hasUnknownDimensions && compilationResult == Result::OP_FAILED &&
- cNoFallback.getExecutionPlan().forTest_hasStepModelOutputsOfUnknownSize()) {
- ASSERT_EQ(cWithFallback.setPartitioning(DeviceManager::kPartitioningWithFallback),
- Result::NO_ERROR);
- ASSERT_EQ(cWithFallback.finish(), Result::NO_ERROR);
- ASSERT_EQ(cWithFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
- ASSERT_EQ(cWithFallback.getExecutionPlan().forTest_simpleGetDevice(),
- DeviceManager::getCpuDevice());
- c2 = &cWithFallback;
- } else {
- ASSERT_EQ(compilationResult, Result::NO_ERROR);
- c2 = &cNoFallback;
- }
+ // Partitioned compilation. We require the partitioning to succeed without
+ // CPU fallback.
+ TestCompilation c2(&model, devices);
+ ASSERT_EQ(c2.setPartitioning(DeviceManager::kPartitioningWithoutFallback), Result::NO_ERROR);
+ ASSERT_EQ(c2.finish(), Result::NO_ERROR);
#ifdef VERBOSE
{
std::cout << "signatures = " << signatures.size() << ", devices = " << devices.size()
<< std::endl;
// TODO: When dumping steps, include non-ExecutionSteps.
- const ExecutionPlan& plan = c2->getExecutionPlan();
+ const ExecutionPlan& plan = c2.getExecutionPlan();
switch (plan.forTest_getKind()) {
case ExecutionPlan::Kind::SIMPLE:
std::cout << "plan: simple" << std::endl;
@@ -1376,7 +1358,7 @@
}
// Partitioned execution.
- WrapperExecution e2(c2);
+ WrapperExecution e2(&c2);
ASSERT_NO_FATAL_FAILURE(prepareForExecution(&e2));
ASSERT_EQ(e2.compute(), Result::NO_ERROR);