Add CPU fallback and remove non-plan driver path.
When we do ExecutionPlan-controlled execution, and
there is a failure at some stage of the execution,
fall back to cpu execution for that one stage (if
possible) or for the entire model (otherwise).
When we don't do ExecutionPlan-controlled execution,
we no longer attempt to find a suitable driver for
full-model execution -- instead, we execute the full
model on the cpu. (Rationale: This avoids having
to implement cpu fallback outside ExecutionPlan-
controlled execution.)
Bug: 63905942
Test: mma (userdebug)
ml/nn/runtime/tests (userdebug)
(with debug.nn.partition.test 1,
no new failures; hand-insert
errors in the runtime in a few
places to exercise some of the
fallback paths, and verified that
behavior is as expected)
Change-Id: I9ec8fba632f94e04840a786575d338d06fd2dcd3
diff --git a/runtime/ExecutionBuilder.cpp b/runtime/ExecutionBuilder.cpp
index ad9f96c..d43f6a3 100644
--- a/runtime/ExecutionBuilder.cpp
+++ b/runtime/ExecutionBuilder.cpp
@@ -182,31 +182,110 @@
length);
}
-static void asyncStartComputePartitioned(const ExecutionPlan* plan,
+// Attempt synchronous execution of full model on CPU.
+// Ensure that executionCallback->notify() is called.
+static void cpuFallbackFull(const ExecutionBuilder* executionBuilder,
+ const sp<ExecutionCallback>& executionCallback) {
+ LOG(DEBUG) << "cpuFallbackFull";
+ StepExecutor executor(executionBuilder, executionBuilder->getModel(),
+ nullptr /* no IDevice, so CPU */,
+ nullptr /* no IPreparedModel */);
+ executor.mapInputsAndOutputsTrivially();
+ sp<ExecutionCallback> fallbackCallback;
+ if (executor.startCompute(&fallbackCallback) != ANEURALNETWORKS_NO_ERROR) {
+ executionCallback->notify(ErrorStatus::GENERAL_FAILURE);
+ return;
+ }
+ fallbackCallback->wait();
+ executionCallback->notify(fallbackCallback->getStatus());
+}
+
+// Attempt synchronous execution on CPU.
+// (1) First, attempt to execute this step on CPU. If successful,
+// return true. (Do not call executionCallback->notify().)
+// (2) If unsuccessful, attempt to execute the full model on CPU,
+// ensure that executionCallback->notify() is called, and return
+// false.
+static bool cpuFallbackPartial(const ExecutionBuilder* executionBuilder,
+ const ExecutionPlan* plan,
+ std::shared_ptr<ExecutionPlan::Controller> controller,
+ const sp<ExecutionCallback>& executionCallback) {
+ LOG(DEBUG) << "cpuFallbackPartial";
+ std::shared_ptr<StepExecutor> executor;
+ int n = plan->fallback(controller, &executor);
+ if (n != ANEURALNETWORKS_NO_ERROR || executor->isCpu()) {
+ cpuFallbackFull(executionBuilder, executionCallback);
+ return false;
+ }
+ sp<ExecutionCallback> fallbackCallback;
+ if (executor->startComputeOnCpu(&fallbackCallback) != ANEURALNETWORKS_NO_ERROR) {
+ cpuFallbackFull(executionBuilder, executionCallback);
+ return false;
+ }
+ fallbackCallback->wait();
+ if (fallbackCallback->getStatus() != ErrorStatus::NONE) {
+ cpuFallbackFull(executionBuilder, executionCallback);
+ return false;
+ }
+ return true;
+}
+
+static void asyncStartComputePartitioned(const ExecutionBuilder* executionBuilder,
+ const ExecutionPlan* plan,
std::shared_ptr<ExecutionPlan::Controller> controller,
- const sp<IExecutionCallback>& executionCallback) {
+ bool allowFallback,
+ const sp<ExecutionCallback>& executionCallback) {
LOG(DEBUG) << "ExecutionBuilder::startCompute (from plan, iteratively)";
while (true) {
std::shared_ptr<StepExecutor> executor;
LOG(DEBUG) << "looking for next StepExecutor";
int n = plan->next(controller, &executor);
- if (n != ANEURALNETWORKS_NO_ERROR || executor == nullptr) {
- executionCallback->notify(
- n == ANEURALNETWORKS_NO_ERROR ? ErrorStatus::NONE : ErrorStatus::GENERAL_FAILURE);
+ if (n != ANEURALNETWORKS_NO_ERROR) {
+ if (allowFallback) {
+ cpuFallbackFull(executionBuilder, executionCallback);
+ } else {
+ executionCallback->notify(ErrorStatus::GENERAL_FAILURE);
+ }
+ return;
+ }
+ if (executor == nullptr) {
+ executionCallback->notify(ErrorStatus::NONE);
return;
}
sp<ExecutionCallback> stepCallback;
n = executor->startCompute(&stepCallback);
if (n != ANEURALNETWORKS_NO_ERROR) {
- executionCallback->notify(ErrorStatus::GENERAL_FAILURE);
- return;
+ if (allowFallback) {
+ if (cpuFallbackPartial(executionBuilder, plan, controller, executionCallback)) {
+ // Successfully executed one step on CPU.
+ continue;
+ } else {
+ // Either successfully executed entire plan on
+ // CPU, or tried and failed to do so.
+ return;
+ }
+ } else {
+ executionCallback->notify(ErrorStatus::GENERAL_FAILURE);
+ return;
+ }
}
stepCallback->wait();
ErrorStatus status = stepCallback->getStatus();
if (status != ErrorStatus::NONE) {
- executionCallback->notify(status);
- return;
+ if (allowFallback) {
+ if (cpuFallbackPartial(executionBuilder, plan, controller, executionCallback)) {
+ // Successfully executed one step on CPU.
+ continue;
+ } else {
+ // Either successfully executed entire plan on
+ // CPU, or tried and failed to do so.
+ return;
+ }
+ } else {
+ executionCallback->notify(status);
+ return;
+ }
}
}
}
@@ -230,53 +309,69 @@
}
}
- // TODO: Remove the non-plan-based path once we've fully integrated ExecutionPlan
- // with the compilation and execution phases of the NN API? Or retain that path
- // as a fallback in the case of partitioning failure?
- //
- // TODO: Entire plan-based-path should run in an asynchronous thread --
- // take the asynchronous thread logic out of startComputeOnCpu() and use
- // it to wrap the plan-based-path.
- const uint32_t partitioning = DeviceManager::get()->getPartitioning();
- if (partitioning > 0) {
- std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this);
- if (controller == nullptr) {
- if (!DeviceManager::partitioningAllowsFallback(partitioning)) {
- return ANEURALNETWORKS_OP_FAILED;
+#ifndef DISABLE_PARTITIONED_EXECUTION
+ {
+ // TODO: Remove the non-plan-based path once we've fully integrated ExecutionPlan
+ // with the compilation and execution phases of the NN API? Or retain that path
+ // as a fallback in the case of partitioning failure?
+ //
+ // TODO: Entire plan-based-path should run in an asynchronous thread --
+ // take the asynchronous thread logic out of startComputeOnCpu() and use
+ // it to wrap the plan-based-path.
+ const uint32_t partitioning = DeviceManager::get()->getPartitioning();
+ if (partitioning > 0) {
+ const bool allowFallback = DeviceManager::partitioningAllowsFallback(partitioning);
+ std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this);
+ if (controller == nullptr) {
+ if (!allowFallback) {
+ return ANEURALNETWORKS_OP_FAILED;
+ }
+ } else {
+ // TODO: use a thread pool
+
+ // Prepare the callback for asynchronous execution.
+ // sp<ExecutionCallback> object is returned when the
+ // execution has been successfully launched, otherwise a
+ // nullptr is returned. The executionCallback is
+ // abstracted in the NN API as an "event".
+ sp<ExecutionCallback> executionCallback = new ExecutionCallback();
+ std::thread thread(asyncStartComputePartitioned, this, mPlan, controller,
+ allowFallback,
+ executionCallback);
+ executionCallback->bind_thread(std::move(thread));
+ *synchronizationCallback = executionCallback;
+ return ANEURALNETWORKS_NO_ERROR;
}
- } else {
- // TODO: use a thread pool
-
- // Prepare the callback for asynchronous execution.
- // sp<ExecutionCallback> object is returned when the
- // execution has been successfully launched, otherwise a
- // nullptr is returned. The executionCallback is
- // abstracted in the NN API as an "event".
- sp<ExecutionCallback> executionCallback = new ExecutionCallback();
- std::thread thread(asyncStartComputePartitioned, mPlan, controller, executionCallback);
- executionCallback->bind_thread(std::move(thread));
- *synchronizationCallback = executionCallback;
- return ANEURALNETWORKS_NO_ERROR;
}
}
-
- // Find a driver that can handle all the operations.
- Model hidlModel;
- mModel->setHidlModel(&hidlModel);
- const std::vector<std::shared_ptr<Device>>& devices = DeviceManager::get()->getDrivers();
- for (const auto& device : devices) {
- hidl_vec<bool> supports;
- LOG(DEBUG) << "Checking " << device->getName();
- device->getSupportedOperations(hidlModel, &supports);
- if (std::find(supports.begin(), supports.end(), false) == supports.end()) {
- LOG(DEBUG) << "ExecutionBuilder::startCompute (without plan) on " << device->getName();
- StepExecutor executor(this, mModel, device->getInterface(),
- nullptr /* no IPreparedModel, so compile */);
- executor.mapInputsAndOutputsTrivially();
- return executor.startCompute(synchronizationCallback);
+#else
+ {
+ // Find a driver that can handle all the operations.
+ // TODO: Does not handle CPU fallback (which is tricky because
+ // StepExecutor::startCompute() is designed as
+ // asynchronous).
+ // TODO: Does not actually behave asynchronously (because
+ // StepExecutor::startCompute() isn't actually asynchronous
+ // on a device as opposed to a CPU).
+ Model hidlModel;
+ mModel->setHidlModel(&hidlModel);
+ const std::vector<std::shared_ptr<Device>>& devices = DeviceManager::get()->getDrivers();
+ for (const auto& device : devices) {
+ hidl_vec<bool> supports;
+ LOG(DEBUG) << "Checking " << device->getName();
+ device->getSupportedOperations(hidlModel, &supports);
+ if (std::find(supports.begin(), supports.end(), false) == supports.end()) {
+ LOG(DEBUG) << "ExecutionBuilder::startCompute (without plan) on " << device->getName();
+ StepExecutor executor(this, mModel, device->getInterface(),
+ nullptr /* no IPreparedModel, so compile */);
+ executor.mapInputsAndOutputsTrivially();
+ return executor.startCompute(synchronizationCallback);
+ }
}
}
- // If none can, run on the CPU.
+#endif // DISABLE_PARTITIONED_EXECUTION
+
+ // Run on the CPU.
LOG(DEBUG) << "ExecutionBuilder::startCompute (without plan) on CPU";
StepExecutor executor(this, mModel,
nullptr /* no IDevice, so CPU */,
diff --git a/runtime/ExecutionBuilder.h b/runtime/ExecutionBuilder.h
index 88e3759..8fd1f01 100644
--- a/runtime/ExecutionBuilder.h
+++ b/runtime/ExecutionBuilder.h
@@ -147,12 +147,18 @@
&mOutputs.at(outputIndex));
}
+ // Executes using the (driver, preparedModel) specified at construction time.
int startCompute(sp<ExecutionCallback>* synchronizationCallback);
+ // Executes using the CPU, regardless of the (driver,
+ // preparedModel) specified at construction time.
+ int startComputeOnCpu(sp<ExecutionCallback>* synchronizationCallback);
+
+ bool isCpu() const { return mDriver == nullptr; }
+
private:
int allocatePointerArgumentsToPool(std::vector<ModelArgumentInfo>* args, Memory* memory);
int startComputeOnDevice(sp<ExecutionCallback>* synchronizationCallback);
- int startComputeOnCpu(sp<ExecutionCallback>* synchronizationCallback);
void mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
ModelArgumentInfo* executorInputOrOutput);
diff --git a/runtime/ExecutionPlan.cpp b/runtime/ExecutionPlan.cpp
index c969fe9..8369f8f 100644
--- a/runtime/ExecutionPlan.cpp
+++ b/runtime/ExecutionPlan.cpp
@@ -468,6 +468,29 @@
totalSizeOfTemporaries));
}
+
+// TODO: Find a better way to provide this functionality.
+int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
+ std::shared_ptr<StepExecutor>* executor) const {
+ *executor = nullptr;
+
+ LOG(DEBUG) << "ExecutionPlan::fallback(" << controller << ", " << executor
+ << "): mNextStepIndex = " << controller->mNextStepIndex;
+
+ if (controller->mNextStepIndex == 0) {
+ // We haven't called next().
+ return ANEURALNETWORKS_OP_FAILED;
+ }
+
+ if (controller->mNextStepIndex == Controller::kBadStepIndex) {
+ // The last call to next() did not produce an executor.
+ return ANEURALNETWORKS_OP_FAILED;
+ }
+
+ --controller->mNextStepIndex;
+ return next(controller, executor);
+}
+
int ExecutionPlan::next(std::shared_ptr<Controller> controller,
std::shared_ptr<StepExecutor>* executor) const {
*executor = nullptr;
diff --git a/runtime/ExecutionPlan.h b/runtime/ExecutionPlan.h
index 980445d..0cc6051 100644
--- a/runtime/ExecutionPlan.h
+++ b/runtime/ExecutionPlan.h
@@ -172,6 +172,9 @@
int next(std::shared_ptr<Controller> controller, std::shared_ptr<StepExecutor>* executor) const;
+ // Create the same executor as the last one created by next().
+ int fallback(std::shared_ptr<Controller> controller, std::shared_ptr<StepExecutor>* executor) const;
+
std::shared_ptr<ExecutionStep> createNewStep(const std::shared_ptr<Device> device);
void becomeSingleStep(const std::shared_ptr<Device> device,