Partial fix to allow partitions to have boundary temporaries of unknown size.

The old behavior was that we'd fall back to full model CPU execution at
compilation time; the new behavior is that we'll get ordinary
partitioned compilation and execution.

Limitations:
- Needs more testing and more tests written.
- The initial guess for the size of a boundary temporary is a single
  element.  Perhaps it would be useful to remember actual size from
  a previous execution.
- Fenced execution punts to unfenced execution (at the NDK API level)
  when plan contains subgraph outputs of unknown size.
- Operands of unknown size at control flow construct boundaries still
  falls back to full model CPU execution.

Also adds some diagnostic logging.

Test: NeuralNetworksTest_static

Bug: 132458982

Merged-In: I52e7179ff9783d184fd6bfc1c9fefc55972e942a
Change-Id: I52e7179ff9783d184fd6bfc1c9fefc55972e942a
(cherry picked from commit d6183c8db7feb5e2bdf0d2907af01418e7da809e)
diff --git a/common/CpuExecutor.cpp b/common/CpuExecutor.cpp
index 9f24775..8d23c0a 100644
--- a/common/CpuExecutor.cpp
+++ b/common/CpuExecutor.cpp
@@ -1914,6 +1914,8 @@
         const RunTimeOperandInfo& from = operands[operandIndex];
         mOutputShapes[i].dimensions = from.dimensions;
         mOutputShapes[i].isSufficient = from.isSufficient();
+        VLOG(EXECUTION) << "CpuExecutor::setOutputShapes: mOutputShapes[" << i
+                        << "] = " << toString(mOutputShapes[i]);
     }
 }
 
diff --git a/common/OperationsUtils.cpp b/common/OperationsUtils.cpp
index d1814b7..f0bcb0e 100644
--- a/common/OperationsUtils.cpp
+++ b/common/OperationsUtils.cpp
@@ -356,7 +356,7 @@
         if (dim1 != dim2 && dim1 != 1 && dim2 != 1) {
             LOG(ERROR) << "Dimensions mismatch for broadcast:\n"
                        << "First tensor: dimension " << numberOfDims1 - i << " of size " << dim1
-                       << "\nSecond tensor: dimension " << numberOfDims2 - i << "of size " << dim2;
+                       << "\nSecond tensor: dimension " << numberOfDims2 - i << " of size " << dim2;
             return false;
         }
         out->dimensions[maxDims - i] = (dim1 == 1) ? dim2 : dim1;
diff --git a/runtime/CompilationBuilder.cpp b/runtime/CompilationBuilder.cpp
index 8b2a269..051ac88 100644
--- a/runtime/CompilationBuilder.cpp
+++ b/runtime/CompilationBuilder.cpp
@@ -63,7 +63,8 @@
         mPlan.setCaching(&mCacheDir, mToken);
     }
     if (mPartitioning) {
-        int n = mModel->partitionTheWork(mDevices, mPreference, mPriority, deadline, &mPlan);
+        int n = mModel->partitionTheWork(mDevices, mPreference, mPriority, deadline, &mPlan,
+                                         mFailPartitioning);
         switch (n) {
             case ANEURALNETWORKS_NO_ERROR:
                 return n;
@@ -96,7 +97,7 @@
     VLOG(COMPILATION) << "CompilationBuilder::finish with CPU fallback";
     mPlan.reset();
     mPlan.becomeSingleStep(DeviceManager::getCpuDevice(), mModel);
-    return mPlan.finish(mPreference, mPriority, deadline);
+    return mPlan.finish(mPreference, mPriority, deadline, ANEURALNETWORKS_NO_ERROR);
 }
 
 int CompilationBuilder::setPreference(int32_t preference) {
@@ -166,9 +167,9 @@
     return ANEURALNETWORKS_NO_ERROR;
 }
 
-int CompilationBuilder::setPartitioning(uint32_t partitioning) {
+int CompilationBuilder::forTest_setPartitioning(uint32_t partitioning) {
     if (mFinished) {
-        LOG(ERROR) << "ANeuralNetworksCompilation_setPartitioning can't modify after compilation "
+        LOG(ERROR) << "CompilationBuilder::forTest_setPartitioning can't modify after compilation "
                       "finished";
         return ANEURALNETWORKS_BAD_STATE;
     }
@@ -177,6 +178,17 @@
     return ANEURALNETWORKS_NO_ERROR;
 }
 
+int CompilationBuilder::forTest_failPartitioning(int fail) {
+    if (mFinished) {
+        LOG(ERROR) << "CompilationBuilder::forTest_failPartitioning can't modify after compilation "
+                      "finished";
+        return ANEURALNETWORKS_BAD_STATE;
+    }
+
+    mFailPartitioning = fail;
+    return ANEURALNETWORKS_NO_ERROR;
+}
+
 int CompilationBuilder::createExecution(ExecutionBuilder** execution) {
     if (!mFinished) {
         LOG(ERROR) << "ANeuralNetworksExecution_create passed an unfinished compilation";
diff --git a/runtime/CompilationBuilder.h b/runtime/CompilationBuilder.h
index d94fb18..0f2db4d 100644
--- a/runtime/CompilationBuilder.h
+++ b/runtime/CompilationBuilder.h
@@ -47,8 +47,6 @@
 
     int setPreference(int32_t preference);
 
-    int setPartitioning(uint32_t partitioning);
-
     int setCaching(const std::string& cacheDir, const uint8_t* token);
 
     int setPriority(int32_t priority);
@@ -66,10 +64,17 @@
     int forEachStepRoleOfInput(uint32_t index, const StepRoleCallback& callback) const;
     int forEachStepRoleOfOutput(uint32_t index, const StepRoleCallback& callback) const;
 
-    const ExecutionPlan& forTest_getExecutionPlan() const { return mPlan; }
-
     bool createdWithExplicitDeviceList() const { return mExplicitDeviceList; }
 
+    bool hasDynamicTemporaries() const { return mPlan.hasDynamicTemporaries(); }
+
+    // These functions are solely intended for use by unit tests of the
+    // partitioning algorithm.
+    const ExecutionPlan& forTest_getExecutionPlan() const { return mPlan; }
+    int forTest_setPartitioning(uint32_t partitioning);
+    int forTest_failPartitioning(
+            int resultCode);  // If not ANEURALNETWORKS_NO_ERROR, then simulate partitioning failure
+
    private:
     const ModelBuilder* mModel;
 
@@ -83,6 +88,9 @@
     // we can override this later.
     uint32_t mPartitioning;
 
+    // For testing purposes, simulate partitioning failure.
+    int mFailPartitioning = ANEURALNETWORKS_NO_ERROR;
+
     // Once the compilation has been finished, we should not allow further
     // modifications to the compilation.
     bool mFinished = false;
diff --git a/runtime/ExecutionBuilder.cpp b/runtime/ExecutionBuilder.cpp
index 0f94e43..e36e564 100644
--- a/runtime/ExecutionBuilder.cpp
+++ b/runtime/ExecutionBuilder.cpp
@@ -20,6 +20,7 @@
 
 #include <algorithm>
 #include <limits>
+#include <map>
 #include <memory>
 #include <mutex>
 #include <optional>
@@ -46,6 +47,66 @@
 
 using namespace hal;
 
+// Partial validation of output shapes returned from driver, to ensure they
+// conform to a very specific set of rules.
+static bool validateOutputShapesFromDriver(ErrorStatus executionStatus, const ModelBuilder* model,
+                                           const std::vector<hal::OutputShape>& shapes) {
+    // Enforces the following rules (some of which are from b/154054474):
+    // - shapes vector is empty except in the case of NONE or OUTPUT_INSUFFICIENT_SIZE.
+    //   If the vector is not empty, it must have as many entries as the step model has outputs.
+    // - If NONE, then either shapes vector is empty, or every shape is
+    //   marked isSufficient and, if a tensor, has known rank.
+    // - If OUTPUT_INSUFFICIENT_SIZE, then the vector is not empty.  At least one entry
+    //   is marked !isSufficient.
+    switch (executionStatus) {
+        case ErrorStatus::NONE: {
+            NN_RET_CHECK(shapes.size() == 0 || shapes.size() == model->outputCount())
+                    << "With execution ErrorStatus " << toString(executionStatus)
+                    << " output shapes vector must be empty or of length " << model->outputCount()
+                    << " but has length " << shapes.size();
+            NN_RET_CHECK(std::all_of(shapes.begin(), shapes.end(),
+                                     [](const OutputShape& shape) { return shape.isSufficient; }))
+                    << "With execution ErrorStatus " << toString(executionStatus)
+                    << " at least one output shape is unexpectedly marked !isSufficient";
+
+            const TypeManager* tm = TypeManager::get();
+            for (uint32_t outputIndex = 0, outputCount = shapes.size(); outputIndex < outputCount;
+                 ++outputIndex) {
+                const hal::Operand& outputOperand = model->getOutputOperand(outputIndex);
+                NN_RET_CHECK(!tm->isTensorType(outputOperand.type) ||
+                             (shapes[outputIndex].dimensions.size() != 0))
+                        << "With execution ErrorStatus " << toString(executionStatus) << " output#"
+                        << outputIndex << " shape unexpectedly has zero rank";
+            }
+
+            break;
+        }
+        case ErrorStatus::OUTPUT_INSUFFICIENT_SIZE: {
+            NN_RET_CHECK(shapes.size() == model->outputCount())
+                    << "With execution ErrorStatus " << toString(executionStatus)
+                    << " output shapes vector must be of length " << model->outputCount()
+                    << " but has length " << shapes.size();
+            NN_RET_CHECK(std::any_of(shapes.begin(), shapes.end(),
+                                     [](const OutputShape& shape) { return !shape.isSufficient; }))
+                    << "With execution ErrorStatus " << toString(executionStatus)
+                    << " at least one output shape must have been marked !isSufficient";
+            break;
+        }
+        default: {
+            NN_RET_CHECK(shapes.size() == 0)
+                    << "With execution ErrorStatus " << toString(executionStatus)
+                    << " output shapes vector must be empty but has length " << shapes.size();
+            break;
+        }
+    }
+    return true;
+}
+static bool validateOutputShapesFromDriver(int executionResultCode, const ModelBuilder* model,
+                                           const std::vector<hal::OutputShape>& shapes) {
+    return validateOutputShapesFromDriver(convertResultCodeToErrorStatus(executionResultCode),
+                                          model, shapes);
+}
+
 const Timing kNoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
 
 static MeasureTiming measureTiming(const ExecutionBuilder* execution) {
@@ -497,7 +558,7 @@
 static void asyncStartComputePartitioned(ExecutionBuilder* executionBuilder,
                                          const ExecutionPlan& plan,
                                          std::shared_ptr<ExecutionPlan::Controller> controller,
-                                         bool allowFallback,
+                                         bool allowCpuFallback,
                                          const std::optional<Deadline>& deadline,
                                          const sp<ExecutionCallback>& executionCallback) {
     CHECK(executionBuilder != nullptr);
@@ -505,8 +566,12 @@
 
     std::vector<OutputShape> outputShapes = executionBuilder->getInitialOutputShapes();
     Timing timing = kNoTiming;
-    // Disallow fallback when the ExecutionPlan is simple on CPU.
-    allowFallback &= !plan.isSimpleCpu();
+    // Disallow CPU fallback when the ExecutionPlan is simple on CPU.
+    allowCpuFallback &= !plan.isSimpleCpu();
+
+    // On this iteration, do I need to repeat the previous step because it
+    // reported insufficient size?
+    bool doInsufficientSizeFallback = false;
 
     while (true) {
         VLOG(EXECUTION) << "looking for next StepExecutor";
@@ -514,13 +579,15 @@
         // Get the current step of the execution.
         std::shared_ptr<StepExecutor> executor;
         std::shared_ptr<ExecutionBurstController> burstController;
-        int n = plan.next(controller, &executor, &burstController);
+        int n = doInsufficientSizeFallback ? plan.fallback(controller, &executor, &burstController)
+                                           : plan.next(controller, &executor, &burstController);
+        doInsufficientSizeFallback = false;
         if (n != ANEURALNETWORKS_NO_ERROR) {
             // During the interpreted execution of control flow, a loop timeout
             // might occur in ExecutionPlan::next().
             bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
                                   n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
-            if (allowFallback && !missedDeadline) break;
+            if (allowCpuFallback && !missedDeadline) break;
             executionCallback->notify(convertResultCodeToErrorStatus(n), {}, kNoTiming);
             return;
         }
@@ -536,36 +603,57 @@
         // Attempt to execute a single step of the execution.
         auto [stepN, stepOutputShapes, stepTiming] = executor->compute(deadline, burstController);
 
-        // Update global outputs.
-        if (!executor->updateOutputShapes(stepOutputShapes, &outputShapes)) {
+        // Update global outputs and dynamic temporaries.
+        StepExecutor::UpdateOutputShapes updateOutputShapes = {};
+        if (!executor->updateOutputShapes(stepN, stepOutputShapes, &outputShapes,
+                                          &updateOutputShapes)) {
             stepN = ANEURALNETWORKS_OP_FAILED;
         }
 
         // If execution was successful, continue to next step.
         if (stepN == ANEURALNETWORKS_NO_ERROR) {
-            // We only support collection of timing information in the case of a
-            // single step, so it's safe to just keep track of the last step's
-            // timing information.
-            timing = stepTiming;
+            if (updateOutputShapes.zeroSizedInput) {
+                // We'll need to do full model CPU fallback
+                VLOG(EXECUTION) << "updateOutputShapes.zeroSizedInput";
+                stepN = ANEURALNETWORKS_OP_FAILED;
+            } else {
+                CHECK(executor->areDynamicTemporariesAllocated());
+                // We only support collection of timing information in the case
+                // of a single step, so it's safe to just keep track of the last
+                // step's timing information.
+                timing = stepTiming;
+                continue;
+            }
+        }
+
+        if (stepN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
+            VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: " << toString(updateOutputShapes);
+            if (updateOutputShapes.mainOutputInsufficient ||
+                !updateOutputShapes.updatedDynamicTemporary) {
+                // Either:
+                // - At least one main model output is not of sufficient size; or
+                // - we didn't learn anything new about dynamic temporaries.
+                // Neither of these is recoverable, so end execution.
+                const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
+                executionCallback->notify(stepStatus, outputShapes, kNoTiming);
+                return;
+            }
+            // Every main model output is of sufficient size.  This implies that
+            // at least one dynamic temporary is not of sufficient size.  This
+            // is recoverable.
+            doInsufficientSizeFallback = true;
             continue;
         }
 
-        // OUTPUT_INSUFFICIENT_SIZE is not recoverable, so end execution.
-        if (stepN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
-            const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
-            executionCallback->notify(stepStatus, outputShapes, kNoTiming);
-            return;
-        }
-
-        // If fallback is not allowed and there was an error, end execution.
-        if (!allowFallback) {
+        // If CPU fallback is not allowed and there was an error, end execution.
+        if (!allowCpuFallback) {
             const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
             executionCallback->notify(stepStatus, {}, kNoTiming);
             return;
         }
 
         // If CPU execution was already attempted, either:
-        // (1) perform a full fallback if the plan is not simple, or
+        // (1) perform a full CPU fallback if the plan is not simple, or
         // (2) return from the function with an error
         if (executorIsCpu) {
             if (!plan.isSimple()) break;
@@ -574,42 +662,77 @@
         }
 
         // If the code reaches this point, attempt a partial fallback to CPU.
-        CHECK(allowFallback);
-        auto [fallbackN, fallbackOutputShapes, fallbackTiming, fallbackExecutor] =
-                cpuFallbackPartial(plan, controller);
-
-        // Update global outputs.
-        if (fallbackExecutor != nullptr &&
-            !fallbackExecutor->updateOutputShapes(fallbackOutputShapes, &outputShapes)) {
-            fallbackN = ANEURALNETWORKS_OP_FAILED;
+        CHECK(allowCpuFallback);
+        if (updateOutputShapes.zeroSizedInput) {
+            // Do not attempt a partial fallback.
+            break;
         }
+        while (true) {
+            auto [fallbackN, fallbackOutputShapes, fallbackTiming, fallbackExecutor] =
+                    cpuFallbackPartial(plan, controller);
 
-        // If execution was successful, continue to next step.
-        if (fallbackN == ANEURALNETWORKS_NO_ERROR) {
-            // We only support collection of timing information in the case of a
-            // single step, so it's safe to just keep track of the last step's
-            // timing information.
-            timing = fallbackTiming;
-            continue;
-        }
+            // Update global outputs and dynamic temporaries.
+            StepExecutor::UpdateOutputShapes fallbackUpdateOutputShapes = {};
+            if (fallbackExecutor != nullptr &&
+                !fallbackExecutor->updateOutputShapes(fallbackN, fallbackOutputShapes,
+                                                      &outputShapes, &fallbackUpdateOutputShapes)) {
+                fallbackN = ANEURALNETWORKS_OP_FAILED;
+            }
 
-        // OUTPUT_INSUFFICIENT_SIZE is not recoverable, so end execution.
-        if (fallbackN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
-            const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
-            executionCallback->notify(fallbackStatus, outputShapes, kNoTiming);
-            return;
-        }
+            // If execution was successful, continue to next step.
+            if (fallbackN == ANEURALNETWORKS_NO_ERROR) {
+                if (fallbackUpdateOutputShapes.zeroSizedInput) {
+                    // We'll need to do full model CPU fallback
+                    VLOG(EXECUTION) << "fallbackUpdateOutputShapes.zeroSizedInput";
+                    fallbackN = ANEURALNETWORKS_OP_FAILED;
+                    break;
+                }
+                CHECK(fallbackExecutor->areDynamicTemporariesAllocated());
+                // We only support collection of timing information in the case of a
+                // single step, so it's safe to just keep track of the last step's
+                // timing information.
+                timing = fallbackTiming;
+                goto nextStep;
+            }
 
-        // Do not fallback twice if the ExecutionPlan is simple.
-        if (plan.isSimple()) {
-            const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
-            executionCallback->notify(fallbackStatus, {}, kNoTiming);
-            return;
+            if (fallbackN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
+                VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: "
+                                << toString(fallbackUpdateOutputShapes);
+                if (fallbackUpdateOutputShapes.mainOutputInsufficient ||
+                    !fallbackUpdateOutputShapes.updatedDynamicTemporary) {
+                    // Either:
+                    // - At least one main model output is not of sufficient size; or
+                    // - we didn't learn anything new about dynamic temporaries.
+                    // Neither of these is recoverable, so end execution.
+                    const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
+                    executionCallback->notify(fallbackStatus, outputShapes, kNoTiming);
+                    return;
+                }
+                // Every main model output is of sufficient size.  This implies
+                // that at least one dynamic temporary is not of sufficient
+                // size.  This is recoverable.
+                continue;
+            }
+
+            // Do not fallback twice if the ExecutionPlan is simple.
+            if (plan.isSimple()) {
+                const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
+                executionCallback->notify(fallbackStatus, {}, kNoTiming);
+                return;
+            }
+
+            // If the code reaches this point, then there was an error with the
+            // fallback. In this case, attempt full fallback.
+            break;
         }
 
         // If the code reaches this point, then there was an error with the
         // fallback. In this case, attempt full fallback.
         break;
+
+    nextStep:
+        // Bottom of the outer loop
+        continue;
     }
 
     // If the code has reached this point, a potentially recoverable error
@@ -623,16 +746,28 @@
 // In case of partitioned execution, startComputeFenced call will return the sync
 // fence and the fenced compute callback returned from the last partition.
 // Any failed partition will result in the whole execution fallback to CPU if
-// allowFallback is set to true.
+// allowCpuFallback is set to true.
 static std::tuple<int, int, sp<hal::IFencedExecutionCallback>> startComputeFenced(
         ExecutionBuilder* executionBuilder, const ExecutionPlan& plan,
         std::shared_ptr<ExecutionPlan::Controller> controller, const std::vector<int>& waitFor,
         uint64_t timeoutDurationAfterFence, const std::optional<Deadline>& deadline,
-        bool allowFallback) {
+        bool allowCpuFallback) {
+    // We should have detected this earlier in the call chain and fallen back to
+    // non-fenced execution.  This is an implementation limitation: In order to
+    // support dynamic temporarires in this code, we'd need to implement
+    // something like the following:
+    // - If a partition has outputs of unknown size, execute that partition in a
+    //   non fenced fashion, just as if it were scheduled on a driver that does
+    //   not support fenced execution.
+    // - Implement something similar to the code in asyncStartComputePartitioned()
+    //   that handles a step execution that fails with
+    //   ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE.
+    CHECK(!executionBuilder->getCompilation()->hasDynamicTemporaries());
+
     CHECK(executionBuilder != nullptr);
     VLOG(EXECUTION) << "ExecutionBuilder::computeFenced (from plan, iteratively)";
     // Disallow fallback when the ExecutionPlan is simple on CPU.
-    allowFallback &= !plan.isSimpleCpu();
+    allowCpuFallback &= !plan.isSimpleCpu();
 
     // Initiate waitForFds, syncFence for the first step.
     std::vector<int> waitForFds = waitFor;
@@ -650,7 +785,7 @@
             // might occur in ExecutionPlan::next().
             bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
                                   n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
-            if (allowFallback && !missedDeadline) break;
+            if (allowCpuFallback && !missedDeadline) break;
             // Return -1 for the sync fence fd, and nullptr for the callback.
             return std::make_tuple(n, -1, nullptr);
         }
@@ -686,8 +821,8 @@
         if (stepN == ANEURALNETWORKS_NO_ERROR) {
             continue;
         }
-        // If fallback is not allowed and there was an error, end execution.
-        if (!allowFallback) {
+        // If CPU fallback is not allowed and there was an error, end execution.
+        if (!allowCpuFallback) {
             return std::make_tuple(stepN, -1, nullptr);
         }
 
@@ -767,12 +902,13 @@
         }
     }
     mStarted = true;
-    const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
+    const bool allowCpuFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
     std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this, nullptr);
     VLOG(EXECUTION) << "ExecutionBuilder::computeFenced";
     int result;
-    std::tie(result, mSyncFenceFd, mFencedExecutionCallback) = startComputeFenced(
-            this, *mPlan, controller, waitFor, timeoutDurationAfterFence, deadline, allowFallback);
+    std::tie(result, mSyncFenceFd, mFencedExecutionCallback) =
+            startComputeFenced(this, *mPlan, controller, waitFor, timeoutDurationAfterFence,
+                               deadline, allowCpuFallback);
     *syncFence = mSyncFenceFd;
     return result;
 }
@@ -826,14 +962,14 @@
     // asynchronous thread -- take the asynchronous thread logic out of
     // CpuPreparedModel::execute() and use it to wrap the plan-based-path.
     mStarted = true;
-    const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
+    const bool allowCpuFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
     std::shared_ptr<ExecutionPlan::Controller> controller =
             mPlan->makeController(this, burstBuilder);
     if (synchronous) {
         VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)";
         sp<ExecutionCallback> localSynchronizationCallback = new ExecutionCallback();
         localSynchronizationCallback->setOnFinish(wrappedFinish);
-        asyncStartComputePartitioned(this, *mPlan, controller, allowFallback, deadline,
+        asyncStartComputePartitioned(this, *mPlan, controller, allowCpuFallback, deadline,
                                      localSynchronizationCallback);
         localSynchronizationCallback->wait();
         if (mMeasureTiming) {
@@ -854,13 +990,13 @@
         executionCallback->setOnFinish(wrappedFinish);
         if (DeviceManager::get()->syncExecRuntime()) {
             VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API, non-threaded)";
-            asyncStartComputePartitioned(this, *mPlan, controller, allowFallback, deadline,
+            asyncStartComputePartitioned(this, *mPlan, controller, allowCpuFallback, deadline,
                                          executionCallback);
         } else {
             VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API)";
             std::thread asyncExecution(
-                    [this, controller, allowFallback, deadline, executionCallback] {
-                        asyncStartComputePartitioned(this, *mPlan, controller, allowFallback,
+                    [this, controller, allowCpuFallback, deadline, executionCallback] {
+                        asyncStartComputePartitioned(this, *mPlan, controller, allowCpuFallback,
                                                      deadline, executionCallback);
                     });
             executionCallback->bindThread(std::move(asyncExecution));
@@ -884,7 +1020,7 @@
 }
 
 // Check if the dimensions "to" is updatable by dimensions "from", where "from" must
-// have a higher specification level.
+// have no lower a specification level.
 static bool isUpdatable(const std::vector<uint32_t>& to, const std::vector<uint32_t>& from) {
     if (to.size() == 0) return true;
     NN_RET_CHECK_EQ(to.size(), from.size());
@@ -894,7 +1030,17 @@
     return true;
 }
 
-bool ExecutionBuilder::updateOutputShapes(const std::vector<OutputShape>& outputShapes) {
+static bool isZeroSizedTensor(int executionResultCode, const OutputShape& outputShape) {
+    return (executionResultCode == ANEURALNETWORKS_NO_ERROR) && outputShape.isSufficient &&
+           outputShape.dimensions.size() &&
+           (std::find(outputShape.dimensions.begin(), outputShape.dimensions.end(), uint32_t(0)) !=
+            outputShape.dimensions.end());
+}
+
+bool ExecutionBuilder::updateOutputShapes(ErrorStatus status,
+                                          const std::vector<OutputShape>& outputShapes) {
+    NN_RET_CHECK(validateOutputShapesFromDriver(status, mModel, outputShapes));
+
     if (outputShapes.size() == 0) {
         return true;
     }
@@ -927,7 +1073,7 @@
     CHECK(!mFinishedWithoutSyncFence) << "ExecutionBuilder::finishWithoutSyncFence is called twice";
     CHECK(!hasSyncFence())
             << "ExecutionBuilder::finishWithoutSyncFence is called when hasSyncFence()";
-    if (!updateOutputShapes(outputShapes) || !updateMemories()) {
+    if (!updateOutputShapes(status, outputShapes) || !updateMemories()) {
         status = ErrorStatus::GENERAL_FAILURE;
     }
     bool success = status == ErrorStatus::NONE;
@@ -951,19 +1097,124 @@
     return status;
 }
 
-bool StepExecutor::updateOutputShapes(const std::vector<OutputShape>& from,
-                                      std::vector<OutputShape>* to) {
+std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes) {
+    return "{ .updatedDynamicTemporary = " +
+           std::to_string(updateOutputShapes.updatedDynamicTemporary) +
+           ", .mainOutputInsufficient = " +
+           std::to_string(updateOutputShapes.mainOutputInsufficient) + "}";
+}
+
+bool StepExecutor::updateOutputShapes(int executionResultCode, const std::vector<OutputShape>& from,
+                                      std::vector<OutputShape>* to, UpdateOutputShapes* update) {
+    CHECK(update != nullptr);
+    *update = {.updatedDynamicTemporary = false,
+               .mainOutputInsufficient = false,
+               .zeroSizedInput = false};
+
+    NN_RET_CHECK(validateOutputShapesFromDriver(executionResultCode, mModel, from));
+
     if (from.size() == 0) {
         return true;
     }
+
+    if (VLOG_IS_ON(EXECUTION)) {
+        for (const auto& shape : from) {
+            VLOG(EXECUTION) << "updateOutputShapes: " << toString(shape);
+        }
+    }
+
     if (mExecutionStep != nullptr) {
         const auto& indexMapping = mExecutionStep->getOutputIndexStepModelToMainModel();
         NN_RET_CHECK_LE(indexMapping.size(), from.size());
         for (uint32_t i = 0, e = indexMapping.size(); i < e; i++) {
-            uint32_t toIndex = indexMapping[i];
+            const uint32_t toIndex = indexMapping[i];
             NN_RET_CHECK_GT(to->size(), toIndex);
             NN_RET_CHECK(isUpdatable(to->at(toIndex).dimensions, from[i].dimensions));
             (*to)[toIndex] = from[i];
+            update->mainOutputInsufficient |= !(*to)[toIndex].isSufficient;
+            if (mExecutionStep->getModelOutputsThatAreDownstreamInputs().count(toIndex) &&
+                isZeroSizedTensor(executionResultCode, from[i])) {
+                update->zeroSizedInput = true;
+            }
+        }
+
+        if (!mDynamicTemporaries->empty()) {
+            // TODO(b/157236079): Instead of computing this here, precompute it in ExecutionStep?
+            std::map<uint32_t, uint32_t> operandIndexStepModelOutputToSourceModelTemp;
+            for (const auto& entry : mExecutionStep->getTempsAsStepModelOutputs()) {
+                operandIndexStepModelOutputToSourceModelTemp.emplace(entry.second, entry.first);
+            }
+
+            const uint32_t sourceModelIndex = mExecutionStep->getSourceModelIndex();
+            for (uint32_t i = 0, e = mModel->outputCount(); i < e; i++) {
+                const uint32_t stepModelOperandIndex = mModel->getOutputOperandIndex(i);
+                const auto it =
+                        operandIndexStepModelOutputToSourceModelTemp.find(stepModelOperandIndex);
+                if (it == operandIndexStepModelOutputToSourceModelTemp.end()) {
+                    continue;
+                }
+                const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, it->second);
+                VLOG(EXECUTION) << "updateOutputShapes checking to see if output#" << i
+                                << " sourceOperandIndex = (" << sourceOperandIndex.first << ", "
+                                << sourceOperandIndex.second << ") is a dynamic temporary";
+                // This is a temporary, but it might not be a dynamic temporary.
+                const auto loc = mDynamicTemporaries->lookup(sourceOperandIndex, false);
+                if (loc == std::nullopt) {
+                    continue;
+                }
+                NN_RET_CHECK(isUpdatable(*loc->dimensions, from[i].dimensions));
+                bool changedShape = false;
+                const uint32_t actualSize = TypeManager::get()->getSizeOfData(
+                        mModel->getOperand(stepModelOperandIndex).type, from[i].dimensions);
+                if (actualSize > 0) {
+                    changedShape = mDynamicTemporaries->redeclare(sourceOperandIndex,
+                                                                  from[i].dimensions, actualSize);
+                } else if (!from[i].isSufficient) {
+                    NN_RET_CHECK(loc->length < UINT32_MAX / 2)
+                            << "output#" << i << " length overflow";
+                    changedShape = mDynamicTemporaries->redeclare(
+                            sourceOperandIndex, from[i].dimensions, 2 * loc->length);
+                } else {
+                    // The combination of not-fully-specified dimensions
+                    // and isSufficient means that we have no
+                    // information about whether the size of the dynamic
+                    // temporary is adequate.
+                    VLOG(EXECUTION) << "updateOutputShapes skipping redeclaration for output#" << i;
+                    if (executionResultCode == ANEURALNETWORKS_NO_ERROR) {
+                        NN_RET_CHECK(isZeroSizedTensor(executionResultCode, from[i]));
+                        // This is a zero-sized tensor, and by
+                        // definition, any dynamic temporary is an input
+                        // to an execution step.
+                        update->zeroSizedInput = true;
+                    }
+                }
+                if (changedShape) {
+                    // TODO: find a better place for this comment.
+                    //
+                    // isUpdatable(a, b) imposes a partial ordering a <=
+                    // b.  Every fully specified dimensions vector is an
+                    // upper bound of that ordering.  Therefore, any
+                    // change in dimensions moves towards an upper
+                    // bound, and hence there are a finite number of
+                    // such changes possible.
+                    //
+                    // actualSize can only be computed from dimensions
+                    // that are an upper bound.  Therefore, once
+                    // actualSize is computed, it will not change.
+                    //
+                    // If dimensions are not fully specified, and
+                    // estimated size changes, it increases.  There is
+                    // an upper bound on estimated size to avoid
+                    // overflow.
+                    //
+                    // Therefore, if we retry only when dimensions or
+                    // size chage, and we stop retrying if we would
+                    // otherwise overflow, we should only retry a finite
+                    // number of times.
+                    update->updatedDynamicTemporary = true;
+                }
+            }
+            mDynamicTemporaries->vlogDump("finished updateOutputShapes");
         }
     } else {
         NN_RET_CHECK_EQ(from.size(), to->size());
@@ -977,19 +1228,26 @@
 
 StepExecutor::StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
                            std::shared_ptr<Device> device,
-                           std::shared_ptr<PreparedModel> preparedModel, const ExecutionStep* step)
+                           std::shared_ptr<PreparedModel> preparedModel, const ExecutionStep* step,
+                           DynamicTemporaries* dynamicTemporaries)
     : mExecutionBuilder(executionBuilder),
       mExecutionStep(step),
+      mDynamicTemporaries(dynamicTemporaries),
       mModel(model),
       mDevice(device),
       mPreparedModel(preparedModel),
       mInputs(model->inputCount()),
       mOutputs(model->outputCount()) {
     CHECK(mDevice != nullptr);
+    CHECK_EQ(step == nullptr, dynamicTemporaries == nullptr);
     VLOG(EXECUTION) << "StepExecutor::StepExecutor with " << mInputs.size() << " inputs and "
                     << mOutputs.size() << " outputs";
 }
 
+bool StepExecutor::areDynamicTemporariesAllocated() const {
+    return !mDynamicTemporaries || mDynamicTemporaries->allocated(mExecutionStep->getIndex());
+}
+
 void StepExecutor::mapInputsAndOutputsTrivially() {
     mInputs = mExecutionBuilder->mInputs;
     mOutputs = mExecutionBuilder->mOutputs;
@@ -1019,33 +1277,56 @@
 
 int StepExecutor::setInputOrOutputFromMemory(const Operand& inputOrOutputOperand,
                                              const Memory* memory, uint32_t offset,
+                                             const hal::hidl_vec<uint32_t>& dimensions,
+                                             std::optional<uint32_t> length,
                                              ModelArgumentInfo* inputOrOutputInfo) {
     // Should be similar to
     //     ExecutionBuilder::setInputFromMemory()
     //     ExecutionBuilder::setOutputFromMemory()
 
     uint32_t poolIndex = mMemories.add(memory);
-    uint32_t length = TypeManager::get()->getSizeOfData(inputOrOutputOperand);
+    uint32_t lengthVal = length.value_or(TypeManager::get()->getSizeOfData(inputOrOutputOperand));
     CHECK(inputOrOutputInfo->unspecified());
     int n;
     std::tie(n, *inputOrOutputInfo) =
             ModelArgumentInfo::createFromMemory(inputOrOutputOperand,
-                                                /*type=*/nullptr, poolIndex, offset, length);
+                                                /*type=*/nullptr, poolIndex, offset, lengthVal);
+    if (n == ANEURALNETWORKS_NO_ERROR && dimensions.size()) {
+        CHECK(isUpdatable(inputOrOutputInfo->dimensions(), dimensions));
+        inputOrOutputInfo->dimensions() = dimensions;
+    }
     return n;
 }
 
+static std::string toString(std::vector<uint32_t> dimensions) {
+    std::string ret = "(";
+    bool wroteOne = false;
+    for (uint32_t dimension : dimensions) {
+        if (wroteOne) {
+            ret += ", ";
+        } else {
+            wroteOne = true;
+        }
+        ret += std::to_string(dimension);
+    }
+    ret += ")";
+    return ret;
+};
+
 static void logArguments(const char* kind, const std::vector<ModelArgumentInfo>& args) {
     for (unsigned i = 0; i < args.size(); i++) {
         const auto& arg = args[i];
         std::string prefix = kind + std::string("[") + std::to_string(i) + "] = ";
         switch (arg.state()) {
             case ModelArgumentInfo::POINTER:
-                VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer()) << ")";
+                VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer()) << ") dim"
+                                << toString(arg.dimensions());
                 break;
             case ModelArgumentInfo::MEMORY:
                 VLOG(EXECUTION) << prefix << "MEMORY("
                                 << "pool=" << arg.locationAndLength().poolIndex << ", "
-                                << "off=" << arg.locationAndLength().offset << ")";
+                                << "off=" << arg.locationAndLength().offset << ") dim"
+                                << toString(arg.dimensions());
                 break;
             case ModelArgumentInfo::HAS_NO_VALUE:
                 VLOG(EXECUTION) << prefix << "HAS_NO_VALUE";
diff --git a/runtime/ExecutionBuilder.h b/runtime/ExecutionBuilder.h
index f61df4c..f9eff4e 100644
--- a/runtime/ExecutionBuilder.h
+++ b/runtime/ExecutionBuilder.h
@@ -19,6 +19,7 @@
 
 #include <atomic>
 #include <memory>
+#include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
@@ -38,6 +39,7 @@
 class BurstBuilder;
 class CompilationBuilder;
 class Device;
+class DynamicTemporaries;
 class ExecutionBurstController;
 class ExecutionPlan;
 class ExecutionStep;
@@ -134,7 +136,8 @@
     const CompilationBuilder* mCompilation;
 
     // Update output dimensional information from OutputShape to ModelArgumentInfo.
-    bool updateOutputShapes(const std::vector<hal::OutputShape>& outputShapes);
+    bool updateOutputShapes(hal::ErrorStatus status,
+                            const std::vector<hal::OutputShape>& outputShapes);
 
     bool updateMemories();
 
@@ -226,9 +229,16 @@
     //     Contains the output index mapping from the excerpted "step" model to
     //     main model if the execution has multiple "steps". Must be nullptr
     //     otherwise.
+    //     (step == nullptr) == (dynamicTemporaries == nullptr)
+    // dynamicTemporaries
+    //     If the execution has multiple "steps", describes the temporaries
+    //     of source models that do not have fully specified types and are outputs
+    //     of "step" models. Must be nullptr otherwise.
+    //     (step == nullptr) == (dynamicTemporaries == nullptr)
     StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
                  std::shared_ptr<Device> device, std::shared_ptr<PreparedModel> preparedModel,
-                 const ExecutionStep* step = nullptr);
+                 const ExecutionStep* step = nullptr,
+                 DynamicTemporaries* dynamicTemporaries = nullptr);
 
     // Map inputs and outputs from ExecutionBuilder to StepExecutor,
     // in the case where we have a single-"step" execution (i.e., the executor
@@ -236,8 +246,17 @@
     void mapInputsAndOutputsTrivially();
 
     // Update output shapes with shapes returned from execution.
-    bool updateOutputShapes(const std::vector<hal::OutputShape>& from,
-                            std::vector<hal::OutputShape>* to);
+    struct UpdateOutputShapes {
+        // These fields are meaningless unless updateOutputShapes() returns true
+        bool updatedDynamicTemporary;  // did shape (dimensions, size) information change for at
+                                       // least one dynamic temporary?
+        bool mainOutputInsufficient;  // is at least one main model output written by this execution
+                                      // marked !isSufficient?
+        bool zeroSizedInput;  // is at least one output of this execution step a zero-sized tensor
+                              // that needs to be read by some other step of the same execution?
+    };
+    bool updateOutputShapes(int executionResultCode, const std::vector<hal::OutputShape>& from,
+                            std::vector<hal::OutputShape>* to, UpdateOutputShapes* update);
 
     // Map inputs and outputs from ExecutionBuilder to StepExecutor,
     // one at a time.  Note that these are input/output indexes, not
@@ -252,15 +271,23 @@
         mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mInputs[executorIndex]);
     }
 
-    // The input or output is assumed to have the size of the
-    // corresponding operand.
-    int setInputFromMemory(uint32_t inputIndex, const Memory* memory, uint32_t offset) {
+    // If no length is provided, the input or output is assumed to have the length
+    // of the operand.  dimensions must either have zero rank or must be
+    // consistent with and at least as well specified as operand dimensions
+    // (i.e., either rank must match, or operand rank must be zero; and for each
+    // individual dimension, either dimension must match, or operand dimension
+    // must be zero).
+    int setInputFromMemory(uint32_t inputIndex, const Memory* memory, uint32_t offset,
+                           const hal::hidl_vec<uint32_t>& dimensions = {},
+                           std::optional<uint32_t> length = std::nullopt) {
         return setInputOrOutputFromMemory(mModel->getInputOperand(inputIndex), memory, offset,
-                                          &mInputs.at(inputIndex));
+                                          dimensions, length, &mInputs.at(inputIndex));
     }
-    int setOutputFromMemory(uint32_t outputIndex, const Memory* memory, uint32_t offset) {
+    int setOutputFromMemory(uint32_t outputIndex, const Memory* memory, uint32_t offset,
+                            const hal::hidl_vec<uint32_t>& dimensions = {},
+                            std::optional<uint32_t> length = std::nullopt) {
         return setInputOrOutputFromMemory(mModel->getOutputOperand(outputIndex), memory, offset,
-                                          &mOutputs.at(outputIndex));
+                                          dimensions, length, &mOutputs.at(outputIndex));
     }
 
     // Executes using the (driver, preparedModel) specified at construction time.
@@ -280,12 +307,24 @@
             const std::vector<int>& wait_for, uint64_t timeoutDurationAfterFence,
             const std::optional<Deadline>& deadline);
 
+    // Do the dynamic temporaries defined by this step have valid allocations?
+    // (true if there are no dynamic temporaries defined by this step.)
+    bool areDynamicTemporariesAllocated() const;
+
    private:
     void mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
                           ModelArgumentInfo* executorInputOrOutput);
 
+    // If no length is provided, the input or output is assumed to have the length
+    // of the corresponding operand.  dimensions must either have zero rank or
+    // must be consistent with and at least as well specified as operand
+    // dimensions (i.e., either rank must match, or operand rank must be zero;
+    // and for each individual dimension, either dimension must match, or
+    // operand dimension must be zero).
     int setInputOrOutputFromMemory(const hal::Operand& inputOrOutputOperand, const Memory* memory,
-                                   uint32_t offset, ModelArgumentInfo* inputOrOutputInfo);
+                                   uint32_t offset, const hal::hidl_vec<uint32_t>& dimensions,
+                                   std::optional<uint32_t> length,
+                                   ModelArgumentInfo* inputOrOutputInfo);
 
     std::tuple<int, std::vector<hal::OutputShape>, hal::Timing> computeWithMemories(
             const std::optional<Deadline>& deadline, const std::vector<const Memory*>& memories,
@@ -295,7 +334,10 @@
     ExecutionBuilder* mExecutionBuilder;
 
     // describes the single execution step
-    const ExecutionStep* mExecutionStep = nullptr;
+    const ExecutionStep* mExecutionStep;
+
+    // describes the dynamic temporaries
+    DynamicTemporaries* mDynamicTemporaries;
 
     // model to be executed on the executor, in both original and
     // compiled forms; and device on which to execute it
@@ -318,6 +360,8 @@
     MemoryTracker mMemories;
 };
 
+std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes);
+
 }  // namespace nn
 }  // namespace android
 
diff --git a/runtime/ExecutionPlan.cpp b/runtime/ExecutionPlan.cpp
index da0c003..97bfacd 100644
--- a/runtime/ExecutionPlan.cpp
+++ b/runtime/ExecutionPlan.cpp
@@ -180,8 +180,165 @@
     }
 }
 
+uint32_t addTemporaryOfSize(uint32_t* totalSizeOfTemporaries, uint32_t size) {
+    // TODO: what about overflow?
+    *totalSizeOfTemporaries += alignBytesNeeded(*totalSizeOfTemporaries, size);
+    const uint32_t offset = *totalSizeOfTemporaries;
+    *totalSizeOfTemporaries += size;
+    return offset;
+};
+
+std::string toString(SourceOperandIndex sourceOperandIndex) {
+    return "(" + std::to_string(sourceOperandIndex.first) + ", " +
+           std::to_string(sourceOperandIndex.second) + ")";
+};
+
+std::string toString(hidl_vec<uint32_t> dimensions) {
+    std::string ret = "(";
+    bool wroteOne = false;
+    for (uint32_t dimension : dimensions) {
+        if (wroteOne) {
+            ret += ", ";
+        } else {
+            wroteOne = true;
+        }
+        ret += std::to_string(dimension);
+    }
+    ret += ")";
+    return ret;
+};
+
 }  // namespace
 
+void DynamicTemporaries::vlogDump(const char* context) const {
+    if (empty()) {
+        return;
+    }
+    if (context) {
+        VLOG(EXECUTION) << "DynamicTemporaries: \"" << context << "\"";
+    }
+    for (const auto& temp : mSourceOperandToTemporary) {
+        VLOG(EXECUTION) << "DynamicTemporaries: sourceOperandIndex = " << toString(temp.first)
+                        << ", stepIndex = " << temp.second.stepIndex
+                        << ", offset = " << temp.second.offset
+                        << ", dimensions = " << toString(temp.second.dimensions)
+                        << ", length = " << temp.second.length;
+    }
+}
+
+void DynamicTemporaries::declare(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex,
+                                 const hidl_vec<uint32_t>& initialDimensions,
+                                 uint32_t initialLength) {
+    VLOG(EXECUTION) << "DynamicTemporaries::declare(sourceOperandIndex = "
+                    << toString(sourceOperandIndex) << ", stepIndex = " << stepIndex
+                    << ", initialDimensions = " << toString(initialDimensions)
+                    << ", initialLength = " << initialLength << ")";
+    CHECK(!mDeclared);
+    CHECK_GT(initialLength, 0u);
+    auto [_, isNew] = mSourceOperandToTemporary.emplace(
+            sourceOperandIndex,
+            InternalLocationAndShape{stepIndex, 0, initialDimensions, initialLength});
+    CHECK(isNew);
+    mStepIndexToSourceOperandIndexes[stepIndex].emplace_back(sourceOperandIndex);
+}
+
+bool DynamicTemporaries::redeclare(SourceOperandIndex sourceOperandIndex,
+                                   const hidl_vec<uint32_t>& newDimensions, uint32_t newLength) {
+    auto createAndLogResult = [sourceOperandIndex, &newDimensions, newLength](bool changedShape) {
+        VLOG(EXECUTION) << "DynamicTemporaries::redeclare(sourceOperandIndex = "
+                        << toString(sourceOperandIndex)
+                        << ", newDimensions = " << toString(newDimensions)
+                        << ", newLength = " << newLength << ") -> " << toString(changedShape);
+        return changedShape;
+    };
+
+    CHECK(mDeclared);
+    CHECK_GT(newLength, 0u);
+
+    InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex);
+    if (temp.length == newLength && temp.dimensions == newDimensions) {
+        return createAndLogResult(false);
+    }
+    if (temp.length < newLength) {
+        // Otherwise allocation remains valid, even if it may be suboptimal
+        // (because it uses more space than needed).  Use case: Don't force
+        // client to allocate again just because the client reported more
+        // accurate shape information.
+        mAllocatedStepIndexes.erase(temp.stepIndex);
+    }
+    temp.length = newLength;
+    temp.dimensions = newDimensions;
+    return createAndLogResult(true);
+}
+
+int DynamicTemporaries::allocate(uint32_t stepIndex) {
+    VLOG(EXECUTION) << "DynamicTemporaries::allocate(stepIndex = " << stepIndex << ")";
+
+    CHECK(mDeclared);
+
+    const auto sourceOperandIndexesI = mStepIndexToSourceOperandIndexes.find(stepIndex);
+    if (sourceOperandIndexesI == mStepIndexToSourceOperandIndexes.end()) {
+        return ANEURALNETWORKS_NO_ERROR;
+    }
+
+    // perform layout
+    uint32_t newSize = 0;
+    for (const auto sourceOperandIndex : sourceOperandIndexesI->second) {
+        InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex);
+        temp.offset = addTemporaryOfSize(&newSize, temp.length);
+    }
+
+    // perform (re-)allocation
+    // TODO: Today we may shrink the allocation in order to avoid wasting memory.  Is this important
+    //       to conserve memory, or do we waste time reallocating?
+    const double kWaste = 0.2 /* arbitrary */;  // Willing to waste space to avoid
+                                                // deallocation/reallocation overhead
+    auto& memory = mStepIndexToMemory[stepIndex];
+    const uint32_t oldSize = (memory ? memory->getSize() : 0);
+    if ((oldSize >= newSize) && (oldSize <= newSize * (1 + kWaste))) {
+        // Suitable allocation already exists; nothing to do
+    } else {
+        int n;
+        std::tie(n, memory) = MemoryAshmem::create(newSize);
+        if (n != ANEURALNETWORKS_NO_ERROR) {
+            LOG(ERROR) << "Failed to allocate dynamic temporaries of size " << newSize
+                       << " for step " << stepIndex;
+            mAllocatedStepIndexes.erase(stepIndex);
+            return n;
+        }
+    }
+
+    mAllocatedStepIndexes.insert(stepIndex);
+    return ANEURALNETWORKS_NO_ERROR;
+}
+
+bool DynamicTemporaries::allocated(uint32_t stepIndex) const {
+    return (mStepIndexToSourceOperandIndexes.find(stepIndex) ==
+            mStepIndexToSourceOperandIndexes.end()) ||
+           mAllocatedStepIndexes.count(stepIndex);
+}
+
+std::optional<DynamicTemporaries::LocationAndShape> DynamicTemporaries::lookup(
+        SourceOperandIndex sourceOperandIndex, bool mustBeAllocated) const {
+    CHECK(mDeclared);
+    if (auto it = mSourceOperandToTemporary.find(sourceOperandIndex);
+        it != mSourceOperandToTemporary.end()) {
+        const InternalLocationAndShape& temp = it->second;
+        const bool isAllocated = allocated(temp.stepIndex);
+        if (mustBeAllocated) {
+            CHECK(isAllocated) << "Source operand " << toString(sourceOperandIndex)
+                               << " must be allocated";
+        }
+        if (isAllocated) {
+            return LocationAndShape{mStepIndexToMemory.at(temp.stepIndex).get(), temp.offset,
+                                    &temp.dimensions, temp.length};
+        } else {
+            return LocationAndShape{nullptr, ~uint32_t(0), &temp.dimensions, temp.length};
+        }
+    }
+    return std::nullopt;
+}
+
 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex, uint32_t sourceModelIndex,
                              std::shared_ptr<Device> device)
     : mPlan(plan),
@@ -283,6 +440,10 @@
                 // The first time we've seen this operand is as an
                 // output.
                 mModelOutputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
+                // It may be an input to a different partition, so keep track of
+                // it.
+                mPlan->recordOutputDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex),
+                                       mIndex);
             }
         } break;
         case OperandLifeTime::SUBGRAPH: {
@@ -338,6 +499,7 @@
 void ExecutionStep::mapInputsAndOutputs(
         std::shared_ptr<StepExecutor> executor, const Memory* temporaryMemory,
         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOffsetOfTemporary,
+        const DynamicTemporaries& dynamicTemporaries,
         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToInputIndex,
         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOutputIndex,
         const std::map<SourceOperandIndex, ConstantReferenceLocation>&
@@ -347,6 +509,9 @@
         if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
             it != sourceOperandToOffsetOfTemporary.end()) {
             executor->setInputFromMemory(stepInputIndex, temporaryMemory, it->second);
+        } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) {
+            executor->setInputFromMemory(stepInputIndex, loc->memory, loc->offset, *loc->dimensions,
+                                         loc->length);
         } else if (auto it = sourceOperandToInputIndex.find(sourceOperandIndex);
                    it != sourceOperandToInputIndex.end()) {
             executor->mapInput(it->second, stepInputIndex);
@@ -368,6 +533,9 @@
         if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
             it != sourceOperandToOffsetOfTemporary.end()) {
             executor->setOutputFromMemory(stepOutputIndex, temporaryMemory, it->second);
+        } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) {
+            executor->setOutputFromMemory(stepOutputIndex, loc->memory, loc->offset,
+                                          *loc->dimensions, loc->length);
         } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
                    it != sourceOperandToOutputIndex.end()) {
             executor->mapOutput(it->second, stepOutputIndex);
@@ -384,6 +552,32 @@
     }
 }
 
+void ExecutionPlan::CompoundBody::findModelOutputsThatAreDownstreamInputs() {
+    auto declareModelOutputIsDownstreamInput =
+            [this](const SourceOperandIndex& sourceOperandIndex) {
+                const auto it = mOutputToDefiningExecutionStep.find(sourceOperandIndex);
+                CHECK(it != mOutputToDefiningExecutionStep.end());
+                uint32_t stepIndex = it->second;
+                CHECK_LT(stepIndex, mSteps.size());
+                VLOG(COMPILATION)
+                        << "ExecutionStep(" << stepIndex
+                        << ")->declareModelOutputIsDownstreamInput(mSourceOperandToOutputIndex.at"
+                        << toString(sourceOperandIndex) << ")";
+                CHECK(mSourceOperandToOutputIndex.find(sourceOperandIndex) !=
+                      mSourceOperandToOutputIndex.end());
+                mSteps[stepIndex]->executionStep()->declareModelOutputIsDownstreamInput(
+                        mSourceOperandToOutputIndex.at(sourceOperandIndex));
+            };
+    for (const auto& logicalStep : mSteps) {
+        if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
+            for (const auto& output : step->getOutputsAsStepModelInputs()) {
+                SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), output.first);
+                declareModelOutputIsDownstreamInput(sourceOperandIndex);
+            }
+        }
+    }
+}
+
 void ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs() {
     auto recordAsOutputIfTemporary = [this](const SourceOperandIndex& sourceOperandIndex) {
         const auto it = mTemporaryToDefiningExecutionStep.find(sourceOperandIndex);
@@ -418,6 +612,17 @@
     }
 }
 
+void ExecutionStep::declareModelOutputIsDownstreamInput(uint32_t mainModelOutputIndex) {
+    VLOG(COMPILATION) << "ExecutionStep(" << mIndex << ")::declareModelOutputIsDownstreamInput("
+                      << mainModelOutputIndex << ")";
+    const auto it = std::find(mOutputIndexStepModelToMainModel.begin(),
+                              mOutputIndexStepModelToMainModel.end(), mainModelOutputIndex);
+    CHECK(it != mOutputIndexStepModelToMainModel.end());
+    const uint32_t stepModelOutputIndex = it - mOutputIndexStepModelToMainModel.begin();
+    CHECK(stepModelOutputIndex < mModelOutputs.size());
+    mModelOutputsThatAreDownstreamInputs.insert(stepModelOutputIndex);
+}
+
 void ExecutionStep::recordTempAsStepModelOutput(uint32_t stepOperandIndex) {
     const auto it = mOperandMap.find(stepOperandIndex);
     CHECK(it != mOperandMap.end());
@@ -610,7 +815,8 @@
 
 int ExecutionPlan::CompoundBody::finish(const SourceModels* sourceModels,
                                         int32_t executionPreference, int32_t priority,
-                                        const std::optional<Deadline>& deadline) {
+                                        const std::optional<Deadline>& deadline,
+                                        int simulateFailureResultCode) {
     CHECK(!mSuccessfulFinish);
     CHECK(!deadline.has_value());
     const ModelBuilder* mainModel = sourceModels->getModel(kMainModelInSourceModels);
@@ -629,8 +835,8 @@
     findTempsAsStepModelOutputs();
     for (const auto& logicalStep : mSteps) {
         if (ExecutionStep* step = logicalStep->tryExecutionStep()) {
-            int n = step->finishStepModel(mainModel, &mHasStepModelOutputOfUnknownSize,
-                                          executionPreference, priority);
+            int n = step->finishStepModel(mainModel, &mHasDynamicTemporaries, executionPreference,
+                                          priority);
             if (n != ANEURALNETWORKS_NO_ERROR) {
                 VLOG(COMPILATION)
                         << "ExecutionPlan::CompoundBody::finish -- finishStepModel failed";
@@ -657,10 +863,11 @@
             CHECK(logicalStep->isGoto());
         }
     }
-    if (mHasStepModelOutputOfUnknownSize) {
-        VLOG(COMPILATION)
-                << "ExecutionPlan::CompoundBody::finish -- mHasStepModelOutputOfUnknownSize";
-        return ANEURALNETWORKS_OP_FAILED;
+
+    if (simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) {
+        VLOG(COMPILATION) << "ExecutionPlan::CompoundeBody::finish: simulating failure, ResultCode "
+                          << simulateFailureResultCode;
+        return simulateFailureResultCode;
     }
 
     for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
@@ -673,6 +880,7 @@
     }
 
     findControlFlowBoundaryConstants(sourceModels);
+    findModelOutputsThatAreDownstreamInputs();
 
     mSuccessfulFinish = true;
     return ANEURALNETWORKS_NO_ERROR;
@@ -713,25 +921,32 @@
 }
 
 int ExecutionPlan::SimpleBody::finish(const SourceModels*, int32_t executionPreference,
-                                      int32_t priority, const std::optional<Deadline>& deadline) {
+                                      int32_t priority, const std::optional<Deadline>& deadline,
+                                      int simulateFailureResultCode) {
     CHECK(!mSuccessfulFinish);
     CHECK(mDevice != nullptr);
     VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
-    const int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheDir,
-                          &mToken, &mPreparedModel);
+    int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheDir, &mToken,
+                    &mPreparedModel);
+    if (n == ANEURALNETWORKS_NO_ERROR && simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) {
+        VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish: simulating failure, ResultCode "
+                          << simulateFailureResultCode;
+        n = simulateFailureResultCode;
+    }
     mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
     return n;
 }
 
 int ExecutionPlan::finish(int32_t executionPreference, int32_t priority,
-                          const std::optional<Deadline>& deadline) {
+                          const std::optional<Deadline>& deadline, int simulateFailureResultCode) {
     CHECK(mBody != nullptr);
-    return mBody->finish(&getSourceModels(), executionPreference, priority, deadline);
+    return mBody->finish(&getSourceModels(), executionPreference, priority, deadline,
+                         simulateFailureResultCode);
 }
 
 ExecutionPlan::Controller::Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
                                       const BurstBuilder* burstBuilder)
-    : Controller(plan, executionBuilder, burstBuilder, 0, {}, {}, {}, {}, {}, {}) {}
+    : Controller(plan, executionBuilder, burstBuilder, 0, {}, {}, {}, {}, {}, {}, {}) {}
 
 ExecutionPlan::Controller::Controller(
         const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
@@ -741,7 +956,8 @@
         std::map<SourceOperandIndex, uint32_t> sourceOperandToInputIndex,
         std::map<SourceOperandIndex, uint32_t> sourceOperandToOutputIndex,
         const std::map<SourceOperandIndex, ConstantCopyLocation>& sourceOperandToConstantCopy,
-        std::map<SourceOperandIndex, ConstantReferenceLocation> sourceOperandToConstantReference)
+        std::map<SourceOperandIndex, ConstantReferenceLocation> sourceOperandToConstantReference,
+        DynamicTemporaries dynamicTemporaries)
     : mPlan(plan),
       mExecutionBuilder(executionBuilder),
       mBurstBuilder(burstBuilder),
@@ -750,6 +966,7 @@
       mSourceOperandToInputIndex(std::move(sourceOperandToInputIndex)),
       mSourceOperandToOutputIndex(std::move(sourceOperandToOutputIndex)),
       mSourceOperandToConstantReference(std::move(sourceOperandToConstantReference)),
+      mDynamicTemporaries(std::move(dynamicTemporaries)),
       mNextStepIndex(0),
       mFallbackNextStepIndex(kBadStepIndex),
       mLastStepSyncFd(-1) {
@@ -823,7 +1040,7 @@
         return std::shared_ptr<Controller>(new Controller(this, executionBuilder, burstBuilder));
     }
     // Create the layout for a Memory object big enough to hold
-    // - every partition boundary TEMPORARY operand and
+    // - every partition boundary TEMPORARY operand that is not a dynamic temporary, and
     // - buffers required by the control flow implementation.
     //
     // TODO: Rethink this approach for managing temporaries.  Some
@@ -844,21 +1061,17 @@
     // what our Memory objects represent.
     //
     uint32_t totalSizeOfTemporaries = 0;
-    auto addTemporaryOfSize = [&totalSizeOfTemporaries](uint32_t size) {
-        totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size);
-        const uint32_t offset = totalSizeOfTemporaries;
-        totalSizeOfTemporaries += size;
-        return offset;
-    };
     // This function has two modes of operation:
     // 1. When lifetime is TEMPORARY_VARIABLE, we allocate memory for
-    //    TEMPORARY_VARIABLE source operands, skip SUBGRAPH_OUTPUT source
-    //    operands, and panic if we see a source operand of another lifetime.
+    //    TEMPORARY_VARIABLE source operands that are not dynamic temporaries,
+    //    skip TEMPORARY_VARIABLE source operands that are dynamic temporaries,
+    //    skip SUBGRAPH_OUTPUT source operands, and panic if we see a source
+    //    operand of another lifetime.
     // 2. When lifetime is SUBGRAPH_OUTPUT, we allocate memory for
     //    SUBGRAPH_OUTPUT source operands and panic if we see a source operand
     //    of another lifetime.
     auto mapTemporary =
-            [executionBuilder, addTemporaryOfSize](
+            [executionBuilder, &totalSizeOfTemporaries](
                     const SourceOperandIndex& sourceOperandIndex,
                     std::map<SourceOperandIndex, uint32_t>* sourceOperandToOffsetOfTemporary,
                     OperandLifeTime lifetime = OperandLifeTime::TEMPORARY_VARIABLE) {
@@ -873,13 +1086,19 @@
                 }
                 CHECK(sourceOperand.lifetime == lifetime);
                 const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
-                CHECK_NE(size, 0u);
-                const uint32_t offset = addTemporaryOfSize(size);
-                auto [_, isNew] =
-                        sourceOperandToOffsetOfTemporary->emplace(sourceOperandIndex, offset);
-                CHECK(isNew);
-                VLOG(EXECUTION) << "temp: operand " << toString(sourceOperandIndex)
-                                << " offset = " << offset;
+                if (size != 0u) {
+                    const uint32_t offset = addTemporaryOfSize(&totalSizeOfTemporaries, size);
+                    auto [_, isNew] =
+                            sourceOperandToOffsetOfTemporary->emplace(sourceOperandIndex, offset);
+                    CHECK(isNew);
+                    VLOG(EXECUTION) << "temp: operand " << toString(sourceOperandIndex)
+                                    << " offset = " << offset;
+                } else {
+                    // Unknown size, hence dynamic temporary.  The mapping will
+                    // be established elsewhere (DynamicTemporaries::allocate()).
+                    CHECK(lifetime == OperandLifeTime::TEMPORARY_VARIABLE);
+                    CHECK(sourceOperand.lifetime == OperandLifeTime::TEMPORARY_VARIABLE);
+                }
             };
     std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary;
     std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary2;
@@ -963,24 +1182,53 @@
     // Allocate temporary memory for boundary CONSTANT_COPY operands.
     for (const auto& [sourceOperandIndex, location] :
          compound()->mSourceOperandToBoundaryConstantCopy) {
-        const uint32_t offset = addTemporaryOfSize(location.length);
+        const uint32_t offset = addTemporaryOfSize(&totalSizeOfTemporaries, location.length);
         sourceOperandToOffsetOfTemporary.emplace(sourceOperandIndex, offset);
         VLOG(EXECUTION) << "temp (boundary constant): operand " << toString(sourceOperandIndex)
                         << " offset = " << offset;
     }
+    // Collect dynamic temporaries.
+    // TODO(b/157236079): Move some or all of this work to compilation time?
+    DynamicTemporaries dynamicTemporaries;
+    const TypeManager* typeManager = TypeManager::get();
+    for (const auto& logicalStep : compound()->mSteps) {
+        if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
+            const uint32_t stepIndex = step->getIndex();
+            const uint32_t sourceModelIndex = step->getSourceModelIndex();
+            for (const auto& entry : step->getTempsAsStepModelOutputs()) {
+                const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, entry.first);
+                const auto& sourceOperand = getSourceOperand(sourceOperandIndex);
+                if (hasUnknownSize(sourceOperand)) {
+                    CHECK(typeManager->isTensorType(sourceOperand.type));
+                    // TODO: For now we guess an initial size equal to element
+                    // size, which is overly conservative.
+                    const uint32_t size = typeManager->getSizeOfData(sourceOperand.type, {1});
+                    dynamicTemporaries.declare(sourceOperandIndex, stepIndex,
+                                               sourceOperand.dimensions, size);
+                }
+            }
+        }
+    }
+    dynamicTemporaries.endDeclarations();
+    dynamicTemporaries.vlogDump("finished declarations");
+
     return std::shared_ptr<Controller>(new Controller(
             this, executionBuilder, burstBuilder, totalSizeOfTemporaries,
             std::move(sourceOperandToOffsetOfTemporary),
             std::move(sourceOperandToOffsetOfTemporary2), compound()->mSourceOperandToInputIndex,
             compound()->mSourceOperandToOutputIndex,
             compound()->mSourceOperandToBoundaryConstantCopy,
-            compound()->mSourceOperandToBoundaryConstantReference));
+            compound()->mSourceOperandToBoundaryConstantReference, std::move(dynamicTemporaries)));
 }
 
 // TODO: Find a better way to provide this functionality.
 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
-                            std::shared_ptr<StepExecutor>* executor) const {
+                            std::shared_ptr<StepExecutor>* executor,
+                            std::shared_ptr<ExecutionBurstController>* burstController) const {
     *executor = nullptr;
+    if (burstController != nullptr) {
+        *burstController = nullptr;
+    }
 
     VLOG(EXECUTION) << "ExecutionPlan::fallback(" << SHOW_IF_DEBUG(controller << ", " << executor)
                     << "): mFallbackNextStepIndex = " << controller->mFallbackNextStepIndex;
@@ -996,7 +1244,7 @@
     }
 
     controller->mNextStepIndex = controller->mFallbackNextStepIndex;
-    return next(controller, executor);
+    return next(controller, executor, burstController);
 }
 
 ExecutionPlan::Buffer::Buffer(void* pointer, uint32_t size)
@@ -1169,13 +1417,19 @@
                                 std::shared_ptr<ExecutionBurstController>* burstController) const {
     VLOG(EXECUTION) << "next: Step#" << controller->mNextStepIndex << ": execute on "
                     << step->getDevice()->getName();
-    *executor =
-            std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(),
-                                           step->getDevice(), step->getPreparedStepModel(), step);
+
+    NN_RETURN_IF_ERROR(controller->mDynamicTemporaries.allocate(step->getIndex()));
+    controller->mDynamicTemporaries.vlogDump("finished allocating for a step");
+
+    *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(),
+                                               step->getDevice(), step->getPreparedStepModel(),
+                                               step, &controller->mDynamicTemporaries);
+
     step->mapInputsAndOutputs(
             *executor, controller->mTemporaries.get(),
-            controller->mSourceOperandToOffsetOfTemporary, controller->mSourceOperandToInputIndex,
-            controller->mSourceOperandToOutputIndex, controller->mSourceOperandToConstantReference);
+            controller->mSourceOperandToOffsetOfTemporary, controller->mDynamicTemporaries,
+            controller->mSourceOperandToInputIndex, controller->mSourceOperandToOutputIndex,
+            controller->mSourceOperandToConstantReference);
     if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
         *burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex);
     }
@@ -1473,6 +1727,13 @@
     mState = SIMPLE;
 }
 
+void ExecutionPlan::recordOutputDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
+    auto [it, isNew] =
+            compound()->mOutputToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
+    CHECK(isNew) << "Step " << stepIndex << " redefines output operand "
+                 << toString(sourceOperandIndex) << " already defined by step " << it->second;
+}
+
 void ExecutionPlan::recordTemporaryDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
     auto [it, isNew] =
             compound()->mTemporaryToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
@@ -1524,8 +1785,8 @@
     return compound()->mSteps;
 }
 
-bool ExecutionPlan::forTest_hasStepModelOutputsOfUnknownSize() const {
-    return mBody->hasStepModelOutputsOfUnknownSize();
+bool ExecutionPlan::hasDynamicTemporaries() const {
+    return mBody->hasDynamicTemporaries();
 }
 
 const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const {
@@ -1602,12 +1863,12 @@
 
 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
                                    uint32_t preference, uint32_t priority,
-                                   const std::optional<Deadline>& deadline,
-                                   ExecutionPlan* plan) const {
+                                   const std::optional<Deadline>& deadline, ExecutionPlan* plan,
+                                   int simulateFailureResultCode) const {
     uint32_t sourceModelIndex = plan->getSourceModels().addModel(this);
     NN_RETURN_IF_ERROR(partitionTheWorkInternal(sourceModelIndex, devices, preference, priority,
                                                 deadline, plan));
-    int n = plan->finish(preference, priority, deadline);
+    int n = plan->finish(preference, priority, deadline, simulateFailureResultCode);
     if (VLOG_IS_ON(COMPILATION)) {
         VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: source model: ";
         logModelToInfo(makeHidlModel());
@@ -1668,12 +1929,24 @@
     // (see LogicalStep).
     std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount + 1);
 
+    // This helper function produces a device name.
+    auto deviceName = [&devices, kControlFlowInterpreter,
+                       deviceCount](int deviceIndex) -> std::string {
+        if (deviceIndex == kControlFlowInterpreter) {
+            return "NNAPI";
+        } else if (deviceIndex < 0 || size_t(deviceIndex) >= deviceCount) {
+            return "{unknown}";
+        } else {
+            return devices.at(deviceIndex)->getName();
+        }
+    };
+
     // This helper function enqueues the operation on the appropriate queue.
     auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
         int deviceIndex = bestDeviceForOperation[operationIndex];
         perDeviceQueue[deviceIndex].push(operationIndex);
         VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
-                          << deviceIndex;
+                          << deviceIndex << " (" << deviceName(deviceIndex) << ")";
     };
 
     // This helper function finds a device that has operations ready to process.
@@ -1692,11 +1965,14 @@
     };
 
     OperandTracker tracker(this, enqueueOnAppropriateDevice);
-    // For each iteration of this loop, we'll create an execution step.
+    // For each iteration of this loop, we'll create either an execution step or
+    // an interpreted control flow construct (including nested execution steps
+    // and interpreted control flow constructs).
     while (true) {
         // Find the device we'll do this step for.
         int deviceIndex = findNextDeviceToProcess();
-        VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex;
+        VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex << " ("
+                          << deviceName(deviceIndex) << ")";
         if (deviceIndex < 0) {
             break;
         }
@@ -2050,13 +2326,14 @@
             const int kControlFlowInterpreter = deviceCount;
             (*bestDeviceForOperation)[operationIndex] = kControlFlowInterpreter;
             VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
-                              << toString(operation.type) << ") = -1"
+                              << toString(operation.type) << ":" << operationIndex << ") = -1"
                               << " (NNAPI)";
         } else {
             (*bestDeviceForOperation)[operationIndex] = bestChoice;
             VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
-                              << toString(operation.type) << ") = " << bestChoice << " ("
-                              << devices[bestChoice]->getName() << ")";
+                              << toString(operation.type) << ":" << operationIndex
+                              << ") = " << bestChoice << " (" << devices[bestChoice]->getName()
+                              << ")";
         }
     }
     return ANEURALNETWORKS_NO_ERROR;
diff --git a/runtime/ExecutionPlan.h b/runtime/ExecutionPlan.h
index d1e7d94..3b6beb6 100644
--- a/runtime/ExecutionPlan.h
+++ b/runtime/ExecutionPlan.h
@@ -22,6 +22,7 @@
 #include <android-base/logging.h>
 #include <openssl/sha.h>
 
+#include <algorithm>
 #include <chrono>
 #include <map>
 #include <memory>
@@ -80,6 +81,13 @@
 //   output of a partition. For ExecutionStep, the inputs and outputs of the
 //   step model are boundary operands; for IfStep and WhileStep, the inputs and
 //   outputs of the corresponding operation are boundary operands.
+// - A partition boundary static temporary is a partition boundary
+//   operand which is of lifetime TEMPORARY_VARIABLE in the source model and
+//   whose dimensions are fully specified.
+// - A partition boundary dynamic temporary is a partition boundary
+//   operand which is of lifetime TEMPORARY_VARIABLE in the source model and
+//   whose dimensions are not fully specified.
+// - A main execution is the execution of a main model.
 //
 // Referenced models can be sources of parition boundary operands. For example,
 // this happens when a referenced model is paritioned into one or more
@@ -105,6 +113,107 @@
     std::vector<const ModelBuilder*> mModels;
 };
 
+// Represents all partition boundary dynamic temporaries for a particular main
+// execution.
+//
+// Usage pattern:
+// - declare() every partition boundary dynamic temporary.
+// - endDeclarations().  After this point, lookup() is permitted.
+// - Before executing an ExecutionStep, call allocate().
+// - After executing an ExecutionStep, call redeclare() for every partition
+//   boundary dynamic temporary for which we've learned or guessed more about
+//   the dimensions or length.
+//
+// Each partition boundary temporary has a location assigned by allocate() for
+// its defining step (see declare() and allocate()).  That location remains
+// valid until redeclare() increases the length of some temporary in its defining
+// step or allocate() is called again for its defining step.
+class DynamicTemporaries {
+    DISALLOW_COPY_AND_ASSIGN(DynamicTemporaries);
+
+   public:
+    DynamicTemporaries() = default;
+    DynamicTemporaries(DynamicTemporaries&&) = default;
+    DynamicTemporaries& operator=(DynamicTemporaries&&) = default;
+
+    // Declare a dynamic temporary.  stepIndex is the step that defines the
+    // temporary (i.e., in which the temporary appears as an operation output
+    // operand).  initialDimensions and initialLength indicate what we know or
+    // (in the case of length) guess about those properties.
+    void declare(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex,
+                 const hal::hidl_vec<uint32_t>& initialDimensions, uint32_t initialLength);
+
+    // Indicate that we've finished declaring all dynamic temporaries.
+    void endDeclarations() {
+        CHECK(!mDeclared);
+        mDeclared = true;
+    }
+
+    // Redeclare a dynamic temporary, indicating what we've learned about it.
+    // This may invalidate the location of temporaries defined by its step.
+    // Returns true if dimensions or length changed, false otherwise.
+    bool redeclare(SourceOperandIndex sourceOperandIndex,
+                   const hal::hidl_vec<uint32_t>& newDimensions, uint32_t newLength);
+
+    // Ensure that all dynamic temporaries defined by the specified step have
+    // locations.  The return value is a ResultCode (e.g.,
+    // ANEURALNETWORKS_NO_ERROR).
+    //
+    // Even if dynamic temporaries have already been allocated for this step,
+    // this call may reallocate them.  A reallocation is not guaranteed to
+    // preserve location (LocationAndShape.memory, LocationAndShape.offset) or
+    // contents of temporaries.
+    int allocate(uint32_t stepIndex);
+
+    // Do the dynamic temporaries defined by this step have valid allocations?
+    // (Will be true if there are no dynamic temporaries defined by this step.)
+    bool allocated(uint32_t stepIndex) const;
+
+    // Dump information to VLOG(EXECUTION).
+    void vlogDump(const char* context = nullptr) const;
+
+    // If the specified operand is a dynamic temporary, return location and
+    // shape information; otherwise, return std::nullopt.
+    //
+    // If temporary exists but does not have a valid allocation, then:
+    //  - If mustBeAllocated == true, then trigger a failed CHECK().
+    //  - If mustBeAllocated == false, then memory == nullptr and offset == ~0.
+    struct LocationAndShape {
+        const Memory* memory;
+        uint32_t offset;
+        const hal::hidl_vec<uint32_t>* dimensions;
+        uint32_t length;
+    };
+    std::optional<LocationAndShape> lookup(SourceOperandIndex sourceOperandIndex,
+                                           bool mustBeAllocated = true) const;
+
+    // Have any dynamic temporaries been declared?
+    bool empty() const { return mSourceOperandToTemporary.empty(); }
+
+   private:
+    // The same as LocationAndShape, except the base of the location is
+    // represented not by memory but by defining stepIndex.
+    struct InternalLocationAndShape {
+        uint32_t stepIndex;
+        uint32_t offset;
+        hal::hidl_vec<uint32_t> dimensions;
+        uint32_t length;
+    };
+    std::map<SourceOperandIndex, InternalLocationAndShape> mSourceOperandToTemporary;
+
+    // Every dynamic temporary defined at a given stepIndex.
+    std::map<uint32_t, std::vector<SourceOperandIndex>> mStepIndexToSourceOperandIndexes;
+
+    std::map<uint32_t, std::unique_ptr<MemoryAshmem>> mStepIndexToMemory;
+
+    // For a given defining stepIndex, we consider either all its dynamic
+    // temporaries to be allocated (have valid locations) or none of them to be.
+    std::set<uint32_t> mAllocatedStepIndexes;
+
+    // Has endDeclarations() been called?
+    bool mDeclared = false;
+};
+
 // An excerpt of a source model to be run by a specific device.
 class ExecutionStep {
    public:
@@ -137,8 +246,14 @@
         return mOutputsAsStepModelInputsIndexToMainModel;
     }
 
+    const std::set<uint32_t>& getModelOutputsThatAreDownstreamInputs() const {
+        return mModelOutputsThatAreDownstreamInputs;
+    }
+
+    uint32_t getIndex() const { return mIndex; }
     uint32_t getSourceModelIndex() const { return mSourceModelIndex; }
 
+    void declareModelOutputIsDownstreamInput(uint32_t mainModelOutputIndex);
     void recordTempAsStepModelOutput(uint32_t stepOperandIndex);
 
     // If this step has a step model output of unknown size, sets
@@ -158,8 +273,11 @@
     // This method only reads map entries for which the first element of
     // SourceOperandIndex is mSourceModelIndex.
     void mapInputsAndOutputs(
-            std::shared_ptr<StepExecutor> stepExecutor, const Memory* temporaryMemory,
-            const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOffsetOfTemporary,
+            std::shared_ptr<StepExecutor> stepExecutor,
+            const Memory* temporaryMemory,  // for static temporaries
+            const std::map<SourceOperandIndex, uint32_t>&
+                    sourceOperandToOffsetOfTemporary,  // for static temporaries
+            const DynamicTemporaries& dynamicTemporaries,
             const std::map<SourceOperandIndex, uint32_t>& sourceOperandToInputIndex,
             const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOutputIndex,
             const std::map<SourceOperandIndex, ConstantReferenceLocation>&
@@ -192,6 +310,7 @@
     // model, the memory should be mapped using
     // ExecutionPlan::CompoundBody::mSourceOperandToInputIndex,
     // ExecutionPlan::Controller::mSourceOperandToOffsetOfTemporary, or
+    // ExecutionPlan::Controller::mDynamicTemporaries, or
     // ExecutionPlan::CompoundBody::mSourceOperandToOutputIndex.
     RemapVectorType mStepModelInputs;
     // All outputs of this step model:
@@ -199,11 +318,12 @@
     //
     // Depending on whether the source operand is an output of the main model,
     // the memory should be mapped using
-    // ExecutionPlan::CompoundBody::mSourceOperandToOutputIndex or
-    // ExecutionPlan::Controller::mSourceOperandToOffsetOfTemporary.
+    // ExecutionPlan::CompoundBody::mSourceOperandToOutputIndex,
+    // ExecutionPlan::Controller::mSourceOperandToOffsetOfTemporary, or
+    // ExecutionPlan::Controller::mDynamicTemporaries.
     //
-    // mOutputIndexStepModelToMainModel relies on mModelOutputs being a prefix of
-    // mStepModelOutputs.
+    // mOutputIndexStepModelToMainModel and declareModelOutputIsDownstreamInput()
+    // rely on mModelOutputs being a prefix of mStepModelOutputs.
     RemapVectorType mStepModelOutputs;
     // Inputs of main model that are also inputs of this step model:
     //     (main model operand index, step model operand index)
@@ -247,6 +367,10 @@
     //     mOutputsAsStepModelInputs[i].first
     std::vector<uint32_t> mOutputsAsStepModelInputsIndexToMainModel;
 
+    // Step model output indexes (not operand indexes) that are outputs of the
+    // main model used as inputs to some other partition.
+    std::set<uint32_t> mModelOutputsThatAreDownstreamInputs;
+
     // The compilation caching token.
     TokenHasher mToken;
 };
@@ -417,8 +541,8 @@
     ExecutionPlan() {}
     ~ExecutionPlan() { delete mBody; }
 
-    // Controller is part of the interface to a mechanism for performing an
-    // execution in N steps.
+    // Controller is part of the interface to a mechanism for performing a
+    // main execution in N steps.
     //
     // The value of N may not be known beforehand if the model contains WHILE
     // loops. See LogicalStep.
@@ -445,15 +569,20 @@
                    const BurstBuilder* burstBuilder);
         // A constructor for mState == COMPOUND.
         Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
-                   const BurstBuilder* burstBuilder, uint32_t totalSizeOfTemporaries,
+                   const BurstBuilder* burstBuilder,
+
+                   // static temporaries
+                   uint32_t totalSizeOfTemporaries,
                    std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary,
                    std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary2,
+
                    std::map<SourceOperandIndex, uint32_t> sourceOperandToInputIndex,
                    std::map<SourceOperandIndex, uint32_t> sourceOperandToOutputIndex,
                    const std::map<SourceOperandIndex, ConstantCopyLocation>&
                            sourceOperandToConstantCopy,
                    std::map<SourceOperandIndex, ConstantReferenceLocation>
-                           sourceOperandToConstantReference);
+                           sourceOperandToConstantReference,
+                   DynamicTemporaries dynamicTemporaries);
 
         // Sets the location of innerOperand to be the same as the location of outerOperand.
         void setInput(const SourceOperandIndex& outerOperand,
@@ -467,7 +596,7 @@
         // does not generate a sync fence.
         int waitForLastStepSyncFence() const;
 
-        const ExecutionPlan* mPlan;
+        [[maybe_unused]] const ExecutionPlan* mPlan;
         ExecutionBuilder* mExecutionBuilder;
         const BurstBuilder* mBurstBuilder;
         // Map from source operand index to an offset into mTemporaries used
@@ -496,7 +625,12 @@
         // Map from source operand index to a constant reference location.
         // Used for WHILE loop operand initializers that are constant references.
         std::map<SourceOperandIndex, ConstantReferenceLocation> mSourceOperandToConstantReference;
+
+        // static temporaries
         std::unique_ptr<MemoryAshmem> mTemporaries;
+
+        DynamicTemporaries mDynamicTemporaries;
+
         // Index of the next step to be processed by ExecutionPlan::next().
         size_t mNextStepIndex;
         // The value to reset mNextStepIndex to for partial CPU fallback.
@@ -521,8 +655,8 @@
              int syncFdOfLastStep = -1) const;
 
     // Create the same executor as the last one created by next().
-    int fallback(std::shared_ptr<Controller> controller,
-                 std::shared_ptr<StepExecutor>* executor) const;
+    int fallback(std::shared_ptr<Controller> controller, std::shared_ptr<StepExecutor>* executor,
+                 std::shared_ptr<ExecutionBurstController>* burstController = nullptr) const;
 
     ExecutionStep* createNewExecutionStep(uint32_t sourceModelIndex,
                                           const std::shared_ptr<Device> device);
@@ -535,9 +669,11 @@
 
     void becomeSingleStep(const std::shared_ptr<Device> device, const ModelBuilder* model);
 
+    // simulateFailureResultCode == ANEURALNETWORKS_NO_ERROR means behave normally.
     int finish(int32_t executionPreference, int32_t priority,
-               const std::optional<Deadline>& deadline);
+               const std::optional<Deadline>& deadline, int simulateFailureResultCode);
 
+    void recordOutputDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex);
     void recordTemporaryDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex);
 
     void dump() const;
@@ -568,6 +704,8 @@
     SourceModels& getSourceModels() { return mSourceModels; }
     const SourceModels& getSourceModels() const { return mSourceModels; }
 
+    bool hasDynamicTemporaries() const;
+
     // These functions are solely intended for use by unit tests of
     // the partitioning algorithm.
     enum class Kind {
@@ -579,14 +717,19 @@
     Kind forTest_getKind() const;
     std::shared_ptr<const Device> forTest_simpleGetDevice() const;
     const std::vector<std::shared_ptr<LogicalStep>>& forTest_compoundGetSteps() const;
-    bool forTest_hasStepModelOutputsOfUnknownSize() const;
     const uint8_t* forTest_simpleGetCacheToken() const;
 
    private:
     // Becomes a new COMPOUND step if mState == EMPTY, otherwise does nothing.
     // Illegal to call for when mState == SIMPLE.
     void becomeCompoundIfEmpty();
-    void findTempsAsStepModelOutputs();
+
+    const hal::Operand& getSourceOperand(
+            const std::pair<uint32_t, uint32_t>& sourceOperandIndex) const {
+        return getSourceModels()
+                .getModel(sourceOperandIndex.first)
+                ->getOperand(sourceOperandIndex.second);
+    }
 
     class Buffer {
        public:
@@ -631,8 +774,9 @@
         virtual ~Body() {}
         virtual void dump() const = 0;
         virtual int finish(const SourceModels* sourceModels, int32_t executionPreference,
-                           int32_t priority, const std::optional<Deadline>& deadline) = 0;
-        virtual bool hasStepModelOutputsOfUnknownSize() const = 0;
+                           int32_t priority, const std::optional<Deadline>& deadline,
+                           int simulateFailureResultCode) = 0;
+        virtual bool hasDynamicTemporaries() const = 0;
         virtual void forEachStepRoleOfInput(uint32_t index,
                                             const StepRoleCallback& callback) const = 0;
         virtual void forEachStepRoleOfOutput(uint32_t index,
@@ -647,8 +791,8 @@
 
         void dump() const override;
         int finish(const SourceModels* sourceModels, int32_t executionPreference, int32_t priority,
-                   const std::optional<Deadline>& deadline) override;
-        bool hasStepModelOutputsOfUnknownSize() const override { return false; }
+                   const std::optional<Deadline>& deadline, int simulateFailureResultCode) override;
+        bool hasDynamicTemporaries() const override { return false; }
         void forEachStepRoleOfInput(uint32_t index,
                                     const StepRoleCallback& callback) const override;
         void forEachStepRoleOfOutput(uint32_t index,
@@ -665,10 +809,8 @@
     struct CompoundBody : Body {
         void dump() const override;
         int finish(const SourceModels* sourceModels, int32_t executionPreference, int32_t priority,
-                   const std::optional<Deadline>& deadline) override;
-        bool hasStepModelOutputsOfUnknownSize() const override {
-            return mHasStepModelOutputOfUnknownSize;
-        }
+                   const std::optional<Deadline>& deadline, int simulateFailureResultCode) override;
+        bool hasDynamicTemporaries() const override { return mHasDynamicTemporaries; }
         void forEachStepRoleOfInput(uint32_t index,
                                     const StepRoleCallback& callback) const override;
         void forEachStepRoleOfOutput(uint32_t index,
@@ -681,6 +823,12 @@
         std::vector<std::shared_ptr<LogicalStep>> mSteps;
 
         // Map from source operand index to defining ExecutionStep index.
+        // Used for all (and only) SUBGRAPH_OUTPUTs that are defined by
+        // ExecutionSteps. Those defined by IfSteps and WhileSteps are not in
+        // the map.
+        std::map<SourceOperandIndex, uint32_t> mOutputToDefiningExecutionStep;
+
+        // Map from source operand index to defining ExecutionStep index.
         // Used for all (and only) TEMPORARY_VARIABLEs that are defined by
         // ExecutionSteps. Those defined by IfSteps and WhileSteps are not in
         // the map.
@@ -708,11 +856,13 @@
         std::map<SourceOperandIndex, ConstantReferenceLocation>
                 mSourceOperandToBoundaryConstantReference;
 
-        bool mHasStepModelOutputOfUnknownSize = false;
+        bool mHasDynamicTemporaries = false;
 
        private:
         void findTempsAsStepModelOutputs();
 
+        void findModelOutputsThatAreDownstreamInputs();
+
         // Constant values that are inputs to IF and WHILE operations and lie on
         // a partition boundary ("control flow boundary constants") require
         // special treatment. We need to be able to dynamically associate those
@@ -758,6 +908,7 @@
     // Pointers to compilation caching information in CompilationBuilder.
     const std::string* mCacheDir = nullptr;
     const uint8_t* mToken = nullptr;
+
     SourceModels mSourceModels;
 };
 
diff --git a/runtime/Manager.cpp b/runtime/Manager.cpp
index 6b80d20..78d7c36 100644
--- a/runtime/Manager.cpp
+++ b/runtime/Manager.cpp
@@ -405,7 +405,7 @@
     }
 
     if (n != ANEURALNETWORKS_NO_ERROR) {
-        VLOG(EXECUTION) << "**Execution failed**";
+        VLOG(EXECUTION) << "**Execution failed** (ResultCode = " << n << ")";
         return {n, std::move(outputShapes), timing};
     }
 
diff --git a/runtime/ModelBuilder.h b/runtime/ModelBuilder.h
index 94baab7..2de68b3 100644
--- a/runtime/ModelBuilder.h
+++ b/runtime/ModelBuilder.h
@@ -126,9 +126,11 @@
         return getReferencedModel(operand.location.offset);
     }
 
+    // simulateFailureResultCode == ANEURALNETWORKS_NO_ERROR means behave normally.
     int partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices, uint32_t preference,
                          uint32_t priority, const std::optional<Deadline>& deadline,
-                         ExecutionPlan* plan) const;
+                         ExecutionPlan* plan,
+                         int simulateFailureResultCode = ANEURALNETWORKS_NO_ERROR) const;
 
    private:
     // TODO(b/132322449): move partitionTheWork, findBestDeviceForEachOperation,
diff --git a/runtime/NeuralNetworks.cpp b/runtime/NeuralNetworks.cpp
index 5d3dae4..f5206c8 100644
--- a/runtime/NeuralNetworks.cpp
+++ b/runtime/NeuralNetworks.cpp
@@ -1543,6 +1543,26 @@
             waitForList.push_back(syncFenceFd);
         }
     }
+
+    if (r->getCompilation()->hasDynamicTemporaries()) {
+        // The current implementation of fenced execution does not support
+        // dynamic temporaries.  Fall back to non fenced execution.
+        LOG(INFO) << "ANeuralNetworksExecution_startComputeWithDependencies falling back"
+                  << " to ANeuralNetworksExecution_startCompute"
+                  << " because of boundary operands of unknown size";
+        for (int syncFenceFd : waitForList) {
+            if (syncFenceFd > 0) {
+                auto w = syncWait(syncFenceFd, -1);
+                if (w != FenceState::SIGNALED) {
+                    VLOG(EXECUTION) << "syncWait failed, fd: " << syncFenceFd;
+                    *event = nullptr;
+                    return ANEURALNETWORKS_OP_FAILED;
+                }
+            }
+        }
+        return ANeuralNetworksExecution_startCompute(execution, event);
+    }
+
     int syncFenceToSignal = -1;
     int n = r->computeFenced(waitForList, duration, &syncFenceToSignal);
     std::unique_ptr<SyncFenceEvent> e =
diff --git a/runtime/VersionedInterfaces.cpp b/runtime/VersionedInterfaces.cpp
index 7139b83..ccb29dc 100644
--- a/runtime/VersionedInterfaces.cpp
+++ b/runtime/VersionedInterfaces.cpp
@@ -638,6 +638,7 @@
         LOG(ERROR) << "IDevice::getVersionString returned the error " << toString(versionStatus);
         return std::nullopt;
     }
+    VLOG(MANAGER) << "Version " << versionString;
 
     const int32_t type = getTypeFunction(device);
     if (type == -1) {
diff --git a/runtime/include/NeuralNetworksOEM.h b/runtime/include/NeuralNetworksOEM.h
index 54a5dfe..e184d52 100644
--- a/runtime/include/NeuralNetworksOEM.h
+++ b/runtime/include/NeuralNetworksOEM.h
@@ -55,9 +55,10 @@
 };  // extends OperandCode
 
 /**
- * If a model contains an {@link ANEURALNETWORKS_OEM_OPERATION}, then
- * either the model must contain only a single operation, or every
- * tensor operand type in the model must be fully specified.
+ * Before API level 30, if a model contains an
+ * {@link ANEURALNETWORKS_OEM_OPERATION}, then either the model must contain
+ * only a single operation, or every tensor operand type in the model must be
+ * fully specified.
  */
 enum {
     /**
diff --git a/runtime/test/TestExecution.cpp b/runtime/test/TestExecution.cpp
index 66bef6b..480b0ef 100644
--- a/runtime/test/TestExecution.cpp
+++ b/runtime/test/TestExecution.cpp
@@ -576,7 +576,7 @@
         // fall back to CPU.  (If we allow CPU fallback, then when our
         // TestDriver reports an execution failure, we'll re-execute
         // on CPU, and will not see the failure.)
-        c->setPartitioning(DeviceManager::kPartitioningWithoutFallback);
+        c->forTest_setPartitioning(DeviceManager::kPartitioningWithoutFallback);
         mCompilation = reinterpret_cast<ANeuralNetworksCompilation*>(c);
     }
 };
diff --git a/runtime/test/TestGenerated.cpp b/runtime/test/TestGenerated.cpp
index 70b0e6f..6b96004 100644
--- a/runtime/test/TestGenerated.cpp
+++ b/runtime/test/TestGenerated.cpp
@@ -265,6 +265,7 @@
 
         // Check output dimensions.
         for (uint32_t i = 0; i < testModel.main.outputIndexes.size(); i++) {
+            SCOPED_TRACE("Output index: " + std::to_string(i));
             const auto& output = testModel.main.operands[testModel.main.outputIndexes[i]];
             if (output.isIgnored) continue;
             std::vector<uint32_t> actualDimensions;
diff --git a/runtime/test/TestPartitioning.cpp b/runtime/test/TestPartitioning.cpp
index 45dabe3..c58b1a4 100644
--- a/runtime/test/TestPartitioning.cpp
+++ b/runtime/test/TestPartitioning.cpp
@@ -888,7 +888,13 @@
     }
 
     Result setPartitioning(uint32_t partitioning) {
-        return static_cast<Result>(builder()->setPartitioning(partitioning));
+        return static_cast<Result>(builder()->forTest_setPartitioning(partitioning));
+    }
+
+    // Simulate recoverable partitioning failure.
+    Result failPartitioning() {
+        return static_cast<Result>(
+                builder()->forTest_failPartitioning(static_cast<int>(Result::OP_FAILED)));
     }
 
     using WrapperCompilation::finish;
@@ -1790,10 +1796,6 @@
     model.finish();
     ASSERT_TRUE(model.isValid());
 
-    // We expect that we cannot successfully partition, because we
-    // have an intermediate operand (opnd2) without dimensions, and
-    // this is not currently handled.
-
     // One device that can and should execute operation 0.
     const auto devices = makeDevices({{"hw", 0.5, (1 << 0)}});
 
@@ -1803,32 +1805,31 @@
     // didn't actually do any partitioning.
     PartitioningCompilation cPNo(&model, devices);
     ASSERT_EQ(cPNo.setPartitioning(DeviceManager::kPartitioningNo), Result::NO_ERROR);
+    ASSERT_EQ(cPNo.failPartitioning(), Result::NO_ERROR);
     ASSERT_EQ(cPNo.finish(), Result::NO_ERROR);
     ASSERT_EQ(cPNo.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
     ASSERT_EQ(cPNo.getExecutionPlan().forTest_simpleGetDevice(), DeviceManager::getCpuDevice());
 
-    // Test kPartitioningWithFallback.  We should attempt
-    // partitioning, reach the end of the partitioning process (so we
-    // have an unsuccessful execution plan), discover the dimensionless
-    // intermediate operand, then fallback to CPU with a SIMPLE plan, and
-    // finally return success.
-    // No need to compare the original model to the model from the plan -- we
-    // didn't actually do any partitioning.
+    // Test kPartitioningWithFallback.  We should attempt partitioning, simulate
+    // a recoverable failure, then fallback to CPU with a SIMPLE plan, and
+    // finally return success.  No need to compare the original model to the
+    // model from the plan -- we didn't actually do any partitioning.
     PartitioningCompilation cPWithFallback(&model, devices);
     ASSERT_EQ(cPWithFallback.setPartitioning(DeviceManager::kPartitioningWithFallback),
               Result::NO_ERROR);
+    ASSERT_EQ(cPWithFallback.failPartitioning(), Result::NO_ERROR);
     ASSERT_EQ(cPWithFallback.finish(), Result::NO_ERROR);
     ASSERT_EQ(cPWithFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
     ASSERT_EQ(cPWithFallback.getExecutionPlan().forTest_simpleGetDevice(),
               DeviceManager::getCpuDevice());
 
-    // Test kPartitioningWithoutFallback.  We should attempt
-    // partitioning, and fail.
+    // Test kPartitioningWithoutFallback.  We should attempt partitioning,
+    // simulate a recoverable failure, and fail.
     PartitioningCompilation cPWithoutFallback(&model, devices);
     ASSERT_EQ(cPWithoutFallback.setPartitioning(DeviceManager::kPartitioningWithoutFallback),
               Result::NO_ERROR);
+    ASSERT_EQ(cPWithoutFallback.failPartitioning(), Result::NO_ERROR);
     ASSERT_EQ(cPWithoutFallback.finish(), Result::OP_FAILED);
-    ASSERT_TRUE(cPWithoutFallback.getExecutionPlan().forTest_hasStepModelOutputsOfUnknownSize());
     ASSERT_EQ(cPWithoutFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::ERROR);
 }
 
diff --git a/runtime/test/TestPartitioningRandom.cpp b/runtime/test/TestPartitioningRandom.cpp
index 968625e..d94ec9f 100644
--- a/runtime/test/TestPartitioningRandom.cpp
+++ b/runtime/test/TestPartitioningRandom.cpp
@@ -220,7 +220,7 @@
     using WrapperCompilation::finish;
 
     Result setPartitioning(uint32_t partitioning) {
-        return static_cast<Result>(builder()->setPartitioning(partitioning));
+        return static_cast<Result>(builder()->forTest_setPartitioning(partitioning));
     }
 
     const ExecutionPlan& getExecutionPlan() const { return builder()->forTest_getExecutionPlan(); }
@@ -751,7 +751,14 @@
 
     const unsigned problemSize = 1 + randUInt(kMaxProblemSize);
     const WrapperOperandType problemType(WrapperType::TENSOR_FLOAT32, {problemSize, problemSize});
-    const WrapperOperandType unknownDimensionsType(WrapperType::TENSOR_FLOAT32, {0, 0});
+    const WrapperOperandType unknownDimensionsTypes[] = {
+            {WrapperType::TENSOR_FLOAT32, {}},
+            {WrapperType::TENSOR_FLOAT32, {0, 0}},
+            {WrapperType::TENSOR_FLOAT32, {0, problemSize}},
+            {WrapperType::TENSOR_FLOAT32, {problemSize, 0}},
+    };
+    const unsigned kUnknownDimensionsTypesCount =
+            sizeof(unknownDimensionsTypes) / sizeof(unknownDimensionsTypes[0]);
 
     static const WrapperOperandType activationFunctionType(WrapperType::INT32, {});
 
@@ -803,11 +810,6 @@
     // operations).
     unsigned rootOperationCount = 0;
 
-    // Track if we added operands with unknown dimensions. In this case,
-    // partitioned compilation will fail if such an operand is read in a
-    // different partition than it is written.
-    bool hasUnknownDimensions = false;
-
     // Generate operations.
     for (unsigned i = 0; i < numOperations; i++) {
         const unsigned operationPatternIndex = randUInt(std::size(kOperationPatterns));
@@ -995,19 +997,18 @@
         // OUTPUTS /////////////////////////////////////////////////////////////////////////////////
 
         std::vector<uint32_t> operationOutputs(operationPattern.mNumOutputs);
-        std::generate(operationOutputs.begin(), operationOutputs.end(),
-                      [&model, &problemType, &unknownDimensionsType, &hasUnknownDimensions,
-                       allowUnknownDimensions, this] {
-                          // 3% unknowns causes ~35% of partitionings to fail
-                          // (determined by commenting out the fallback code,
-                          // running tests and noting number of failures).
-                          if (allowUnknownDimensions && randFrac() < 0.03) {
-                              hasUnknownDimensions = true;
-                              return model.addOperand(&unknownDimensionsType);
-                          } else {
-                              return model.addOperand(&problemType);
-                          }
-                      });
+        std::generate(
+                operationOutputs.begin(), operationOutputs.end(),
+                [&model, &problemType, &unknownDimensionsTypes, allowUnknownDimensions, this] {
+                    // Before the fix for http://b/132458982, 3% unknowns
+                    // causes ~35% of partitionings to fail.
+                    if (allowUnknownDimensions && randFrac() < 0.03) {
+                        return model.addOperand(
+                                &unknownDimensionsTypes[randUInt(kUnknownDimensionsTypesCount)]);
+                    } else {
+                        return model.addOperand(&problemType);
+                    }
+                });
 
         // OPERATION ///////////////////////////////////////////////////////////////////////////////
 
@@ -1157,37 +1158,18 @@
     // CPU fallback device
     devices.push_back(DeviceManager::getCpuDevice());
 
-    // Partitioned compilation.
-    // For test cases without unknown intermediate operand sizes we require the
-    // partitioning to succeed without CPU fallback. With unknown sizes we
-    // retry with a fallback if the non-fallback partitioning fails and require
-    // the fallback to succeed.
-    TestCompilation cNoFallback(&model, devices);
-    TestCompilation cWithFallback(&model, devices);
-    TestCompilation* c2 = nullptr;
-    ASSERT_EQ(cNoFallback.setPartitioning(DeviceManager::kPartitioningWithoutFallback),
-              Result::NO_ERROR);
-    auto compilationResult = cNoFallback.finish();
-    if (hasUnknownDimensions && compilationResult == Result::OP_FAILED &&
-        cNoFallback.getExecutionPlan().forTest_hasStepModelOutputsOfUnknownSize()) {
-        ASSERT_EQ(cWithFallback.setPartitioning(DeviceManager::kPartitioningWithFallback),
-                  Result::NO_ERROR);
-        ASSERT_EQ(cWithFallback.finish(), Result::NO_ERROR);
-        ASSERT_EQ(cWithFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
-        ASSERT_EQ(cWithFallback.getExecutionPlan().forTest_simpleGetDevice(),
-                  DeviceManager::getCpuDevice());
-        c2 = &cWithFallback;
-    } else {
-        ASSERT_EQ(compilationResult, Result::NO_ERROR);
-        c2 = &cNoFallback;
-    }
+    // Partitioned compilation.  We require the partitioning to succeed without
+    // CPU fallback.
+    TestCompilation c2(&model, devices);
+    ASSERT_EQ(c2.setPartitioning(DeviceManager::kPartitioningWithoutFallback), Result::NO_ERROR);
+    ASSERT_EQ(c2.finish(), Result::NO_ERROR);
 
 #ifdef VERBOSE
     {
         std::cout << "signatures = " << signatures.size() << ", devices = " << devices.size()
                   << std::endl;
         // TODO: When dumping steps, include non-ExecutionSteps.
-        const ExecutionPlan& plan = c2->getExecutionPlan();
+        const ExecutionPlan& plan = c2.getExecutionPlan();
         switch (plan.forTest_getKind()) {
             case ExecutionPlan::Kind::SIMPLE:
                 std::cout << "plan: simple" << std::endl;
@@ -1376,7 +1358,7 @@
     }
 
     // Partitioned execution.
-    WrapperExecution e2(c2);
+    WrapperExecution e2(&c2);
     ASSERT_NO_FATAL_FAILURE(prepareForExecution(&e2));
     ASSERT_EQ(e2.compute(), Result::NO_ERROR);