model output of unspecified shape as partition input should not force CPU fallback

When execution of one partition writes a model output, and that output's
shape is not fully specified, and the execution succeeds, and that model
output is the input of another partition, execution of that second
partition fails. This results in CPU fallback, except that an OEM or
extension operation results in execution failure being reported at the
NDK level.

The reason execution of the second partition fails is that although
shape information is computed by execution of the first partition, that
shape information is not propagated to the execution of the second
partition: When we try to execute the second partition, both the model
input operand and the request input argument have unspecified shape,
causing a validation failure.

The fix is that in ExecutionStep::mapInputsAndOutputs() when we
initialize the second partition's input from the first partition's
output, we copy the dimensions from the first partition's output to the
second partition's input.

Test: NeuralNetworksTest_static (PartitioningTest, DynamicTemporariesTest, RandomPartitioningTest)

Bug: 168657259
Merged-In: I388f14feb2e55d1632958a07e2f2b5cecb9328d8
Change-Id: I388f14feb2e55d1632958a07e2f2b5cecb9328d8
(cherry picked from commit bf459f7b742bfbb3bc06948014d73d6fd41591ab)
diff --git a/runtime/ExecutionBuilder.cpp b/runtime/ExecutionBuilder.cpp
index 120ff99..8b6b817 100644
--- a/runtime/ExecutionBuilder.cpp
+++ b/runtime/ExecutionBuilder.cpp
@@ -544,7 +544,7 @@
 
     // Get fallback executor.
     std::shared_ptr<StepExecutor> executor;
-    int n1 = plan.fallback(controller, &executor);
+    int n1 = plan.fallback(controller, &executor, nullptr, nullptr);
     if (n1 != ANEURALNETWORKS_NO_ERROR) {
         return {n1, {}, kNoTiming, nullptr};
     }
@@ -579,8 +579,9 @@
         // Get the current step of the execution.
         std::shared_ptr<StepExecutor> executor;
         std::shared_ptr<ExecutionBurstController> burstController;
-        int n = doInsufficientSizeFallback ? plan.fallback(controller, &executor, &burstController)
-                                           : plan.next(controller, &executor, &burstController);
+        int n = doInsufficientSizeFallback
+                        ? plan.fallback(controller, &executor, &burstController, &outputShapes)
+                        : plan.next(controller, &executor, &burstController, &outputShapes);
         doInsufficientSizeFallback = false;
         if (n != ANEURALNETWORKS_NO_ERROR) {
             // During the interpreted execution of control flow, a loop timeout
@@ -779,7 +780,7 @@
 
         // Get the current step of the execution.
         std::shared_ptr<StepExecutor> executor;
-        int n = plan.next(controller, &executor, nullptr, syncFence);
+        int n = plan.next(controller, &executor, nullptr, nullptr, syncFence);
         if (n != ANEURALNETWORKS_NO_ERROR) {
             // During the interpreted execution of control flow, a loop timeout
             // might occur in ExecutionPlan::next().
@@ -1259,17 +1260,28 @@
 }
 
 void StepExecutor::mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
-                                    ModelArgumentInfo* executorInputOrOutput) {
+                                    ModelArgumentInfo* executorInputOrOutput,
+                                    const hidl_vec<uint32_t>* builderDimensions) {
+    auto updateDimensions = [executorInputOrOutput, builderDimensions] {
+        if (!builderDimensions) {
+            return;
+        }
+        executorInputOrOutput->dimensions() = *builderDimensions;
+    };
+
     *executorInputOrOutput = builderInputOrOutput;
     switch (executorInputOrOutput->state()) {
         default:
             CHECK(false) << "unexpected ModelArgumentInfo::state";
             break;
         case ModelArgumentInfo::HAS_NO_VALUE:
-        case ModelArgumentInfo::POINTER:
         case ModelArgumentInfo::UNSPECIFIED:
             break;
+        case ModelArgumentInfo::POINTER:
+            updateDimensions();
+            break;
         case ModelArgumentInfo::MEMORY: {
+            updateDimensions();
             const uint32_t builderPoolIndex = builderInputOrOutput.locationAndLength().poolIndex;
             const Memory* memory = mExecutionBuilder->mMemories[builderPoolIndex];
             const uint32_t executorPoolIndex = mMemories.add(memory);
diff --git a/runtime/ExecutionBuilder.h b/runtime/ExecutionBuilder.h
index f9eff4e..2540f23 100644
--- a/runtime/ExecutionBuilder.h
+++ b/runtime/ExecutionBuilder.h
@@ -261,14 +261,19 @@
     // Map inputs and outputs from ExecutionBuilder to StepExecutor,
     // one at a time.  Note that these are input/output indexes, not
     // operand indexes.
+    //
+    // For mapOutputToInput(), outputDimensions may be nullptr if the input
+    // operand has fully specified dimensions.
     void mapInput(uint32_t builderIndex, uint32_t executorIndex) {
         mapInputOrOutput(mExecutionBuilder->mInputs[builderIndex], &mInputs[executorIndex]);
     }
     void mapOutput(uint32_t builderIndex, uint32_t executorIndex) {
         mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mOutputs[executorIndex]);
     }
-    void mapOutputToInput(uint32_t builderIndex, uint32_t executorIndex) {
-        mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mInputs[executorIndex]);
+    void mapOutputToInput(uint32_t builderIndex, uint32_t executorIndex,
+                          const hal::hidl_vec<uint32_t>* outputDimensions) {
+        mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mInputs[executorIndex],
+                         outputDimensions);
     }
 
     // If no length is provided, the input or output is assumed to have the length
@@ -312,8 +317,11 @@
     bool areDynamicTemporariesAllocated() const;
 
    private:
+    // builderDimensions may be nullptr if executorInputOrOutput has fully
+    // specified dimensions.
     void mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
-                          ModelArgumentInfo* executorInputOrOutput);
+                          ModelArgumentInfo* executorInputOrOutput,
+                          const hal::hidl_vec<uint32_t>* builderDimensions = nullptr);
 
     // If no length is provided, the input or output is assumed to have the length
     // of the corresponding operand.  dimensions must either have zero rank or
diff --git a/runtime/ExecutionPlan.cpp b/runtime/ExecutionPlan.cpp
index caf263c..c3aa61f 100644
--- a/runtime/ExecutionPlan.cpp
+++ b/runtime/ExecutionPlan.cpp
@@ -497,7 +497,8 @@
 }
 
 void ExecutionStep::mapInputsAndOutputs(
-        std::shared_ptr<StepExecutor> executor, const Memory* temporaryMemory,
+        std::shared_ptr<StepExecutor> executor,
+        const std::vector<hal::OutputShape>* mainModelOutputShapes, const Memory* temporaryMemory,
         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOffsetOfTemporary,
         const DynamicTemporaries& dynamicTemporaries,
         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToInputIndex,
@@ -517,7 +518,10 @@
             executor->mapInput(it->second, stepInputIndex);
         } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
                    it != sourceOperandToOutputIndex.end()) {
-            executor->mapOutputToInput(it->second, stepInputIndex);
+            executor->mapOutputToInput(it->second, stepInputIndex,
+                                       mainModelOutputShapes
+                                               ? &mainModelOutputShapes->at(it->second).dimensions
+                                               : nullptr);
         } else if (auto it = sourceOperandToConstantReference.find(sourceOperandIndex);
                    it != sourceOperandToConstantReference.end()) {
             // Constant partition boundary operand. This could be an IF branch
@@ -1216,7 +1220,8 @@
 // TODO: Find a better way to provide this functionality.
 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
                             std::shared_ptr<StepExecutor>* executor,
-                            std::shared_ptr<ExecutionBurstController>* burstController) const {
+                            std::shared_ptr<ExecutionBurstController>* burstController,
+                            const std::vector<OutputShape>* mainModelOutputShapes) const {
     *executor = nullptr;
     if (burstController != nullptr) {
         *burstController = nullptr;
@@ -1236,7 +1241,7 @@
     }
 
     controller->mNextStepIndex = controller->mFallbackNextStepIndex;
-    return next(controller, executor, burstController);
+    return next(controller, executor, burstController, mainModelOutputShapes);
 }
 
 ExecutionPlan::Buffer::Buffer(void* pointer, uint32_t size)
@@ -1332,6 +1337,7 @@
 int ExecutionPlan::next(std::shared_ptr<Controller> controller,
                         std::shared_ptr<StepExecutor>* executor,
                         std::shared_ptr<ExecutionBurstController>* burstController,
+                        const std::vector<OutputShape>* mainModelOutputShapes,
                         int syncFdOfLastStep) const {
     controller->mLastStepSyncFd = syncFdOfLastStep;
     *executor = nullptr;
@@ -1373,12 +1379,13 @@
         return ANEURALNETWORKS_NO_ERROR;
     }
 
-    return nextCompound(controller, executor, burstController);
+    return nextCompound(controller, executor, burstController, mainModelOutputShapes);
 }
 
 int ExecutionPlan::nextCompound(std::shared_ptr<Controller> controller,
                                 std::shared_ptr<StepExecutor>* executor,
-                                std::shared_ptr<ExecutionBurstController>* burstController) const {
+                                std::shared_ptr<ExecutionBurstController>* burstController,
+                                const std::vector<OutputShape>* mainModelOutputShapes) const {
     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
         return ANEURALNETWORKS_OP_FAILED;
     }
@@ -1391,13 +1398,13 @@
 
     const auto& logicalStep = compoundBody->mSteps[controller->mNextStepIndex];
     if (const IfStep* step = logicalStep->tryIfStep()) {
-        return nextCompound(step, controller, executor, burstController);
+        return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
     } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
-        return nextCompound(step, controller, executor, burstController);
+        return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
     } else if (const GotoStep* step = logicalStep->tryGotoStep()) {
-        return nextCompound(step, controller, executor, burstController);
+        return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
     } else if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
-        return nextCompound(step, controller, executor, burstController);
+        return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
     } else {
         CHECK(false) << "Unknown step variant";
         return ANEURALNETWORKS_BAD_STATE;
@@ -1406,7 +1413,8 @@
 
 int ExecutionPlan::nextCompound(const ExecutionStep* step, std::shared_ptr<Controller> controller,
                                 std::shared_ptr<StepExecutor>* executor,
-                                std::shared_ptr<ExecutionBurstController>* burstController) const {
+                                std::shared_ptr<ExecutionBurstController>* burstController,
+                                const std::vector<OutputShape>* mainModelOutputShapes) const {
     VLOG(EXECUTION) << "next: Step#" << controller->mNextStepIndex << ": execute on "
                     << step->getDevice()->getName();
 
@@ -1418,7 +1426,7 @@
                                                step, &controller->mDynamicTemporaries);
 
     step->mapInputsAndOutputs(
-            *executor, controller->mTemporaries.get(),
+            *executor, mainModelOutputShapes, controller->mTemporaries.get(),
             controller->mSourceOperandToOffsetOfTemporary, controller->mDynamicTemporaries,
             controller->mSourceOperandToInputIndex, controller->mSourceOperandToOutputIndex,
             controller->mSourceOperandToConstantReference);
@@ -1505,7 +1513,8 @@
 
 int ExecutionPlan::nextCompound(const IfStep* step, std::shared_ptr<Controller> controller,
                                 std::shared_ptr<StepExecutor>* executor,
-                                std::shared_ptr<ExecutionBurstController>* burstController) const {
+                                std::shared_ptr<ExecutionBurstController>* burstController,
+                                const std::vector<OutputShape>* mainModelOutputShapes) const {
     VLOG(EXECUTION) << "next: " << toString(*step);
     // If the last step has a sync fence, wait for it to signal before reading the condition value.
     // This is safe because the steps are serialized when doing fenced compute.
@@ -1539,12 +1548,13 @@
         // step->outerOutputOperands[i] to implement double buffering.
         controller->setOutput(step->outerOutputOperands[i], branchOutputOperands[i]);
     }
-    return nextCompound(controller, executor, burstController);
+    return nextCompound(controller, executor, burstController, mainModelOutputShapes);
 }
 
 int ExecutionPlan::nextCompound(const WhileStep* step, std::shared_ptr<Controller> controller,
                                 std::shared_ptr<StepExecutor>* executor,
-                                std::shared_ptr<ExecutionBurstController>* burstController) const {
+                                std::shared_ptr<ExecutionBurstController>* burstController,
+                                const std::vector<OutputShape>* mainModelOutputShapes) const {
     WhileState& state = controller->mWhileState[controller->mNextStepIndex];
     if (state.stage == WhileState::EVALUATE_CONDITION) {
         state.iteration = state.iteration == WhileState::kOutsideLoop ? 0 : state.iteration + 1;
@@ -1572,7 +1582,7 @@
         }
 
         state.stage = WhileState::EVALUATE_BODY;
-        return nextCompound(controller, executor, burstController);
+        return nextCompound(controller, executor, burstController, mainModelOutputShapes);
     }
 
     CHECK(state.stage == WhileState::EVALUATE_BODY);
@@ -1660,15 +1670,16 @@
     }
 
     state.stage = WhileState::EVALUATE_CONDITION;
-    return nextCompound(controller, executor, burstController);
+    return nextCompound(controller, executor, burstController, mainModelOutputShapes);
 }
 
 int ExecutionPlan::nextCompound(const GotoStep* step, std::shared_ptr<Controller> controller,
                                 std::shared_ptr<StepExecutor>* executor,
-                                std::shared_ptr<ExecutionBurstController>* burstController) const {
+                                std::shared_ptr<ExecutionBurstController>* burstController,
+                                const std::vector<OutputShape>* mainModelOutputShapes) const {
     VLOG(EXECUTION) << "next: " << toString(*step);
     controller->mNextStepIndex = step->gotoStepIndex;
-    return nextCompound(controller, executor, burstController);
+    return nextCompound(controller, executor, burstController, mainModelOutputShapes);
 }
 
 void ExecutionPlan::becomeCompoundIfEmpty() {
diff --git a/runtime/ExecutionPlan.h b/runtime/ExecutionPlan.h
index 1a1de1b..740912d 100644
--- a/runtime/ExecutionPlan.h
+++ b/runtime/ExecutionPlan.h
@@ -273,8 +273,12 @@
     //
     // This method only reads map entries for which the first element of
     // SourceOperandIndex is mSourceModelIndex.
+    //
+    // mainModelOutputShapes may be nullptr if the only main model outputs that are
+    //     inputs of this step are of fully specified shape.
     void mapInputsAndOutputs(
             std::shared_ptr<StepExecutor> stepExecutor,
+            const std::vector<hal::OutputShape>* mainModelOutputShapes,
             const Memory* temporaryMemory,  // for static temporaries
             const std::map<SourceOperandIndex, uint32_t>&
                     sourceOperandToOffsetOfTemporary,  // for static temporaries
@@ -650,14 +654,19 @@
     // Sets up a new StepExecutor and burstController (if applicable) if there
     // is a step to execute. See ExecutionPlan::Controller.
     // Handles control flow. See LogicalStep.
+    // burstController is nullptr if we are not to do burst execution.
+    // mainModelOutputShapes may be nullptr if the only main model outputs that are step model
+    //     inputs are of fully specified shape.
     // syncFdOfLastStep is the sync fence fd generated by the most recently processed step.
     int next(std::shared_ptr<Controller> controller, std::shared_ptr<StepExecutor>* executor,
-             std::shared_ptr<ExecutionBurstController>* burstController = nullptr,
+             std::shared_ptr<ExecutionBurstController>* burstController,
+             const std::vector<hal::OutputShape>* mainModelOutputShapes,
              int syncFdOfLastStep = -1) const;
 
     // Create the same executor as the last one created by next().
     int fallback(std::shared_ptr<Controller> controller, std::shared_ptr<StepExecutor>* executor,
-                 std::shared_ptr<ExecutionBurstController>* burstController = nullptr) const;
+                 std::shared_ptr<ExecutionBurstController>* burstController,
+                 const std::vector<hal::OutputShape>* mainModelOutputShapes) const;
 
     ExecutionStep* createNewExecutionStep(uint32_t sourceModelIndex,
                                           const std::shared_ptr<Device> device);
@@ -760,19 +769,24 @@
     // Handles control flow. See LogicalStep.
     int nextCompound(std::shared_ptr<Controller> controller,
                      std::shared_ptr<StepExecutor>* executor,
-                     std::shared_ptr<ExecutionBurstController>* burstController) const;
+                     std::shared_ptr<ExecutionBurstController>* burstController,
+                     const std::vector<hal::OutputShape>* mainModelOutputShapes) const;
     int nextCompound(const ExecutionStep* step, std::shared_ptr<Controller> controller,
                      std::shared_ptr<StepExecutor>* executor,
-                     std::shared_ptr<ExecutionBurstController>* burstController) const;
+                     std::shared_ptr<ExecutionBurstController>* burstController,
+                     const std::vector<hal::OutputShape>* mainModelOutputShapes) const;
     int nextCompound(const IfStep* step, std::shared_ptr<Controller> controller,
                      std::shared_ptr<StepExecutor>* executor,
-                     std::shared_ptr<ExecutionBurstController>* burstController) const;
+                     std::shared_ptr<ExecutionBurstController>* burstController,
+                     const std::vector<hal::OutputShape>* mainModelOutputShapes) const;
     int nextCompound(const WhileStep* step, std::shared_ptr<Controller> controller,
                      std::shared_ptr<StepExecutor>* executor,
-                     std::shared_ptr<ExecutionBurstController>* burstController) const;
+                     std::shared_ptr<ExecutionBurstController>* burstController,
+                     const std::vector<hal::OutputShape>* mainModelOutputShapes) const;
     int nextCompound(const GotoStep* step, std::shared_ptr<Controller> controller,
                      std::shared_ptr<StepExecutor>* executor,
-                     std::shared_ptr<ExecutionBurstController>* burstController) const;
+                     std::shared_ptr<ExecutionBurstController>* burstController,
+                     const std::vector<hal::OutputShape>* mainModelOutputShapes) const;
 
     struct Body {
         virtual ~Body() {}
diff --git a/runtime/test/TestPartitioning.cpp b/runtime/test/TestPartitioning.cpp
index 8b94649..d85717c 100644
--- a/runtime/test/TestPartitioning.cpp
+++ b/runtime/test/TestPartitioning.cpp
@@ -2265,8 +2265,6 @@
     }
 }
 
-#if 0
-// TODO: enable this test once b/168657259 is fixed
 TEST_F(DynamicTemporariesTest, ModelOutputsSufficientSize) {
     // The purpose of this test is to confirm that the partitioner and the
     // runtime can handle a model output of unspecified dimensions but
@@ -2280,7 +2278,6 @@
     ASSERT_NO_FATAL_FAILURE(executeCompilationAndCompareOutput(true, true));
 }
 
-// TODO: enable this test once b/168657259 is fixed
 TEST_F(DynamicTemporariesTest, DynamicTemporariesUnspecifiedOutputs) {
     // The purpose of this test is to confirm that the partitioner can produce
     // dynamic temporaries and that the runtime can handle them properly.  Note
@@ -2290,7 +2287,6 @@
     ASSERT_NO_FATAL_FAILURE(compileModelAndComparePlan());
     ASSERT_NO_FATAL_FAILURE(executeCompilationAndCompareOutput(true, true));
 }
-#endif
 
 TEST_F(DynamicTemporariesTest, DynamicTemporariesSpecifiedOutputs) {
     // The purpose of this test is to confirm that the partitioner can produce
@@ -2338,7 +2334,6 @@
     ASSERT_NO_FATAL_FAILURE(executeCompilationAndCompareOutput(false, true));
 }
 
-#if 0
 // TODO: enable this test once b/168657259 is fixed
 TEST_F(DynamicTemporariesTest, ModelOutput4InsufficientSizeWithoutDynamicTemporary) {
     // The purpose of this test is to confirm that the runtime can detect a
@@ -2351,7 +2346,6 @@
     ASSERT_NO_FATAL_FAILURE(compileModelAndComparePlan());
     ASSERT_NO_FATAL_FAILURE(executeCompilationAndCompareOutput(true, false));
 }
-#endif
 
 // Test token rehashing during the compilation step.
 class CacheTest : public PartitioningTest {