More testing for partitions with boundary temporaries of unknown size. PartitioningTest: - Add verification that no existing tests have boundary temporaries of unknown size. - Add new tests: - Model output with unspecified shape and sufficient size. Disabled because of b/168657259. - Boundary temporary of unknown size, model outputs of unspecified shape and sufficient size. Disabled because of b/168657259. - Boundary temporary of unknown size, model outputs of specified shape. - Boundary temporary of unknown size, model output with insufficient size. - Model output with insufficient size. - Verify that all tests have expected modelOutputsThatAreDownstreamInputs set: Most tests have empty set, two old tests have a one-member set, the four new tests each have a one-member set (sometimes a model output of unknown size). - "#ifdef VERBOSE" now produces more information when a comparison fails. RandomPartitioningTest: - Some ASYNC testing now becomes FENCED testing, to ensure that boundary temporaries of unknown size are handled properly (we expect, but do not confirm, that this will happen via fallback to ASYNC). This potentially changes all test cases (because of its effect on pseudorandom number sequences). Note that all attempts in these tests to test handling of boundary temporaries of unknown size is vulnerable to compile-time shape propagation (an optimization that does not occur today): If such propagation happens, then we may not actually have boundary temporaries of unknown size, and so we will not be testing what we want to test. - Under "#ifdef TRACE_DYNTEMP", generate some debugging output that may be useful to determine test coverage for support of boundary temporaries of unknown size. Also: - Improvements to Execution wrapper: New methods getComputeMode() and compute(ComputeMode). Change TestExecution.cpp and TestGenerated.cpp to use the new methods. - Improve logging to better distinguish burst execution from ordinary synchronous execution. - Make some validation failure messages more verbose. Test: NeuralNetworksTest_static (PartitioningTest, DynamicTemporariesTest, RandomPartitioningTest) Test: Confirm that Execution wrapper changes work as expected by eyeballing VLOG(EXECUTION) output for NeuralNetworksTest_static Flavor/ExecutionTest13.Wait/0 256 and for *Generated*add 256 Bug: 132458982 Merged-In: I11e0e5bba9a8c6762881127a8aaf10b8cef9db3d Change-Id: I11e0e5bba9a8c6762881127a8aaf10b8cef9db3d (cherry picked from commit c7766b0113918f4bac0c53ad74264df3d1723f7d)

commit: 0a963399c726226601ced17d208f45a1fc9d28c5 [log] [tgz]
author: David Gross <[email protected]> Fri Sep 18 14:16:31 2020 -0700
committer: David Gross <[email protected]> Tue Sep 29 09:17:01 2020 -0700
tree: b77bd205db6a67b9b18ad31b9f513bb74053508b
parent: 948ffa8d882485ff45ce6a9b79c822bf261c1c44 [diff]
diff --git a/common/ValidateHal.cpp b/common/ValidateHal.cpp
index 6470fbc..46f9b2f 100644
--- a/common/ValidateHal.cpp
+++ b/common/ValidateHal.cpp

@@ -782,8 +782,9 @@
                     // extension operand type.
                     if (!isExtensionOperandType(operand.type) &&
                         !nonExtensionOperandTypeIsScalar(static_cast<int>(operand.type))) {
-                        NN_RET_CHECK_GT(modelRank, 0) << "Model has unknown rank but the request "
-                                                         "does not specify the rank.";
+                        NN_RET_CHECK_GT(modelRank, 0)
+                                << "Model " << type << " " << requestArgumentIndex
+                                << " has unknown rank but the request does not specify the rank.";
                     }
                     // Validate that all the dimensions are specified in the model.
                     for (size_t i = 0; i < modelRank; i++) {

diff --git a/runtime/ExecutionBuilder.cpp b/runtime/ExecutionBuilder.cpp
index e36e564..120ff99 100644
--- a/runtime/ExecutionBuilder.cpp
+++ b/runtime/ExecutionBuilder.cpp

@@ -966,7 +966,11 @@
     std::shared_ptr<ExecutionPlan::Controller> controller =
             mPlan->makeController(this, burstBuilder);
     if (synchronous) {
-        VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)";
+        if (burstBuilder) {
+            VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API, burst)";
+        } else {
+            VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)";
+        }
         sp<ExecutionCallback> localSynchronizationCallback = new ExecutionCallback();
         localSynchronizationCallback->setOnFinish(wrappedFinish);
         asyncStartComputePartitioned(this, *mPlan, controller, allowCpuFallback, deadline,

diff --git a/runtime/ExecutionPlan.cpp b/runtime/ExecutionPlan.cpp
index 97bfacd..caf263c 100644
--- a/runtime/ExecutionPlan.cpp
+++ b/runtime/ExecutionPlan.cpp

@@ -1191,24 +1191,16 @@
     // TODO(b/157236079): Move some or all of this work to compilation time?
     DynamicTemporaries dynamicTemporaries;
     const TypeManager* typeManager = TypeManager::get();
-    for (const auto& logicalStep : compound()->mSteps) {
-        if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
-            const uint32_t stepIndex = step->getIndex();
-            const uint32_t sourceModelIndex = step->getSourceModelIndex();
-            for (const auto& entry : step->getTempsAsStepModelOutputs()) {
-                const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, entry.first);
-                const auto& sourceOperand = getSourceOperand(sourceOperandIndex);
-                if (hasUnknownSize(sourceOperand)) {
-                    CHECK(typeManager->isTensorType(sourceOperand.type));
-                    // TODO: For now we guess an initial size equal to element
-                    // size, which is overly conservative.
-                    const uint32_t size = typeManager->getSizeOfData(sourceOperand.type, {1});
-                    dynamicTemporaries.declare(sourceOperandIndex, stepIndex,
-                                               sourceOperand.dimensions, size);
-                }
-            }
-        }
-    }
+    forEachDynamicTemporary([typeManager, &dynamicTemporaries](
+                                    SourceOperandIndex sourceOperandIndex,
+                                    const Operand& sourceOperand, uint32_t definingStepIndex) {
+        CHECK(typeManager->isTensorType(sourceOperand.type));
+        // TODO: For now we guess an initial size equal to element
+        // size, which is overly conservative.
+        const uint32_t size = typeManager->getSizeOfData(sourceOperand.type, {1});
+        dynamicTemporaries.declare(sourceOperandIndex, definingStepIndex, sourceOperand.dimensions,
+                                   size);
+    });
     dynamicTemporaries.endDeclarations();
     dynamicTemporaries.vlogDump("finished declarations");
 
@@ -1785,6 +1777,15 @@
     return compound()->mSteps;
 }
 
+std::set<uint32_t> ExecutionPlan::forTest_flatGetDynamicTemporaries() const {
+    CHECK_EQ(getSourceModels().size(), size_t(1));
+    std::set<uint32_t> ret;
+    forEachDynamicTemporary([&ret](SourceOperandIndex dynTemp, const Operand&, uint32_t) {
+        ret.insert(dynTemp.second);
+    });
+    return ret;
+}
+
 bool ExecutionPlan::hasDynamicTemporaries() const {
     return mBody->hasDynamicTemporaries();
 }
@@ -1861,6 +1862,28 @@
     }
 }
 
+void ExecutionPlan::forEachDynamicTemporary(
+        const std::function<void(SourceOperandIndex, const Operand&, uint32_t definingStepIndex)>&
+                fn) const {
+    if (mState != COMPOUND) {
+        return;
+    }
+
+    for (const auto& logicalStep : compound()->mSteps) {
+        if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
+            const uint32_t stepIndex = step->getIndex();
+            const uint32_t sourceModelIndex = step->getSourceModelIndex();
+            for (const auto& entry : step->getTempsAsStepModelOutputs()) {
+                const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, entry.first);
+                const auto& sourceOperand = getSourceOperand(sourceOperandIndex);
+                if (hasUnknownSize(sourceOperand)) {
+                    fn(sourceOperandIndex, sourceOperand, stepIndex);
+                }
+            }
+        }
+    }
+}
+
 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
                                    uint32_t preference, uint32_t priority,
                                    const std::optional<Deadline>& deadline, ExecutionPlan* plan,

diff --git a/runtime/ExecutionPlan.h b/runtime/ExecutionPlan.h
index 3b6beb6..1a1de1b 100644
--- a/runtime/ExecutionPlan.h
+++ b/runtime/ExecutionPlan.h

@@ -24,6 +24,7 @@
 
 #include <algorithm>
 #include <chrono>
+#include <functional>
 #include <map>
 #include <memory>
 #include <ostream>
@@ -717,6 +718,9 @@
     Kind forTest_getKind() const;
     std::shared_ptr<const Device> forTest_simpleGetDevice() const;
     const std::vector<std::shared_ptr<LogicalStep>>& forTest_compoundGetSteps() const;
+    //     The "flat" in the name signifies that this method requires that the
+    //     model not contain any control flow operations.
+    std::set<uint32_t> forTest_flatGetDynamicTemporaries() const;
     const uint8_t* forTest_simpleGetCacheToken() const;
 
    private:
@@ -905,6 +909,9 @@
         return static_cast<const CompoundBody*>(mBody);
     }
 
+    void forEachDynamicTemporary(const std::function<void(SourceOperandIndex, const hal::Operand&,
+                                                          uint32_t definingStepIndex)>&) const;
+
     // Pointers to compilation caching information in CompilationBuilder.
     const std::string* mCacheDir = nullptr;
     const uint8_t* mToken = nullptr;

diff --git a/runtime/test/TestExecution.cpp b/runtime/test/TestExecution.cpp
index 480b0ef..3441f9f 100644
--- a/runtime/test/TestExecution.cpp
+++ b/runtime/test/TestExecution.cpp

@@ -16,8 +16,6 @@
 
 #include <gtest/gtest.h>
 
-#include <android-base/scopeguard.h>
-
 #include <algorithm>
 #include <atomic>
 #include <cassert>
@@ -757,19 +755,15 @@
         SCOPED_TRACE("burstCompute");
 
         // TODO: If a burst API is added to nn::test_wrapper (e.g.,
-        // Execution::burstCompute()), then use that, rather than using
-        // Execution::setComputeMode() to make Execution::compute() use burst
-        // functionality.
-
-        auto oldComputeMode =
-                WrapperExecution::setComputeMode(WrapperExecution::ComputeMode::BURST);
-        base::ScopeGuard restore(
-                [oldComputeMode] { WrapperExecution::setComputeMode(oldComputeMode); });
+        // Execution::burstCompute()), then use that, rather than
+        // Execution::compute(WrapperExecution::ComputeMode::BURST).
 
         WrapperExecution execution(&mCompilation);
         ASSERT_NO_FATAL_FAILURE(setInputOutput(&execution));
         TestPreparedModelLatest::pauseExecutions(true);
-        std::thread run([this, &execution] { EXPECT_EQ(execution.compute(), kExpectResult); });
+        std::thread run([this, &execution] {
+            EXPECT_EQ(execution.compute(WrapperExecution::ComputeMode::BURST), kExpectResult);
+        });
         getDimensionsWhileRunning(execution);
         TestPreparedModelLatest::pauseExecutions(false);
         run.join();

diff --git a/runtime/test/TestGenerated.cpp b/runtime/test/TestGenerated.cpp
index 6b96004..d7e4d75 100644
--- a/runtime/test/TestGenerated.cpp
+++ b/runtime/test/TestGenerated.cpp

@@ -82,6 +82,7 @@
     bool mExpectFailure = false;
     bool mTestQuantizationCoupling = false;
     bool mTestDeviceMemory = false;
+    Execution::ComputeMode mComputeMode = Execution::getComputeMode();
 };
 
 int GeneratedTests::mVndkVersion = __ANDROID_API_FUTURE__;
@@ -138,13 +139,14 @@
     }
 }
 
-static void computeWithPtrs(const TestModel& testModel, Execution* execution, Result* result,
+static void computeWithPtrs(const TestModel& testModel, Execution* execution,
+                            Execution::ComputeMode computeMode, Result* result,
                             std::vector<TestBuffer>* outputs) {
     {
         NNTRACE_APP(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "computeWithPtrs example");
         createRequest(testModel, execution, outputs);
     }
-    *result = execution->compute();
+    *result = execution->compute(computeMode);
 }
 
 static ANeuralNetworksMemory* createDeviceMemoryForInput(const Compilation& compilation,
@@ -175,8 +177,8 @@
 
 // Set result = Result::NO_ERROR and outputs = {} if the test should be skipped.
 static void computeWithDeviceMemories(const Compilation& compilation, const TestModel& testModel,
-                                      Execution* execution, Result* result,
-                                      std::vector<TestBuffer>* outputs) {
+                                      Execution* execution, Execution::ComputeMode computeMode,
+                                      Result* result, std::vector<TestBuffer>* outputs) {
     ASSERT_NE(execution, nullptr);
     ASSERT_NE(result, nullptr);
     ASSERT_NE(outputs, nullptr);
@@ -218,7 +220,7 @@
         }
     }
 
-    *result = execution->compute();
+    *result = execution->compute(computeMode);
 
     // Copy out output results.
     for (uint32_t i = 0; i < testModel.main.outputIndexes.size(); i++) {
@@ -245,9 +247,10 @@
     std::vector<TestBuffer> outputs;
 
     if (mTestDeviceMemory) {
-        computeWithDeviceMemories(compilation, testModel, &execution, &result, &outputs);
+        computeWithDeviceMemories(compilation, testModel, &execution, mComputeMode, &result,
+                                  &outputs);
     } else {
-        computeWithPtrs(testModel, &execution, &result, &outputs);
+        computeWithPtrs(testModel, &execution, mComputeMode, &result, &outputs);
     }
 
     if (result == Result::NO_ERROR && outputs.empty()) {
@@ -388,21 +391,18 @@
 
 #ifdef NNTEST_COMPUTE_MODE
 TEST_P(GeneratedTests, Sync) {
-    const auto oldComputeMode = Execution::setComputeMode(Execution::ComputeMode::SYNC);
+    mComputeMode = Execution::ComputeMode::SYNC;
     execute(testModel);
-    Execution::setComputeMode(oldComputeMode);
 }
 
 TEST_P(GeneratedTests, Async) {
-    const auto oldComputeMode = Execution::setComputeMode(Execution::ComputeMode::ASYNC);
+    mComputeMode = Execution::ComputeMode::ASYNC;
     execute(testModel);
-    Execution::setComputeMode(oldComputeMode);
 }
 
 TEST_P(GeneratedTests, Burst) {
-    const auto oldComputeMode = Execution::setComputeMode(Execution::ComputeMode::BURST);
+    mComputeMode = Execution::ComputeMode::BURST;
     execute(testModel);
-    Execution::setComputeMode(oldComputeMode);
 }
 #else
 TEST_P(GeneratedTests, Test) {
@@ -427,9 +427,8 @@
 }
 
 TEST_P(FencedComputeTest, Test) {
-    const auto oldComputeMode = Execution::setComputeMode(Execution::ComputeMode::FENCED);
+    mComputeMode = Execution::ComputeMode::FENCED;
     execute(testModel);
-    Execution::setComputeMode(oldComputeMode);
 }
 
 INSTANTIATE_GENERATED_TEST(GeneratedTests,

diff --git a/runtime/test/TestNeuralNetworksWrapper.h b/runtime/test/TestNeuralNetworksWrapper.h
index ae40121..d89854b 100644
--- a/runtime/test/TestNeuralNetworksWrapper.h
+++ b/runtime/test/TestNeuralNetworksWrapper.h

@@ -409,8 +409,23 @@
         return result;
     }
 
-    Result compute() {
-        switch (mComputeMode) {
+    // By default, compute() uses the synchronous API. Either an argument or
+    // setComputeMode() can be used to change the behavior of compute() to
+    // either:
+    // - use the asynchronous or fenced API and then wait for computation to complete
+    // or
+    // - use the burst API
+    // Returns the previous ComputeMode.
+    enum class ComputeMode { SYNC, ASYNC, BURST, FENCED };
+    static ComputeMode setComputeMode(ComputeMode mode) {
+        ComputeMode oldComputeMode = mComputeMode;
+        mComputeMode = mode;
+        return oldComputeMode;
+    }
+    static ComputeMode getComputeMode() { return mComputeMode; }
+
+    Result compute(ComputeMode computeMode = mComputeMode) {
+        switch (computeMode) {
             case ComputeMode::SYNC: {
                 return static_cast<Result>(ANeuralNetworksExecution_compute(mExecution));
             }
@@ -455,19 +470,6 @@
         return Result::BAD_DATA;
     }
 
-    // By default, compute() uses the synchronous API. setComputeMode() can be
-    // used to change the behavior of compute() to either:
-    // - use the asynchronous API and then wait for computation to complete
-    // or
-    // - use the burst API
-    // Returns the previous ComputeMode.
-    enum class ComputeMode { SYNC, ASYNC, BURST, FENCED };
-    static ComputeMode setComputeMode(ComputeMode mode) {
-        ComputeMode oldComputeMode = mComputeMode;
-        mComputeMode = mode;
-        return oldComputeMode;
-    }
-
     Result getOutputOperandDimensions(uint32_t index, std::vector<uint32_t>* dimensions) {
         uint32_t rank = 0;
         Result result = static_cast<Result>(

diff --git a/runtime/test/TestPartitioning.cpp b/runtime/test/TestPartitioning.cpp
index c58b1a4..8b94649 100644
--- a/runtime/test/TestPartitioning.cpp
+++ b/runtime/test/TestPartitioning.cpp

@@ -19,6 +19,7 @@
 #include <algorithm>
 #include <filesystem>
 #include <functional>
+#include <iostream>
 #include <map>
 #include <memory>
 #include <queue>
@@ -160,16 +161,12 @@
 using Result = ::android::nn::test_wrapper::Result;
 using SampleDriver = ::android::nn::sample_driver::SampleDriver;
 using WrapperCompilation = ::android::nn::test_wrapper::Compilation;
+using WrapperExecution = ::android::nn::test_wrapper::Execution;
 using WrapperModel = ::android::nn::test_wrapper::Model;
 using WrapperOperandType = ::android::nn::test_wrapper::OperandType;
 using WrapperSymmPerChannelQuantParams = ::android::nn::test_wrapper::SymmPerChannelQuantParams;
 using WrapperType = ::android::nn::test_wrapper::Type;
 
-template <typename T>
-using MQDescriptorSync = ::android::hardware::MQDescriptorSync<T>;
-
-constexpr Timing kBadTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
-
 Capabilities makeCapabilities(float perf) {
     PerformanceInfo perfInfo = {.execTime = perf, .powerUsage = perf};
     return {.relaxedFloat32toFloat16PerformanceScalar = perfInfo,
@@ -309,52 +306,6 @@
 // operation kind K corresponds to the bit (1 << K).  The other operations are
 // represented by a set of OperationType.
 class PartitioningDriver : public SampleDriver {
-   private:
-    // Placeholder class -- a prepared model must not be nullptr.
-    class PartitioningPreparedModel : public IPreparedModel {
-       public:
-        Return<V1_0::ErrorStatus> execute(const V1_0::Request&,
-                                          const sp<V1_0::IExecutionCallback>&) override {
-            return V1_0::ErrorStatus::DEVICE_UNAVAILABLE;
-        }
-        Return<V1_0::ErrorStatus> execute_1_2(const V1_0::Request&, MeasureTiming,
-                                              const sp<V1_2::IExecutionCallback>&) override {
-            return V1_0::ErrorStatus::DEVICE_UNAVAILABLE;
-        }
-        Return<V1_3::ErrorStatus> execute_1_3(const V1_3::Request&, MeasureTiming,
-                                              const OptionalTimePoint&,
-                                              const OptionalTimeoutDuration&,
-                                              const sp<V1_3::IExecutionCallback>&) override {
-            return V1_3::ErrorStatus::DEVICE_UNAVAILABLE;
-        }
-        Return<void> executeSynchronously(const V1_0::Request&, MeasureTiming,
-                                          executeSynchronously_cb cb) override {
-            cb(V1_0::ErrorStatus::DEVICE_UNAVAILABLE, {}, kBadTiming);
-            return Void();
-        }
-        Return<void> executeSynchronously_1_3(const V1_3::Request&, MeasureTiming,
-                                              const OptionalTimePoint&,
-                                              const OptionalTimeoutDuration&,
-                                              executeSynchronously_1_3_cb cb) override {
-            cb(V1_3::ErrorStatus::DEVICE_UNAVAILABLE, {}, kBadTiming);
-            return Void();
-        }
-        Return<void> configureExecutionBurst(
-                const sp<V1_2::IBurstCallback>& /*callback*/,
-                const MQDescriptorSync<V1_2::FmqRequestDatum>& /*requestChannel*/,
-                const MQDescriptorSync<V1_2::FmqResultDatum>& /*resultChannel*/,
-                configureExecutionBurst_cb cb) override {
-            cb(V1_0::ErrorStatus::DEVICE_UNAVAILABLE, nullptr);
-            return Void();
-        }
-        Return<void> executeFenced(const Request&, const hidl_vec<hidl_handle>&, MeasureTiming,
-                                   const OptionalTimePoint&, const OptionalTimeoutDuration&,
-                                   const OptionalTimeoutDuration&, executeFenced_cb cb) {
-            cb(ErrorStatus::DEVICE_UNAVAILABLE, hidl_handle(nullptr), nullptr);
-            return Void();
-        }
-    };
-
    public:
     enum OEM {
         OEMNo,          // rejected by getSupportedOperations and prepareModel
@@ -372,9 +323,11 @@
           mOEM(oem),
           mOperationTypes(std::move(operationTypes)) {
         CHECK_EQ(mOperationTypes.count(OperationType::OEM_OPERATION), size_t(0));
-        std::for_each(mOperationTypes.begin(), mOperationTypes.end(), [](OperationType type) {
-            CHECK_EQ(operationToFirstEncoding.count(type), size_t(0));
-        });
+        if (operationMask) {
+            std::for_each(mOperationTypes.begin(), mOperationTypes.end(), [](OperationType type) {
+                CHECK_EQ(operationToFirstEncoding.count(type), size_t(0));
+            });
+        }
     }
     ~PartitioningDriver() override {}
 
@@ -384,20 +337,38 @@
     }
 
     Return<V1_3::ErrorStatus> prepareModel_1_3(
-            const Model& model, ExecutionPreference, Priority, const OptionalTimePoint&,
-            const hidl_vec<hidl_handle>&, const hidl_vec<hidl_handle>&, const CacheToken&,
-            const sp<V1_3::IPreparedModelCallback>& cb) override {
-        V1_3::ErrorStatus status = V1_3::ErrorStatus::NONE;
-        if (mOEM != OEMYes) {
+            const Model& model, ExecutionPreference preference, Priority priority,
+            const OptionalTimePoint& deadline, const hidl_vec<hidl_handle>& modelCache,
+            const hidl_vec<hidl_handle>& dataCache, const CacheToken& token,
+            const sp<V1_3::IPreparedModelCallback>& callback) override {
+        if (mOEM == OEMIndecisive) {
             for (const auto& operation : model.main.operations) {
                 if (operation.type == OperationType::OEM_OPERATION) {
-                    status = V1_3::ErrorStatus::INVALID_ARGUMENT;
-                    break;
+                    callback->notify_1_3(V1_3::ErrorStatus::INVALID_ARGUMENT, nullptr);
+                    return V1_3::ErrorStatus::INVALID_ARGUMENT;
                 }
             }
         }
-        cb->notify_1_3(status, new PartitioningPreparedModel);
-        return status;
+
+        // NOTE: We verify that all operations in the model are supported.
+        V1_3::ErrorStatus outStatus = V1_3::ErrorStatus::INVALID_ARGUMENT;
+        auto ret = getSupportedOperations_1_3(
+                model, [&outStatus](V1_3::ErrorStatus inStatus,
+                                    const hidl_vec<bool>& supportedOperations) {
+                    if (inStatus == V1_3::ErrorStatus::NONE) {
+                        if (std::all_of(supportedOperations.begin(), supportedOperations.end(),
+                                        [](bool v) { return v; })) {
+                            outStatus = V1_3::ErrorStatus::NONE;
+                        }
+                    }
+                });
+        if (ret.isOk() && (outStatus == V1_3::ErrorStatus::NONE)) {
+            return SampleDriver::prepareModel_1_3(model, preference, priority, deadline, modelCache,
+                                                  dataCache, token, callback);
+        } else {
+            callback->notify_1_3(V1_3::ErrorStatus::INVALID_ARGUMENT, nullptr);
+            return V1_3::ErrorStatus::INVALID_ARGUMENT;
+        }
     }
 
     Return<DeviceStatus> getStatus() override { return DeviceStatus::AVAILABLE; }
@@ -422,13 +393,6 @@
         return Void();
     }
 
-    Return<V1_0::ErrorStatus> prepareModelFromCache(
-            const hidl_vec<hidl_handle>&, const hidl_vec<hidl_handle>&, const CacheToken&,
-            const sp<V1_2::IPreparedModelCallback>& callback) override {
-        callback->notify_1_2(V1_0::ErrorStatus::NONE, new PartitioningPreparedModel);
-        return V1_0::ErrorStatus::NONE;
-    }
-
    private:
     std::vector<bool> getSupportedOperationsForSubgraph(const Model& model,
                                                         const Subgraph& subgraph) {
@@ -624,10 +588,40 @@
     const sp<V1_3::IDevice> mLatestDriver;
 };
 
-enum class Dimensioned { NO, YES };
+enum class Dimensioned {
+    NO,     // either a scalar, or a tensor of either unspecified rank (usually)
+            // or specified rank but with no specified dimensions (where
+            // specifically stated)
+    YES_1,  // tensor of shape { 1 }
+    YES_2,  // tensor of shape { 2 }
+    YES = YES_1
+};
+
+std::vector<uint32_t> dimensions(Dimensioned dimensioned) {
+    switch (dimensioned) {
+        default:
+            EXPECT_TRUE(false) << "Unknown value";
+            FALLTHROUGH_INTENDED;
+        case Dimensioned::NO:
+            return {};
+        case Dimensioned::YES_1:
+            return {1};
+        case Dimensioned::YES_2:
+            return {2};
+    }
+}
 
 std::string toString(Dimensioned dimensioned) {
-    return dimensioned == Dimensioned::NO ? "NO" : "YES";
+    switch (dimensioned) {
+        default:
+            return "<Unknown value>";
+        case Dimensioned::NO:
+            return "NO";
+        case Dimensioned::YES_1:
+            return "YES_1";
+        case Dimensioned::YES_2:
+            return "YES_2";
+    }
 }
 
 // This class adds some simple abstractions and utilities on top of
@@ -642,12 +636,24 @@
     using WrapperModel::identifyInputsAndOutputs;
     using WrapperModel::isValid;
     using WrapperModel::relaxComputationFloat32toFloat16;
+    using WrapperModel::setOperandValue;
 
     // Create a tensor operand of the specified type, and return the
     // corresponding operand index.
+    uint32_t addIntOperand(Dimensioned dimensioned = Dimensioned::YES) {
+        return addOperand(WrapperType::TENSOR_INT32, dimensioned);
+    }
+    uint32_t addIntScalarOperand(std::optional<int> v = std::nullopt) {
+        uint32_t opnd = addOperand(WrapperType::INT32);
+        if (v.has_value()) {
+            setOperandValue(opnd, &v.value());
+        }
+        return opnd;
+    }
     uint32_t addFloatOperand(Dimensioned dimensioned = Dimensioned::YES) {
         return addOperand(WrapperType::TENSOR_FLOAT32, dimensioned);
     }
+    uint32_t addFloatScalarOperand() { return addOperand(WrapperType::FLOAT32); }
     uint32_t addQuantOperand(Dimensioned dimensioned = Dimensioned::YES) {
         return addOperand(WrapperType::TENSOR_QUANT8_ASYMM, dimensioned);
     }
@@ -658,14 +664,6 @@
     // Create an operand of the specified type, and return the corresponding
     // operand index.
     uint32_t addOperand(WrapperType wrapperType, Dimensioned dimensioned = Dimensioned::YES) {
-        auto dimensions = [dimensioned]() -> std::vector<uint32_t> {
-            if (dimensioned == Dimensioned::YES) {
-                return {1};
-            } else {
-                return {};
-            }
-        };
-
         switch (static_cast<int>(wrapperType)) {
             case ANEURALNETWORKS_BOOL:
             case ANEURALNETWORKS_FLOAT16:
@@ -680,7 +678,7 @@
             case ANEURALNETWORKS_TENSOR_FLOAT16:
             case ANEURALNETWORKS_TENSOR_FLOAT32:
             case ANEURALNETWORKS_TENSOR_OEM_BYTE:
-                return addOperand(WrapperOperandType{wrapperType, dimensions()});
+                return addOperand(WrapperOperandType{wrapperType, dimensions(dimensioned)});
 
             case ANEURALNETWORKS_TENSOR_INT32:
             case ANEURALNETWORKS_TENSOR_QUANT8_ASYMM:
@@ -688,10 +686,10 @@
             case ANEURALNETWORKS_TENSOR_QUANT8_SYMM:
             case ANEURALNETWORKS_TENSOR_QUANT16_ASYMM:
             case ANEURALNETWORKS_TENSOR_QUANT16_SYMM:
-                return addOperand(WrapperOperandType{wrapperType, dimensions(), 1.0f});
+                return addOperand(WrapperOperandType{wrapperType, dimensions(dimensioned), 1.0f});
 
             case ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL:
-                return addOperand(WrapperOperandType{wrapperType, dimensions(),
+                return addOperand(WrapperOperandType{wrapperType, dimensions(dimensioned),
                                                      WrapperSymmPerChannelQuantParams({1.0f}, 0)});
 
             default:
@@ -862,11 +860,21 @@
 
     // Create an operand of the same type as the specified operand,
     // and return the operand index of the new operand.
+    //
+    // If a tensor, the new operand will have the same rank as the specified
+    // operand.  If dimensioned == Dimensioned::NO, then all dimensions of a new
+    // tensor operand will be unspecified.  If dimensioned != Dimensioned::NO,
+    // then all dimensions of a new tensor operand will have the implied value
+    // (e.g., YES_1 means each dimension will have the value "1").
     uint32_t addOperandOfSameType(uint32_t operand, Dimensioned dimensioned = Dimensioned::YES) {
         WrapperOperandType type = mWrapperOperandType.at(operand);
+
+        const auto d = dimensions(dimensioned);
+        EXPECT_TRUE(d.size() <= 1);
         for (auto& dimension : type.dimensions) {
-            dimension = (dimensioned == Dimensioned::YES);
+            dimension = (dimensioned == Dimensioned::NO ? 0 : d[0]);
         }
+
         mWrapperOperandType.push_back(type);
         return WrapperModel::addOperand(&type);
     }
@@ -932,6 +940,7 @@
 
 class PartitioningTest : public ::testing::Test {
    protected:
+    using DynamicTemporariesType = decltype(ExecutionPlan().forTest_flatGetDynamicTemporaries());
     using RemapVectorType = ExecutionStep::RemapVectorType;
     using StepModelOutputSetType = ExecutionStep::StepModelOutputSetType;
 
@@ -1276,6 +1285,12 @@
             uint32_t outputA = modelA->getOutputOperandIndex(i);
             uint32_t outputB = modelB->getOutputOperandIndex(i);
             if (!compare(modelA->getOperand(outputA), modelB->getOperand(outputB))) {
+#ifdef VERBOSE
+                std::cout << "modelA.output[" << i << "] = operand[" << outputA
+                          << "] = " << toString(modelA->getOperand(outputA)) << std::endl;
+                std::cout << "modelB.output[" << i << "] = operand[" << outputB
+                          << "] = " << toString(modelB->getOperand(outputB)) << std::endl;
+#endif
                 RETURN_FALSE();
             }
             equivalentOperandsAToB[outputA] = outputB;
@@ -1353,6 +1368,12 @@
                 }
                 // We haven't identified an equivalent operand for inputA.
                 if (!compare(modelA->getOperand(inputA), modelB->getOperand(inputB))) {
+#ifdef VERBOSE
+                    std::cout << "modelA.input[" << i << "] = operand[" << inputA
+                              << "] = " << toString(modelA->getOperand(inputA)) << std::endl;
+                    std::cout << "modelB.input[" << i << "] = operand[" << inputB
+                              << "] = " << toString(modelB->getOperand(inputB)) << std::endl;
+#endif
                     RETURN_FALSE();
                 }
                 equivalentOperandsAToB[inputA] = inputB;
@@ -1398,7 +1419,8 @@
                  std::shared_ptr<Device> device, const RemapVectorType& modelInputs,
                  const RemapVectorType& modelOutputs, const RemapVectorType& tempsAsStepModelInputs,
                  const StepModelOutputSetType& tempsAsStepModelOutputs,
-                 const RemapVectorType& outputsAsStepModelInputs) {
+                 const RemapVectorType& outputsAsStepModelInputs,
+                 const std::set<uint32_t>& modelOutputsThatAreDownstreamInputs) {
         ASSERT_TRUE(logicalStep->isExecution());
         const ExecutionStep* step = logicalStep->executionStep();
         std::map<uint32_t, uint32_t> inputsAndOutputsModelToStep;
@@ -1416,6 +1438,8 @@
         ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep,
                                         step->getOutputsAsStepModelInputs(),
                                         outputsAsStepModelInputs));
+        ASSERT_TRUE(modelOutputsThatAreDownstreamInputs ==
+                    step->getModelOutputsThatAreDownstreamInputs());
     }
 
    private:
@@ -1461,6 +1485,7 @@
     ASSERT_EQ(model.partitionTheWork(devicesA, ExecutePreference::PREFER_LOW_POWER,
                                      ExecutePriority::DEFAULT, {}, &planA),
               ANEURALNETWORKS_NO_ERROR);
+    EXPECT_TRUE(planA.forTest_flatGetDynamicTemporaries().empty());
     ASSERT_EQ(planA.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
     ASSERT_NE(planA.forTest_simpleGetDevice().get(), nullptr);
     ASSERT_EQ(planA.forTest_simpleGetDevice()->getName(), "good");
@@ -1473,6 +1498,7 @@
     ASSERT_EQ(model.partitionTheWork(devicesC, ExecutePreference::PREFER_LOW_POWER,
                                      ExecutePriority::DEFAULT, {}, &planC),
               ANEURALNETWORKS_NO_ERROR);
+    EXPECT_TRUE(planC.forTest_flatGetDynamicTemporaries().empty());
     ASSERT_EQ(planC.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
     ASSERT_EQ(planC.forTest_simpleGetDevice(), DeviceManager::getCpuDevice());
 
@@ -1485,6 +1511,7 @@
     ASSERT_EQ(model.partitionTheWork(devicesB, ExecutePreference::PREFER_LOW_POWER,
                                      ExecutePriority::DEFAULT, {}, &planB),
               ANEURALNETWORKS_NO_ERROR);
+    EXPECT_TRUE(planB.forTest_flatGetDynamicTemporaries().empty());
     ASSERT_EQ(planB.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
     const auto& stepsB = planB.forTest_compoundGetSteps();
     ASSERT_EQ(stepsB.size(), size_t(2));
@@ -1504,7 +1531,8 @@
                         RemapVectorType{},                                    // modelOutputs
                         RemapVectorType{},                         // tempsAsStepModelInputs
                         StepModelOutputSetType{{opnd2, b0Opnd2}},  // tempsAsStepModelOutputs
-                        RemapVectorType{}));                       // outputsAsStepModelInputs;
+                        RemapVectorType{},                         // outputsAsStepModelInputs
+                        {}));  // modelOutputsThatAreDownstreamInputs
     }
     {
         // Build a model to compare against the step model from stepsB[1].
@@ -1526,7 +1554,8 @@
                 RemapVectorType{{opnd4, b1Opnd4}},  // modelOutputs
                 RemapVectorType{{opnd2, b1Opnd2}},  // tempsAsStepModelInputs
                 StepModelOutputSetType{},           // tempsAsStepModelOutputs
-                RemapVectorType{}));                // outputsAsStepModelInputs
+                RemapVectorType{},                  // outputsAsStepModelInputs
+                {}));                               // modelOutputsThatAreDownstreamInputs
     }
 }
 
@@ -1554,6 +1583,7 @@
     ASSERT_EQ(model.partitionTheWork(devicesA, ExecutePreference::PREFER_LOW_POWER,
                                      ExecutePriority::DEFAULT, {}, &planA),
               ANEURALNETWORKS_NO_ERROR);
+    EXPECT_TRUE(planA.forTest_flatGetDynamicTemporaries().empty());
     ASSERT_EQ(planA.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
     ASSERT_NE(planA.forTest_simpleGetDevice().get(), nullptr);
     ASSERT_EQ(planA.forTest_simpleGetDevice()->getName(), "V1_3");
@@ -1568,6 +1598,7 @@
     ASSERT_EQ(model.partitionTheWork(devicesB, ExecutePreference::PREFER_LOW_POWER,
                                      ExecutePriority::DEFAULT, {}, &planB),
               ANEURALNETWORKS_NO_ERROR);
+    EXPECT_TRUE(planB.forTest_flatGetDynamicTemporaries().empty());
     ASSERT_EQ(planB.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
     const auto& stepsB = planB.forTest_compoundGetSteps();
     ASSERT_EQ(stepsB.size(), size_t(4));
@@ -1587,7 +1618,8 @@
                         RemapVectorType{{opnd4, b0Opnd2}},                    // modelOutputs
                         RemapVectorType{},         // tempsAsStepModelInputs
                         StepModelOutputSetType{},  // tempsAsStepModelOutputs
-                        RemapVectorType{}));       // outputsAsStepModelInputs
+                        RemapVectorType{},         // outputsAsStepModelInputs
+                        {}));                      // modelOutputsThatAreDownstreamInputs
     }
     {
         // Build a model to compare against the step model from stepsB[1].
@@ -1600,13 +1632,16 @@
         modelB1.finish();
         ASSERT_TRUE(modelB1.isValid());
 
+        // Note that this is also an important test that we can detect
+        // modelOutputsThatAreDownstreamInputs.
         ASSERT_NO_FATAL_FAILURE(
                 compare(stepsB[1], &modelB1, devicesB[0],
                         RemapVectorType{{opnd0, b1Opnd0}, {opnd1, b1Opnd1}},  // modelInputs
                         RemapVectorType{{opnd2, b1Opnd2}},                    // modelOutputs
                         RemapVectorType{},                         // tempsAsStepModelInputs
                         StepModelOutputSetType{{opnd3, b1Opnd3}},  // tempsAsStepModelOutputs
-                        RemapVectorType{}));                       // outputsAsStepModelInputs
+                        RemapVectorType{},                         // outputsAsStepModelInputs
+                        {0u}));  // modelOutputsThatAreDownstreamInputs
     }
     {
         // Build a model to compare against the step model from stepsB[2].
@@ -1623,9 +1658,10 @@
         ASSERT_NO_FATAL_FAILURE(
                 compare(stepsB[2], &modelB2, devicesB[3], RemapVectorType{},  // modelInputs
                         RemapVectorType{{opnd6, b2Opnd1}},                    // modelOutputs
-                        RemapVectorType{},                    // tempsAsStepModelInputs
-                        StepModelOutputSetType{},             // tempsAsStepModelOutputs
-                        RemapVectorType{{opnd2, b2Opnd0}}));  // outputsAsStepModelInputs
+                        RemapVectorType{},                  // tempsAsStepModelInputs
+                        StepModelOutputSetType{},           // tempsAsStepModelOutputs
+                        RemapVectorType{{opnd2, b2Opnd0}},  // outputsAsStepModelInputs
+                        {}));                               // modelOutputsThatAreDownstreamInputs
     }
     {
         // Build a model to compare against the step model from stepsB[3].
@@ -1646,9 +1682,10 @@
         ASSERT_NO_FATAL_FAILURE(
                 compare(stepsB[3], &modelB3, devicesB[2], RemapVectorType{},  // modelInputs
                         RemapVectorType{{opnd5, b3Opnd2}},                    // modelOutputs
-                        RemapVectorType{{opnd3, b3Opnd1}},    // tempsAsStepModelInputs
-                        StepModelOutputSetType{},             // tempsAsStepModelOutputs
-                        RemapVectorType{{opnd2, b3Opnd0}}));  // outputsAsStepModelInputs
+                        RemapVectorType{{opnd3, b3Opnd1}},  // tempsAsStepModelInputs
+                        StepModelOutputSetType{},           // tempsAsStepModelOutputs
+                        RemapVectorType{{opnd2, b3Opnd0}},  // outputsAsStepModelInputs
+                        {}));                               // modelOutputsThatAreDownstreamInputs
     }
 
     // TODO: Make sure this still works when we have multiple devices
@@ -1676,6 +1713,7 @@
     ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER,
                                      ExecutePriority::DEFAULT, {}, &plan),
               ANEURALNETWORKS_NO_ERROR);
+    EXPECT_TRUE(plan.forTest_flatGetDynamicTemporaries().empty());
     ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
     ASSERT_NE(plan.forTest_simpleGetDevice().get(), nullptr);
     ASSERT_EQ(plan.forTest_simpleGetDevice()->getName(), "V1_3");
@@ -1715,6 +1753,7 @@
     ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER,
                                      ExecutePriority::DEFAULT, {}, &plan),
               ANEURALNETWORKS_NO_ERROR);
+    EXPECT_TRUE(plan.forTest_flatGetDynamicTemporaries().empty());
     ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
     const auto& steps = plan.forTest_compoundGetSteps();
     ASSERT_EQ(steps.size(), size_t(3));
@@ -1738,7 +1777,8 @@
                         RemapVectorType{},  // tempsAsStepModelInputs
                         StepModelOutputSetType{{opnd2, m0Opnd2},
                                                {opnd3, m0Opnd3}},  // tempsAsStepModelOutputs
-                        RemapVectorType{}));                       // outputsAsStepModelInputs
+                        RemapVectorType{},                         // outputsAsStepModelInputs
+                        {}));  // modelOutputsThatAreDownstreamInputs
     }
     {
         const auto& step1 = steps[1];
@@ -1760,7 +1800,8 @@
                 RemapVectorType{{opnd4, m1Opnd4}},                    // modelOutputs
                 RemapVectorType{{opnd3, m1Opnd3}, {opnd2, m1Opnd2}},  // tempsAsStepModelInputs
                 StepModelOutputSetType{{opnd5, m1Opnd5}},             // tempsAsStepModelOutputs
-                RemapVectorType{}));                                  // outputsAsStepModelInputs
+                RemapVectorType{},                                    // outputsAsStepModelInputs
+                {}));  // modelOutputsThatAreDownstreamInputs
     }
     {
         const auto& step2 = steps[2];
@@ -1781,7 +1822,8 @@
                 RemapVectorType{{opnd8, m2Opnd8}},                              // modelOutputs
                 RemapVectorType{{opnd3, m2Opnd3}, {opnd5, m2Opnd5}},  // tempsAsStepModelInputs
                 StepModelOutputSetType{},                             // tempsAsStepModelOutputs
-                RemapVectorType{}));                                  // outputsAsStepModelInputs
+                RemapVectorType{},                                    // outputsAsStepModelInputs
+                {}));  // modelOutputsThatAreDownstreamInputs
     }
 }
 
@@ -1855,6 +1897,7 @@
     ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER,
                                      ExecutePriority::DEFAULT, {}, &plan),
               ANEURALNETWORKS_NO_ERROR);
+    EXPECT_TRUE(plan.forTest_flatGetDynamicTemporaries().empty());
     ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
     const auto& steps = plan.forTest_compoundGetSteps();
     ASSERT_EQ(steps.size(), size_t(2));
@@ -1873,7 +1916,8 @@
                         RemapVectorType{{opnd2, m0Opnd2}},                    // modelOutputs
                         RemapVectorType{},         // tempsAsStepModelInputs
                         StepModelOutputSetType{},  // tempsAsStepModelOutputs
-                        RemapVectorType{}));       // outputsAsStepModelInputs
+                        RemapVectorType{},         // outputsAsStepModelInputs
+                        {0u}));                    // modelOutputsThatAreDownstreamInputs
     }
     {
         // Build a model to compare against the step model from steps[1].
@@ -1888,8 +1932,9 @@
                 compare(steps[1], &model1, devices[1], RemapVectorType{},  // modelInputs
                         RemapVectorType{{opnd3, m1Opnd3}},                 // modelOutputs
                         RemapVectorType{},                                 // tempsAsStepModelInputs
-                        StepModelOutputSetType{},             // tempsAsStepModelOutputs
-                        RemapVectorType{{opnd2, m1Opnd2}}));  // outputsAsStepModelInputs
+                        StepModelOutputSetType{},           // tempsAsStepModelOutputs
+                        RemapVectorType{{opnd2, m1Opnd2}},  // outputsAsStepModelInputs
+                        {}));                               // modelOutputsThatAreDownstreamInputs
     }
 }
 
@@ -1956,6 +2001,7 @@
         ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER,
                                          ExecutePriority::DEFAULT, {}, &plan),
                   ANEURALNETWORKS_NO_ERROR);
+        EXPECT_TRUE(plan.forTest_flatGetDynamicTemporaries().empty());
         ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
         ASSERT_EQ(plan.forTest_simpleGetDevice()->getName(), expectDevice);
     };
@@ -2009,6 +2055,7 @@
             ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER,
                                              ExecutePriority::DEFAULT, {}, &plan),
                       ANEURALNETWORKS_NO_ERROR);
+            EXPECT_TRUE(plan.forTest_flatGetDynamicTemporaries().empty());
             ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
             ASSERT_EQ(plan.forTest_simpleGetDevice()->getName(), "good");
         }
@@ -2028,6 +2075,7 @@
             ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER,
                                              ExecutePriority::DEFAULT, {}, &plan),
                       ANEURALNETWORKS_NO_ERROR);
+            EXPECT_TRUE(plan.forTest_flatGetDynamicTemporaries().empty());
             ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
             ASSERT_EQ(plan.forTest_simpleGetDevice()->getName(), "base");
         }
@@ -2043,6 +2091,268 @@
     }
 }
 
+// Test dynamic temporaries and related parts of the partitioning implementation.
+//
+// opnd0 = model input                   // fill shape
+// opnd1 = constant                      // fill value
+// opnd2 = FILL(opnd0, opnd1)            // model output
+// opnd3 = FILL(opnd0, opnd1)
+// opnd4 = ADD(opnd2, opnd3, FUSED_NONE) // model output
+class DynamicTemporariesTest : public PartitioningTest {
+   protected:
+    // Call these functions in sequence in order to perform the test.
+    // Call to declareOutputDimensions() can be omitted (see the default values below).
+    void declareOutputDimensions(bool opnd2ModelAndPartitionOutputSpecified,
+                                 bool opnd3PartitionOutputSpecified,
+                                 bool opnd4ModelOutputSpecified);
+    void makeModelAndValidate();
+    void compileModelAndComparePlan();
+    void executeCompilationAndCompareOutput(bool opnd2ModelOutputBigEnough,
+                                            bool opnd4ModelOutputBigEnough);
+
+    // set by declareOutputDimensions()
+    bool mOpnd2ModelAndPartitionOutputSpecified = false;
+    bool mOpnd3PartitionOutputSpecified = false;
+    bool mOpnd4ModelOutputSpecified = false;
+
+    // created by makeModelAndValidate()
+    std::optional<PartitioningModel> mModel;
+    std::vector<uint32_t> mOpnds;
+
+    // created by compileModelAndComparePlan();
+    std::optional<PartitioningCompilation> mCompilation;
+
+    static Dimensioned dimensioned(bool specified) {
+        return specified ? Dimensioned::YES_2 : Dimensioned::NO;
+    }
+
+    static constexpr float kFillValue = 3.0f;
+};
+
+void DynamicTemporariesTest::declareOutputDimensions(bool opnd2ModelAndPartitionOutputSpecified,
+                                                     bool opnd3PartitionOutputSpecified,
+                                                     bool opnd4ModelOutputSpecified) {
+    ASSERT_FALSE(mModel.has_value());
+    mOpnd2ModelAndPartitionOutputSpecified = opnd2ModelAndPartitionOutputSpecified;
+    mOpnd3PartitionOutputSpecified = opnd3PartitionOutputSpecified;
+    mOpnd4ModelOutputSpecified = opnd4ModelOutputSpecified;
+}
+
+void DynamicTemporariesTest::makeModelAndValidate() {
+    ASSERT_FALSE(mModel.has_value());
+    mModel = PartitioningModel();
+
+    uint32_t opndActivation = mModel->addIntScalarOperand(ANEURALNETWORKS_FUSED_NONE);
+
+    uint32_t opnd0 = mModel->addIntOperand(Dimensioned::NO);  // desired output tensor shape
+    uint32_t opnd1 = mModel->addFloatScalarOperand();         // fill value
+    mModel->setOperandValue(opnd1, &kFillValue, sizeof(kFillValue));
+    uint32_t opnd2 = mModel->addExplicitOperationXTo1(
+            ANEURALNETWORKS_FILL, {opnd0, opnd1}, WrapperType::TENSOR_FLOAT32,
+            dimensioned(mOpnd2ModelAndPartitionOutputSpecified));
+    uint32_t opnd3 = mModel->addExplicitOperationXTo1(ANEURALNETWORKS_FILL, {opnd0, opnd1},
+                                                      WrapperType::TENSOR_FLOAT32,
+                                                      dimensioned(mOpnd3PartitionOutputSpecified));
+    uint32_t opnd4 = mModel->addExplicitOperationXTo1(
+            ANEURALNETWORKS_ADD, {opnd2, opnd3, opndActivation}, WrapperType::TENSOR_FLOAT32,
+            dimensioned(mOpnd4ModelOutputSpecified));
+    mModel->identifyInputsAndOutputs({opnd0}, {opnd2, opnd4});
+    mModel->finish();
+    ASSERT_TRUE(mModel->isValid());
+
+    mOpnds = {opnd0, opnd1, opnd2, opnd3, opnd4};
+}
+
+void DynamicTemporariesTest::compileModelAndComparePlan() {
+    ASSERT_TRUE(mModel.has_value());
+    ASSERT_TRUE(!mCompilation.has_value());
+
+    auto devices = makeDevices({{"fill", 0.9, 0U, PartitioningDriver::OEMNo, {OperationType::FILL}},
+                                {"add", 0.9, 0U, PartitioningDriver::OEMNo, {OperationType::ADD}}});
+
+    mCompilation = PartitioningCompilation(&mModel.value(), devices);
+    ASSERT_EQ(mCompilation->setPartitioning(DeviceManager::kPartitioningWithoutFallback),
+              Result::NO_ERROR);
+    ASSERT_EQ(mCompilation->finish(), Result::NO_ERROR);
+    const ExecutionPlan& planA = mCompilation->getExecutionPlan();
+    EXPECT_TRUE(planA.forTest_flatGetDynamicTemporaries() ==
+                (mOpnd3PartitionOutputSpecified ? DynamicTemporariesType{}
+                                                : DynamicTemporariesType{mOpnds[3]}));
+    ASSERT_EQ(planA.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
+    const auto& stepsA = planA.forTest_compoundGetSteps();
+    ASSERT_EQ(stepsA.size(), size_t(2));
+    {
+        // Build a model to compare against the step model from stepsA[0].
+        PartitioningModel modelA0;
+        uint32_t a0Opnd0 = modelA0.addIntOperand(Dimensioned::NO);
+        uint32_t a0Opnd1 = modelA0.addFloatScalarOperand();
+        modelA0.setOperandValue(a0Opnd1, &kFillValue, sizeof(kFillValue));
+        uint32_t a0Opnd2 = modelA0.addExplicitOperationXTo1(
+                ANEURALNETWORKS_FILL, {a0Opnd0, a0Opnd1}, WrapperType::TENSOR_FLOAT32,
+                dimensioned(mOpnd3PartitionOutputSpecified));
+        uint32_t a0Opnd3 = modelA0.addExplicitOperationXTo1(
+                ANEURALNETWORKS_FILL, {a0Opnd0, a0Opnd1}, WrapperType::TENSOR_FLOAT32,
+                dimensioned(mOpnd2ModelAndPartitionOutputSpecified));
+        modelA0.identifyInputsAndOutputs({a0Opnd0}, {a0Opnd3, a0Opnd2});
+        modelA0.finish();
+        ASSERT_TRUE(modelA0.isValid());
+
+        ASSERT_NO_FATAL_FAILURE(
+                compare(stepsA[0], &modelA0, devices[0],
+                        RemapVectorType{{mOpnds[0], a0Opnd0}},         // modelInputs
+                        RemapVectorType{{mOpnds[2], a0Opnd3}},         // modelOutputs
+                        RemapVectorType{},                             // tempsAsStepModelInputs
+                        StepModelOutputSetType{{mOpnds[3], a0Opnd2}},  // tempsAsStepModelOutputs
+                        RemapVectorType{},                             // outputsAsStepModelInputs
+                        {0u}));  // modelOutputsThatAreDownstreamInputs
+    }
+    {
+        // Build a model to compare against the step model from stepsA[1].
+        PartitioningModel modelA1;
+        uint32_t a1Opnd2 =
+                modelA1.addFloatOperand(dimensioned(mOpnd2ModelAndPartitionOutputSpecified));
+        uint32_t a1Opnd3 = modelA1.addFloatOperand(dimensioned(mOpnd3PartitionOutputSpecified));
+        uint32_t a1Opnd4 = modelA1.addOperation2To1V1_0(0, a1Opnd2, a1Opnd3,
+                                                        dimensioned(mOpnd4ModelOutputSpecified));
+        modelA1.identifyInputsAndOutputs({a1Opnd3, a1Opnd2}, {a1Opnd4});
+        modelA1.finish();
+        ASSERT_TRUE(modelA1.isValid());
+
+        ASSERT_NO_FATAL_FAILURE(
+                compare(stepsA[1], &modelA1, devices[1], RemapVectorType{},  // modelInputs
+                        RemapVectorType{{mOpnds[4], a1Opnd4}},               // modelOutputs
+                        RemapVectorType{{mOpnds[3], a1Opnd3}},  // tempsAsStepModelInputs
+                        StepModelOutputSetType{},               // tempsAsStepModelOutputs
+                        RemapVectorType{{mOpnds[2], a1Opnd2}},  // outputsAsStepModelInputs
+                        {}));  // modelOutputsThatAreDownstreamInputs
+    }
+}
+
+void DynamicTemporariesTest::executeCompilationAndCompareOutput(bool opnd2ModelOutputBigEnough,
+                                                                bool opnd4ModelOutputBigEnough) {
+    ASSERT_TRUE(opnd2ModelOutputBigEnough || !mOpnd2ModelAndPartitionOutputSpecified);
+    ASSERT_TRUE(opnd4ModelOutputBigEnough || !mOpnd4ModelOutputSpecified);
+
+    ASSERT_TRUE(mCompilation.has_value());
+    WrapperExecution e(&mCompilation.value());
+
+    WrapperOperandType shapeType(WrapperType::TENSOR_INT32, {1});
+    const int shape[1] = {2};
+    e.setInput(0, &shape, &shapeType.operandType);
+
+    auto setOutput = [&e](uint32_t index, float* buffer, bool bigEnough, bool specified) {
+        const uint32_t elts = bigEnough ? 2 : 1;
+        std::fill(buffer, buffer + elts, 0.0f);
+        using DimsType = std::vector<uint32_t>;
+        WrapperOperandType outputType(WrapperType::TENSOR_FLOAT32,
+                                      specified ? DimsType{elts} : DimsType{});
+        e.setOutput(index, buffer, elts * sizeof(float), &outputType.operandType);
+    };
+    float opnd2ModelOutput[2], opnd4ModelOutput[2];
+    setOutput(0, opnd2ModelOutput, opnd2ModelOutputBigEnough,
+              mOpnd2ModelAndPartitionOutputSpecified);
+    setOutput(1, opnd4ModelOutput, opnd4ModelOutputBigEnough, mOpnd4ModelOutputSpecified);
+
+    const Result expectResult = opnd2ModelOutputBigEnough && opnd4ModelOutputBigEnough
+                                        ? Result::NO_ERROR
+                                        : Result::OUTPUT_INSUFFICIENT_SIZE;
+    ASSERT_EQ(e.compute(), expectResult);
+    if (expectResult == Result::NO_ERROR) {
+        ASSERT_TRUE(std::all_of(std::begin(opnd2ModelOutput), std::end(opnd2ModelOutput),
+                                [](float v) { return v == kFillValue; }));
+        ASSERT_TRUE(std::all_of(std::begin(opnd4ModelOutput), std::end(opnd4ModelOutput),
+                                [](float v) { return v == kFillValue * 2; }));
+    }
+}
+
+#if 0
+// TODO: enable this test once b/168657259 is fixed
+TEST_F(DynamicTemporariesTest, ModelOutputsSufficientSize) {
+    // The purpose of this test is to confirm that the partitioner and the
+    // runtime can handle a model output of unspecified dimensions but
+    // sufficient size that is written by one partition and read by another.
+
+    ASSERT_NO_FATAL_FAILURE(declareOutputDimensions(/*opnd2ModelAndPartitionOutputSpecified=*/false,
+                                                    /*opnd3PartitionOutputSpecified=*/true,
+                                                    /*opnd4ModelOutputSpecified=*/false));
+    ASSERT_NO_FATAL_FAILURE(makeModelAndValidate());
+    ASSERT_NO_FATAL_FAILURE(compileModelAndComparePlan());
+    ASSERT_NO_FATAL_FAILURE(executeCompilationAndCompareOutput(true, true));
+}
+
+// TODO: enable this test once b/168657259 is fixed
+TEST_F(DynamicTemporariesTest, DynamicTemporariesUnspecifiedOutputs) {
+    // The purpose of this test is to confirm that the partitioner can produce
+    // dynamic temporaries and that the runtime can handle them properly.  Note
+    // that all model outputs are of unspecified dimensions but sufficient size.
+
+    ASSERT_NO_FATAL_FAILURE(makeModelAndValidate());
+    ASSERT_NO_FATAL_FAILURE(compileModelAndComparePlan());
+    ASSERT_NO_FATAL_FAILURE(executeCompilationAndCompareOutput(true, true));
+}
+#endif
+
+TEST_F(DynamicTemporariesTest, DynamicTemporariesSpecifiedOutputs) {
+    // The purpose of this test is to confirm that the partitioner can produce
+    // dynamic temporaries and that the runtime can handle them properly.  Note
+    // that all model outputs are of specified dimensions.
+
+    ASSERT_NO_FATAL_FAILURE(declareOutputDimensions(/*opnd2ModelAndPartitionOutputSpecified=*/true,
+                                                    /*opnd3PartitionOutputSpecified=*/false,
+                                                    /*opnd4ModelOutputSpecified=*/true));
+    ASSERT_NO_FATAL_FAILURE(makeModelAndValidate());
+    ASSERT_NO_FATAL_FAILURE(compileModelAndComparePlan());
+    ASSERT_NO_FATAL_FAILURE(executeCompilationAndCompareOutput(true, true));
+}
+
+TEST_F(DynamicTemporariesTest, ModelOutputsInsufficientSizeWithDynamicTemporary) {
+    // The purpose of this test is to confirm that the runtime can detect a
+    // model output of insufficient size in the presence of a dynamic temporary.
+
+    ASSERT_NO_FATAL_FAILURE(makeModelAndValidate());
+    ASSERT_NO_FATAL_FAILURE(compileModelAndComparePlan());
+    ASSERT_NO_FATAL_FAILURE(executeCompilationAndCompareOutput(false, false));
+}
+
+TEST_F(DynamicTemporariesTest, ModelOutputsInsufficientSizeWithoutDynamicTemporary) {
+    // The purpose of this test is to confirm that the runtime can detect a
+    // model output of insufficient size in the absence of a dynamic temporary.
+
+    ASSERT_NO_FATAL_FAILURE(declareOutputDimensions(/*opnd2ModelAndPartitionOutputSpecified=*/false,
+                                                    /*opnd3PartitionOutputSpecified=*/true,
+                                                    /*opnd4ModelOutputSpecified=*/false));
+    ASSERT_NO_FATAL_FAILURE(makeModelAndValidate());
+    ASSERT_NO_FATAL_FAILURE(compileModelAndComparePlan());
+    ASSERT_NO_FATAL_FAILURE(executeCompilationAndCompareOutput(false, false));
+}
+
+TEST_F(DynamicTemporariesTest, ModelOutput2InsufficientSizeWithoutDynamicTemporary) {
+    // The purpose of this test is to confirm that the runtime can detect a
+    // model output of insufficient size in the absence of a dynamic temporary.
+
+    ASSERT_NO_FATAL_FAILURE(declareOutputDimensions(/*opnd2ModelAndPartitionOutputSpecified=*/false,
+                                                    /*opnd3PartitionOutputSpecified=*/true,
+                                                    /*opnd4ModelOutputSpecified=*/false));
+    ASSERT_NO_FATAL_FAILURE(makeModelAndValidate());
+    ASSERT_NO_FATAL_FAILURE(compileModelAndComparePlan());
+    ASSERT_NO_FATAL_FAILURE(executeCompilationAndCompareOutput(false, true));
+}
+
+#if 0
+// TODO: enable this test once b/168657259 is fixed
+TEST_F(DynamicTemporariesTest, ModelOutput4InsufficientSizeWithoutDynamicTemporary) {
+    // The purpose of this test is to confirm that the runtime can detect a
+    // model output of insufficient size in the absence of a dynamic temporary.
+
+    ASSERT_NO_FATAL_FAILURE(declareOutputDimensions(/*opnd2ModelAndPartitionOutputSpecified=*/false,
+                                                    /*opnd3PartitionOutputSpecified=*/true,
+                                                    /*opnd4ModelOutputSpecified=*/false));
+    ASSERT_NO_FATAL_FAILURE(makeModelAndValidate());
+    ASSERT_NO_FATAL_FAILURE(compileModelAndComparePlan());
+    ASSERT_NO_FATAL_FAILURE(executeCompilationAndCompareOutput(true, false));
+}
+#endif
+
 // Test token rehashing during the compilation step.
 class CacheTest : public PartitioningTest {
    protected:
@@ -2732,8 +3042,8 @@
 void ControlFlowPartitioningTest::testIfUnknownSize(Dimensioned dimensionedMain,
                                                     Dimensioned dimensionedThen,
                                                     Dimensioned dimensionedElse) {
-    if (dimensionedMain == Dimensioned::YES && dimensionedThen == Dimensioned::YES &&
-        dimensionedElse == Dimensioned::YES) {
+    if (dimensionedMain != Dimensioned::NO && dimensionedThen != Dimensioned::NO &&
+        dimensionedElse != Dimensioned::NO) {
         // No unknown size.
         return;
     }
@@ -2772,8 +3082,8 @@
 void ControlFlowPartitioningTest::testWhileUnknownSize(Dimensioned dimensionedMain,
                                                        Dimensioned dimensionedCond,
                                                        Dimensioned dimensionedBody) {
-    if (dimensionedMain == Dimensioned::YES && dimensionedCond == Dimensioned::YES &&
-        dimensionedBody == Dimensioned::YES) {
+    if (dimensionedMain != Dimensioned::NO && dimensionedCond != Dimensioned::NO &&
+        dimensionedBody != Dimensioned::NO) {
         // No unknown size.
         return;
     }

diff --git a/runtime/test/TestPartitioningRandom.cpp b/runtime/test/TestPartitioningRandom.cpp
index d94ec9f..51d7910 100644
--- a/runtime/test/TestPartitioningRandom.cpp
+++ b/runtime/test/TestPartitioningRandom.cpp

@@ -46,6 +46,12 @@
 //
 // #define VERBOSE VERBOSE
 
+// Uncomment the following line to generate some debugging output that
+// may be useful to determine test coverage for support of dynamic
+// temporaries (http://b/132458982):
+//
+// #define TRACE_DYNTEMP TRACE_DYNTEMP
+
 // We randomly generate tests (model + input data) at runtime, and verify
 // that we get the same results whether we do partitioned compilation/execution
 // or non partitioned compilation/execution.  We perform a test as follows:
@@ -779,10 +785,21 @@
     // joining disjoint subgraphs rather than by forcing a root.
     const bool forceCommonRoot = (randFrac() < 0.75);
 
+    auto computeMode = WrapperExecution::getComputeMode();
+    // We check randFrac() independent of compute mode, because we don't want
+    // the random number sequence to change depending on compute mode: Compute
+    // mode should only affect how we perform the inference, not how we build the
+    // Model, the Compilation, or the Execution.
+    if (randFrac() < 0.5 && computeMode == WrapperExecution::ComputeMode::ASYNC) {
+        computeMode = WrapperExecution::ComputeMode::FENCED;
+    }
+
     TestModel model;
     std::vector<uint32_t> modelInputs;
     std::vector<uint32_t> modelOutputs;
 
+    std::set<uint32_t> operandsWithUnknownDimensions;
+
     // Each region in weights is a problem-sized 2-D TENSOR_FLOAT32.
     TestMemories weights;
 
@@ -999,12 +1016,15 @@
         std::vector<uint32_t> operationOutputs(operationPattern.mNumOutputs);
         std::generate(
                 operationOutputs.begin(), operationOutputs.end(),
-                [&model, &problemType, &unknownDimensionsTypes, allowUnknownDimensions, this] {
+                [&operandsWithUnknownDimensions, &model, &problemType, &unknownDimensionsTypes,
+                 allowUnknownDimensions, this] {
                     // Before the fix for http://b/132458982, 3% unknowns
                     // causes ~35% of partitionings to fail.
                     if (allowUnknownDimensions && randFrac() < 0.03) {
-                        return model.addOperand(
+                        uint32_t opndIdx = model.addOperand(
                                 &unknownDimensionsTypes[randUInt(kUnknownDimensionsTypesCount)]);
+                        operandsWithUnknownDimensions.insert(opndIdx);
+                        return opndIdx;
                     } else {
                         return model.addOperand(&problemType);
                     }
@@ -1091,6 +1111,21 @@
         const auto& outputs = model.getOperationOutputs(randUInt(model.operationCount()));
         modelOutputs.push_back(outputs[randUInt(outputs.size())]);
     }
+    if (computeMode == WrapperExecution::ComputeMode::FENCED) {
+        if (std::any_of(modelOutputs.begin(), modelOutputs.end(),
+                        [&operandsWithUnknownDimensions](uint32_t opndIdx) {
+                            return operandsWithUnknownDimensions.count(opndIdx) != 0;
+                        })) {
+            // Workaround for http://b/162980246: Fenced execution is documented
+            // as requiring model outputs to have fully specified dimensions,
+            // either from Model or from Execution, but its implementation
+            // requires this to come from Model.  This test only guarantees that
+            // they have fully specified dimensions from Execution.  So in the
+            // case of a Model where some output does not have fully specified
+            // dimensions, perform asynchronous execution instead.
+            computeMode = WrapperExecution::ComputeMode::ASYNC;
+        }
+    }
 
     model.identifyInputsAndOutputs(modelInputs, modelOutputs);
 #ifdef VERBOSE
@@ -1163,6 +1198,37 @@
     TestCompilation c2(&model, devices);
     ASSERT_EQ(c2.setPartitioning(DeviceManager::kPartitioningWithoutFallback), Result::NO_ERROR);
     ASSERT_EQ(c2.finish(), Result::NO_ERROR);
+#ifdef TRACE_DYNTEMP
+    {
+        const ExecutionPlan& plan = c2.getExecutionPlan();
+        const size_t dynamicTemporaryCount = plan.forTest_flatGetDynamicTemporaries().size();
+        std::cout << "TRACE_DYNTEMP: dynamic temporary count = " << dynamicTemporaryCount
+                  << std::endl;
+        if (plan.forTest_getKind() == ExecutionPlan::Kind::COMPOUND) {
+            size_t stepsWithModelOutputsThatAreDownstreamInputs = 0;
+            size_t countOfModelOutputsThatAreDownstreamInputs = 0;
+            for (const auto& step : plan.forTest_compoundGetSteps()) {
+                if (const size_t count = step->executionStep()
+                                                 ->getModelOutputsThatAreDownstreamInputs()
+                                                 .size()) {
+                    ++stepsWithModelOutputsThatAreDownstreamInputs;
+                    countOfModelOutputsThatAreDownstreamInputs += count;
+                }
+            }
+            if (countOfModelOutputsThatAreDownstreamInputs != 0) {
+                std::cout << "TRACE_DYNTEMP: model outputs that are downstream inputs: "
+                          << countOfModelOutputsThatAreDownstreamInputs << " / "
+                          << modelOutputs.size() << ", over "
+                          << stepsWithModelOutputsThatAreDownstreamInputs << " / "
+                          << plan.forTest_compoundGetSteps().size() << " steps" << std::endl;
+                EXPECT_LE(countOfModelOutputsThatAreDownstreamInputs, modelOutputs.size());
+            }
+        } else {
+            EXPECT_EQ(dynamicTemporaryCount, size_t(0))
+                    << "Only COMPOUND plan should have dynamic temporaries";
+        }
+    }
+#endif
 
 #ifdef VERBOSE
     {
@@ -1327,7 +1393,7 @@
     // Non-partitioned execution.
     WrapperExecution e(&c);
     ASSERT_NO_FATAL_FAILURE(prepareForExecution(&e));
-    ASSERT_EQ(e.compute(), Result::NO_ERROR);
+    ASSERT_EQ(e.compute(computeMode), Result::NO_ERROR);
 
     // Copy the outputs of the non-partitioned execution to a save area.
     std::vector<float> nonPartitionedOutputs(problemSize * problemSize * model.outputCount());
@@ -1360,7 +1426,7 @@
     // Partitioned execution.
     WrapperExecution e2(&c2);
     ASSERT_NO_FATAL_FAILURE(prepareForExecution(&e2));
-    ASSERT_EQ(e2.compute(), Result::NO_ERROR);
+    ASSERT_EQ(e2.compute(computeMode), Result::NO_ERROR);
 
     // Compare the outputs of the partitioned execution to the save
     // area containing the outpus of the non-partitioned execution.
commit	0a963399c726226601ced17d208f45a1fc9d28c5	[log] [tgz]
author	David Gross <[email protected]>	Fri Sep 18 14:16:31 2020 -0700
committer	David Gross <[email protected]>	Tue Sep 29 09:17:01 2020 -0700
tree	b77bd205db6a67b9b18ad31b9f513bb74053508b
parent	948ffa8d882485ff45ce6a9b79c822bf261c1c44 [diff]