Support the fallback path with device memories.

CPU fallback with device memories:
1. Allocate BLOB mode AHWBs for input and output device memories.
2. Copy out input device memories by IBuffer::copyTo.
3. Compute on CPU.
4. Copy back output device memories by IBuffer::copyFrom.

Bug: 152208838
Test: NNT_static with a broken driver
Change-Id: I9fed3134a7c56c893ff94e892cc25d230a1bd972
Merged-In: I9fed3134a7c56c893ff94e892cc25d230a1bd972
(cherry picked from commit 7b352ec1d710de550df2719a98083b04ccadfe1a)
diff --git a/runtime/ExecutionBuilder.cpp b/runtime/ExecutionBuilder.cpp
index 27e8212..61e320f 100644
--- a/runtime/ExecutionBuilder.cpp
+++ b/runtime/ExecutionBuilder.cpp
@@ -1071,6 +1071,12 @@
 std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::compute(
         const std::optional<Deadline>& deadline,
         const std::shared_ptr<ExecutionBurstController>& burstController) {
+    return computeWithMemories(deadline, mMemories.getObjects(), burstController);
+}
+
+std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::computeWithMemories(
+        const std::optional<Deadline>& deadline, const std::vector<const Memory*>& memories,
+        const std::shared_ptr<ExecutionBurstController>& burstController) {
     CHECK(mPreparedModel != nullptr);
 
     if (VLOG_IS_ON(EXECUTION)) {
@@ -1081,9 +1087,8 @@
     const MeasureTiming measure = measureTiming(mExecutionBuilder);
     const OptionalTimeoutDuration loopTimeoutDuration =
             makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration());
-    const auto [n, outputShapes, timing] =
-            mPreparedModel->execute(mInputs, mOutputs, mMemories.getObjects(), burstController,
-                                    measure, deadline, loopTimeoutDuration);
+    const auto [n, outputShapes, timing] = mPreparedModel->execute(
+            mInputs, mOutputs, memories, burstController, measure, deadline, loopTimeoutDuration);
     mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing);
 
     return {n, std::move(outputShapes), timing};
@@ -1128,13 +1133,74 @@
     const ExecutionPreference preference =
             static_cast<ExecutionPreference>(ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER);
     const Priority priority = convertToHalPriority(ANEURALNETWORKS_PRIORITY_DEFAULT);
-    const auto [n, preparedModel] =
-            mDevice->prepareModel(makeModel, preference, priority, {}, {}, {});
-    mPreparedModel = preparedModel;
+    auto [n, preparedModel] = mDevice->prepareModel(makeModel, preference, priority, {}, {}, {});
+    mPreparedModel = std::move(preparedModel);
     if (n != ANEURALNETWORKS_NO_ERROR) {
         return {n, {}, kNoTiming};
     }
-    return compute({}, /*burstController=*/nullptr);
+
+    // Prepare device memories for CPU fallback.
+    std::vector<const Memory*> memories = mMemories.getObjects();
+    std::vector<bool> isUsedAsInput(memories.size(), false);
+    std::vector<bool> isUsedAsOutput(memories.size(), false);
+    std::vector<std::unique_ptr<Memory>> blobAhwbs;
+
+    // Mark the input and output usages.
+    for (auto& input : mInputs) {
+        if (input.state() == ModelArgumentInfo::MEMORY) {
+            const uint32_t poolIndex = input.locationAndLength().poolIndex;
+            isUsedAsInput[poolIndex] = true;
+        }
+    }
+    for (auto& output : mOutputs) {
+        if (output.state() == ModelArgumentInfo::MEMORY) {
+            const uint32_t poolIndex = output.locationAndLength().poolIndex;
+            // Cannot allocate output buffers with unknown shapes.
+            if (mMemories[poolIndex]->getValidator().createdWithUnknownShape()) {
+                LOG(ERROR) << "Cannot fallback to CPU because at least one of the output operands "
+                              "has unknown shape.";
+                return {ANEURALNETWORKS_OP_FAILED, {}, kNoTiming};
+            }
+            isUsedAsOutput[poolIndex] = true;
+        }
+    }
+
+    // Allocate BLOB mode AHardwareBuffers and read the data from input device memories.
+    for (uint32_t i = 0; i < memories.size(); i++) {
+        const Memory* memory = mMemories[i];
+        if (memory->getIBuffer() != nullptr) {
+            const uint32_t size = memory->getValidator().getMetadata().logicalSize;
+            auto [nAhwb, blobAhwb] = MemoryRuntimeAHWB::create(size);
+            if (nAhwb != ANEURALNETWORKS_NO_ERROR) {
+                return {nAhwb, {}, kNoTiming};
+            }
+            if (isUsedAsInput[i]) {
+                n = copyIBufferToHidlMemory(memory->getIBuffer(), blobAhwb->getHidlMemory());
+                if (n != ANEURALNETWORKS_NO_ERROR) {
+                    return {n, {}, kNoTiming};
+                }
+            }
+            memories[i] = blobAhwb.get();
+            blobAhwbs.push_back(std::move(blobAhwb));
+        }
+    }
+
+    auto [nCompute, outputShapes, timing] = computeWithMemories({}, memories);
+    if (nCompute != ANEURALNETWORKS_NO_ERROR) {
+        return {nCompute, std::move(outputShapes), timing};
+    }
+
+    // Write back to output device memories.
+    for (uint32_t i = 0; i < memories.size(); i++) {
+        const Memory* memory = mMemories[i];
+        if (memory->getIBuffer() != nullptr && isUsedAsOutput[i]) {
+            n = copyHidlMemoryToIBuffer(memories[i]->getHidlMemory(), memory->getIBuffer(), {});
+            if (n != ANEURALNETWORKS_NO_ERROR) {
+                return {n, {}, kNoTiming};
+            }
+        }
+    }
+    return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
 }
 
 }  // namespace nn
diff --git a/runtime/ExecutionBuilder.h b/runtime/ExecutionBuilder.h
index f32e8c1..f61df4c 100644
--- a/runtime/ExecutionBuilder.h
+++ b/runtime/ExecutionBuilder.h
@@ -287,6 +287,10 @@
     int setInputOrOutputFromMemory(const hal::Operand& inputOrOutputOperand, const Memory* memory,
                                    uint32_t offset, ModelArgumentInfo* inputOrOutputInfo);
 
+    std::tuple<int, std::vector<hal::OutputShape>, hal::Timing> computeWithMemories(
+            const std::optional<Deadline>& deadline, const std::vector<const Memory*>& memories,
+            const std::shared_ptr<ExecutionBurstController>& burstController = nullptr);
+
     // describes the full (possibly multiple-"step") execution
     ExecutionBuilder* mExecutionBuilder;
 
diff --git a/runtime/Memory.cpp b/runtime/Memory.cpp
index 7ff3651..09e597e 100644
--- a/runtime/Memory.cpp
+++ b/runtime/Memory.cpp
@@ -134,7 +134,6 @@
     }
 
     Metadata getMetadata() const override {
-        CHECK(mInitialized);
         return {.logicalSize = TypeManager::get()->getSizeOfData(kOperand.type, mUpdatedDimensions),
                 .dimensions = mUpdatedDimensions,
                 .operand = kOperand};
@@ -158,6 +157,10 @@
         return true;
     }
 
+    bool createdWithUnknownShape() const override {
+        return TypeManager::get()->getSizeOfData(kOperand.type, kInitialDimensions) == 0;
+    }
+
     void setInitialized(bool initialized) override { mInitialized = initialized; }
     bool isInitialized() const override { return mInitialized; }
 
@@ -243,7 +246,7 @@
     return ANEURALNETWORKS_NO_ERROR;
 }
 
-static int copyIBufferToHidlMemory(const sp<IBuffer>& src, const hidl_memory& dst) {
+int copyIBufferToHidlMemory(const sp<IBuffer>& src, const hidl_memory& dst) {
     const auto ret = src->copyTo(dst);
     if (!ret.isOk()) {
         LOG(ERROR) << "ANeuralNetworksMemory_copy failure: " << ret.description();
@@ -252,8 +255,8 @@
     return convertErrorStatusToResultCode(static_cast<ErrorStatus>(ret));
 }
 
-static int copyHidlMemoryToIBuffer(const hidl_memory& src, const sp<IBuffer>& dst,
-                                   const std::vector<uint32_t>& dimensions) {
+int copyHidlMemoryToIBuffer(const hidl_memory& src, const sp<IBuffer>& dst,
+                            const std::vector<uint32_t>& dimensions) {
     const auto ret = dst->copyFrom(src, dimensions);
     if (!ret.isOk()) {
         LOG(ERROR) << "ANeuralNetworksMemory_copy failure: " << ret.description();
diff --git a/runtime/Memory.h b/runtime/Memory.h
index dcedffa..56bf81d 100644
--- a/runtime/Memory.h
+++ b/runtime/Memory.h
@@ -151,10 +151,18 @@
     // Try update the memory metadata with the provided metadata. Return false if incompatible.
     virtual bool updateMetadata(const Metadata& metadata) = 0;
 
+    // Whether the memory is created with unknown dimensions or rank.
+    virtual bool createdWithUnknownShape() const { return false; }
+
     virtual void setInitialized(bool) {}
     virtual bool isInitialized() const { return true; }
 };
 
+int copyIBufferToHidlMemory(const sp<hal::IBuffer>& src, const hal::hidl_memory& dst);
+
+int copyHidlMemoryToIBuffer(const hal::hidl_memory& src, const sp<hal::IBuffer>& dst,
+                            const std::vector<uint32_t>& dimensions);
+
 // Represents a memory region.
 class Memory {
     // Disallow copy and assign to prevent slicing