Make ASG destruction async

... to avoid a deadlock in the case outlined in b/329287602 where

*  Host-Virtio-GpuThread is performing a VIRTIO_GPU_CMD_RESOURCE_UNREF
   on virtio gpu resource-1 and is waiting for Host-RenderThread-1 to
   finish (since it is using resource-1 as the ASG channel) before it
   can finish destroying virtio gpu resource-1

*  Host-RenderThread-1 is spinning inside of VkDecoder waiting for
   Host-RenderThread-2 to execute sequence-number-N

*  Host-RenderThread-2 is asleep waiting for a GFXSTREAM_CONTEXT_PING
   via a VIRTIO_GPU_CMD_SUBMIT_3D from Host-Virtio-GpuThread in order
   to resume processing

By pushing the ASG destruction into a separate cleanup thread, the main
virtio gpu thread can continue, run the VIRTIO_GPU_CMD_SUBMIT_3D command
to wake up Host-RenderThread-2 in the above example.

Bug: b/329287602
Test: GfxstreamEnd2EndVkTest.MultiThreadedShutdown with 50000 iterations
Change-Id: I7b51a5557c3424b78d4ce9db160aa9bcada483d6
diff --git a/host/virtio-gpu-gfxstream-renderer.cpp b/host/virtio-gpu-gfxstream-renderer.cpp
index 9b38f00..02b226d 100644
--- a/host/virtio-gpu-gfxstream-renderer.cpp
+++ b/host/virtio-gpu-gfxstream-renderer.cpp
@@ -18,6 +18,7 @@
 #include <deque>
 #include <type_traits>
 #include <unordered_map>
+#include <variant>
 
 #include "BlobManager.h"
 #include "FrameBuffer.h"
@@ -30,6 +31,7 @@
 #include "aemu/base/Tracing.h"
 #include "aemu/base/memory/SharedMemory.h"
 #include "aemu/base/synchronization/Lock.h"
+#include "aemu/base/threads/WorkerThread.h"
 #include "gfxstream/Strings.h"
 #include "gfxstream/host/Features.h"
 #include "host-common/AddressSpaceService.h"
@@ -252,6 +254,47 @@
     COLOR_BUFFER,
 };
 
+struct AlignedMemory {
+    void* addr = nullptr;
+
+    AlignedMemory(size_t align, size_t size)
+        : addr(android::aligned_buf_alloc(align, size)) {}
+
+    ~AlignedMemory() {
+        if (addr != nullptr) {
+            android::aligned_buf_free(addr);
+        }
+    }
+
+    // AlignedMemory is neither copyable nor movable.
+    AlignedMemory(const AlignedMemory& other) = delete;
+    AlignedMemory& operator=(const AlignedMemory& other) = delete;
+    AlignedMemory(AlignedMemory&& other) = delete;
+    AlignedMemory& operator=(AlignedMemory&& other) = delete;
+};
+
+// Memory used as a ring buffer for communication between the guest and host.
+class RingBlob : public std::variant<std::unique_ptr<AlignedMemory>,
+                                     std::unique_ptr<SharedMemory>> {
+  public:
+    using BaseType = std::variant<std::unique_ptr<AlignedMemory>,
+                                  std::unique_ptr<SharedMemory>>;
+    // Inherit constructors.
+    using BaseType::BaseType;
+
+    bool isExportable() const {
+        return std::holds_alternative<std::unique_ptr<SharedMemory>>(*this);
+    }
+
+    SharedMemory::handle_type releaseHandle() {
+        if (!isExportable()) {
+            return SharedMemory::invalidHandle();
+        }
+        return std::get<std::unique_ptr<SharedMemory>>(*this)->releaseHandle();
+    }
+};
+
+
 struct PipeResEntry {
     stream_renderer_resource_create_args args;
     iovec* iov;
@@ -267,7 +310,7 @@
     uint32_t blobFlags;
     uint32_t caching;
     ResType type;
-    std::shared_ptr<SharedMemory> ringBlob = nullptr;
+    std::shared_ptr<RingBlob> ringBlob;
     bool externalAddr = false;
     std::shared_ptr<ManagedDescriptorInfo> descriptorInfo = nullptr;
 };
@@ -592,6 +635,47 @@
     return ((uint64_t)lo) | (((uint64_t)hi) << 32);
 }
 
+class CleanupThread {
+  public:
+    using GenericCleanup = std::function<void()>;
+
+    CleanupThread() : mWorker([](CleanupTask task) {
+            return std::visit([](auto&& work) {
+                using T = std::decay_t<decltype(work)>;
+                if constexpr (std::is_same_v<T, GenericCleanup>) {
+                    work();
+                    return android::base::WorkerProcessingResult::Continue;
+                } else if constexpr (std::is_same_v<T, Exit>) {
+                    return android::base::WorkerProcessingResult::Stop;
+                }
+            }, std::move(task));
+          }) {
+        mWorker.start();
+    }
+
+    ~CleanupThread() { stop(); }
+
+    // CleanupThread is neither copyable nor movable.
+    CleanupThread(const CleanupThread& other) = delete;
+    CleanupThread& operator=(const CleanupThread& other) = delete;
+    CleanupThread(CleanupThread&& other) = delete;
+    CleanupThread& operator=(CleanupThread&& other) = delete;
+
+    void enqueueCleanup(GenericCleanup command) {
+        mWorker.enqueue(std::move(command));
+    }
+
+    void stop() {
+        mWorker.enqueue(Exit{});
+        mWorker.join();
+    }
+
+  private:
+    struct Exit {};
+    using CleanupTask = std::variant<GenericCleanup, Exit>;
+    android::base::WorkerThread<CleanupTask> mWorker;
+};
+
 class PipeVirglRenderer {
    public:
     PipeVirglRenderer() = default;
@@ -618,9 +702,15 @@
         mPageSize = getpagesize();
 #endif
 
+        mCleanupThread.reset(new CleanupThread());
+
         return 0;
     }
 
+    void teardown() {
+        mCleanupThread.reset();
+    }
+
     int resetPipe(GoldfishHwPipe* hwPipe, GoldfishHostPipe* hostPipe) {
         stream_renderer_debug("Want to reset hwpipe %p to hostpipe %p", hwPipe, hostPipe);
         VirtioGpuCtxId asCtxId = (VirtioGpuCtxId)(uintptr_t)hwPipe;
@@ -701,6 +791,8 @@
 
         if (it->second.hasAddressSpaceHandle) {
             for (auto const& [resourceId, handle] : it->second.addressSpaceHandles) {
+                // Note: this can hang as is but this has only been observed to
+                // happen during shutdown. See b/329287602#comment8.
                 mAddressSpaceDeviceControlOps->destroy_handle(handle);
             }
         }
@@ -779,6 +871,9 @@
                 auto& resEntry = resEntryIt->second;
 
                 std::string name = ctxEntry.name + "-" + std::to_string(contextCreate.resourceId);
+
+                // Note: resource ids can not be used as ASG handles because ASGs may outlive the
+                // containing resource due asynchronous ASG destruction.
                 uint32_t handle = mAddressSpaceDeviceControlOps->gen_handle();
 
                 struct AddressSpaceCreateInfo createInfo = {
@@ -1107,10 +1202,6 @@
             entry.numIovs = 0;
         }
 
-        if (entry.externalAddr && !entry.ringBlob) {
-            android::aligned_buf_free(entry.hva);
-        }
-
         entry.hva = nullptr;
         entry.hvaSize = 0;
         entry.blobId = 0;
@@ -1558,24 +1649,25 @@
                        const struct stream_renderer_handle* handle) {
         if (mFeatures.ExternalBlob.enabled) {
             std::string name = "shared-memory-" + std::to_string(res_handle);
-            auto ringBlob = std::make_shared<SharedMemory>(name, create_blob->size);
-            int ret = ringBlob->create(0600);
+            auto shmem = std::make_unique<SharedMemory>(name, create_blob->size);
+            int ret = shmem->create(0600);
             if (ret) {
                 stream_renderer_error("Failed to create shared memory blob");
                 return ret;
             }
 
-            entry.ringBlob = ringBlob;
-            entry.hva = ringBlob->get();
+            entry.hva = shmem->get();
+            entry.ringBlob = std::make_shared<RingBlob>(std::move(shmem));
+
         } else {
-            void* addr =
-                android::aligned_buf_alloc(mPageSize, create_blob->size);
-            if (addr == nullptr) {
+            auto mem = std::make_unique<AlignedMemory>(mPageSize, create_blob->size);
+            if (mem->addr == nullptr) {
                 stream_renderer_error("Failed to allocate ring blob");
                 return -ENOMEM;
             }
 
-            entry.hva = addr;
+            entry.hva = mem->addr;
+            entry.ringBlob = std::make_shared<RingBlob>(std::move(mem));
         }
 
         entry.hvaSize = create_blob->size;
@@ -1717,7 +1809,7 @@
         }
 
         auto& entry = it->second;
-        if (entry.ringBlob) {
+        if (entry.ringBlob && entry.ringBlob->isExportable()) {
             // Handle ownership transferred to VMM, gfxstream keeps the mapping.
 #ifdef _WIN32
             handle->os_handle =
@@ -1822,18 +1914,23 @@
         }
         mContextResources[ctxId] = withoutRes;
 
-        auto resIt = mResources.find(toUnrefId);
-        if (resIt == mResources.end()) return;
+        auto resourceIt = mResources.find(toUnrefId);
+        if (resourceIt == mResources.end()) return;
+        auto& resource = resourceIt->second;
 
-        resIt->second.hostPipe = 0;
-        resIt->second.ctxId = 0;
+        resource.hostPipe = 0;
+        resource.ctxId = 0;
 
         auto ctxIt = mContexts.find(ctxId);
         if (ctxIt != mContexts.end()) {
             auto& ctxEntry = ctxIt->second;
             if (ctxEntry.addressSpaceHandles.count(toUnrefId)) {
-                uint32_t handle = ctxEntry.addressSpaceHandles[toUnrefId];
-                mAddressSpaceDeviceControlOps->destroy_handle(handle);
+                uint32_t asgHandle = ctxEntry.addressSpaceHandles[toUnrefId];
+
+                mCleanupThread->enqueueCleanup([this, asgBlob = resource.ringBlob, asgHandle](){
+                    mAddressSpaceDeviceControlOps->destroy_handle(asgHandle);
+                });
+
                 ctxEntry.addressSpaceHandles.erase(toUnrefId);
             }
         }
@@ -1863,6 +1960,8 @@
     // fences created for that context should not be signaled immediately.
     // Rather, they should get in line.
     std::unique_ptr<VirtioGpuTimelines> mVirtioGpuTimelines = nullptr;
+
+    std::unique_ptr<CleanupThread> mCleanupThread;
 };
 
 static PipeVirglRenderer* sRenderer() {
@@ -2604,6 +2703,8 @@
     android_finishOpenglesRenderer();
     android_hideOpenglesWindow();
     android_stopOpenglesRenderer(true);
+
+    sRenderer()->teardown();
 }
 
 VG_EXPORT void gfxstream_backend_set_screen_mask(int width, int height,