vulkan: wait sync fd for vkQSRI

Bug: 193809913

This CL adds the async vkQSRI functionality, and enables the feature
flag (if fence contexts are available).

Change-Id: I5e67167618b318a74bfb55528951ca95007df614
diff --git a/stream-servers/FrameBuffer.cpp b/stream-servers/FrameBuffer.cpp
index 13baee8..1e10c8d 100644
--- a/stream-servers/FrameBuffer.cpp
+++ b/stream-servers/FrameBuffer.cpp
@@ -3337,6 +3337,16 @@
     SyncThread::get()->triggerWaitVkWithCompletionCallback((VkFence)fenceHandle, std::move(cb));
 }
 
+void FrameBuffer::asyncWaitForGpuVulkanQsriWithCb(uint64_t image, FenceCompletionCallback cb) {
+    SyncThread::get()->triggerWaitVkQsriWithCompletionCallback((VkImage)image, std::move(cb));
+}
+
+void FrameBuffer::waitForGpuVulkanQsri(uint64_t image) {
+    (void)image;
+    // Signal immediately, because this was a sync wait and it's vulkan.
+    SyncThread::get()->triggerBlockedWaitNoTimeline(nullptr);
+}
+
 void FrameBuffer::setGuestManagedColorBufferLifetime(bool guestManaged) {
     m_guestManagedColorBufferLifetime = guestManaged;
 }
diff --git a/stream-servers/FrameBuffer.h b/stream-servers/FrameBuffer.h
index ab21563..37951cf 100644
--- a/stream-servers/FrameBuffer.h
+++ b/stream-servers/FrameBuffer.h
@@ -579,6 +579,8 @@
     void waitForGpuVulkan(uint64_t deviceHandle, uint64_t fenceHandle);
     void asyncWaitForGpuWithCb(uint64_t eglsync, FenceCompletionCallback cb);
     void asyncWaitForGpuVulkanWithCb(uint64_t deviceHandle, uint64_t fenceHandle, FenceCompletionCallback cb);
+    void asyncWaitForGpuVulkanQsriWithCb(uint64_t image, FenceCompletionCallback cb);
+    void waitForGpuVulkanQsri(uint64_t image);
 
     void setGuestManagedColorBufferLifetime(bool guestManaged);
 
diff --git a/stream-servers/GfxStreamBackend.cpp b/stream-servers/GfxStreamBackend.cpp
index 19fa746..6ef5dd6 100644
--- a/stream-servers/GfxStreamBackend.cpp
+++ b/stream-servers/GfxStreamBackend.cpp
@@ -396,6 +396,12 @@
                                  useVulkanNativeSwapchain);
     feature_set_enabled_override(
             kFeature_VulkanBatchedDescriptorSetUpdate, true);
+    // TODO: Strictly speaking, renderer_flags check is insufficient because
+    // fence contexts require us to be running a new-enough guest kernel.
+    feature_set_enabled_override(
+           kFeature_VirtioGpuFenceContexts,
+           !syncFdDisabledByFlag &&
+           (renderer_flags & GFXSTREAM_RENDERER_FLAGS_ASYNC_FENCE_CB));
 
     if (useVulkanNativeSwapchain && !enableVk) {
         fprintf(stderr,
diff --git a/stream-servers/RenderControl.cpp b/stream-servers/RenderControl.cpp
index 540bfcf..b03ac7e 100644
--- a/stream-servers/RenderControl.cpp
+++ b/stream-servers/RenderControl.cpp
@@ -350,7 +350,8 @@
 static bool shouldEnableVulkanAsyncQsri() {
     return shouldEnableVulkan() &&
         (feature_is_enabled(kFeature_GLAsyncSwap) ||
-         feature_is_enabled(kFeature_VirtioGpuNativeSync));
+         (feature_is_enabled(kFeature_VirtioGpuNativeSync) &&
+          feature_is_enabled(kFeature_VirtioGpuFenceContexts)));
 }
 
 const char* maxVersionToFeatureString(GLESDispatchMaxVersion version) {
@@ -619,12 +620,10 @@
         glStr += " ";
     }
 
-    // Bug: 193809913
-    // Only switch on after everything else about this is merged / works.
-    // if (vulkanAsyncQsri) {
-    //     glStr += kVulkanAsyncQsri;
-    //     glStr += " ";
-    // }
+    if (vulkanAsyncQsri && name == GL_EXTENSIONS) {
+        glStr += kVulkanAsyncQsri;
+        glStr += " ";
+    }
 
     if (name == GL_EXTENSIONS) {
 
diff --git a/stream-servers/RendererImpl.cpp b/stream-servers/RendererImpl.cpp
index 5fc0012..29e417f 100644
--- a/stream-servers/RendererImpl.cpp
+++ b/stream-servers/RendererImpl.cpp
@@ -598,6 +598,12 @@
         .async_wait_for_gpu_vulkan_with_cb = [](uint64_t device, uint64_t fence, FenceCompletionCallback cb) {
             FrameBuffer::getFB()->asyncWaitForGpuVulkanWithCb(device, fence, cb);
         },
+        .async_wait_for_gpu_vulkan_qsri_with_cb = [](uint64_t image, FenceCompletionCallback cb) {
+            FrameBuffer::getFB()->asyncWaitForGpuVulkanQsriWithCb(image, cb);
+        },
+        .wait_for_gpu_vulkan_qsri = [](uint64_t image) {
+            FrameBuffer::getFB()->waitForGpuVulkanQsri(image);
+        },
 };
 
 struct AndroidVirtioGpuOps* RendererImpl::getVirtioGpuOps() {
diff --git a/stream-servers/SyncThread.cpp b/stream-servers/SyncThread.cpp
index 30bb432..8cfa2d5 100644
--- a/stream-servers/SyncThread.cpp
+++ b/stream-servers/SyncThread.cpp
@@ -140,6 +140,36 @@
     DPRINT("exit");
 }
 
+void SyncThread::triggerWaitVkQsriWithCompletionCallback(VkImage vkImage, FenceCompletionCallback cb) {
+    DPRINT("fenceSyncInfo=0x%llx ...", fenceSync);
+    SyncThreadCmd to_send;
+    to_send.opCode = SYNC_THREAD_WAIT_VK_QSRI;
+    to_send.vkImage = vkImage;
+    to_send.useFenceCompletionCallback = true;
+    to_send.fenceCompletionCallback = cb;
+    DPRINT("opcode=%u", to_send.opCode);
+    sendAsync(to_send);
+    DPRINT("exit");
+}
+
+void SyncThread::triggerWaitVkQsriBlockedNoTimeline(VkImage vkImage) {
+    DPRINT("fenceSyncInfo=0x%llx ...", fenceSync);
+    SyncThreadCmd to_send;
+    to_send.opCode = SYNC_THREAD_WAIT_VK_QSRI;
+    to_send.vkImage = vkImage;
+    DPRINT("opcode=%u", to_send.opCode);
+    sendAndWaitForResult(to_send);
+    DPRINT("exit");
+}
+
+void SyncThread::triggerGeneral(FenceCompletionCallback cb) {
+    SyncThreadCmd to_send;
+    to_send.opCode = SYNC_THREAD_GENERAL;
+    to_send.useFenceCompletionCallback = true;
+    to_send.fenceCompletionCallback = cb;
+    sendAsync(to_send);
+}
+
 void SyncThread::cleanup() {
     DPRINT("enter");
     SyncThreadCmd to_send;
@@ -336,6 +366,52 @@
     return result;
 }
 
+int SyncThread::doSyncWaitVkQsri(SyncThreadCmd* cmd) {
+    DPRINT("enter");
+
+    auto decoder = goldfish_vk::VkDecoderGlobalState::get();
+    DPRINT("doSyncWaitVkQsri for image %p", cmd->vkImage);
+    auto result = decoder->waitQsri(cmd->vkImage, kDefaultTimeoutNsecs);
+    DPRINT("doSyncWaitVkQsri for image %p (done, do signal/callback)", cmd->vkImage);
+    if (result == VK_TIMEOUT) {
+        fprintf(stderr, "SyncThread::%s: SYNC_WAIT_VK_QSRI timeout: vkImage=%p\n",
+                __func__, cmd->vkImage);
+    } else if (result != VK_SUCCESS) {
+        fprintf(stderr, "SyncThread::%s: SYNC_WAIT_VK_QSRI error: %d vkImage=%p\n",
+                __func__, result, cmd->vkImage);
+    }
+
+    DPRINT("issue timeline increment");
+
+    // We always unconditionally increment timeline at this point, even
+    // if the call to vkWaitForFences returned abnormally.
+    // See comments in |doSyncWait| about the rationale.
+    if (cmd->useFenceCompletionCallback) {
+        DPRINT("wait done, use completion callback");
+        cmd->fenceCompletionCallback();
+    } else {
+        DPRINT("wait done, use goldfish sync timeline inc");
+        emugl::emugl_sync_timeline_inc(cmd->timeline, kTimelineInterval);
+    }
+
+    DPRINT("done timeline increment");
+
+    DPRINT("exit");
+    return result;
+}
+
+int SyncThread::doSyncGeneral(SyncThreadCmd* cmd) {
+    DPRINT("enter");
+    if (cmd->useFenceCompletionCallback) {
+        DPRINT("wait done, use completion callback");
+        cmd->fenceCompletionCallback();
+    } else {
+        DPRINT("warning, completion callback not provided in general op!");
+    }
+
+    return 0;
+}
+
 void SyncThread::doSyncBlockedWaitNoTimeline(SyncThreadCmd* cmd) {
     DPRINT("enter");
 
@@ -396,6 +472,14 @@
         DPRINT("exec SYNC_THREAD_WAIT_VK");
         result = doSyncWaitVk(cmd);
         break;
+    case SYNC_THREAD_WAIT_VK_QSRI:
+        DPRINT("exec SYNC_THREAD_WAIT_VK_QSRI");
+        result = doSyncWaitVkQsri(cmd);
+        break;
+    case SYNC_THREAD_GENERAL:
+        DPRINT("exec SYNC_THREAD_GENERAL");
+        result = doSyncGeneral(cmd);
+        break;
     case SYNC_THREAD_EXIT:
         DPRINT("exec SYNC_THREAD_EXIT");
         doExit(cmd);
diff --git a/stream-servers/SyncThread.h b/stream-servers/SyncThread.h
index 0e58f8c..521dda0 100644
--- a/stream-servers/SyncThread.h
+++ b/stream-servers/SyncThread.h
@@ -53,6 +53,10 @@
     // and timeline handle.
     // A fence FD object / Zircon eventpair in the guest is signaled.
     SYNC_THREAD_WAIT_VK = 4,
+    // Command to wait on the presentation the given VkImage.
+    SYNC_THREAD_WAIT_VK_QSRI = 5,
+    // Command that consists only of a callback.
+    SYNC_THREAD_GENERAL = 6,
 };
 
 struct SyncThreadCmd {
@@ -65,6 +69,7 @@
     union {
         FenceSync* fenceSync = nullptr;
         VkFence vkFence;
+        VkImage vkImage;
     };
     uint64_t timeline = 0;
 
@@ -111,6 +116,10 @@
     // For use with virtio-gpu and async fence completion callback. This is async like triggerWait, but takes a fence completion callback instead of incrementing some timeline directly.
     void triggerWaitWithCompletionCallback(FenceSync* fenceSync, FenceCompletionCallback);
     void triggerWaitVkWithCompletionCallback(VkFence fenceHandle, FenceCompletionCallback);
+    void triggerWaitVkQsriWithCompletionCallback(VkImage image, FenceCompletionCallback);
+    void triggerWaitVkQsriBlockedNoTimeline(VkImage image);
+
+    void triggerGeneral(FenceCompletionCallback);
 
     // |cleanup|: for use with destructors and other cleanup functions.
     // it destroys the sync context and exits the sync thread.
@@ -152,6 +161,8 @@
     void doSyncContextInit(SyncThreadCmd* cmd);
     void doSyncWait(SyncThreadCmd* cmd);
     int doSyncWaitVk(SyncThreadCmd* cmd);
+    int doSyncWaitVkQsri(SyncThreadCmd* cmd);
+    int doSyncGeneral(SyncThreadCmd* cmd);
     void doSyncBlockedWaitNoTimeline(SyncThreadCmd* cmd);
     void doExit(SyncThreadCmd* cmd);
 
diff --git a/stream-servers/virtio-gpu-gfxstream-renderer.cpp b/stream-servers/virtio-gpu-gfxstream-renderer.cpp
index 6a2dcf7..0b752ba 100644
--- a/stream-servers/virtio-gpu-gfxstream-renderer.cpp
+++ b/stream-servers/virtio-gpu-gfxstream-renderer.cpp
@@ -502,7 +502,7 @@
 const uint32_t kVirtioGpuNativeSyncVulkanCreateExportFd = 0xa000;
 const uint32_t kVirtioGpuNativeSyncVulkanCreateImportFd = 0xa001;
 
-const uint32_t kVirtioGpuNativeSyncVulkanCreateExportFdForQueueSignalReleaseImage = 0xa002;
+const uint32_t kVirtioGpuNativeSyncVulkanQsriExport = 0xa002;
 
 class PipeVirglRenderer {
 public:
@@ -820,7 +820,7 @@
                     AutoLock lock(mCtxPendingFencesLock);
                     mCtxNeededFencingTypes[ctxId] = CtxFencingType::AsyncSignal;
                     mVirtioGpuOps->async_wait_for_gpu_with_cb(sync_handle, [this, ctxId] {
-                            this->completionCallback(ctxId);
+                        this->completionCallback(ctxId);
                     });
                 } else {
                     mVirtioGpuOps->wait_for_gpu(sync_handle);
@@ -839,28 +839,32 @@
 
                 VGPLOG("wait for gpu vk ctx id %u", ctxId);
                 if (mUseAsyncFenceCb) {
+                    AutoLock lock(mCtxPendingFencesLock);
+                    mCtxNeededFencingTypes[ctxId] = CtxFencingType::AsyncSignal;
                     mVirtioGpuOps->async_wait_for_gpu_vulkan_with_cb(device_handle, fence_handle, [this, ctxId] {
-                            this->completionCallback(ctxId);
+                        this->completionCallback(ctxId);
                     });
                 } else {
                     mVirtioGpuOps->wait_for_gpu_vulkan(device_handle, fence_handle);
                 }
                 break;
             }
-            // case kVirtioGpuNativeSyncVulkanCreateExportFdForQueueSignalReleaseImage: {
-            //     uint64_t queue_handle_lo = dwords[1];
-            //     uint64_t queue_handle_hi = dwords[2];
-            //     uint64_t queue_handle = convert32to64(device_handle_lo, device_handle_hi);
-            //     fprintf(stderr, "%s: wait for gpu vk qsri id %u queue 0x%llx\n", __func__, ctxId, (unsigned long long)queue_handle);
-            //     if (mUseAsyncFenceCb) {
-            //         mVirtioGpuOps->async_wait_for_gpu_vulkan_qsri_with_cb(queue_handle, [this, ctxId] {
-            //                 this->completionCallback(ctxId);
-            //         });
-            //     } else {
-            //         mVirtioGpuOps->wait_for_gpu_vulkan_qsri(queue_handle);
-            //     }
-            //     break;
-            // }
+            case kVirtioGpuNativeSyncVulkanQsriExport: {
+                uint64_t image_handle_lo = dwords[1];
+                uint64_t image_handle_hi = dwords[2];
+                uint64_t image_handle = convert32to64(image_handle_lo, image_handle_hi);
+                VGPLOG("wait for gpu vk qsri id %u image 0x%llx", ctxId, (unsigned long long)image_handle);
+                if (mUseAsyncFenceCb) {
+                    AutoLock lock(mCtxPendingFencesLock);
+                    mCtxNeededFencingTypes[ctxId] = CtxFencingType::AsyncSignal;
+                    mVirtioGpuOps->async_wait_for_gpu_vulkan_qsri_with_cb(image_handle, [this, ctxId] {
+                        this->completionCallback(ctxId);
+                    });
+                } else {
+                    mVirtioGpuOps->wait_for_gpu_vulkan_qsri(image_handle);
+                }
+                break;
+            }
             default:
                 return -1;
         }
@@ -919,8 +923,7 @@
 
     int contextCreateFence(uint32_t fence_id, uint32_t ctx_id, uint32_t fence_ctx_idx) {
         AutoLock lock(mLock);
-        fprintf(stderr, "%s: fence id %u ctx id %u fence_ctx_idx %u\n", __func__, fence_id, ctx_id, fence_ctx_idx);
-        VGPLOG("fenceid: %u cmdtype: %u", fence_id, ctx_id);
+        VGPLOG("fenceid: %u cmdtype: %u fence_ctx_idx: %u", fence_id, ctx_id, fence_ctx_idx);
         if (mUseAsyncFenceCb) {
             fprintf(stderr, "%s: create fence using async fence cb\n", __func__);
             if (0 == ctx_id) {
@@ -1679,7 +1682,11 @@
 
             if (pendingState.type == CtxFencingType::SyncSignal) {
                 VGPLOG("This was a sync signal, write fence, erase it and continue")
-                mVirglRendererCallbacks.write_fence2(mCookie, fence_value, ctx_id, 0 /* ring_idx */);
+                if (mUseAsyncFenceCb) {
+                    mVirglRendererCallbacks.write_fence2(mCookie, fence_value, ctx_id, 0 /* ring_idx */);
+                } else {
+                    mVirglRendererCallbacks.write_fence(mCookie, fence_value);
+                }
                 it = pendingFencesThisCtx.erase(it);
             } else {
                 if (CtxFencingType::AsyncSignal != pendingState.type) {
diff --git a/stream-servers/virtio_gpu_ops.h b/stream-servers/virtio_gpu_ops.h
index 8c08bac..490036e 100644
--- a/stream-servers/virtio_gpu_ops.h
+++ b/stream-servers/virtio_gpu_ops.h
@@ -89,6 +89,9 @@
 typedef void (*async_wait_for_gpu_with_cb_t)(uint64_t eglsync, FenceCompletionCallback);
 typedef void (*async_wait_for_gpu_vulkan_with_cb_t)(uint64_t device, uint64_t fence, FenceCompletionCallback);
 
+typedef void (*async_wait_for_gpu_vulkan_qsri_with_cb_t)(uint64_t image, FenceCompletionCallback);
+typedef void (*wait_for_gpu_vulkan_qsri_t)(uint64_t image);
+
 struct AndroidVirtioGpuOps {
     create_color_buffer_with_handle_t create_color_buffer_with_handle;
     open_color_buffer_t open_color_buffer;
@@ -113,4 +116,7 @@
     set_guest_managed_color_buffer_lifetime_t set_guest_managed_color_buffer_lifetime;
     async_wait_for_gpu_with_cb_t async_wait_for_gpu_with_cb;
     async_wait_for_gpu_vulkan_with_cb_t async_wait_for_gpu_vulkan_with_cb;
+
+    async_wait_for_gpu_vulkan_qsri_with_cb_t async_wait_for_gpu_vulkan_qsri_with_cb;
+    wait_for_gpu_vulkan_qsri_t wait_for_gpu_vulkan_qsri;
 };
diff --git a/stream-servers/vulkan/VkAndroidNativeBuffer.cpp b/stream-servers/vulkan/VkAndroidNativeBuffer.cpp
index 7cbfb5a..a47bb4d 100644
--- a/stream-servers/vulkan/VkAndroidNativeBuffer.cpp
+++ b/stream-servers/vulkan/VkAndroidNativeBuffer.cpp
@@ -19,13 +19,61 @@
 #include "GrallocDefs.h"
 #include "VkCommonOperations.h"
 #include "VulkanDispatch.h"
+#include "SyncThread.h"
 
 #include <string.h>
 
 #define VK_ANB_ERR(fmt,...) fprintf(stderr, "%s:%d " fmt "\n", __func__, __LINE__, ##__VA_ARGS__);
 
+#define VK_ANB_DEBUG 0
+
+#if VK_ANB_DEBUG
+#define VK_ANB_DEBUG(fmt,...) fprintf(stderr, "vk-anb-debug: %s:%d " fmt "\n", __func__, __LINE__, ##__VA_ARGS__);
+#define VK_ANB_DEBUG_OBJ(obj, fmt,...) fprintf(stderr, "vk-anb-debug: %s:%d:%p " fmt "\n", __func__, __LINE__, obj, ##__VA_ARGS__);
+#else
+#define VK_ANB_DEBUG(fmt,...)
+#define VK_ANB_DEBUG_OBJ(obj, fmt,...)
+#endif
+
+using android::base::AutoLock;
+using android::base::Lock;
+
 namespace goldfish_vk {
 
+VkFence AndroidNativeBufferInfo::QsriWaitInfo::getFenceFromPoolLocked() {
+    VK_ANB_DEBUG("enter");
+
+    if (!vk) return VK_NULL_HANDLE;
+
+    if (fencePool.empty()) {
+        VkFence fence;
+        VkFenceCreateInfo fenceCreateInfo = {
+            VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, 0, 0,
+        };
+        vk->vkCreateFence(device, &fenceCreateInfo, nullptr, &fence);
+        VK_ANB_DEBUG("no fences in pool, created %p", fence);
+        return fence;
+    } else {
+        VkFence res = fencePool.back();
+        fencePool.pop_back();
+        vk->vkResetFences(device, 1, &res);
+        VK_ANB_DEBUG("existing fence in pool: %p. also reset the fence", res);
+        return res;
+    }
+}
+
+AndroidNativeBufferInfo::QsriWaitInfo::~QsriWaitInfo() {
+    VK_ANB_DEBUG("enter");
+    if (!vk) return;
+    if (!device) return;
+    // Nothing in the fence pool is unsignaled
+    for (auto fence : fencePool) {
+        VK_ANB_DEBUG("destroy fence %p", fence);
+        vk->vkDestroyFence(device, fence, nullptr);
+    }
+    VK_ANB_DEBUG("exit");
+}
+
 bool parseAndroidNativeBufferInfo(
     const VkImageCreateInfo* pCreateInfo,
     AndroidNativeBufferInfo* info_out) {
@@ -48,8 +96,7 @@
     const VkPhysicalDeviceMemoryProperties* memProps,
     AndroidNativeBufferInfo* out) {
 
-    *out = {};
-
+    out->vk = vk;
     out->device = device;
     out->vkFormat = pCreateInfo->format;
     out->extent = pCreateInfo->extent;
@@ -273,11 +320,22 @@
     for (auto queueState : anbInfo->queueStates) {
         queueState.teardown(vk, device);
     }
+
     anbInfo->queueStates.clear();
 
     anbInfo->acquireQueueState.teardown(vk, device);
 
-    *anbInfo = {};
+    anbInfo->vk = nullptr;
+    anbInfo->device = VK_NULL_HANDLE;
+    anbInfo->image = VK_NULL_HANDLE;
+    anbInfo->imageMemory = VK_NULL_HANDLE;
+    anbInfo->stagingBuffer = VK_NULL_HANDLE;
+    anbInfo->mappedStagingPtr = nullptr;
+    anbInfo->stagingMemory = VK_NULL_HANDLE;
+
+    AutoLock lock(anbInfo->qsriWaitInfo.lock);
+    anbInfo->qsriWaitInfo.presentCount = 0;
+    anbInfo->qsriWaitInfo.requestedPresentCount = 0;
 }
 
 void getGralloc0Usage(VkFormat format, VkImageUsageFlags imageUsage,
@@ -505,6 +563,8 @@
     return VK_SUCCESS;
 }
 
+static constexpr uint64_t kTimeoutNs = 3ULL * 1000000000ULL;
+
 VkResult syncImageToColorBuffer(
     VulkanDispatch* vk,
     uint32_t queueFamilyIndex,
@@ -512,7 +572,14 @@
     uint32_t waitSemaphoreCount,
     const VkSemaphore* pWaitSemaphores,
     int* pNativeFenceFd,
-    AndroidNativeBufferInfo* anbInfo) {
+    std::shared_ptr<AndroidNativeBufferInfo> anbInfo) {
+
+    auto anbInfoPtr = anbInfo.get();
+    {
+        AutoLock lock(anbInfo->qsriWaitInfo.lock);
+        VK_ANB_DEBUG_OBJ(anbInfoPtr, "ensure dispatch %p device %p", vk, anbInfo->device);
+        anbInfo->qsriWaitInfo.ensureDispatchAndDevice(vk, anbInfo->device);
+    }
 
     auto fb = FrameBuffer::getFB();
     fb->lock();
@@ -662,13 +729,46 @@
     };
 
     // TODO(kaiyili): initiate ownership transfer to DisplayVk here.
-    vk->vkQueueSubmit(queueState.queue, 1, &submitInfo, VK_NULL_HANDLE);
-
+    VkFence qsriFence = VK_NULL_HANDLE;
+    {
+        VK_ANB_DEBUG_OBJ(anbInfoPtr, "trying to get qsri fence");
+        AutoLock lock(anbInfo->qsriWaitInfo.lock);
+        VK_ANB_DEBUG_OBJ(anbInfoPtr, "trying to get qsri fence (got lock)");
+        qsriFence = anbInfo->qsriWaitInfo.getFenceFromPoolLocked();
+        VK_ANB_DEBUG_OBJ(anbInfoPtr, "got qsri fence %p", qsriFence);
+    }
+    vk->vkQueueSubmit(queueState.queue, 1, &submitInfo, qsriFence);
     fb->unlock();
 
     if (anbInfo->useVulkanNativeImage) {
+        VK_ANB_DEBUG_OBJ(anbInfoPtr, "using native image, so use sync thread to wait");
         fb->setColorBufferInUse(anbInfo->colorBufferHandle, false);
+        VkDevice device = anbInfo->device;
+        // Queue wait to sync thread with completion callback
+        // Pass anbInfo by value to get a ref
+        SyncThread::get()->triggerGeneral([anbInfoPtr, anbInfo, vk, device, qsriFence] {
+            VK_ANB_DEBUG_OBJ(anbInfoPtr, "wait callback: enter");
+            if (qsriFence) {
+                VK_ANB_DEBUG_OBJ(anbInfoPtr, "wait callback: wait for fence %p...", qsriFence);
+                vk->vkWaitForFences(device, 1, &qsriFence, VK_FALSE, kTimeoutNs);
+                VK_ANB_DEBUG_OBJ(anbInfoPtr, "wait callback: wait for fence %p...(done)", qsriFence);
+            }
+            AutoLock lock(anbInfo->qsriWaitInfo.lock);
+            VK_ANB_DEBUG_OBJ(anbInfoPtr, "wait callback: return fence and signal");
+            if (qsriFence) {
+                anbInfo->qsriWaitInfo.returnFenceLocked(qsriFence);
+            }
+            ++anbInfo->qsriWaitInfo.presentCount;
+            VK_ANB_DEBUG_OBJ(anbInfoPtr, "wait callback: done, present count is now %llu", (unsigned long long)anbInfo->qsriWaitInfo.presentCount);
+            anbInfo->qsriWaitInfo.cv.signal();
+            VK_ANB_DEBUG_OBJ(anbInfoPtr, "wait callback: exit");
+        });
     } else {
+        VK_ANB_DEBUG_OBJ(anbInfoPtr, "not using native image, so wait right away");
+        if (qsriFence) {
+            vk->vkWaitForFences(anbInfo->device, 1, &qsriFence, VK_FALSE, kTimeoutNs);
+        }
+
         VkMappedMemoryRange toInvalidate = {
             VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, 0,
             anbInfo->stagingMemory,
@@ -701,6 +801,14 @@
                 colorBufferHandle,
                 anbInfo->mappedStagingPtr,
                 bpp * anbInfo->extent.width * anbInfo->extent.height);
+
+        AutoLock lock(anbInfo->qsriWaitInfo.lock);
+        ++anbInfo->qsriWaitInfo.presentCount;
+        VK_ANB_DEBUG_OBJ(anbInfoPtr, "done, present count is now %llu", (unsigned long long)anbInfo->qsriWaitInfo.presentCount);
+        anbInfo->qsriWaitInfo.cv.signal();
+        if (qsriFence) {
+            anbInfo->qsriWaitInfo.returnFenceLocked(qsriFence);
+        }
     }
 
     return VK_SUCCESS;
diff --git a/stream-servers/vulkan/VkAndroidNativeBuffer.h b/stream-servers/vulkan/VkAndroidNativeBuffer.h
index 4feed9c..72e84b6 100644
--- a/stream-servers/vulkan/VkAndroidNativeBuffer.h
+++ b/stream-servers/vulkan/VkAndroidNativeBuffer.h
@@ -17,12 +17,17 @@
 
 #include <vulkan/vulkan.h>
 
+#include "base/Lock.h"
+#include "base/ConditionVariable.h"
 #include "cereal/common/goldfish_vk_private_defs.h"
 
+#include <deque>
+#include <memory>
 #include <vector>
 
 namespace goldfish_vk {
 
+struct AndroidNativeBufferInfo;
 struct VulkanDispatch;
 
 // This class provides methods to create and query information about Android
@@ -32,7 +37,18 @@
 // This is to be refactored to move to external memory only once we get that
 // working.
 
+void teardownAndroidNativeBufferImage(
+    VulkanDispatch* vk,
+    AndroidNativeBufferInfo* anbInfo);
+
 struct AndroidNativeBufferInfo {
+    ~AndroidNativeBufferInfo() {
+        if (vk) {
+            teardownAndroidNativeBufferImage(vk, this);
+        }
+    }
+
+    VulkanDispatch* vk = nullptr;
     VkDevice device = VK_NULL_HANDLE;
     VkFormat vkFormat;
     VkExtent3D extent;
@@ -87,6 +103,7 @@
     // We keep one QueueState for each queue family index used by the guest
     // in vkQueuePresentKHR.
     std::vector<QueueState> queueStates;
+
     // Did we ever sync the Vulkan image with a ColorBuffer?
     // If so, set everSynced along with the queue family index
     // used to do that.
@@ -101,6 +118,43 @@
     // Track that here.
     bool everAcquired = false;
     QueueState acquireQueueState;
+
+    // State that is of interest when interacting with sync fds and SyncThread.
+    // Protected by this lock and condition variable.
+    struct QsriWaitInfo {
+        android::base::Lock lock;
+        android::base::ConditionVariable cv;
+
+        VulkanDispatch* vk = nullptr;
+        VkDevice device = VK_NULL_HANDLE;
+
+        // A pool of vkFences for waiting (optimization so we don't keep recreating them every time).
+        std::vector<VkFence> fencePool;
+
+        // How many times the image was presented via vkQueueSignalReleaseImageANDROID
+        // versus how many times we want it to be (for sync fd fence waiting).
+        // Incremented by waitQsri.
+        uint64_t requestedPresentCount = 0;
+        // Incremented by waitQsri if vkWaitForFences there succeeds,
+        // or by syncImageToColorBuffer in the non-zero-copy case.
+        uint64_t presentCount = 0;
+
+        void ensureDispatchAndDevice(VulkanDispatch* vkIn, VkDevice deviceIn) {
+            vk = vkIn;
+            device = deviceIn;
+        }
+
+        VkFence getFenceFromPoolLocked();
+
+        // requires fence to be signaled
+        void returnFenceLocked(VkFence fence) {
+            fencePool.push_back(fence);
+        }
+
+        ~QsriWaitInfo();
+    };
+
+    QsriWaitInfo qsriWaitInfo;
 };
 
 VkResult prepareAndroidNativeBufferImage(
@@ -112,10 +166,6 @@
     const VkPhysicalDeviceMemoryProperties* memProps,
     AndroidNativeBufferInfo* out);
 
-void teardownAndroidNativeBufferImage(
-    VulkanDispatch* vk,
-    AndroidNativeBufferInfo* anbInfo);
-
 void getGralloc0Usage(VkFormat format, VkImageUsageFlags imageUsage,
                       int* usage_out);
 void getGralloc1Usage(VkFormat format, VkImageUsageFlags imageUsage,
@@ -139,6 +189,6 @@
     uint32_t waitSemaphoreCount,
     const VkSemaphore* pWaitSemaphores,
     int* pNativeFenceFd,
-    AndroidNativeBufferInfo* anbInfo);
+    std::shared_ptr<AndroidNativeBufferInfo> anbInfo);
 
 } // namespace goldfish_vk
diff --git a/stream-servers/vulkan/VkDecoderGlobalState.cpp b/stream-servers/vulkan/VkDecoderGlobalState.cpp
index 835aa87..31f840c 100644
--- a/stream-servers/vulkan/VkDecoderGlobalState.cpp
+++ b/stream-servers/vulkan/VkDecoderGlobalState.cpp
@@ -1559,7 +1559,7 @@
         }
         cmpInfo.device = device;
 
-        AndroidNativeBufferInfo anbInfo;
+        AndroidNativeBufferInfo* anbInfo = new AndroidNativeBufferInfo;
         const VkNativeBufferANDROID* nativeBufferANDROID =
             vk_find_struct<VkNativeBufferANDROID>(pCreateInfo);
 
@@ -1572,9 +1572,9 @@
             createRes =
                 prepareAndroidNativeBufferImage(
                         vk, device, pCreateInfo, nativeBufferANDROID, pAllocator,
-                        memProps, &anbInfo);
+                        memProps, anbInfo);
             if (createRes == VK_SUCCESS) {
-                *pImage = anbInfo.image;
+                *pImage = anbInfo->image;
             }
         } else {
             createRes =
@@ -1589,7 +1589,9 @@
         }
 
         auto& imageInfo = mImageInfo[*pImage];
-        imageInfo.anbInfo = anbInfo;
+
+        if (anbInfo) imageInfo.anbInfo.reset(anbInfo);
+
         imageInfo.cmpInfo = cmpInfo;
 
         *pImage = new_boxed_non_dispatchable_VkImage(*pImage);
@@ -1613,9 +1615,7 @@
 
         auto info = it->second;
 
-        if (info.anbInfo.image) {
-            teardownAndroidNativeBufferImage(vk, &info.anbInfo);
-        } else {
+        if (!info.anbInfo) {
             if (info.cmpInfo.isCompressed) {
                 CompressedImageInfo& cmpInfo = info.cmpInfo;
                 if (image != cmpInfo.decompImg) {
@@ -1744,9 +1744,10 @@
                 pCreateInfo = &createInfo;
             }
         }
-        if (imageInfoIt->second.anbInfo.externallyBacked) {
+        if (imageInfoIt->second.anbInfo &&
+            imageInfoIt->second.anbInfo->externallyBacked) {
             createInfo = *pCreateInfo;
-            createInfo.format = imageInfoIt->second.anbInfo.vkFormat;
+            createInfo.format = imageInfoIt->second.anbInfo->vkFormat;
             pCreateInfo = &createInfo;
         }
 
@@ -3470,7 +3471,7 @@
             return VK_ERROR_INITIALIZATION_FAILED;
         }
 
-        AndroidNativeBufferInfo* anbInfo = &imageInfo->anbInfo;
+        AndroidNativeBufferInfo* anbInfo = imageInfo->anbInfo.get();
 
         return
             setAndroidNativeImageSemaphoreSignaled(
@@ -3499,7 +3500,7 @@
         }
 
         auto imageInfo = android::base::find(mImageInfo, image);
-        AndroidNativeBufferInfo* anbInfo = &imageInfo->anbInfo;
+        auto anbInfo = imageInfo->anbInfo;
 
         return
             syncImageToColorBuffer(
@@ -4712,6 +4713,59 @@
         return vk->vkGetFenceStatus(device, fence);
     }
 
+    VkResult waitQsri(VkImage boxed_image, uint64_t timeout) {
+        (void)timeout; // TODO
+
+        AutoLock lock(mLock);
+
+        VkImage image = unbox_VkImage(boxed_image);
+
+        if (mLogging) {
+            fprintf(stderr, "%s: for boxed image 0x%llx image %p\n",
+                             __func__, (unsigned long long)boxed_image, image);
+        }
+
+        if (image == VK_NULL_HANDLE || mImageInfo.find(image) == mImageInfo.end()) {
+            // No image
+            return VK_SUCCESS;
+        }
+
+        auto anbInfo = mImageInfo[image].anbInfo; // shared ptr, take ref
+        lock.unlock();
+
+        if (!anbInfo) {
+            fprintf(stderr, "%s: warning: image %p doesn't ahve anb info\n", __func__, image);
+            return VK_SUCCESS;
+        }
+        if (!anbInfo->vk) {
+            fprintf(stderr, "%s:%p warning: image %p anb info not initialized\n", __func__, anbInfo.get(), image);
+            return VK_SUCCESS;
+        }
+        // Could be null or mismatched image, check later
+        if (image != anbInfo->image) {
+            fprintf(stderr, "%s:%p warning: image %p anb info has wrong image: %p\n", __func__, anbInfo.get(), image, anbInfo->image);
+            return VK_SUCCESS;
+        }
+
+        AutoLock qsriLock(anbInfo->qsriWaitInfo.lock);
+        ++anbInfo->qsriWaitInfo.requestedPresentCount;
+        uint64_t targetPresentCount = anbInfo->qsriWaitInfo.requestedPresentCount;
+
+        if (mLogging) {
+            fprintf(stderr, "%s:%p New target present count %llu\n",
+                    __func__, anbInfo.get(), (unsigned long long)targetPresentCount);
+        }
+
+        anbInfo->qsriWaitInfo.cv.wait(&anbInfo->qsriWaitInfo.lock, [anbInfo, targetPresentCount] {
+            return targetPresentCount <= anbInfo->qsriWaitInfo.presentCount;
+        });
+
+        if (mLogging) {
+            fprintf(stderr, "%s:%p Done waiting\n", __func__, anbInfo.get());
+        }
+        return VK_SUCCESS;
+    }
+
 #define GUEST_EXTERNAL_MEMORY_HANDLE_TYPES                                \
     (VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID | \
      VK_EXTERNAL_MEMORY_HANDLE_TYPE_TEMP_ZIRCON_VMO_BIT_FUCHSIA |         \
@@ -6430,7 +6484,7 @@
     };
 
     struct ImageInfo {
-        AndroidNativeBufferInfo anbInfo;
+        std::shared_ptr<AndroidNativeBufferInfo> anbInfo;
         CompressedImageInfo cmpInfo;
     };
 
@@ -7716,6 +7770,10 @@
     return mImpl->getFenceStatus(boxed_fence);
 }
 
+VkResult VkDecoderGlobalState::waitQsri(VkImage image, uint64_t timeout) {
+    return mImpl->waitQsri(image, timeout);
+}
+
 void VkDecoderGlobalState::deviceMemoryTransform_tohost(
     VkDeviceMemory* memory, uint32_t memoryCount,
     VkDeviceSize* offset, uint32_t offsetCount,
diff --git a/stream-servers/vulkan/VkDecoderGlobalState.h b/stream-servers/vulkan/VkDecoderGlobalState.h
index f2864e5..6cbf223 100644
--- a/stream-servers/vulkan/VkDecoderGlobalState.h
+++ b/stream-servers/vulkan/VkDecoderGlobalState.h
@@ -780,6 +780,13 @@
 
     VkResult getFenceStatus(VkFence boxed_fence);
 
+    // Wait for present (vkQueueSignalReleaseImageANDROID). This explicitly
+    // requires the image to be presented again versus how many times it's been
+    // presented so far, so it ends up incrementing a "target present count"
+    // for this image, and then waiting for the image to get vkQSRI'ed at least
+    // that many times.
+    VkResult waitQsri(VkImage boxed_image, uint64_t timeout);
+
     // Transformations
     void deviceMemoryTransform_tohost(
         VkDeviceMemory* memory, uint32_t memoryCount,