gfxstream renderer: always use VirtioGpuTimelines to manage fences

Change-Id: I9e37d8ee4adc0ee74ec6a9a7d48512fbe6f5cc38
diff --git a/stream-servers/VirtioGpuTimelines.cpp b/stream-servers/VirtioGpuTimelines.cpp
index 2953703..d71642b 100644
--- a/stream-servers/VirtioGpuTimelines.cpp
+++ b/stream-servers/VirtioGpuTimelines.cpp
@@ -19,34 +19,41 @@
 #include "host-common/GfxstreamFatalError.h"
 
 using TaskId = VirtioGpuTimelines::TaskId;
-using CtxId = VirtioGpuTimelines::CtxId;
+using Ring = VirtioGpuTimelines::Ring;
 using FenceId = VirtioGpuTimelines::FenceId;
 using AutoLock = android::base::AutoLock;
 using emugl::ABORT_REASON_OTHER;
 using emugl::FatalError;
 
-VirtioGpuTimelines::VirtioGpuTimelines() : mNextId(0) {}
+std::unique_ptr<VirtioGpuTimelines> VirtioGpuTimelines::create(bool withAsyncCallback) {
+    return std::unique_ptr<VirtioGpuTimelines>(new VirtioGpuTimelines(withAsyncCallback));
+}
 
-TaskId VirtioGpuTimelines::enqueueTask(CtxId ctxId) {
+VirtioGpuTimelines::VirtioGpuTimelines(bool withAsyncCallback)
+    : mNextId(0), mWithAsyncCallback(withAsyncCallback) {}
+
+TaskId VirtioGpuTimelines::enqueueTask(const Ring& ring) {
     AutoLock lock(mLock);
 
     TaskId id = mNextId++;
-    std::shared_ptr<Task> task(new Task(id, ctxId), [this](Task *task) {
+    std::shared_ptr<Task> task(new Task(id, ring), [this](Task* task) {
         mTaskIdToTask.erase(task->mId);
         delete task;
     });
     mTaskIdToTask[id] = task;
-    mTimelineQueues[ctxId].emplace_back(std::move(task));
+    mTimelineQueues[ring].emplace_back(std::move(task));
     return id;
 }
 
-void VirtioGpuTimelines::enqueueFence(
-    CtxId ctxId, FenceId, FenceCompletionCallback fenceCompletionCallback) {
+void VirtioGpuTimelines::enqueueFence(const Ring& ring, FenceId,
+                                      FenceCompletionCallback fenceCompletionCallback) {
     AutoLock lock(mLock);
 
     auto fence = std::make_unique<Fence>(fenceCompletionCallback);
-    mTimelineQueues[ctxId].emplace_back(std::move(fence));
-    poll_locked(ctxId);
+    mTimelineQueues[ring].emplace_back(std::move(fence));
+    if (mWithAsyncCallback) {
+        poll_locked(ring);
+    }
 }
 
 void VirtioGpuTimelines::notifyTaskCompletion(TaskId taskId) {
@@ -71,14 +78,26 @@
             << "Task(id = " << static_cast<uint64_t>(taskId) << ") has been set to completed.";
     }
     task->mHasCompleted = true;
-    poll_locked(task->mCtxId);
+    if (mWithAsyncCallback) {
+        poll_locked(task->mRing);
+    }
 }
 
-void VirtioGpuTimelines::poll_locked(CtxId ctxId) {
-    auto iTimelineQueue = mTimelineQueues.find(ctxId);
+void VirtioGpuTimelines::poll() {
+    if (mWithAsyncCallback) {
+        GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER))
+            << "Can't call poll with async callback enabled.";
+    }
+    AutoLock lock(mLock);
+    for (const auto& [ring, timeline] : mTimelineQueues) {
+        poll_locked(ring);
+    }
+}
+void VirtioGpuTimelines::poll_locked(const Ring& ring) {
+    auto iTimelineQueue = mTimelineQueues.find(ring);
     if (iTimelineQueue == mTimelineQueues.end()) {
         GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER))
-            << "Context(id = " << ctxId << " doesn't exist";
+            << "Ring(" << to_string(ring) << ") doesn't exist.";
     }
     std::list<TimelineItem> &timelineQueue = iTimelineQueue->second;
     auto i = timelineQueue.begin();
diff --git a/stream-servers/VirtioGpuTimelines.h b/stream-servers/VirtioGpuTimelines.h
index 7d08473..bd166a3 100644
--- a/stream-servers/VirtioGpuTimelines.h
+++ b/stream-servers/VirtioGpuTimelines.h
@@ -15,8 +15,11 @@
 #define VIRTIO_GPU_TIMELINES_H
 
 #include <atomic>
+#include <functional>
 #include <list>
 #include <memory>
+#include <sstream>
+#include <string>
 #include <unordered_map>
 #include <variant>
 
@@ -24,18 +27,64 @@
 #include "virtio-gpu-gfxstream-renderer.h"
 #include "virtio_gpu_ops.h"
 
+struct VirtioGpuRingGlobal {};
+struct VirtioGpuRingContextSpecific {
+    VirtioGpuCtxId mCtxId;
+    VirtioGpuRingIdx mRingIdx;
+};
+using VirtioGpuRing = std::variant<VirtioGpuRingGlobal, VirtioGpuRingContextSpecific>;
+
+template <>
+struct std::hash<VirtioGpuRingGlobal> {
+    std::size_t operator()(VirtioGpuRingGlobal const&) const noexcept { return 0; }
+};
+
+inline bool operator==(const VirtioGpuRingGlobal&, const VirtioGpuRingGlobal&) { return true; }
+
+template <>
+struct std::hash<VirtioGpuRingContextSpecific> {
+    std::size_t operator()(VirtioGpuRingContextSpecific const& ringContextSpecific) const noexcept {
+        std::size_t ctxHash = std::hash<VirtioGpuCtxId>{}(ringContextSpecific.mCtxId);
+        std::size_t ringHash = std::hash<VirtioGpuRingIdx>{}(ringContextSpecific.mRingIdx);
+        // Use the hash_combine from
+        // https://www.boost.org/doc/libs/1_78_0/boost/container_hash/hash.hpp.
+        std::size_t res = ctxHash;
+        res ^= ringHash + 0x9e3779b9 + (res << 6) + (res >> 2);
+        return res;
+    }
+};
+
+inline bool operator==(const VirtioGpuRingContextSpecific& lhs,
+                       const VirtioGpuRingContextSpecific& rhs) {
+    return lhs.mCtxId == rhs.mCtxId && lhs.mRingIdx == rhs.mRingIdx;
+}
+
+inline std::string to_string(const VirtioGpuRing& ring) {
+    struct {
+        std::string operator()(const VirtioGpuRingGlobal&) { return "global"; }
+        std::string operator()(const VirtioGpuRingContextSpecific& ring) {
+            std::stringstream ss;
+            ss << "context specific {ctx = " << ring.mCtxId << ", ring = " << ring.mRingIdx << "}";
+            return ss.str();
+        }
+    } visitor;
+    return std::visit(visitor, ring);
+}
+
 class VirtioGpuTimelines {
    public:
     using FenceId = uint64_t;
-    using CtxId = VirtioGpuCtxId;
+    using Ring = VirtioGpuRing;
     using TaskId = uint64_t;
-    VirtioGpuTimelines();
 
-    TaskId enqueueTask(CtxId);
-    void enqueueFence(CtxId, FenceId, FenceCompletionCallback);
+    TaskId enqueueTask(const Ring&);
+    void enqueueFence(const Ring&, FenceId, FenceCompletionCallback);
     void notifyTaskCompletion(TaskId);
+    void poll();
+    static std::unique_ptr<VirtioGpuTimelines> create(bool withAsyncCallback);
 
    private:
+    VirtioGpuTimelines(bool withAsyncCallback);
     struct Fence {
         std::unique_ptr<FenceCompletionCallback> mCompletionCallback;
         Fence(FenceCompletionCallback completionCallback)
@@ -44,10 +93,9 @@
     };
     struct Task {
         TaskId mId;
-        CtxId mCtxId;
+        Ring mRing;
         std::atomic_bool mHasCompleted;
-        Task(TaskId id, CtxId ctxId)
-            : mId(id), mCtxId(ctxId), mHasCompleted(false) {}
+        Task(TaskId id, const Ring& ring) : mId(id), mRing(ring), mHasCompleted(false) {}
     };
     using TimelineItem =
         std::variant<std::unique_ptr<Fence>, std::shared_ptr<Task>>;
@@ -57,10 +105,11 @@
     // mTimelineQueues, is destroyed, because the deleter of Task will
     // automatically remove the entry in mTaskIdToTask.
     std::unordered_map<TaskId, std::weak_ptr<Task>> mTaskIdToTask;
-    std::unordered_map<CtxId, std::list<TimelineItem>> mTimelineQueues;
+    std::unordered_map<Ring, std::list<TimelineItem>> mTimelineQueues;
+    const bool mWithAsyncCallback;
     // Go over the timeline, signal any fences without pending tasks, and remove
     // timeline items that are no longer needed.
-    void poll_locked(CtxId);
+    void poll_locked(const Ring&);
 };
 
 #endif  // VIRTIO_GPU_TIMELINES_H
diff --git a/stream-servers/tests/VirtioGpuTimelines_unittest.cpp b/stream-servers/tests/VirtioGpuTimelines_unittest.cpp
index a5aa148..f0abc28 100644
--- a/stream-servers/tests/VirtioGpuTimelines_unittest.cpp
+++ b/stream-servers/tests/VirtioGpuTimelines_unittest.cpp
@@ -18,29 +18,68 @@
 
 #include <memory>
 
-class VirtioGpuTimelinesTest : public ::testing::Test {
-   protected:
-    std::unique_ptr<VirtioGpuTimelines> mVirtioGpuTimelines;
-    void SetUp() override {
-        mVirtioGpuTimelines = std::make_unique<VirtioGpuTimelines>();
-    }
-};
+using RingGlobal = VirtioGpuRingGlobal;
+using RingContextSpecific = VirtioGpuRingContextSpecific;
 
-TEST_F(VirtioGpuTimelinesTest, Init) {}
+TEST(VirtioGpuTimelinesTest, Init) {
+    std::unique_ptr<VirtioGpuTimelines> virtioGpuTimelines = VirtioGpuTimelines::create(true);
+    virtioGpuTimelines = VirtioGpuTimelines::create(false);
+}
 
-TEST_F(VirtioGpuTimelinesTest, TasksShouldHaveDifferentIds) {
-    auto taskId1 = mVirtioGpuTimelines->enqueueTask(0);
-    auto taskId2 = mVirtioGpuTimelines->enqueueTask(0);
+TEST(VirtioGpuTimelinesTest, TasksShouldHaveDifferentIds) {
+    std::unique_ptr<VirtioGpuTimelines> virtioGpuTimelines = VirtioGpuTimelines::create(true);
+    auto taskId1 = virtioGpuTimelines->enqueueTask(RingGlobal{});
+    auto taskId2 = virtioGpuTimelines->enqueueTask(RingGlobal{});
     ASSERT_NE(taskId1, taskId2);
 }
 
-TEST_F(VirtioGpuTimelinesTest, MultipleTasksAndFences) {
+TEST(VirtioGpuTimelinesTest, CantPollWithAsyncCallbackEnabled) {
+    EXPECT_DEATH(
+        {
+            std::unique_ptr<VirtioGpuTimelines> virtioGpuTimelines =
+                VirtioGpuTimelines::create(true);
+            virtioGpuTimelines->poll();
+        },
+        ".*");
+}
+
+TEST(VirtioGpuTimelinesTest, MultipleTasksAndFencesWithSyncCallback) {
+    std::unique_ptr<VirtioGpuTimelines> virtioGpuTimelines = VirtioGpuTimelines::create(false);
+    using namespace testing;
+    MockFunction<void()> check;
+    MockFunction<void()> fence1Callback;
+    MockFunction<void()> fence2Callback;
+    MockFunction<void()> fence3Callback;
+    VirtioGpuTimelines::FenceId fenceId = 0;
+    {
+        InSequence s;
+
+        EXPECT_CALL(check, Call());
+        EXPECT_CALL(fence1Callback, Call());
+        EXPECT_CALL(fence2Callback, Call());
+        EXPECT_CALL(fence3Callback, Call());
+    }
+
+    auto task1Id = virtioGpuTimelines->enqueueTask(RingGlobal{});
+    virtioGpuTimelines->enqueueFence(RingGlobal{}, fenceId++, fence1Callback.AsStdFunction());
+    auto task2Id = virtioGpuTimelines->enqueueTask(RingGlobal{});
+    virtioGpuTimelines->enqueueFence(RingGlobal{}, fenceId++, fence2Callback.AsStdFunction());
+    virtioGpuTimelines->notifyTaskCompletion(task1Id);
+    auto task3Id = virtioGpuTimelines->enqueueTask(RingGlobal{});
+    virtioGpuTimelines->enqueueFence(RingGlobal{}, fenceId++, fence3Callback.AsStdFunction());
+    virtioGpuTimelines->notifyTaskCompletion(task2Id);
+    virtioGpuTimelines->notifyTaskCompletion(task3Id);
+    check.Call();
+    virtioGpuTimelines->poll();
+}
+
+TEST(VirtioGpuTimelinesTest, MultipleTasksAndFencesWithAsyncCallback) {
+    std::unique_ptr<VirtioGpuTimelines> virtioGpuTimelines = VirtioGpuTimelines::create(true);
     using namespace testing;
     MockFunction<void(int)> check;
     MockFunction<void()> fence1Callback;
     MockFunction<void()> fence2Callback;
     MockFunction<void()> fence3Callback;
-    VirtioGpuTimelines::CtxId ctxId = 0;
     VirtioGpuTimelines::FenceId fenceId = 0;
     {
         InSequence s;
@@ -54,29 +93,26 @@
         EXPECT_CALL(fence3Callback, Call());
     }
 
-    auto task1Id = mVirtioGpuTimelines->enqueueTask(ctxId);
-    mVirtioGpuTimelines->enqueueFence(ctxId, fenceId++,
-                                      fence1Callback.AsStdFunction());
-    auto task2Id = mVirtioGpuTimelines->enqueueTask(ctxId);
-    mVirtioGpuTimelines->enqueueFence(ctxId, fenceId++,
-                                      fence2Callback.AsStdFunction());
+    auto task1Id = virtioGpuTimelines->enqueueTask(RingGlobal{});
+    virtioGpuTimelines->enqueueFence(RingGlobal{}, fenceId++, fence1Callback.AsStdFunction());
+    auto task2Id = virtioGpuTimelines->enqueueTask(RingGlobal{});
+    virtioGpuTimelines->enqueueFence(RingGlobal{}, fenceId++, fence2Callback.AsStdFunction());
     check.Call(1);
-    mVirtioGpuTimelines->notifyTaskCompletion(task1Id);
+    virtioGpuTimelines->notifyTaskCompletion(task1Id);
     check.Call(2);
-    auto task3Id = mVirtioGpuTimelines->enqueueTask(ctxId);
-    mVirtioGpuTimelines->enqueueFence(ctxId, fenceId++,
-                                      fence3Callback.AsStdFunction());
+    auto task3Id = virtioGpuTimelines->enqueueTask(RingGlobal{});
+    virtioGpuTimelines->enqueueFence(RingGlobal{}, fenceId++, fence3Callback.AsStdFunction());
     check.Call(3);
-    mVirtioGpuTimelines->notifyTaskCompletion(task2Id);
+    virtioGpuTimelines->notifyTaskCompletion(task2Id);
     check.Call(4);
-    mVirtioGpuTimelines->notifyTaskCompletion(task3Id);
+    virtioGpuTimelines->notifyTaskCompletion(task3Id);
 }
 
-TEST_F(VirtioGpuTimelinesTest, FencesWithoutPendingTasks) {
+TEST(VirtioGpuTimelinesTest, FencesWithoutPendingTasksWithAsyncCallback) {
+    std::unique_ptr<VirtioGpuTimelines> virtioGpuTimelines = VirtioGpuTimelines::create(true);
     using namespace testing;
     MockFunction<void()> fenceCallback1;
     MockFunction<void()> fenceCallback2;
-    VirtioGpuTimelines::CtxId ctxId = 0;
     VirtioGpuTimelines::FenceId fenceId = 0;
     {
         InSequence s;
@@ -84,18 +120,16 @@
         EXPECT_CALL(fenceCallback2, Call());
     }
 
-    mVirtioGpuTimelines->enqueueFence(ctxId, fenceId++,
-                                      fenceCallback1.AsStdFunction());
-    mVirtioGpuTimelines->enqueueFence(ctxId, fenceId++,
-                                      fenceCallback2.AsStdFunction());
+    virtioGpuTimelines->enqueueFence(RingGlobal{}, fenceId++, fenceCallback1.AsStdFunction());
+    virtioGpuTimelines->enqueueFence(RingGlobal{}, fenceId++, fenceCallback2.AsStdFunction());
 }
 
-TEST_F(VirtioGpuTimelinesTest, FencesSharingSamePendingTasks) {
+TEST(VirtioGpuTimelinesTest, FencesSharingSamePendingTasksWithAsyncCallback) {
+    std::unique_ptr<VirtioGpuTimelines> virtioGpuTimelines = VirtioGpuTimelines::create(true);
     using namespace testing;
     MockFunction<void()> fenceCallback1;
     MockFunction<void()> fenceCallback2;
     MockFunction<void(int)> check;
-    VirtioGpuTimelines::CtxId ctxId = 0;
     VirtioGpuTimelines::FenceId fenceId = 0;
     {
         InSequence s;
@@ -104,16 +138,15 @@
         EXPECT_CALL(fenceCallback2, Call());
     }
 
-    auto taskId = mVirtioGpuTimelines->enqueueTask(ctxId);
-    mVirtioGpuTimelines->enqueueFence(ctxId, fenceId++,
-                                      fenceCallback1.AsStdFunction());
-    mVirtioGpuTimelines->enqueueFence(ctxId, fenceId++,
-                                      fenceCallback2.AsStdFunction());
+    auto taskId = virtioGpuTimelines->enqueueTask(RingGlobal{});
+    virtioGpuTimelines->enqueueFence(RingGlobal{}, fenceId++, fenceCallback1.AsStdFunction());
+    virtioGpuTimelines->enqueueFence(RingGlobal{}, fenceId++, fenceCallback2.AsStdFunction());
     check.Call(1);
-    mVirtioGpuTimelines->notifyTaskCompletion(taskId);
+    virtioGpuTimelines->notifyTaskCompletion(taskId);
 }
 
-TEST_F(VirtioGpuTimelinesTest, TasksAndFencesOnMultipleContexts) {
+TEST(VirtioGpuTimelinesTest, TasksAndFencesOnMultipleContextsWithAsyncCallback) {
+    std::unique_ptr<VirtioGpuTimelines> virtioGpuTimelines = VirtioGpuTimelines::create(true);
     using namespace testing;
     MockFunction<void()> fence1Callback;
     MockFunction<void()> fence2Callback;
@@ -129,14 +162,80 @@
         EXPECT_CALL(check, Call(3));
         EXPECT_CALL(fence3Callback, Call());
     }
-    auto taskId2 = mVirtioGpuTimelines->enqueueTask(2);
-    auto taskId3 = mVirtioGpuTimelines->enqueueTask(3);
+    auto taskId2 = virtioGpuTimelines->enqueueTask(RingContextSpecific{
+        .mCtxId = 2,
+        .mRingIdx = 0,
+    });
+    auto taskId3 = virtioGpuTimelines->enqueueTask(RingContextSpecific{
+        .mCtxId = 3,
+        .mRingIdx = 0,
+    });
     check.Call(1);
-    mVirtioGpuTimelines->enqueueFence(1, 1, fence1Callback.AsStdFunction());
+    virtioGpuTimelines->enqueueFence(RingGlobal{}, 1, fence1Callback.AsStdFunction());
     check.Call(2);
-    mVirtioGpuTimelines->enqueueFence(2, 2, fence2Callback.AsStdFunction());
-    mVirtioGpuTimelines->enqueueFence(3, 3, fence3Callback.AsStdFunction());
-    mVirtioGpuTimelines->notifyTaskCompletion(taskId2);
+    virtioGpuTimelines->enqueueFence(
+        RingContextSpecific{
+            .mCtxId = 2,
+            .mRingIdx = 0,
+        },
+        2, fence2Callback.AsStdFunction());
+    virtioGpuTimelines->enqueueFence(
+        RingContextSpecific{
+            .mCtxId = 3,
+            .mRingIdx = 0,
+        },
+        3, fence3Callback.AsStdFunction());
+    virtioGpuTimelines->notifyTaskCompletion(taskId2);
     check.Call(3);
-    mVirtioGpuTimelines->notifyTaskCompletion(taskId3);
+    virtioGpuTimelines->notifyTaskCompletion(taskId3);
+}
+
+TEST(VirtioGpuTimelinesTest, TasksAndFencesOnMultipleRingsWithAsyncCallback) {
+    std::unique_ptr<VirtioGpuTimelines> virtioGpuTimelines = VirtioGpuTimelines::create(true);
+    using namespace testing;
+    MockFunction<void()> fence1Callback;
+    MockFunction<void()> fence2Callback;
+    MockFunction<void()> fence3Callback;
+    MockFunction<void(int)> check;
+    {
+        InSequence s;
+
+        EXPECT_CALL(check, Call(1));
+        EXPECT_CALL(fence1Callback, Call());
+        EXPECT_CALL(check, Call(2));
+        EXPECT_CALL(fence2Callback, Call());
+        EXPECT_CALL(check, Call(3));
+        EXPECT_CALL(fence3Callback, Call());
+    }
+    auto taskId2 = virtioGpuTimelines->enqueueTask(RingContextSpecific{
+        .mCtxId = 1,
+        .mRingIdx = 2,
+    });
+    auto taskId3 = virtioGpuTimelines->enqueueTask(RingContextSpecific{
+        .mCtxId = 1,
+        .mRingIdx = 3,
+    });
+    check.Call(1);
+    virtioGpuTimelines->enqueueFence(
+        RingContextSpecific{
+            .mCtxId = 1,
+            .mRingIdx = 1,
+        },
+        1, fence1Callback.AsStdFunction());
+    check.Call(2);
+    virtioGpuTimelines->enqueueFence(
+        RingContextSpecific{
+            .mCtxId = 1,
+            .mRingIdx = 2,
+        },
+        2, fence2Callback.AsStdFunction());
+    virtioGpuTimelines->enqueueFence(
+        RingContextSpecific{
+            .mCtxId = 1,
+            .mRingIdx = 3,
+        },
+        3, fence3Callback.AsStdFunction());
+    virtioGpuTimelines->notifyTaskCompletion(taskId2);
+    check.Call(3);
+    virtioGpuTimelines->notifyTaskCompletion(taskId3);
 }
diff --git a/stream-servers/virtio-gpu-gfxstream-renderer.cpp b/stream-servers/virtio-gpu-gfxstream-renderer.cpp
index 489c38f..56fdac7 100644
--- a/stream-servers/virtio-gpu-gfxstream-renderer.cpp
+++ b/stream-servers/virtio-gpu-gfxstream-renderer.cpp
@@ -21,13 +21,13 @@
 #include "base/Lock.h"
 #include "base/Tracing.h"
 #include "host-common/AddressSpaceService.h"
+#include "host-common/GfxstreamFatalError.h"
 #include "host-common/HostmemIdMapping.h"
 #include "host-common/address_space_device.h"
 #include "host-common/android_pipe_common.h"
-#include "host-common/GfxstreamFatalError.h"
+#include "host-common/linux_types.h"
 #include "host-common/opengles.h"
 #include "host-common/vm_operations.h"
-#include "host-common/linux_types.h"
 
 extern "C" {
 #include "virtio-gpu-gfxstream-renderer.h"
@@ -120,16 +120,6 @@
 // Pipe read() operation corresponds to performing a TRANSFER_FROM_HOST ioctl on
 // the resource created alongside open().
 //
-// A note on synchronization----------------------------------------------------
-//
-// Unlike goldfish-pipe which handles write/read/open/close on the vcpu thread
-// that triggered the particular operation, virtio-gpu handles the
-// corresponding virgl operations in a bottom half that is triggered off the
-// vcpu thread on a timer. This means that in the guest, if we want to ensure
-// that a particular operation such as TRANSFER_TO_HOST completed on the host,
-// we need to call VIRTGPU_WAIT, which ends up polling fences here. This is why
-// we insert a fence after every operation in this code.
-//
 // Details on transfer mechanism: mapping 2D transfer to 1D ones----------------
 //
 // Resource objects are typically 2D textures, while we're wanting to transmit
@@ -552,13 +542,8 @@
             GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER))
                 << "Could not get address space device control ops!";
         }
-        if (flags & GFXSTREAM_RENDERER_FLAGS_ASYNC_FENCE_CB) {
-            VGPLOG("Using async fence cb.");
-            mVirtioGpuTimelines = std::make_unique<VirtioGpuTimelines>();
-        } else {
-            VGPLOG("Not using async fence cb.");
-            mVirtioGpuTimelines = nullptr;
-        }
+        mVirtioGpuTimelines =
+            VirtioGpuTimelines::create(flags & GFXSTREAM_RENDERER_FLAGS_ASYNC_FENCE_CB);
         VGPLOG("done");
         return 0;
     }
@@ -812,7 +797,11 @@
     }
 
     int submitCmd(VirtioGpuCtxId ctxId, void* buffer, int dwordCount) {
-        VGPLOG("ctxid: %u buffer: %p dwords: %d", ctxId, buffer, dwordCount);
+        // TODO(kaiyili): embed the ring_idx into the command buffer to make it possible to dispatch
+        // commands on different ring.
+        const VirtioGpuRing ring = VirtioGpuRingGlobal{};
+        VGPLOG("ctx: %" PRIu32 ", ring: %s buffer: %p dwords: %d", ctxId, to_string(ring).c_str(),
+               buffer, dwordCount);
 
         if (!buffer) {
             fprintf(stderr, "%s: error: buffer null\n", __func__);
@@ -841,17 +830,11 @@
                 uint32_t sync_handle_hi = dwords[2];
                 uint64_t sync_handle = convert32to64(sync_handle_lo, sync_handle_hi);
 
-                VGPLOG("wait for gpu ctx id %u", ctxId);
-                if (mVirtioGpuTimelines) {
-                    auto taskId = mVirtioGpuTimelines->enqueueTask(
-                        static_cast<VirtioGpuTimelines::CtxId>(ctxId));
-                    mVirtioGpuOps->async_wait_for_gpu_with_cb(
-                        sync_handle, [this, ctxId, taskId] {
-                            mVirtioGpuTimelines->notifyTaskCompletion(taskId);
-                        });
-                } else {
-                    mVirtioGpuOps->wait_for_gpu(sync_handle);
-                }
+                VGPLOG("wait for gpu ring %s", to_string(ring));
+                auto taskId = mVirtioGpuTimelines->enqueueTask(ring);
+                mVirtioGpuOps->async_wait_for_gpu_with_cb(sync_handle, [this, taskId] {
+                    mVirtioGpuTimelines->notifyTaskCompletion(taskId);
+                });
                 break;
             }
             case kVirtioGpuNativeSyncVulkanCreateExportFd:
@@ -864,34 +847,23 @@
                 uint32_t fence_handle_hi = dwords[4];
                 uint64_t fence_handle = convert32to64(fence_handle_lo, fence_handle_hi);
 
-                VGPLOG("wait for gpu vk ctx id %u", ctxId);
-                if (mVirtioGpuTimelines) {
-                    auto taskId = mVirtioGpuTimelines->enqueueTask(
-                        static_cast<VirtioGpuTimelines::CtxId>(ctxId));
-                    mVirtioGpuOps->async_wait_for_gpu_vulkan_with_cb(
-                        device_handle, fence_handle, [this, ctxId, taskId] {
-                            mVirtioGpuTimelines->notifyTaskCompletion(taskId);
-                        });
-                } else {
-                    mVirtioGpuOps->wait_for_gpu_vulkan(device_handle, fence_handle);
-                }
+                VGPLOG("wait for gpu ring %s", to_string(ring));
+                auto taskId = mVirtioGpuTimelines->enqueueTask(ring);
+                mVirtioGpuOps->async_wait_for_gpu_vulkan_with_cb(
+                    device_handle, fence_handle,
+                    [this, taskId] { mVirtioGpuTimelines->notifyTaskCompletion(taskId); });
                 break;
             }
             case kVirtioGpuNativeSyncVulkanQsriExport: {
                 uint64_t image_handle_lo = dwords[1];
                 uint64_t image_handle_hi = dwords[2];
                 uint64_t image_handle = convert32to64(image_handle_lo, image_handle_hi);
-                VGPLOG("wait for gpu vk qsri id %u image 0x%llx", ctxId, (unsigned long long)image_handle);
-                if (mVirtioGpuTimelines) {
-                    auto taskId = mVirtioGpuTimelines->enqueueTask(
-                        static_cast<VirtioGpuTimelines::CtxId>(ctxId));
-                    mVirtioGpuOps->async_wait_for_gpu_vulkan_qsri_with_cb(
-                        image_handle, [this, ctxId, taskId] {
-                            mVirtioGpuTimelines->notifyTaskCompletion(taskId);
-                        });
-                } else {
-                    mVirtioGpuOps->wait_for_gpu_vulkan_qsri(image_handle);
-                }
+                VGPLOG("wait for gpu vk qsri ring %u image 0x%llx", to_string(ring).c_str(),
+                       (unsigned long long)image_handle);
+                auto taskId = mVirtioGpuTimelines->enqueueTask(ring);
+                mVirtioGpuOps->async_wait_for_gpu_vulkan_qsri_with_cb(image_handle, [this, taskId] {
+                    mVirtioGpuTimelines->notifyTaskCompletion(taskId);
+                });
                 break;
             }
             default:
@@ -901,89 +873,46 @@
         return 0;
     }
 
-    enum VirtioGpuFenceType {
-        Global,
-        ContextFence,
-    };
+    int createFence(uint64_t fence_id, const VirtioGpuRing& ring) {
+        VGPLOG("fenceid: %llu ring: %s", (unsigned long long)fence_id, to_string(ring).c_str());
 
-    enum CtxSyncingType {
-        SyncSignal,
-        AsyncSignal,
-    };
-
-    struct CtxPendingFence {
-        VirtioGpuFenceType fenceType;
-        CtxSyncingType syncType;
-        uint64_t fence_value;
-    };
-
-    int createFence(int client_fence_id, uint32_t ctx_id) {
-        AutoLock lock(mLock);
-        VGPLOG("fenceid: %u cmdtype: %u", client_fence_id, ctx_id);
-        if (mVirtioGpuTimelines) {
-            VGPLOG("create fence using async fence cb");
-            if (0 == ctx_id) {
-                VGPLOG("is 0 ctx id, signal right away as everything's serialized to this point");
-                mVirglRendererCallbacks.write_fence(mCookie, (uint32_t)client_fence_id);
-            } else {
-                VGPLOG("is Not 0 ctx id (%u), do not signal right away if async signal on top.. the client fence id was %d", ctx_id, client_fence_id);
-                mVirtioGpuTimelines->enqueueFence(
-                    0,
-                    static_cast<VirtioGpuTimelines::FenceId>(client_fence_id),
-                    [this, client_fence_id]() {
-                        mVirglRendererCallbacks.write_fence(
-                            mCookie, static_cast<uint32_t>(client_fence_id));
-                    });
+        struct {
+            FenceCompletionCallback operator()(const VirtioGpuRingGlobal&) {
+                return [renderer = mRenderer, fenceId = mFenceId] {
+                    renderer->mVirglRendererCallbacks.write_fence(renderer->mCookie, fenceId);
+                };
             }
-        } else {
-            VGPLOG("create fence without async fence cb");
-            mFenceDeque.push_back((uint64_t)client_fence_id);
-        }
-        return 0;
-    }
-
-    int contextCreateFence(uint64_t fence_id, uint32_t ctx_id, uint8_t ring_idx) {
-        AutoLock lock(mLock);
-        VGPLOG("fenceid: %llu cmdtype: %u ring_idx: %u", (unsigned long long)fence_id, ctx_id, ring_idx);
-        if (mVirtioGpuTimelines) {
-            VGPLOG("create fence using async fence cb");
-            if (0 == ctx_id) {
-                VGPLOG("is 0 ctx id, signal right away as everything's serialized to this point");
-                mVirglRendererCallbacks.write_fence(mCookie, (uint32_t)fence_id);
-            } else {
-                VGPLOG("is Not 0 ctx id (%u), do not signal right away if async signal on top.. the client fence id was %llu",
-                       ctx_id, (unsigned long long)fence_id);
+            FenceCompletionCallback operator()(const VirtioGpuRingContextSpecific& ring) {
 #ifdef VIRGL_RENDERER_UNSTABLE_APIS
-                mVirtioGpuTimelines->enqueueFence(
-                    static_cast<VirtioGpuTimelines::CtxId>(ctx_id),
-                    static_cast<VirtioGpuTimelines::FenceId>(fence_id),
-                    [this, fence_id, ctx_id, ring_idx]() {
-                        mVirglRendererCallbacks.write_context_fence(
-                            mCookie, fence_id, ctx_id, ring_idx);
-                    });
+                return [renderer = mRenderer, fenceId = mFenceId, ring] {
+                    renderer->mVirglRendererCallbacks.write_context_fence(
+                        renderer->mCookie, fenceId, ring.mCtxId, ring.mRingIdx);
+                };
 #else
-                VGPLOG("enable unstable apis for this feature");
-                return -EINVAL;
+                VGPLOG("enable unstable apis for the context specific fence feature");
+                return {};
 #endif
             }
-        } else {
-            fprintf(stderr, "%s: create fence without async fence cb\n", __func__);
-            mFenceDeque.push_back(fence_id);
+
+            PipeVirglRenderer* mRenderer;
+            VirtioGpuTimelines::FenceId mFenceId;
+        } visitor{
+            .mRenderer = this,
+            .mFenceId = fence_id,
+        };
+        FenceCompletionCallback callback = std::visit(visitor, ring);
+        if (!callback) {
+            // A context specific ring passed in, but the project is compiled without
+            // VIRGL_RENDERER_UNSTABLE_APIS defined.
+            return -EINVAL;
         }
+        AutoLock lock(mLock);
+        mVirtioGpuTimelines->enqueueFence(ring, fence_id, callback);
+
         return 0;
     }
 
-    void poll() {
-        VGPLOG("start");
-        AutoLock lock(mLock);
-        for (auto fence : mFenceDeque) {
-            VGPLOG("write fence: %llu", (unsigned long long)fence);
-            mVirglRendererCallbacks.write_fence(mCookie, (uint32_t)fence);
-            VGPLOG("write fence: %llu (done with callback)", (unsigned long long)fence);
-        }
-        mFenceDeque.clear();
-        VGPLOG("end");
-    }
+    void poll() { mVirtioGpuTimelines->poll(); }
 
     enum pipe_texture_target {
         PIPE_BUFFER,
@@ -1639,14 +1568,10 @@
     std::unordered_map<VirtioGpuCtxId, std::vector<VirtioGpuResId>> mContextResources;
     std::unordered_map<VirtioGpuResId, std::vector<VirtioGpuCtxId>> mResourceContexts;
 
-    // For use with the async fence cb.
     // When we wait for gpu or wait for gpu vulkan, the next (and subsequent)
     // fences created for that context should not be signaled immediately.
     // Rather, they should get in line.
     std::unique_ptr<VirtioGpuTimelines> mVirtioGpuTimelines = nullptr;
-
-    // For use without the async fence cb.
-    std::deque<uint64_t> mFenceDeque;
 };
 
 static PipeVirglRenderer* sRenderer() {
@@ -1662,9 +1587,7 @@
     return 0;
 }
 
-VG_EXPORT void pipe_virgl_renderer_poll(void) {
-    sRenderer()->poll();
-}
+VG_EXPORT void pipe_virgl_renderer_poll(void) { sRenderer()->poll(); }
 
 VG_EXPORT void* pipe_virgl_renderer_get_cursor_data(
     uint32_t resource_id, uint32_t *width, uint32_t *height) {
@@ -1737,7 +1660,7 @@
 
 VG_EXPORT int pipe_virgl_renderer_create_fence(
     int client_fence_id, uint32_t ctx_id) {
-    sRenderer()->createFence(client_fence_id, ctx_id);
+    sRenderer()->createFence(client_fence_id, VirtioGpuRingGlobal{});
     return 0;
 }
 
@@ -1804,7 +1727,10 @@
 
 VG_EXPORT int stream_renderer_context_create_fence(
     uint64_t fence_id, uint32_t ctx_id, uint8_t ring_idx) {
-    sRenderer()->contextCreateFence(fence_id, ctx_id, ring_idx);
+    sRenderer()->createFence(fence_id, VirtioGpuRingContextSpecific{
+                                           .mCtxId = ctx_id,
+                                           .mRingIdx = ring_idx,
+                                       });
     return 0;
 }
 
diff --git a/stream-servers/virtio-gpu-gfxstream-renderer.h b/stream-servers/virtio-gpu-gfxstream-renderer.h
index 8cb3a81..b33d291 100644
--- a/stream-servers/virtio-gpu-gfxstream-renderer.h
+++ b/stream-servers/virtio-gpu-gfxstream-renderer.h
@@ -12,6 +12,7 @@
 extern "C" {
 #endif
 typedef uint32_t VirtioGpuCtxId;
+typedef uint8_t VirtioGpuRingIdx;
 struct virgl_renderer_virtio_interface*
     get_goldfish_pipe_virgl_renderer_virtio_interface(void);