Adds helper for tracking command buffers with device lost

... which was useful for debugging b/347288539.

Bug: b/347288539
Test: cvd start \
 --gpu_mode=gfxstream_guest_angle \
 --gpu_renderer_features=VulkanCommandBufferCheckpoints:enabled
Change-Id: I8056ba820a1c28410d94fa56608f38fbf0328148
diff --git a/codegen/vulkan/vulkan-docs-next/scripts/cerealgenerator.py b/codegen/vulkan/vulkan-docs-next/scripts/cerealgenerator.py
index cb32fd3..720d218 100644
--- a/codegen/vulkan/vulkan-docs-next/scripts/cerealgenerator.py
+++ b/codegen/vulkan/vulkan-docs-next/scripts/cerealgenerator.py
@@ -123,6 +123,7 @@
     "VK_EXT_metal_objects",
     "VK_KHR_external_semaphore_win32",
     "VK_KHR_external_memory_win32",
+    "VK_NV_device_diagnostic_checkpoints",
     # Android
     "VK_ANDROID_native_buffer",
     "VK_ANDROID_external_memory_android_hardware_buffer",
@@ -169,6 +170,7 @@
     "VK_KHR_android_surface": ["func_table"],
     "VK_EXT_swapchain_maintenance1" : HOST_MODULES,
     "VK_KHR_swapchain" : HOST_MODULES,
+    "VK_NV_device_diagnostic_checkpoints": ["goldfish_vk_dispatch"],
 }
 
 # These modules will be used when the feature is not supported.
diff --git a/host/FrameBuffer.cpp b/host/FrameBuffer.cpp
index 6c827e8..daba974 100644
--- a/host/FrameBuffer.cpp
+++ b/host/FrameBuffer.cpp
@@ -2849,6 +2849,13 @@
     return colorBufferPtr->borrowForDisplay(api);
 }
 
+void FrameBuffer::logVulkanDeviceLost() {
+    if (!m_emulationVk) {
+        GFXSTREAM_ABORT(FatalError(ABORT_REASON_OTHER)) << "Device lost without VkEmulation?";
+    }
+    vk::onVkDeviceLost();
+}
+
 void FrameBuffer::logVulkanOutOfMemory(VkResult result, const char* function, int line,
                                        std::optional<uint64_t> allocationSize) {
     m_logger->logMetricEvent(MetricEventVulkanOutOfMemory{
diff --git a/host/FrameBuffer.h b/host/FrameBuffer.h
index e4ecb84..c00ee27 100644
--- a/host/FrameBuffer.h
+++ b/host/FrameBuffer.h
@@ -482,6 +482,7 @@
         return *m_logger;
     }
 
+    void logVulkanDeviceLost();
     void logVulkanOutOfMemory(VkResult result, const char* function, int line,
                               std::optional<uint64_t> allocationSize = std::nullopt);
 
diff --git a/host/features/include/gfxstream/host/Features.h b/host/features/include/gfxstream/host/Features.h
index 3e2aa74..38e034c 100644
--- a/host/features/include/gfxstream/host/Features.h
+++ b/host/features/include/gfxstream/host/Features.h
@@ -299,6 +299,13 @@
         "labels on Vulkan resources and operation",
         &map,
     };
+    FeatureInfo VulkanCommandBufferCheckpoints = {
+        "VulkanCommandBufferCheckpoints",
+        "If enabled, the host will enable the VK_NV_device_diagnostic_checkpoints extension "
+        "when available, track command buffers with markers, and report unfinished command "
+        "buffers on device lost. (TODO: VK_AMD_buffer_marker)",
+        &map,
+    };
 };
 
 #define GFXSTREAM_SET_FEATURE_ON_CONDITION(set, feature, condition) \
diff --git a/host/virtio-gpu-gfxstream-renderer.cpp b/host/virtio-gpu-gfxstream-renderer.cpp
index 5ff4904..8532421 100644
--- a/host/virtio-gpu-gfxstream-renderer.cpp
+++ b/host/virtio-gpu-gfxstream-renderer.cpp
@@ -2802,6 +2802,15 @@
     gfxstream::vk::vk_util::setVkCheckCallbacks(
         std::make_unique<gfxstream::vk::vk_util::VkCheckCallbacks>(
             gfxstream::vk::vk_util::VkCheckCallbacks{
+                .onVkErrorDeviceLost =
+                    []() {
+                        auto fb = gfxstream::FrameBuffer::getFB();
+                        if (!fb) {
+                            ERR("FrameBuffer not yet initialized. Dropping device lost event");
+                            return;
+                        }
+                        fb->logVulkanDeviceLost();
+                    },
                 .onVkErrorOutOfMemory =
                     [](VkResult result, const char* function, int line) {
                         auto fb = gfxstream::FrameBuffer::getFB();
diff --git a/host/vulkan/Android.bp b/host/vulkan/Android.bp
index f51c207..6b809ae 100644
--- a/host/vulkan/Android.bp
+++ b/host/vulkan/Android.bp
@@ -45,6 +45,7 @@
         "ColorBufferVk.cpp",
         "CompositorVk.cpp",
         "DebugUtilsHelper.cpp",
+        "DeviceLostHelper.cpp",
         "DeviceOpTracker.cpp",
         "DisplaySurfaceVk.cpp",
         "DisplayVk.cpp",
diff --git a/host/vulkan/BUILD.bazel b/host/vulkan/BUILD.bazel
index 2ef519d..3b2d174 100644
--- a/host/vulkan/BUILD.bazel
+++ b/host/vulkan/BUILD.bazel
@@ -6,6 +6,7 @@
         "ColorBufferVk.cpp",
         "CompositorVk.cpp",
         "DebugUtilsHelper.cpp",
+        "DeviceLostHelper.cpp",
         "DeviceOpTracker.cpp",
         "DisplaySurfaceVk.cpp",
         "DisplayVk.cpp",
diff --git a/host/vulkan/CMakeLists.txt b/host/vulkan/CMakeLists.txt
index 779aa42..7da5d9e 100644
--- a/host/vulkan/CMakeLists.txt
+++ b/host/vulkan/CMakeLists.txt
@@ -11,6 +11,7 @@
             BufferVk.cpp
             ColorBufferVk.cpp
             CompositorVk.cpp
+            DeviceLostHelper.cpp
             DeviceOpTracker.cpp
             DisplayVk.cpp
             DisplaySurfaceVk.cpp
diff --git a/host/vulkan/DeviceLostHelper.cpp b/host/vulkan/DeviceLostHelper.cpp
new file mode 100644
index 0000000..ed6f989
--- /dev/null
+++ b/host/vulkan/DeviceLostHelper.cpp
@@ -0,0 +1,160 @@
+// Copyright 2024 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expresso or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "DeviceLostHelper.h"
+
+#include "host-common/logging.h"
+
+namespace gfxstream {
+namespace vk {
+
+void DeviceLostHelper::enableWithNvidiaDeviceDiagnosticCheckpoints() { mEnabled = true; }
+
+const void* DeviceLostHelper::createMarkerForCommandBuffer(const VkCommandBuffer& commandBuffer,
+                                                           MarkerType type) {
+    std::lock_guard<std::mutex> lock(mMarkersMutex);
+
+    auto it = mMarkers.insert(CheckpointMarker{commandBuffer, type});
+
+    // References and pointers to data stored in the container are only
+    // invalidated by erasing that element, even when the corresponding
+    // iterator is invalidated.
+    return reinterpret_cast<const void*>(&(*it.first));
+}
+
+void DeviceLostHelper::removeMarkersForCommandBuffer(const VkCommandBuffer& commandBuffer) {
+    std::lock_guard<std::mutex> lock(mMarkersMutex);
+    mMarkers.erase(CheckpointMarker{
+        .commandBuffer = commandBuffer,
+        .type = MarkerType::kBegin,
+    });
+    mMarkers.erase(CheckpointMarker{
+        .commandBuffer = commandBuffer,
+        .type = MarkerType::kEnd,
+    });
+}
+
+void DeviceLostHelper::addNeededDeviceExtensions(std::vector<const char*>* deviceExtensions) {
+    if (mEnabled) {
+        deviceExtensions->push_back(VK_NV_DEVICE_DIAGNOSTIC_CHECKPOINTS_EXTENSION_NAME);
+    }
+}
+
+void DeviceLostHelper::onBeginCommandBuffer(const VkCommandBuffer& commandBuffer,
+                                            const VulkanDispatch* vk) {
+    if (!mEnabled) {
+        return;
+    }
+
+    const void* marker = createMarkerForCommandBuffer(commandBuffer, MarkerType::kBegin);
+    vk->vkCmdSetCheckpointNV(commandBuffer, marker);
+}
+
+void DeviceLostHelper::onEndCommandBuffer(const VkCommandBuffer& commandBuffer,
+                                          const VulkanDispatch* vk) {
+    if (!mEnabled) {
+        return;
+    }
+
+    const void* marker = createMarkerForCommandBuffer(commandBuffer, MarkerType::kEnd);
+    vk->vkCmdSetCheckpointNV(commandBuffer, marker);
+}
+
+void DeviceLostHelper::onResetCommandBuffer(const VkCommandBuffer& commandBuffer) {
+    if (!mEnabled) {
+        return;
+    }
+
+    removeMarkersForCommandBuffer(commandBuffer);
+}
+
+void DeviceLostHelper::onFreeCommandBuffer(const VkCommandBuffer& commandBuffer) {
+    if (!mEnabled) {
+        return;
+    }
+
+    removeMarkersForCommandBuffer(commandBuffer);
+}
+
+void DeviceLostHelper::onDeviceLost(const std::vector<DeviceWithQueues>& devicesWithQueues) {
+    if (!mEnabled) {
+        return;
+    }
+
+    ERR("DeviceLostHelper starting lost device checks...");
+
+    for (const DeviceWithQueues& deviceWithQueues : devicesWithQueues) {
+        const auto& device = deviceWithQueues.device;
+        const auto* deviceDispatch = deviceWithQueues.deviceDispatch;
+        if (deviceDispatch->vkDeviceWaitIdle(device) != VK_ERROR_DEVICE_LOST) {
+            continue;
+        }
+        ERR("VkDevice:%p was lost, checking for unfinished VkCommandBuffers...", device);
+
+        struct CommandBufferOnQueue {
+            VkCommandBuffer commandBuffer = VK_NULL_HANDLE;
+            VkQueue queue = VK_NULL_HANDLE;
+        };
+        std::vector<CommandBufferOnQueue> unfinishedCommandBuffers;
+
+        for (const VkQueue& queue : deviceWithQueues.queues) {
+            std::vector<VkCheckpointDataNV> checkpointDatas;
+
+            uint32_t checkpointDataCount = 0;
+            deviceDispatch->vkGetQueueCheckpointDataNV(queue, &checkpointDataCount, nullptr);
+            if (checkpointDataCount == 0) continue;
+
+            checkpointDatas.resize(
+                static_cast<size_t>(checkpointDataCount),
+                VkCheckpointDataNV{
+                    .sType = VK_STRUCTURE_TYPE_CHECKPOINT_DATA_NV,
+                });
+            deviceDispatch->vkGetQueueCheckpointDataNV(queue, &checkpointDataCount,
+                                                       checkpointDatas.data());
+
+            std::unordered_set<VkCommandBuffer> unfinishedCommandBuffersForQueue;
+            for (const VkCheckpointDataNV& checkpointData : checkpointDatas) {
+                const auto& marker =
+                    *reinterpret_cast<const CheckpointMarker*>(checkpointData.pCheckpointMarker);
+                if (marker.type == MarkerType::kBegin) {
+                    unfinishedCommandBuffersForQueue.insert(marker.commandBuffer);
+                } else {
+                    unfinishedCommandBuffersForQueue.erase(marker.commandBuffer);
+                }
+            }
+
+            for (const VkCommandBuffer commandBuffer : unfinishedCommandBuffersForQueue) {
+                unfinishedCommandBuffers.push_back(CommandBufferOnQueue{
+                    .commandBuffer = commandBuffer,
+                    .queue = queue,
+                });
+            }
+        }
+
+        if (unfinishedCommandBuffers.empty()) {
+            ERR("VkDevice:%p has no outstanding VkCommandBuffers.", device);
+        } else {
+            ERR("VkDevice:%p has outstanding VkCommandBuffers:", device);
+            for (const CommandBufferOnQueue& unfinished : unfinishedCommandBuffers) {
+                ERR("   - VkCommandBuffer:%p on VkQueue:%p", unfinished.commandBuffer,
+                    unfinished.queue);
+            }
+        }
+    }
+
+    ERR("DeviceLostHelper finished lost device checks.");
+}
+
+}  // namespace vk
+}  // namespace gfxstream
\ No newline at end of file
diff --git a/host/vulkan/DeviceLostHelper.h b/host/vulkan/DeviceLostHelper.h
new file mode 100644
index 0000000..90e0fc5
--- /dev/null
+++ b/host/vulkan/DeviceLostHelper.h
@@ -0,0 +1,88 @@
+// Copyright 2024 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expresso or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vulkan/vulkan.h>
+
+#include <memory>
+#include <mutex>
+#include <unordered_set>
+
+#include "vulkan/cereal/common/goldfish_vk_dispatch.h"
+
+namespace gfxstream {
+namespace vk {
+
+// TODO: Support VK_AMD_buffer_marker.
+class DeviceLostHelper {
+   public:
+    DeviceLostHelper() : mEnabled(false) {};
+
+    DeviceLostHelper(const DeviceLostHelper&) = delete;
+    DeviceLostHelper& operator=(const DeviceLostHelper&) = delete;
+
+    DeviceLostHelper(DeviceLostHelper&&) = delete;
+    DeviceLostHelper& operator=(const DeviceLostHelper&&) = delete;
+
+    void enableWithNvidiaDeviceDiagnosticCheckpoints();
+
+    void addNeededDeviceExtensions(std::vector<const char*>* deviceExtensions);
+
+    void onBeginCommandBuffer(const VkCommandBuffer& commandBuffer, const VulkanDispatch* vk);
+    void onEndCommandBuffer(const VkCommandBuffer& commandBuffer, const VulkanDispatch* vk);
+
+    void onResetCommandBuffer(const VkCommandBuffer& commandBuffer);
+    void onFreeCommandBuffer(const VkCommandBuffer& commandBuffer);
+
+    struct DeviceWithQueues {
+        VkDevice device;
+        const VulkanDispatch* deviceDispatch;
+        std::vector<VkQueue> queues;
+    };
+    void onDeviceLost(const std::vector<DeviceWithQueues>& devicesWithQueues);
+
+   private:
+    enum class MarkerType { kBegin, kEnd };
+
+    struct CheckpointMarker {
+        VkCommandBuffer commandBuffer;
+        MarkerType type;
+    };
+
+    struct CheckpointMarkerEq {
+        bool operator()(const CheckpointMarker& lhs, const CheckpointMarker& rhs) const {
+            return lhs.commandBuffer == rhs.commandBuffer && lhs.type == rhs.type;
+        }
+    };
+
+    struct CheckpointMarkerHash {
+        size_t operator()(const CheckpointMarker& marker) const {
+            std::size_t h1 = (std::size_t)(marker.commandBuffer);
+            std::size_t h2 = (std::size_t)(marker.type);
+            return h1 ^ (h2 << 1);
+        }
+    };
+
+    const void* createMarkerForCommandBuffer(const VkCommandBuffer& commandBuffer, MarkerType type);
+    void removeMarkersForCommandBuffer(const VkCommandBuffer& commandBuffer);
+
+    bool mEnabled = false;
+
+    std::mutex mMarkersMutex;
+    std::unordered_set<CheckpointMarker, CheckpointMarkerHash, CheckpointMarkerEq> mMarkers;
+};
+
+}  // namespace vk
+}  // namespace gfxstream
\ No newline at end of file
diff --git a/host/vulkan/VkCommonOperations.cpp b/host/vulkan/VkCommonOperations.cpp
index 038ae0a..1119967 100644
--- a/host/vulkan/VkCommonOperations.cpp
+++ b/host/vulkan/VkCommonOperations.cpp
@@ -881,15 +881,21 @@
 
         deviceInfos[i].hasSamplerYcbcrConversionExtension =
             extensionsSupported(deviceExts, {VK_KHR_SAMPLER_YCBCR_CONVERSION_EXTENSION_NAME});
+
+        deviceInfos[i].hasNvidiaDeviceDiagnosticCheckpointsExtension =
+            extensionsSupported(deviceExts, {VK_NV_DEVICE_DIAGNOSTIC_CHECKPOINTS_EXTENSION_NAME});
+
         if (sVkEmulation->getPhysicalDeviceFeatures2Func) {
             VkPhysicalDeviceFeatures2 features2 = {
                 .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
             };
             auto features2Chain = vk_make_chain_iterator(&features2);
+
             VkPhysicalDeviceSamplerYcbcrConversionFeatures samplerYcbcrConversionFeatures = {
                 .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES,
             };
             vk_append_struct(&features2Chain, &samplerYcbcrConversionFeatures);
+
 #if defined(__QNX__)
             VkPhysicalDeviceExternalMemoryScreenBufferFeaturesQNX extMemScreenBufferFeatures = {
                 .sType =
@@ -897,10 +903,23 @@
             };
             vk_append_struct(&features2Chain, &extMemScreenBufferFeatures);
 #endif
+
+            VkPhysicalDeviceDiagnosticsConfigFeaturesNV deviceDiagnosticsConfigFeatures = {
+                .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DIAGNOSTICS_CONFIG_FEATURES_NV,
+                .diagnosticsConfig = VK_FALSE,
+            };
+            if (deviceInfos[i].hasNvidiaDeviceDiagnosticCheckpointsExtension) {
+                vk_append_struct(&features2Chain, &deviceDiagnosticsConfigFeatures);
+            }
+
             sVkEmulation->getPhysicalDeviceFeatures2Func(physdevs[i], &features2);
 
             deviceInfos[i].supportsSamplerYcbcrConversion =
                 samplerYcbcrConversionFeatures.samplerYcbcrConversion == VK_TRUE;
+
+            deviceInfos[i].supportsNvidiaDeviceDiagnosticCheckpoints =
+                deviceDiagnosticsConfigFeatures.diagnosticsConfig == VK_TRUE;
+
 #if defined(__QNX__)
             deviceInfos[i].supportsExternalMemoryImport =
                 extMemScreenBufferFeatures.screenBufferImport == VK_TRUE;
@@ -1121,6 +1140,7 @@
                 });
         vk_append_struct(&deviceCiChain, samplerYcbcrConversionFeatures.get());
     }
+
 #if defined(__QNX__)
     std::unique_ptr<VkPhysicalDeviceExternalMemoryScreenBufferFeaturesQNX>
         extMemScreenBufferFeaturesQNX = nullptr;
@@ -1136,6 +1156,25 @@
     }
 #endif
 
+    const bool commandBufferCheckpointsSupported =
+        sVkEmulation->deviceInfo.supportsNvidiaDeviceDiagnosticCheckpoints;
+    const bool commandBufferCheckpointsRequested =
+        sVkEmulation->features.VulkanCommandBufferCheckpoints.enabled;
+    const bool commandBufferCheckpointsSupportedAndRequested =
+        commandBufferCheckpointsSupported && commandBufferCheckpointsRequested;
+    VkPhysicalDeviceDiagnosticsConfigFeaturesNV deviceDiagnosticsConfigFeatures = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DIAGNOSTICS_CONFIG_FEATURES_NV,
+        .diagnosticsConfig = VK_TRUE,
+    };
+    if (commandBufferCheckpointsSupportedAndRequested) {
+        INFO("Enabling command buffer checkpoints with VK_NV_device_diagnostic_checkpoints.");
+        vk_append_struct(&deviceCiChain, &deviceDiagnosticsConfigFeatures);
+    } else if (commandBufferCheckpointsRequested) {
+        WARN(
+            "VulkanCommandBufferCheckpoints was requested but the "
+            "VK_NV_device_diagnostic_checkpoints extension is not supported.");
+    }
+
     ivk->vkCreateDevice(sVkEmulation->physdev, &dCi, nullptr, &sVkEmulation->device);
 
     if (res != VK_SUCCESS) {
@@ -1329,8 +1368,8 @@
                                              string_VkResult(stagingBufferBindRes));
     }
 
-    sVkEmulation->debugUtilsAvailableAndRequested = debugUtilsAvailableAndRequested;
-    if (sVkEmulation->debugUtilsAvailableAndRequested) {
+    if (debugUtilsAvailableAndRequested) {
+        sVkEmulation->debugUtilsAvailableAndRequested = true;
         sVkEmulation->debugUtilsHelper =
             DebugUtilsHelper::withUtilsEnabled(sVkEmulation->device, sVkEmulation->ivk);
 
@@ -1342,6 +1381,11 @@
                                                      "AEMU_CommandBuffer");
     }
 
+    if (commandBufferCheckpointsSupportedAndRequested) {
+        sVkEmulation->commandBufferCheckpointsSupportedAndRequested = true;
+        sVkEmulation->deviceLostHelper.enableWithNvidiaDeviceDiagnosticCheckpoints();
+    }
+
     VERBOSE("Vulkan global emulation state successfully initialized.");
     sVkEmulation->live = true;
 
@@ -1454,6 +1498,8 @@
     sVkEmulation = nullptr;
 }
 
+void onVkDeviceLost() { VkDecoderGlobalState::get()->on_DeviceLost(); }
+
 std::unique_ptr<gfxstream::DisplaySurface> createDisplaySurface(FBNativeWindowType window,
                                                                 uint32_t width, uint32_t height) {
     if (!sVkEmulation || !sVkEmulation->live) {
diff --git a/host/vulkan/VkCommonOperations.h b/host/vulkan/VkCommonOperations.h
index cb53bf2..40f83f2 100644
--- a/host/vulkan/VkCommonOperations.h
+++ b/host/vulkan/VkCommonOperations.h
@@ -26,6 +26,7 @@
 #include "BorrowedImageVk.h"
 #include "CompositorVk.h"
 #include "DebugUtilsHelper.h"
+#include "DeviceLostHelper.h"
 #include "DeviceOpTracker.h"
 #include "DisplayVk.h"
 #include "FrameworkFormats.h"
@@ -158,6 +159,9 @@
     bool debugUtilsAvailableAndRequested = false;
     DebugUtilsHelper debugUtilsHelper = DebugUtilsHelper::withUtilsDisabled();
 
+    bool commandBufferCheckpointsSupportedAndRequested = false;
+    DeviceLostHelper deviceLostHelper{};
+
     // Queue, command pool, and command buffer
     // for running commands to sync stuff system-wide.
     // TODO(b/197362803): Encapsulate host side VkQueue and the lock.
@@ -204,6 +208,8 @@
         bool hasSamplerYcbcrConversionExtension = false;
         bool supportsSamplerYcbcrConversion = false;
         bool glInteropSupported = false;
+        bool hasNvidiaDeviceDiagnosticCheckpointsExtension = false;
+        bool supportsNvidiaDeviceDiagnosticCheckpoints = false;
 
         std::vector<VkExtensionProperties> extensions;
 
@@ -453,6 +459,8 @@
 VkEmulation* getGlobalVkEmulation();
 void teardownGlobalVkEmulation();
 
+void onVkDeviceLost();
+
 std::unique_ptr<gfxstream::DisplaySurface> createDisplaySurface(FBNativeWindowType window,
                                                                 uint32_t width, uint32_t height);
 
diff --git a/host/vulkan/VkDecoderGlobalState.cpp b/host/vulkan/VkDecoderGlobalState.cpp
index df2e145..9a880b0 100644
--- a/host/vulkan/VkDecoderGlobalState.cpp
+++ b/host/vulkan/VkDecoderGlobalState.cpp
@@ -1643,10 +1643,12 @@
         auto physicalDevice = unbox_VkPhysicalDevice(boxed_physicalDevice);
         auto vk = dispatch_VkPhysicalDevice(boxed_physicalDevice);
 
-        const std::vector<const char*> finalExts =
+        std::vector<const char*> updatedDeviceExtensions =
             filteredDeviceExtensionNames(vk, physicalDevice, pCreateInfo->enabledExtensionCount,
                                          pCreateInfo->ppEnabledExtensionNames);
 
+        m_emu->deviceLostHelper.addNeededDeviceExtensions(&updatedDeviceExtensions);
+
         // Run the underlying API call, filtering extensions.
         VkDeviceCreateInfo createInfoFiltered = *pCreateInfo;
         // According to the spec, it seems that the application can use compressed texture formats
@@ -1756,8 +1758,8 @@
         // Filter device memory report as callbacks can not be passed between guest and host.
         vk_struct_chain_filter<VkDeviceDeviceMemoryReportCreateInfoEXT>(&createInfoFiltered);
 
-        createInfoFiltered.enabledExtensionCount = (uint32_t)finalExts.size();
-        createInfoFiltered.ppEnabledExtensionNames = finalExts.data();
+        createInfoFiltered.enabledExtensionCount = (uint32_t)updatedDeviceExtensions.size();
+        createInfoFiltered.ppEnabledExtensionNames = updatedDeviceExtensions.data();
 
         // bug: 155795731
         bool swiftshader =
@@ -5571,6 +5573,8 @@
         auto commandBuffer = unbox_VkCommandBuffer(boxed_commandBuffer);
         auto vk = dispatch_VkCommandBuffer(boxed_commandBuffer);
 
+        m_emu->deviceLostHelper.onResetCommandBuffer(commandBuffer);
+
         VkResult result = vk->vkResetCommandBuffer(commandBuffer, flags);
         if (VK_SUCCESS == result) {
             std::lock_guard<std::recursive_mutex> lock(mLock);
@@ -5587,7 +5591,13 @@
         auto vk = dispatch_VkDevice(boxed_device);
 
         if (!device) return;
+
+        for (uint32_t i = 0; i < commandBufferCount; i++) {
+            m_emu->deviceLostHelper.onFreeCommandBuffer(pCommandBuffers[i]);
+        }
+
         vk->vkFreeCommandBuffers(device, commandPool, commandBufferCount, pCommandBuffers);
+
         std::lock_guard<std::recursive_mutex> lock(mLock);
         for (uint32_t i = 0; i < commandBufferCount; i++) {
             const auto& cmdBufferInfoIt = mCmdBufferInfo.find(pCommandBuffers[i]);
@@ -5894,6 +5904,8 @@
             return result;
         }
 
+        m_emu->deviceLostHelper.onBeginCommandBuffer(commandBuffer, vk);
+
         std::lock_guard<std::recursive_mutex> lock(mLock);
 
         auto* commandBufferInfo = android::base::find(mCmdBufferInfo, commandBuffer);
@@ -5921,6 +5933,8 @@
         auto commandBuffer = unbox_VkCommandBuffer(boxed_commandBuffer);
         auto vk = dispatch_VkCommandBuffer(boxed_commandBuffer);
 
+        m_emu->deviceLostHelper.onEndCommandBuffer(commandBuffer, vk);
+
         std::lock_guard<std::recursive_mutex> lock(mLock);
 
         auto* commandBufferInfo = android::base::find(mCmdBufferInfo, commandBuffer);
@@ -6574,9 +6588,25 @@
         return;
     }
 
-    void on_DeviceLost() { GFXSTREAM_ABORT(FatalError(VK_ERROR_DEVICE_LOST)); }
+    void on_DeviceLost() {
+        {
+            std::lock_guard<std::recursive_mutex> lock(mLock);
 
-    void DeviceLostHandler() {}
+            std::vector<DeviceLostHelper::DeviceWithQueues> devicesToQueues;
+            for (const auto& [device, deviceInfo] : mDeviceInfo) {
+                auto& deviceToQueues = devicesToQueues.emplace_back();
+                deviceToQueues.device = device;
+                deviceToQueues.deviceDispatch = dispatch_VkDevice(deviceInfo.boxed);
+                for (const auto& [queueIndex, queues] : deviceInfo.queues) {
+                    deviceToQueues.queues.insert(deviceToQueues.queues.end(), queues.begin(),
+                                                 queues.end());
+                }
+            }
+            m_emu->deviceLostHelper.onDeviceLost(devicesToQueues);
+        }
+
+        GFXSTREAM_ABORT(FatalError(VK_ERROR_DEVICE_LOST));
+    }
 
     void on_CheckOutOfMemory(VkResult result, uint32_t opCode, const VkDecoderContext& context,
                              std::optional<uint64_t> allocationSize = std::nullopt) {
@@ -9080,8 +9110,6 @@
 
 void VkDecoderGlobalState::on_DeviceLost() { mImpl->on_DeviceLost(); }
 
-void VkDecoderGlobalState::DeviceLostHandler() { mImpl->DeviceLostHandler(); }
-
 void VkDecoderGlobalState::on_CheckOutOfMemory(VkResult result, uint32_t opCode,
                                                const VkDecoderContext& context,
                                                std::optional<uint64_t> allocationSize) {
diff --git a/host/vulkan/VkDecoderGlobalState.h b/host/vulkan/VkDecoderGlobalState.h
index 705c077..46beda4 100644
--- a/host/vulkan/VkDecoderGlobalState.h
+++ b/host/vulkan/VkDecoderGlobalState.h
@@ -705,8 +705,6 @@
 
     void on_DeviceLost();
 
-    void DeviceLostHandler();
-
     void on_CheckOutOfMemory(VkResult result, uint32_t opCode, const VkDecoderContext& context,
                              std::optional<uint64_t> allocationSize = std::nullopt);
 
diff --git a/host/vulkan/cereal/common/goldfish_vk_dispatch.cpp b/host/vulkan/cereal/common/goldfish_vk_dispatch.cpp
index 50903dd..786ffd0 100644
--- a/host/vulkan/cereal/common/goldfish_vk_dispatch.cpp
+++ b/host/vulkan/cereal/common/goldfish_vk_dispatch.cpp
@@ -659,6 +659,11 @@
     out->vkCmdInsertDebugUtilsLabelEXT =
         (PFN_vkCmdInsertDebugUtilsLabelEXT)dlSymFunc(lib, "vkCmdInsertDebugUtilsLabelEXT");
 #endif
+#ifdef VK_NV_device_diagnostic_checkpoints
+    out->vkCmdSetCheckpointNV = (PFN_vkCmdSetCheckpointNV)dlSymFunc(lib, "vkCmdSetCheckpointNV");
+    out->vkGetQueueCheckpointDataNV =
+        (PFN_vkGetQueueCheckpointDataNV)dlSymFunc(lib, "vkGetQueueCheckpointDataNV");
+#endif
 #ifdef VK_EXT_tooling_info
     out->vkGetPhysicalDeviceToolPropertiesEXT = (PFN_vkGetPhysicalDeviceToolPropertiesEXT)dlSymFunc(
         lib, "vkGetPhysicalDeviceToolPropertiesEXT");
@@ -1596,6 +1601,12 @@
         (PFN_vkCmdInsertDebugUtilsLabelEXT)vk->vkGetInstanceProcAddr(
             instance, "vkCmdInsertDebugUtilsLabelEXT");
 #endif
+#ifdef VK_NV_device_diagnostic_checkpoints
+    out->vkCmdSetCheckpointNV =
+        (PFN_vkCmdSetCheckpointNV)vk->vkGetInstanceProcAddr(instance, "vkCmdSetCheckpointNV");
+    out->vkGetQueueCheckpointDataNV = (PFN_vkGetQueueCheckpointDataNV)vk->vkGetInstanceProcAddr(
+        instance, "vkGetQueueCheckpointDataNV");
+#endif
 #ifdef VK_EXT_tooling_info
     out->vkGetPhysicalDeviceToolPropertiesEXT =
         (PFN_vkGetPhysicalDeviceToolPropertiesEXT)vk->vkGetInstanceProcAddr(
@@ -2537,6 +2548,12 @@
     out->vkCmdInsertDebugUtilsLabelEXT = (PFN_vkCmdInsertDebugUtilsLabelEXT)vk->vkGetDeviceProcAddr(
         device, "vkCmdInsertDebugUtilsLabelEXT");
 #endif
+#ifdef VK_NV_device_diagnostic_checkpoints
+    out->vkCmdSetCheckpointNV =
+        (PFN_vkCmdSetCheckpointNV)vk->vkGetDeviceProcAddr(device, "vkCmdSetCheckpointNV");
+    out->vkGetQueueCheckpointDataNV = (PFN_vkGetQueueCheckpointDataNV)vk->vkGetDeviceProcAddr(
+        device, "vkGetQueueCheckpointDataNV");
+#endif
 #ifdef VK_EXT_tooling_info
     out->vkGetPhysicalDeviceToolPropertiesEXT =
         (PFN_vkGetPhysicalDeviceToolPropertiesEXT)vk->vkGetDeviceProcAddr(
diff --git a/host/vulkan/cereal/common/goldfish_vk_dispatch.h b/host/vulkan/cereal/common/goldfish_vk_dispatch.h
index d4f2a9c..8062063 100644
--- a/host/vulkan/cereal/common/goldfish_vk_dispatch.h
+++ b/host/vulkan/cereal/common/goldfish_vk_dispatch.h
@@ -466,6 +466,10 @@
     PFN_vkDestroyDebugUtilsMessengerEXT vkDestroyDebugUtilsMessengerEXT;
     PFN_vkSubmitDebugUtilsMessageEXT vkSubmitDebugUtilsMessageEXT;
 #endif
+#ifdef VK_NV_device_diagnostic_checkpoints
+    PFN_vkCmdSetCheckpointNV vkCmdSetCheckpointNV;
+    PFN_vkGetQueueCheckpointDataNV vkGetQueueCheckpointDataNV;
+#endif
 #ifdef VK_EXT_metal_surface
     PFN_vkCreateMetalSurfaceEXT vkCreateMetalSurfaceEXT;
 #endif
diff --git a/host/vulkan/cereal/common/goldfish_vk_extension_structs.cpp b/host/vulkan/cereal/common/goldfish_vk_extension_structs.cpp
index c420927..61c2754 100644
--- a/host/vulkan/cereal/common/goldfish_vk_extension_structs.cpp
+++ b/host/vulkan/cereal/common/goldfish_vk_extension_structs.cpp
@@ -1136,11 +1136,6 @@
             return sizeof(VkPhysicalDeviceExclusiveScissorFeaturesNV);
         }
 #endif
-#ifdef VK_NV_device_diagnostic_checkpoints
-        case VK_STRUCTURE_TYPE_QUEUE_FAMILY_CHECKPOINT_PROPERTIES_NV: {
-            return sizeof(VkQueueFamilyCheckpointPropertiesNV);
-        }
-#endif
 #ifdef VK_INTEL_shader_integer_functions2
         case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_FUNCTIONS_2_FEATURES_INTEL: {
             return sizeof(VkPhysicalDeviceShaderIntegerFunctions2FeaturesINTEL);
@@ -3255,11 +3250,6 @@
             return sizeof(VkPhysicalDeviceExclusiveScissorFeaturesNV);
         }
 #endif
-#ifdef VK_NV_device_diagnostic_checkpoints
-        case VK_STRUCTURE_TYPE_QUEUE_FAMILY_CHECKPOINT_PROPERTIES_NV: {
-            return sizeof(VkQueueFamilyCheckpointPropertiesNV);
-        }
-#endif
 #ifdef VK_INTEL_shader_integer_functions2
         case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_FUNCTIONS_2_FEATURES_INTEL: {
             return sizeof(VkPhysicalDeviceShaderIntegerFunctions2FeaturesINTEL);
diff --git a/host/vulkan/cereal/common/vk_struct_id.h b/host/vulkan/cereal/common/vk_struct_id.h
index 72d9b9d..e4abeb2 100644
--- a/host/vulkan/cereal/common/vk_struct_id.h
+++ b/host/vulkan/cereal/common/vk_struct_id.h
@@ -122,4 +122,7 @@
                       VK_STRUCTURE_TYPE_EXPORT_METAL_OBJECT_CREATE_INFO_EXT);
 #endif
 
+REGISTER_VK_STRUCT_ID(VkPhysicalDeviceDiagnosticsConfigFeaturesNV,
+                      VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DIAGNOSTICS_CONFIG_FEATURES_NV);
+
 #undef REGISTER_VK_STRUCT_ID
diff --git a/host/vulkan/meson.build b/host/vulkan/meson.build
index ae7e03a..ccc03a9 100644
--- a/host/vulkan/meson.build
+++ b/host/vulkan/meson.build
@@ -13,6 +13,7 @@
   'BufferVk.cpp',
   'ColorBufferVk.cpp',
   'CompositorVk.cpp',
+  'DeviceLostHelper.cpp',
   'DeviceOpTracker.cpp',
   'DisplayVk.cpp',
   'DisplaySurfaceVk.cpp',