drm/amdgpu: avoid ras error injection for retired page
check whether a page is bad page before umc error injection, bad page
should not be accessed again
Signed-off-by: Tao Zhou <[email protected]>
Reviewed-by: Guchun Chen <[email protected]>
Signed-off-by: Alex Deucher <[email protected]>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 84d8c33..49bdee9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -71,6 +71,9 @@ const char *ras_block_string[] = {
atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
+static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
+ uint64_t addr);
+
static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
size_t size, loff_t *pos)
{
@@ -291,6 +294,14 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
break;
}
+ /* umc ce/ue error injection for a bad page is not allowed */
+ if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
+ amdgpu_ras_check_bad_page(adev, data.inject.address)) {
+ DRM_WARN("RAS WARN: 0x%llx has been marked as bad before error injection!\n",
+ data.inject.address);
+ break;
+ }
+
/* data.inject.address is offset instead of absolute gpu address */
ret = amdgpu_ras_error_inject(adev, &data.inject);
break;
@@ -1431,6 +1442,39 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
return ret;
}
+/*
+ * check if an address belongs to bad page
+ *
+ * Note: this check is only for umc block
+ */
+static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
+ uint64_t addr)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_err_handler_data *data;
+ int i;
+ bool ret = false;
+
+ if (!con || !con->eh_data)
+ return ret;
+
+ mutex_lock(&con->recovery_lock);
+ data = con->eh_data;
+ if (!data)
+ goto out;
+
+ addr >>= AMDGPU_GPU_PAGE_SHIFT;
+ for (i = 0; i < data->count; i++)
+ if (addr == data->bps[i].retired_page) {
+ ret = true;
+ goto out;
+ }
+
+out:
+ mutex_unlock(&con->recovery_lock);
+ return ret;
+}
+
/* called in gpu recovery/init */
int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
{