]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
drm/amdgpu: process RAS fatal error MB notification
authorVignesh Chander <Vignesh.Chander@amd.com>
Mon, 24 Jun 2024 21:44:26 +0000 (16:44 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 27 Jun 2024 21:31:37 +0000 (17:31 -0400)
For RAS error scenario, VF guest driver will check mailbox
and set fed flag to avoid unnecessary HW accesses.
additionally, poll for reset completion message first
to avoid accidentally spamming multiple reset requests to host.

v2: add another mailbox check for handling case where kfd detects
timeout first

v3: set host_flr bit and use wait_for_reset

Signed-off-by: Vignesh Chander <Vignesh.Chander@amd.com>
Reviewed-by: Zhigang Luo <Zhigang.Luo@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h

index 183e219b6a851feb80c5096cda1f54c76db77592..b27336a05aaee6eee347225c3c8db09e47a639c6 100644 (file)
@@ -5069,7 +5069,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
        struct amdgpu_hive_info *hive = NULL;
 
        if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
-               amdgpu_virt_ready_to_reset(adev);
+               if (!amdgpu_ras_get_fed_status(adev))
+                       amdgpu_virt_ready_to_reset(adev);
                amdgpu_virt_wait_reset(adev);
                clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
                r = amdgpu_virt_request_full_gpu(adev, true);
@@ -5837,6 +5838,12 @@ retry:   /* Rest of adevs pre asic reset from XGMI hive. */
        /* Actual ASIC resets if needed.*/
        /* Host driver will handle XGMI hive reset for SRIOV */
        if (amdgpu_sriov_vf(adev)) {
+               if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
+                       dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
+                       amdgpu_ras_set_fed(adev, true);
+                       set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
+               }
+
                r = amdgpu_device_reset_sriov(adev, reset_context);
                if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
                        amdgpu_virt_release_full_gpu(adev, true);
index 63f2286858c4844fcf286610ea028f65107eba6e..ccb3d041c2b24937d16b802895c0f5fecb862ae3 100644 (file)
@@ -229,6 +229,22 @@ void amdgpu_virt_free_mm_table(struct amdgpu_device *adev)
        adev->virt.mm_table.gpu_addr = 0;
 }
 
+/**
+ * amdgpu_virt_rcvd_ras_interrupt() - receive ras interrupt
+ * @adev:      amdgpu device.
+ * Check whether host sent RAS error message
+ * Return: true if found, otherwise false
+ */
+bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev)
+{
+       struct amdgpu_virt *virt = &adev->virt;
+
+       if (!virt->ops || !virt->ops->rcvd_ras_intr)
+               return false;
+
+       return virt->ops->rcvd_ras_intr(adev);
+}
+
 
 unsigned int amd_sriov_msg_checksum(void *obj,
                                unsigned long obj_size,
@@ -612,11 +628,14 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work)
        ret = amdgpu_virt_read_pf2vf_data(adev);
        if (ret) {
                adev->virt.vf2pf_update_retry_cnt++;
-               if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
-                   amdgpu_sriov_runtime(adev)) {
+
+               if ((amdgpu_virt_rcvd_ras_interrupt(adev) ||
+                       adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
+                       amdgpu_sriov_runtime(adev)) {
+
                        amdgpu_ras_set_fed(adev, true);
                        if (amdgpu_reset_domain_schedule(adev->reset_domain,
-                                                         &adev->kfd.reset_work))
+                                                       &adev->kfd.reset_work))
                                return;
                        else
                                dev_err(adev->dev, "Failed to queue work! at %s", __func__);
index f04cd1586c7220c64921188adea4b70b418d90a9..b42a8854dca0cb3a9f1d56e3bac1438a0c7717d6 100644 (file)
@@ -52,7 +52,7 @@
 /* tonga/fiji use this offset */
 #define mmBIF_IOV_FUNC_IDENTIFIER 0x1503
 
-#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 5
+#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 2
 
 enum amdgpu_sriov_vf_mode {
        SRIOV_VF_MODE_BARE_METAL = 0,
@@ -94,6 +94,7 @@ struct amdgpu_virt_ops {
                          u32 data1, u32 data2, u32 data3);
        void (*ras_poison_handler)(struct amdgpu_device *adev,
                                        enum amdgpu_ras_block block);
+       bool (*rcvd_ras_intr)(struct amdgpu_device *adev);
 };
 
 /*
@@ -352,6 +353,7 @@ void amdgpu_virt_ready_to_reset(struct amdgpu_device *adev);
 int amdgpu_virt_wait_reset(struct amdgpu_device *adev);
 int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev);
 void amdgpu_virt_free_mm_table(struct amdgpu_device *adev);
+bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev);
 void amdgpu_virt_release_ras_err_handler_data(struct amdgpu_device *adev);
 void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev);
 void amdgpu_virt_exchange_data(struct amdgpu_device *adev);
index 65656afc6ed1c2055d4e22a23708d8477d6e6695..f5411b798e1116d28ede811c4966eec60bcc3125 100644 (file)
@@ -408,6 +408,13 @@ static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev,
        xgpu_ai_send_access_requests(adev, IDH_RAS_POISON);
 }
 
+static bool xgpu_ai_rcvd_ras_intr(struct amdgpu_device *adev)
+{
+       enum idh_event msg = xgpu_ai_mailbox_peek_msg(adev);
+
+       return (msg == IDH_RAS_ERROR_DETECTED || msg == 0xFFFFFFFF);
+}
+
 const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
        .req_full_gpu   = xgpu_ai_request_full_gpu_access,
        .rel_full_gpu   = xgpu_ai_release_full_gpu_access,
@@ -417,4 +424,5 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
        .trans_msg = xgpu_ai_mailbox_trans_msg,
        .req_init_data  = xgpu_ai_request_init_data,
        .ras_poison_handler = xgpu_ai_ras_poison_handler,
+       .rcvd_ras_intr = xgpu_ai_rcvd_ras_intr,
 };
index c520b2fabfb9a8021e328a3358e024ac5e735ee5..ed57cbc150afba29d6f609f5b050746ef1fb7381 100644 (file)
@@ -51,7 +51,9 @@ enum idh_event {
        IDH_FAIL,
        IDH_QUERY_ALIVE,
        IDH_REQ_GPU_INIT_DATA_READY,
-
+       IDH_RAS_POISON_READY,
+       IDH_PF_SOFT_FLR_NOTIFICATION,
+       IDH_RAS_ERROR_DETECTED,
        IDH_TEXT_MESSAGE = 255,
 };
 
index 17e1e8cc243752a0a31d03770e996f92d53628c1..f47bd7ada4d79cd7b392583c03e056f5e570196f 100644 (file)
@@ -449,6 +449,13 @@ static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev,
        }
 }
 
+static bool xgpu_nv_rcvd_ras_intr(struct amdgpu_device *adev)
+{
+       enum idh_event msg = xgpu_nv_mailbox_peek_msg(adev);
+
+       return (msg == IDH_RAS_ERROR_DETECTED || msg == 0xFFFFFFFF);
+}
+
 const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
        .req_full_gpu   = xgpu_nv_request_full_gpu_access,
        .rel_full_gpu   = xgpu_nv_release_full_gpu_access,
@@ -458,4 +465,5 @@ const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
        .wait_reset = xgpu_nv_wait_reset,
        .trans_msg = xgpu_nv_mailbox_trans_msg,
        .ras_poison_handler = xgpu_nv_ras_poison_handler,
+       .rcvd_ras_intr = xgpu_nv_rcvd_ras_intr,
 };
index 1e8fd90cab434724a04aa2ce45f8b44b0ce37865..caf616a2c8a6c52db903c5ad11aed1643596723f 100644 (file)
@@ -26,7 +26,7 @@
 
 #define NV_MAILBOX_POLL_ACK_TIMEDOUT   500
 #define NV_MAILBOX_POLL_MSG_TIMEDOUT   6000
-#define NV_MAILBOX_POLL_FLR_TIMEDOUT   5000
+#define NV_MAILBOX_POLL_FLR_TIMEDOUT   10000
 #define NV_MAILBOX_POLL_MSG_REP_MAX    11
 
 enum idh_request {
@@ -52,7 +52,8 @@ enum idh_event {
        IDH_QUERY_ALIVE,
        IDH_REQ_GPU_INIT_DATA_READY,
        IDH_RAS_POISON_READY,
-
+       IDH_PF_SOFT_FLR_NOTIFICATION,
+       IDH_RAS_ERROR_DETECTED,
        IDH_TEXT_MESSAGE = 255,
 };