]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
drm/amdgpu: make reset method configurable for RAS poison
authorTao Zhou <tao.zhou1@amd.com>
Tue, 12 Mar 2024 03:30:09 +0000 (11:30 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 20 Mar 2024 17:38:15 +0000 (13:38 -0400)
Each RAS block has different requirement for gpu reset in poison
consumption handling.
Add support for mmhub RAS poison consumption handling.

v2: remove the mmhub poison support for kfd int v10.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c

index 8ee18c2c082a98542235ca465043bd400c12d373..3b4591f554f1b07b0d9b7a85e0077ec6d10bbaa4 100644 (file)
@@ -748,7 +748,7 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
 }
 
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
-       enum amdgpu_ras_block block, bool reset)
+       enum amdgpu_ras_block block, uint32_t reset)
 {
        amdgpu_umc_poison_handler(adev, block, reset);
 }
index 6b67f00259663d9ed6b9a9b4027b72b6273b6f7a..c51954c9052e14e875cc2c177c2a54157261d4c6 100644 (file)
@@ -336,7 +336,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);
 int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
                                struct tile_config *config);
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
-                       enum amdgpu_ras_block block, bool reset);
+                       enum amdgpu_ras_block block, uint32_t reset);
 bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
 bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
index 26662c76b293d313d53b2928ce91c96e87562f21..3c6d532824f6143ea07d31f43fa6416f5af7a1a9 100644 (file)
@@ -2051,7 +2051,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
                }
        }
 
-       amdgpu_umc_poison_handler(adev, obj->head.block, false);
+       amdgpu_umc_poison_handler(adev, obj->head.block, 0);
 
        if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
                poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
@@ -2704,7 +2704,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
                atomic_dec(&con->page_retirement_req_cnt);
 
                amdgpu_umc_bad_page_polling_timeout(adev,
-                               false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
+                               0, MAX_UMC_POISON_POLLING_TIME_ASYNC);
        }
 
        return 0;
index 20436f81856ad280f112bf52dd42ea6157443b04..f486510fc94c7d5c746aaad9920bc4edc941b6cf 100644 (file)
@@ -177,7 +177,7 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
 static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
                void *ras_error_status,
                struct amdgpu_iv_entry *entry,
-               bool reset)
+               uint32_t reset)
 {
        struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -186,9 +186,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
        amdgpu_umc_handle_bad_pages(adev, ras_error_status);
 
        if (err_data->ue_count && reset) {
-               /* use mode-2 reset for poison consumption */
-               if (!entry)
-                       con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+               con->gpu_reset_flags |= reset;
                amdgpu_ras_reset_gpu(adev);
        }
 
@@ -196,7 +194,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
 }
 
 int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
-                       bool reset, uint32_t timeout_ms)
+                       uint32_t reset, uint32_t timeout_ms)
 {
        struct ras_err_data err_data;
        struct ras_common_if head = {
@@ -238,8 +236,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
        if (reset) {
                struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
-               /* use mode-2 reset for poison consumption */
-               con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+               con->gpu_reset_flags |= reset;
                amdgpu_ras_reset_gpu(adev);
        }
 
@@ -247,7 +244,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
 }
 
 int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-                       enum amdgpu_ras_block block, bool reset)
+                       enum amdgpu_ras_block block, uint32_t reset)
 {
        int ret = AMDGPU_RAS_SUCCESS;
 
@@ -311,7 +308,8 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
                void *ras_error_status,
                struct amdgpu_iv_entry *entry)
 {
-       return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
+       return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry,
+                               AMDGPU_RAS_GPU_RESET_MODE1_RESET);
 }
 
 int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev)
index 5954e839d5808dcd4de710355149c3763a68a825..563b0249247ea951a1f44c2c8685aed163460c20 100644 (file)
@@ -101,7 +101,7 @@ struct amdgpu_umc {
 int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);
 int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-                       enum amdgpu_ras_block block, bool reset);
+                       enum amdgpu_ras_block block, uint32_t reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
                struct amdgpu_irq_src *source,
                struct amdgpu_iv_entry *entry);
@@ -121,5 +121,5 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
                        umc_func func, void *data);
 
 int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
-                       bool reset, uint32_t timeout_ms);
+                       uint32_t reset, uint32_t timeout_ms);
 #endif
index 650da18b0d87b3289d4c8be3ce70f54e34f3ecf0..740f89aafbc06408dd03ff2eb3cdfa0685f6b028 100644 (file)
@@ -134,6 +134,7 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
 {
        enum amdgpu_ras_block block = 0;
        int old_poison, ret = -EINVAL;
+       uint32_t reset = 0;
        struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
 
        if (!p)
@@ -153,6 +154,8 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
        case SOC15_IH_CLIENTID_UTCL2:
                ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
                block = AMDGPU_RAS_BLOCK__GFX;
+               if (ret)
+                       reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
                break;
        case SOC15_IH_CLIENTID_SDMA0:
        case SOC15_IH_CLIENTID_SDMA1:
@@ -160,6 +163,7 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
        case SOC15_IH_CLIENTID_SDMA3:
        case SOC15_IH_CLIENTID_SDMA4:
                block = AMDGPU_RAS_BLOCK__SDMA;
+               reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
                break;
        default:
                break;
@@ -170,17 +174,16 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
        /* resetting queue passes, do page retirement without gpu reset
         * resetting queue fails, fallback to gpu reset solution
         */
-       if (!ret) {
+       if (!ret)
                dev_warn(dev->adev->dev,
                        "RAS poison consumption, unmap queue flow succeeded: client id %d\n",
                        client_id);
-               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
-       } else {
+       else
                dev_warn(dev->adev->dev,
                        "RAS poison consumption, fall back to gpu reset flow: client id %d\n",
                        client_id);
-               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
-       }
+
+       amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
 }
 
 static bool event_interrupt_isr_v10(struct kfd_node *dev,
index 7e2859736a558fe899c8d1bb438daa07523f2c59..d3d6b5c180b36ee635f6d656f4759b389ca5d117 100644 (file)
@@ -193,6 +193,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
 {
        enum amdgpu_ras_block block = 0;
        int ret = -EINVAL;
+       uint32_t reset = 0;
        struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
 
        if (!p)
@@ -212,10 +213,13 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
                if (dev->dqm->ops.reset_queues)
                        ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
                block = AMDGPU_RAS_BLOCK__GFX;
+               if (ret)
+                       reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
                break;
        case SOC21_INTSRC_SDMA_ECC:
        default:
                block = AMDGPU_RAS_BLOCK__GFX;
+               reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
                break;
        }
 
@@ -223,10 +227,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
 
        /* resetting queue passes, do page retirement without gpu reset
           resetting queue fails, fallback to gpu reset solution */
-       if (!ret)
-               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
-       else
-               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
+       amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
 }
 
 static bool event_interrupt_isr_v11(struct kfd_node *dev,
index 11641f4645e6c1e049ec7ba735352615436e03b9..2a37ab7a715020078233fb931910d202173c913c 100644 (file)
@@ -145,6 +145,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
 {
        enum amdgpu_ras_block block = 0;
        int old_poison, ret = -EINVAL;
+       uint32_t reset = 0;
        struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
 
        if (!p)
@@ -164,6 +165,15 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
        case SOC15_IH_CLIENTID_UTCL2:
                ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
                block = AMDGPU_RAS_BLOCK__GFX;
+               if (ret)
+                       reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+               break;
+       case SOC15_IH_CLIENTID_VMC:
+       case SOC15_IH_CLIENTID_VMC1:
+               ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
+               block = AMDGPU_RAS_BLOCK__MMHUB;
+               if (ret)
+                       reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
                break;
        case SOC15_IH_CLIENTID_SDMA0:
        case SOC15_IH_CLIENTID_SDMA1:
@@ -171,6 +181,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
        case SOC15_IH_CLIENTID_SDMA3:
        case SOC15_IH_CLIENTID_SDMA4:
                block = AMDGPU_RAS_BLOCK__SDMA;
+               reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
                break;
        default:
                break;
@@ -181,17 +192,16 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
        /* resetting queue passes, do page retirement without gpu reset
         * resetting queue fails, fallback to gpu reset solution
         */
-       if (!ret) {
+       if (!ret)
                dev_warn(dev->adev->dev,
                        "RAS poison consumption, unmap queue flow succeeded: client id %d\n",
                        client_id);
-               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
-       } else {
+       else
                dev_warn(dev->adev->dev,
                        "RAS poison consumption, fall back to gpu reset flow: client id %d\n",
                        client_id);
-               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
-       }
+
+       amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
 }
 
 static bool context_id_expected(struct kfd_dev *dev)