]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
drm/amdgpu: Add init level for post reset reinit
authorLijo Lazar <lijo.lazar@amd.com>
Fri, 15 Nov 2024 05:38:02 +0000 (11:08 +0530)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 20 Nov 2024 15:03:05 +0000 (10:03 -0500)
When device needs to be reset before initialization, it's not required
for all IPs to be initialized before a reset. In such cases, it needs to
identify whether the IP/feature is initialized for the first time or
whether it's reinitialized after a reset.

Add RESET_RECOVERY init level to identify post reset reinitialization
phase. This only provides a device level identification, IP/features may
choose to track their state independently also.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Acked-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/aldebaran.c
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c

index 3a588fecb0c58362e149a556ea22ae95db3bc03d..f44de9d4b6a17f212bb9911bb3f33df69a349315 100644 (file)
@@ -330,6 +330,8 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
        }
 
        list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
+               amdgpu_set_init_level(tmp_adev,
+                               AMDGPU_INIT_LEVEL_RESET_RECOVERY);
                dev_info(tmp_adev->dev,
                         "GPU reset succeeded, trying to resume\n");
                r = aldebaran_mode2_restore_ip(tmp_adev);
@@ -375,6 +377,8 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
                                                        tmp_adev);
 
                if (!r) {
+                       amdgpu_set_init_level(tmp_adev,
+                                             AMDGPU_INIT_LEVEL_DEFAULT);
                        amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
 
                        r = amdgpu_ib_ring_tests(tmp_adev);
index d8bc6da5001614e3add6999f8b53df2df42be6a3..4653a8d2823a6d6f645c620ba56caf0769e0d2b1 100644 (file)
@@ -839,6 +839,7 @@ struct amdgpu_mqd {
 enum amdgpu_init_lvl_id {
        AMDGPU_INIT_LEVEL_DEFAULT,
        AMDGPU_INIT_LEVEL_MINIMAL_XGMI,
+       AMDGPU_INIT_LEVEL_RESET_RECOVERY,
 };
 
 struct amdgpu_init_level {
index 0171d240fcb05e37d9a8a037b4d94e9103a57a36..5ef95161e632c9874661234df5ba7a6340407d68 100644 (file)
@@ -156,6 +156,11 @@ struct amdgpu_init_level amdgpu_init_default = {
        .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
 };
 
+struct amdgpu_init_level amdgpu_init_recovery = {
+       .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY,
+       .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
+};
+
 /*
  * Minimal blocks needed to be initialized before a XGMI hive can be reset. This
  * is used for cases like reset on initialization where the entire hive needs to
@@ -182,6 +187,9 @@ void amdgpu_set_init_level(struct amdgpu_device *adev,
        case AMDGPU_INIT_LEVEL_MINIMAL_XGMI:
                adev->init_lvl = &amdgpu_init_minimal_xgmi;
                break;
+       case AMDGPU_INIT_LEVEL_RESET_RECOVERY:
+               adev->init_lvl = &amdgpu_init_recovery;
+               break;
        case AMDGPU_INIT_LEVEL_DEFAULT:
                fallthrough;
        default:
@@ -5419,7 +5427,7 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
        struct list_head *device_list_handle;
        bool full_reset, vram_lost = false;
        struct amdgpu_device *tmp_adev;
-       int r;
+       int r, init_level;
 
        device_list_handle = reset_context->reset_device_list;
 
@@ -5428,10 +5436,18 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
 
        full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
 
+       /**
+        * If it's reset on init, it's default init level, otherwise keep level
+        * as recovery level.
+        */
+       if (reset_context->method == AMD_RESET_METHOD_ON_INIT)
+                       init_level = AMDGPU_INIT_LEVEL_DEFAULT;
+       else
+                       init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY;
+
        r = 0;
        list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
-               /* After reset, it's default init level */
-               amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT);
+               amdgpu_set_init_level(tmp_adev, init_level);
                if (full_reset) {
                        /* post card */
                        amdgpu_ras_set_fed(tmp_adev, false);
@@ -5518,6 +5534,9 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
 
 out:
                if (!r) {
+                       /* IP init is complete now, set level as default */
+                       amdgpu_set_init_level(tmp_adev,
+                                             AMDGPU_INIT_LEVEL_DEFAULT);
                        amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
                        r = amdgpu_ib_ring_tests(tmp_adev);
                        if (r) {
index 24dae7cdbe9547ed76b7bcd69ab2a87072235dd9..a0acb65f4b40afbcd9d1a305b4893aa66fadfde7 100644 (file)
@@ -342,3 +342,8 @@ void amdgpu_reset_get_desc(struct amdgpu_reset_context *rst_ctxt, char *buf,
                strscpy(buf, "unknown", len);
        }
 }
+
+bool amdgpu_reset_in_recovery(struct amdgpu_device *adev)
+{
+       return (adev->init_lvl->level == AMDGPU_INIT_LEVEL_RESET_RECOVERY);
+}
index f8628bc898df45183c6b6bf3155a3c1c82f32d92..4d9b9701139be520c2cfcc94bf6b1a182130959a 100644 (file)
@@ -158,4 +158,6 @@ extern struct amdgpu_reset_handler xgmi_reset_on_init_handler;
 int amdgpu_reset_do_xgmi_reset_on_init(
        struct amdgpu_reset_context *reset_context);
 
+bool amdgpu_reset_in_recovery(struct amdgpu_device *adev);
+
 #endif
index 9b01e074af471e14790416eb56dce53edace03c1..2594467bdd8735dbea28648bfe3065614f757f65 100644 (file)
@@ -220,6 +220,7 @@ sienna_cichlid_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
        int r;
        struct amdgpu_device *tmp_adev = (struct amdgpu_device *)reset_ctl->handle;
 
+       amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_RESET_RECOVERY);
        dev_info(tmp_adev->dev,
                        "GPU reset succeeded, trying to resume\n");
        r = sienna_cichlid_mode2_restore_ip(tmp_adev);
@@ -237,6 +238,7 @@ sienna_cichlid_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
 
        amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
 
+       amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT);
        r = amdgpu_ib_ring_tests(tmp_adev);
        if (r) {
                dev_err(tmp_adev->dev,
index e70ebad3f9fac499406deffa23bd66761cd7d001..70569ea906bca7652b4047496c0414781ce67eb3 100644 (file)
@@ -221,6 +221,7 @@ smu_v13_0_10_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
        int r;
        struct amdgpu_device *tmp_adev = (struct amdgpu_device *)reset_ctl->handle;
 
+       amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_RESET_RECOVERY);
        dev_info(tmp_adev->dev,
                        "GPU reset succeeded, trying to resume\n");
        r = smu_v13_0_10_mode2_restore_ip(tmp_adev);
@@ -234,6 +235,7 @@ smu_v13_0_10_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
 
        amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
 
+       amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT);
        r = amdgpu_ib_ring_tests(tmp_adev);
        if (r) {
                dev_err(tmp_adev->dev,