]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
drm/amdgpu: add ras POSION_CONSUMPTION event id support
authorYang Wang <kevinyang.wang@amd.com>
Fri, 28 Jun 2024 08:24:39 +0000 (16:24 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 8 Jul 2024 20:55:37 +0000 (16:55 -0400)
add amdgpu ras POSION_CONSUMPTION event id support.

Signed-off-by: Yang Wang <kevinyang.wang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c

index ff90b8e4bc29b92deaab26386961271c42fa9da8..04278f13fd4b08a04b298f6347261ee5d4b8eb07 100644 (file)
@@ -2076,10 +2076,17 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
        struct amdgpu_ras_block_object *block_obj =
                amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION;
+       u64 event_id;
+       int ret;
 
        if (!block_obj || !con)
                return;
 
+       ret = amdgpu_ras_mark_ras_event(adev, type);
+       if (ret)
+               return;
+
        /* both query_poison_status and handle_poison_consumption are optional,
         * but at least one of them should be implemented if we need poison
         * consumption handler
@@ -2104,8 +2111,10 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
         * For RMA case, amdgpu_umc_poison_handler will handle gpu reset.
         */
        if (poison_stat && !con->is_rma) {
-               dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n",
-                               block_obj->ras_comm.name);
+               event_id = amdgpu_ras_acquire_event_id(adev, type);
+               RAS_EVENT_LOG(adev, event_id,
+                             "GPU reset for %s RAS poison consumption is issued!\n",
+                             block_obj->ras_comm.name);
                amdgpu_ras_reset_gpu(adev);
        }
 
@@ -2498,7 +2507,7 @@ static enum ras_event_type amdgpu_ras_get_fatal_error_event(struct amdgpu_device
        if (amdgpu_ras_intr_triggered())
                return RAS_EVENT_TYPE_FATAL;
        else
-               return RAS_EVENT_TYPE_INVALID;
+               return RAS_EVENT_TYPE_POISON_CONSUMPTION;
 }
 
 static void amdgpu_ras_do_recovery(struct work_struct *work)
@@ -3986,6 +3995,7 @@ u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type
        switch (type) {
        case RAS_EVENT_TYPE_FATAL:
        case RAS_EVENT_TYPE_POISON_CREATION:
+       case RAS_EVENT_TYPE_POISON_CONSUMPTION:
                event_mgr = __get_ras_event_mgr(adev);
                if (!event_mgr)
                        return RAS_EVENT_INVALID_ID;
index cc7a9be4fc1aa79fd545926ef841a8f5709af661..925b4df3109a9483b21dd5c82de4a1ec44786f61 100644 (file)
@@ -436,6 +436,7 @@ enum ras_event_type {
        RAS_EVENT_TYPE_INVALID = 0,
        RAS_EVENT_TYPE_FATAL,
        RAS_EVENT_TYPE_POISON_CREATION,
+       RAS_EVENT_TYPE_POISON_CONSUMPTION,
        RAS_EVENT_TYPE_COUNT,
 };
 
index da95b8ba87e4221d842ba9b4f125a4e4222a607b..a9c3580be8c9b9d149a475470e8f13f36d1cb2c4 100644 (file)
@@ -27,6 +27,7 @@
 #include "soc15_int.h"
 #include "kfd_device_queue_manager.h"
 #include "kfd_smi_events.h"
+#include "amdgpu_ras.h"
 
 /*
  * GFX9 SQ Interrupts
@@ -144,9 +145,11 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
                                uint16_t pasid, uint16_t client_id)
 {
        enum amdgpu_ras_block block = 0;
-       int old_poison;
        uint32_t reset = 0;
        struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+       enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION;
+       u64 event_id;
+       int old_poison, ret;
 
        if (!p)
                return;
@@ -193,10 +196,16 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
                return;
        }
 
+       ret = amdgpu_ras_mark_ras_event(dev->adev, type);
+       if (ret)
+               return;
+
        kfd_signal_poison_consumed_event(dev, pasid);
 
-       dev_warn(dev->adev->dev,
-                "poison is consumed by client %d, kick off gpu reset flow\n", client_id);
+       event_id = amdgpu_ras_acquire_event_id(dev->adev, type);
+
+       RAS_EVENT_LOG(dev->adev, event_id,
+                     "poison is consumed by client %d, kick off gpu reset flow\n", client_id);
 
        amdgpu_amdkfd_ras_pasid_poison_consumption_handler(dev->adev,
                block, pasid, NULL, NULL, reset);