From: Tao Zhou Date: Mon, 6 Dec 2021 07:54:54 +0000 (+0800) Subject: drm/amdkfd: reset queue which consumes RAS poison (v2) X-Git-Tag: perf_urgent_for_v5.17_rc2~93^2^2~40 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=b6485bed40d7859735bdbfedbd55dcc8366a88a7;p=users%2Fdwmw2%2Flinux.git drm/amdkfd: reset queue which consumes RAS poison (v2) CP supports unmap queue with reset mode which only destroys specific queue without affecting others. Replacing whole gpu reset with reset queue mode for RAS poison consumption saves much time, and we can also fallback to gpu reset solution if reset queue fails. v2: Return directly if process is NULL; Reset queue solution is not applicable to SDMA, fallback to legacy way; Call kfd_unref_process after lookup process. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Acked-by: Felix Kuehling Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 46cf48b3904ac..0bf09a94d9441 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -721,13 +721,13 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev) return adev->have_atomics_support; } -void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev) +void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset) { struct ras_err_data err_data = {0, 0, 0, NULL}; /* CPU MCA will handle page retirement if connected_to_cpu is 1 */ if (!adev->gmc.xgmi.connected_to_cpu) - amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL); - else + amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset); + else if (reset) amdgpu_amdkfd_gpu_reset(adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index fcbc8a9c9e06d..61f899e54fd5f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -296,7 +296,8 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev, uint64_t *mmap_offset); int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev, struct tile_config *config); -void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev); +void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, + bool reset); #if IS_ENABLED(CONFIG_HSA_AMD) void amdgpu_amdkfd_gpuvm_init_mem_limits(void); void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index deb64168c9e8b..b8ac28fb12315 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -89,6 +89,44 @@ enum SQ_INTERRUPT_ERROR_TYPE { #define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000 #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20 +static void event_interrupt_poison_consumption(struct kfd_dev *dev, + uint16_t pasid, uint16_t source_id) +{ + int ret = -EINVAL; + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + + if (!p) + return; + + /* all queues of a process will be unmapped in one time */ + if (atomic_read(&p->poison)) { + kfd_unref_process(p); + return; + } + + atomic_set(&p->poison, 1); + kfd_unref_process(p); + + switch (source_id) { + case SOC15_INTSRC_SQ_INTERRUPT_MSG: + if (dev->dqm->ops.reset_queues) + ret = dev->dqm->ops.reset_queues(dev->dqm, pasid); + break; + case SOC15_INTSRC_SDMA_ECC: + default: + break; + } + + kfd_signal_poison_consumed_event(dev, pasid); + + /* resetting queue passes, do page retirement without gpu reset + resetting queue fails, fallback to gpu reset solution */ + if (!ret) + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false); + else + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true); +} + static bool event_interrupt_isr_v9(struct kfd_dev *dev, const uint32_t *ih_ring_entry, uint32_t *patched_ihre, @@ -230,8 +268,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, sq_intr_err); if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST && sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) { - kfd_signal_poison_consumed_event(dev, pasid); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev); + event_interrupt_poison_consumption(dev, pasid, source_id); return; } break; @@ -252,8 +289,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, if (source_id == SOC15_INTSRC_SDMA_TRAP) { kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28); } else if (source_id == SOC15_INTSRC_SDMA_ECC) { - kfd_signal_poison_consumed_event(dev, pasid); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev); + event_interrupt_poison_consumption(dev, pasid, source_id); return; } } else if (client_id == SOC15_IH_CLIENTID_VMC || diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 0c3f911e3bf4b..ea68f3b3a4e9c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -856,6 +856,8 @@ struct kfd_process { struct svm_range_list svms; bool xnack_enabled; + + atomic_t poison; }; #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */