From: Andrey Grodzovsky Date: Thu, 29 Nov 2018 20:14:27 +0000 (-0500) Subject: drm/amdgpu: Implement concurrent asic reset for XGMI. X-Git-Tag: v5.0-rc1~185^2~4^2~39 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=d4535e2c018bba71b49edeb5e396183920f5d341;p=users%2Fhch%2Fuuid.git drm/amdgpu: Implement concurrent asic reset for XGMI. Use per hive wq to concurrently send reset commands to all nodes in the hive. v2: Switch to system_highpri_wq after dropping dedicated queue. Fix non XGMI code path KASAN error. Stop the hive reset for each node loop if there is a reset failure on any of the nodes. Signed-off-by: Andrey Grodzovsky Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index c8ad6bf6618a..6fc023bae7fe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -910,7 +910,9 @@ struct amdgpu_device { bool in_gpu_reset; struct mutex lock_reset; struct amdgpu_doorbell_index doorbell_index; + int asic_reset_res; + struct work_struct xgmi_reset_work; }; static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index bfd286c40631..9fd9f63adc08 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2356,6 +2356,19 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) return amdgpu_device_asic_has_dc_support(adev->asic_type); } + +static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) +{ + struct amdgpu_device *adev = + container_of(__work, struct amdgpu_device, xgmi_reset_work); + + adev->asic_reset_res = amdgpu_asic_reset(adev); + if (adev->asic_reset_res) + DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s", + adev->asic_reset_res, adev->ddev->unique); +} + + /** * amdgpu_device_init - initialize the driver * @@ -2454,6 +2467,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, amdgpu_device_delay_enable_gfx_off); + INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); + adev->gfx.gfx_off_req_count = 1; adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false; @@ -3331,10 +3346,31 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, */ if (need_full_reset) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - r = amdgpu_asic_reset(tmp_adev); - if (r) - DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s", + /* For XGMI run all resets in parallel to speed up the process */ + if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { + if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) + r = -EALREADY; + } else + r = amdgpu_asic_reset(tmp_adev); + + if (r) { + DRM_ERROR("ASIC reset failed with err r, %d for drm dev, %s", r, tmp_adev->ddev->unique); + break; + } + } + + /* For XGMI wait for all PSP resets to complete before proceed */ + if (!r) { + list_for_each_entry(tmp_adev, device_list_handle, + gmc.xgmi.head) { + if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { + flush_work(&tmp_adev->xgmi_reset_work); + r = tmp_adev->asic_reset_res; + if (r) + break; + } + } } } @@ -3521,8 +3557,6 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ if (tmp_adev == adev) continue; - dev_info(tmp_adev->dev, "GPU reset begin for drm dev %s!\n", adev->ddev->unique); - amdgpu_device_lock_adev(tmp_adev); r = amdgpu_device_pre_asic_reset(tmp_adev, NULL,