From ad3d93230d6b6e65b7c320e211e5008ee86627b5 Mon Sep 17 00:00:00 2001 From: Asad Kamal Date: Sat, 12 Apr 2025 17:30:09 +0800 Subject: [PATCH 01/16] drm/amd/pm: Fill static metrics data Fill static metrics data for smu_v13_0_6 v2: Proceed with driver load just with warning even if board voltage reads invalid value Signed-off-by: Asad Kamal Reviewed-by: Lijo Lazar Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h | 1 + .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h index 21589c4583e6..9678d2593f8f 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h @@ -112,6 +112,7 @@ struct smu_13_0_dpm_context { uint32_t workload_policy_mask; uint32_t dcef_min_ds_clk; uint64_t caps; + uint32_t board_volt; }; enum smu_13_0_power_state { diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index b4bea0881ac9..41a9829215b8 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -739,6 +739,19 @@ static ssize_t smu_v13_0_6_get_pm_metrics(struct smu_context *smu, return pm_metrics->common_header.structure_size; } +static void smu_v13_0_6_fill_static_metrics_table(struct smu_context *smu, + StaticMetricsTable_t *static_metrics) +{ + struct smu_13_0_dpm_context *dpm_context = smu->smu_dpm.dpm_context; + + if (!static_metrics->InputTelemetryVoltageInmV) { + dev_warn(smu->adev->dev, "Invalid board voltage %d\n", + static_metrics->InputTelemetryVoltageInmV); + } + + dpm_context->board_volt = static_metrics->InputTelemetryVoltageInmV; +} + int smu_v13_0_6_get_static_metrics_table(struct smu_context *smu) { struct smu_table_context *smu_table = &smu->smu_table; @@ -762,6 +775,7 @@ int smu_v13_0_6_get_static_metrics_table(struct smu_context *smu) static int smu_v13_0_6_setup_driver_pptable(struct smu_context *smu) { struct smu_table_context *smu_table = &smu->smu_table; + StaticMetricsTable_t *static_metrics = (StaticMetricsTable_t *)smu_table->metrics_table; MetricsTableV0_t *metrics_v0 = (MetricsTableV0_t *)smu_table->metrics_table; MetricsTableV1_t *metrics_v1 = (MetricsTableV1_t *)smu_table->metrics_table; MetricsTableV2_t *metrics_v2 = (MetricsTableV2_t *)smu_table->metrics_table; @@ -830,6 +844,7 @@ static int smu_v13_0_6_setup_driver_pptable(struct smu_context *smu) ret = smu_v13_0_6_get_static_metrics_table(smu); if (ret) return ret; + smu_v13_0_6_fill_static_metrics_table(smu, static_metrics); } } -- 2.51.0 From 3a2191efe45d00f9890071a9a81be15db9dab68d Mon Sep 17 00:00:00 2001 From: Asad Kamal Date: Sat, 12 Apr 2025 17:34:41 +0800 Subject: [PATCH 02/16] drm/amd/pm: Add voltage caps for smu_v13_0_6 Add & enable board voltage caps for smu_v13_0_6 v3: Update version check for board voltage support Signed-off-by: Asad Kamal Reviewed-by: Lijo Lazar Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 4 +++- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 41a9829215b8..f8e06913cd72 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -392,8 +392,10 @@ static void smu_v13_0_6_init_caps(struct smu_context *smu) if ((pgm == 7 && fw_ver >= 0x7550E00) || (pgm == 0 && fw_ver >= 0x00557E00)) smu_v13_0_6_cap_set(smu, SMU_CAP(HST_LIMIT_METRICS)); - if (fw_ver >= 0x00557F01) + if (fw_ver >= 0x00557F01) { smu_v13_0_6_cap_set(smu, SMU_CAP(STATIC_METRICS)); + smu_v13_0_6_cap_set(smu, SMU_CAP(BOARD_VOLTAGE)); + } } if (((pgm == 7) && (fw_ver >= 0x7550700)) || ((pgm == 0) && (fw_ver >= 0x00557900)) || diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h index c7a07aa5ef14..5313206ae4bb 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h @@ -66,6 +66,7 @@ enum smu_v13_0_6_caps { SMU_CAP(SDMA_RESET), SMU_CAP(STATIC_METRICS), SMU_CAP(HST_LIMIT_METRICS), + SMU_CAP(BOARD_VOLTAGE), SMU_CAP(ALL), }; -- 2.51.0 From ad7c088e31f026d71fe87fd09473fafb7d6ed006 Mon Sep 17 00:00:00 2001 From: "Jesse.Zhang" Date: Mon, 28 Apr 2025 10:35:19 +0800 Subject: [PATCH 03/16] drm/amdgpu: Fix API status offset for MES queue reset The mes_v11_0_reset_hw_queue and mes_v12_0_reset_hw_queue functions were using the wrong union type (MESAPI__REMOVE_QUEUE) when getting the offset for api_status. Since these functions handle queue reset operations, they should use MESAPI__RESET union instead. This fixes the polling of API status during hardware queue reset operations in the MES for both v11 and v12 versions. Signed-off-by: Jesse Zhang Reviewed-By: Shaoyun.liu Reviewed-by: Prike Liang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 0a5b7a296f08..b34d7bedc317 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -497,7 +497,7 @@ static int mes_v11_0_reset_hw_queue(struct amdgpu_mes *mes, return mes_v11_0_submit_pkt_and_poll_completion(mes, &mes_reset_queue_pkt, sizeof(mes_reset_queue_pkt), - offsetof(union MESAPI__REMOVE_QUEUE, api_status)); + offsetof(union MESAPI__RESET, api_status)); } static int mes_v11_0_map_legacy_queue(struct amdgpu_mes *mes, diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 1f7614dccb00..ee8b531b713d 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -517,7 +517,7 @@ static int mes_v12_0_reset_hw_queue(struct amdgpu_mes *mes, return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &mes_reset_queue_pkt, sizeof(mes_reset_queue_pkt), - offsetof(union MESAPI__REMOVE_QUEUE, api_status)); + offsetof(union MESAPI__RESET, api_status)); } static int mes_v12_0_map_legacy_queue(struct amdgpu_mes *mes, -- 2.51.0 From 96ac487c120041715e3b87969a15e1c27d9d2d65 Mon Sep 17 00:00:00 2001 From: Asad Kamal Date: Thu, 20 Mar 2025 18:21:57 +0800 Subject: [PATCH 04/16] drm/amd/pm: Add board voltage node to hwmon Add and expose board voltage node as vddboard to hwmon for smu_v13_0_6 v2: Replace ip check with supported sensor attribute(Lijo) Signed-off-by: Asad Kamal Reviewed-by: Lijo Lazar Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- .../gpu/drm/amd/include/kgd_pp_interface.h | 1 + drivers/gpu/drm/amd/pm/amdgpu_pm.c | 34 +++++++++++++++++++ .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 10 ++++++ 3 files changed, 45 insertions(+) diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h b/drivers/gpu/drm/amd/include/kgd_pp_interface.h index 21dc956b5f35..0f7542d7074b 100644 --- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h +++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h @@ -128,6 +128,7 @@ enum amd_pp_sensors { AMDGPU_PP_SENSOR_CPU_CLK, AMDGPU_PP_SENSOR_VDDNB, AMDGPU_PP_SENSOR_VDDGFX, + AMDGPU_PP_SENSOR_VDDBOARD, AMDGPU_PP_SENSOR_UVD_VCLK, AMDGPU_PP_SENSOR_UVD_DCLK, AMDGPU_PP_SENSOR_VCE_ECCLK, diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index 922def51685b..5537dcf23b5c 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -2944,6 +2944,23 @@ static ssize_t amdgpu_hwmon_show_vddgfx(struct device *dev, return sysfs_emit(buf, "%d\n", vddgfx); } +static ssize_t amdgpu_hwmon_show_vddboard(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct amdgpu_device *adev = dev_get_drvdata(dev); + u32 vddboard; + int r; + + /* get the voltage */ + r = amdgpu_hwmon_get_sensor_generic(adev, AMDGPU_PP_SENSOR_VDDBOARD, + (void *)&vddboard); + if (r) + return r; + + return sysfs_emit(buf, "%d\n", vddboard); +} + static ssize_t amdgpu_hwmon_show_vddgfx_label(struct device *dev, struct device_attribute *attr, char *buf) @@ -2951,6 +2968,12 @@ static ssize_t amdgpu_hwmon_show_vddgfx_label(struct device *dev, return sysfs_emit(buf, "vddgfx\n"); } +static ssize_t amdgpu_hwmon_show_vddboard_label(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "vddboard\n"); +} static ssize_t amdgpu_hwmon_show_vddnb(struct device *dev, struct device_attribute *attr, char *buf) @@ -3294,6 +3317,8 @@ static SENSOR_DEVICE_ATTR(in0_input, S_IRUGO, amdgpu_hwmon_show_vddgfx, NULL, 0) static SENSOR_DEVICE_ATTR(in0_label, S_IRUGO, amdgpu_hwmon_show_vddgfx_label, NULL, 0); static SENSOR_DEVICE_ATTR(in1_input, S_IRUGO, amdgpu_hwmon_show_vddnb, NULL, 0); static SENSOR_DEVICE_ATTR(in1_label, S_IRUGO, amdgpu_hwmon_show_vddnb_label, NULL, 0); +static SENSOR_DEVICE_ATTR(in2_input, S_IRUGO, amdgpu_hwmon_show_vddboard, NULL, 0); +static SENSOR_DEVICE_ATTR(in2_label, S_IRUGO, amdgpu_hwmon_show_vddboard_label, NULL, 0); static SENSOR_DEVICE_ATTR(power1_average, S_IRUGO, amdgpu_hwmon_show_power_avg, NULL, 0); static SENSOR_DEVICE_ATTR(power1_input, S_IRUGO, amdgpu_hwmon_show_power_input, NULL, 0); static SENSOR_DEVICE_ATTR(power1_cap_max, S_IRUGO, amdgpu_hwmon_show_power_cap_max, NULL, 0); @@ -3341,6 +3366,8 @@ static struct attribute *hwmon_attributes[] = { &sensor_dev_attr_in0_label.dev_attr.attr, &sensor_dev_attr_in1_input.dev_attr.attr, &sensor_dev_attr_in1_label.dev_attr.attr, + &sensor_dev_attr_in2_input.dev_attr.attr, + &sensor_dev_attr_in2_label.dev_attr.attr, &sensor_dev_attr_power1_average.dev_attr.attr, &sensor_dev_attr_power1_input.dev_attr.attr, &sensor_dev_attr_power1_cap_max.dev_attr.attr, @@ -3492,6 +3519,13 @@ static umode_t hwmon_attributes_visible(struct kobject *kobj, attr == &sensor_dev_attr_in1_label.dev_attr.attr)) return 0; + /* only few boards support vddboard */ + if ((attr == &sensor_dev_attr_in2_input.dev_attr.attr || + attr == &sensor_dev_attr_in2_label.dev_attr.attr) && + amdgpu_hwmon_get_sensor_generic(adev, AMDGPU_PP_SENSOR_VDDBOARD, + (void *)&tmp) == -EOPNOTSUPP) + return 0; + /* no mclk on APUs other than gc 9,4,3*/ if (((adev->flags & AMD_IS_APU) && (gc_ver != IP_VERSION(9, 4, 3))) && (attr == &sensor_dev_attr_freq2_input.dev_attr.attr || diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index f8e06913cd72..b6e5da7b06ef 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -1652,6 +1652,7 @@ static int smu_v13_0_6_read_sensor(struct smu_context *smu, enum amd_pp_sensors sensor, void *data, uint32_t *size) { + struct smu_13_0_dpm_context *dpm_context = smu->smu_dpm.dpm_context; int ret = 0; if (amdgpu_ras_intr_triggered()) @@ -1696,6 +1697,15 @@ static int smu_v13_0_6_read_sensor(struct smu_context *smu, ret = smu_v13_0_get_gfx_vdd(smu, (uint32_t *)data); *size = 4; break; + case AMDGPU_PP_SENSOR_VDDBOARD: + if (smu_v13_0_6_cap_supported(smu, SMU_CAP(BOARD_VOLTAGE))) { + *(uint32_t *)data = dpm_context->board_volt; + *size = 4; + break; + } else { + ret = -EOPNOTSUPP; + break; + } case AMDGPU_PP_SENSOR_GPU_AVG_POWER: default: ret = -EOPNOTSUPP; -- 2.51.0 From 3805e6959ced4c0735a4ae53f0c56324c87c72b2 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Fri, 25 Apr 2025 12:31:19 +0530 Subject: [PATCH 05/16] drm/amdgpu: Fix query order of XGMI v6.4.1 status Keep the register offsets as per link order for querying XGMI v6.4.1 link status. Signed-off-by: Lijo Lazar Acked-by: Alex Deucher Tested-by: Mangesh Gadre Fixes: 6dee64e765c4 ("drm/amdgpu: Fix xgmi v6.4.1 link status reporting") Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 95231de26cb1..f51ef4cf16e0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -297,8 +297,8 @@ static const struct amdgpu_pcs_ras_field xgmi3x16_pcs_ras_fields[] = { static u32 xgmi_v6_4_get_link_status(struct amdgpu_device *adev, int global_link_num) { const u32 smn_xgmi_6_4_pcs_state_hist1[2] = { 0x11a00070, 0x11b00070 }; - const u32 smn_xgmi_6_4_1_pcs_state_hist1[2] = { 0x11b00070, - 0x12100070 }; + const u32 smn_xgmi_6_4_1_pcs_state_hist1[2] = { 0x12100070, + 0x11b00070 }; u32 i, n; u64 addr; -- 2.51.0 From 161949dd716e1dc738da0a0ff0efb20dd914e408 Mon Sep 17 00:00:00 2001 From: Yifan Zha Date: Mon, 21 Apr 2025 17:06:52 +0800 Subject: [PATCH 06/16] drm/amdgpu: refine MES register print for devices of hive [Why] Register access print missed device info. [How] Using dev_xxx instead of DRM_xxx to indicate which device of a hive is the message for. Signed-off-by: Yifan Zha Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index 38ea64d87a0a..8d1a930e93ba 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -428,7 +428,7 @@ uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg) uint32_t *read_val_ptr; if (amdgpu_device_wb_get(adev, &addr_offset)) { - DRM_ERROR("critical bug! too many mes readers\n"); + dev_err(adev->dev, "critical bug! too many mes readers\n"); goto error; } read_val_gpu_addr = adev->wb.gpu_addr + (addr_offset * 4); @@ -438,13 +438,13 @@ uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg) op_input.read_reg.buffer_addr = read_val_gpu_addr; if (!adev->mes.funcs->misc_op) { - DRM_ERROR("mes rreg is not supported!\n"); + dev_err(adev->dev, "mes rreg is not supported!\n"); goto error; } r = adev->mes.funcs->misc_op(&adev->mes, &op_input); if (r) - DRM_ERROR("failed to read reg (0x%x)\n", reg); + dev_err(adev->dev, "failed to read reg (0x%x)\n", reg); else val = *(read_val_ptr); @@ -465,14 +465,14 @@ int amdgpu_mes_wreg(struct amdgpu_device *adev, op_input.write_reg.reg_value = val; if (!adev->mes.funcs->misc_op) { - DRM_ERROR("mes wreg is not supported!\n"); + dev_err(adev->dev, "mes wreg is not supported!\n"); r = -EINVAL; goto error; } r = adev->mes.funcs->misc_op(&adev->mes, &op_input); if (r) - DRM_ERROR("failed to write reg (0x%x)\n", reg); + dev_err(adev->dev, "failed to write reg (0x%x)\n", reg); error: return r; @@ -492,14 +492,14 @@ int amdgpu_mes_reg_write_reg_wait(struct amdgpu_device *adev, op_input.wrm_reg.mask = mask; if (!adev->mes.funcs->misc_op) { - DRM_ERROR("mes reg_write_reg_wait is not supported!\n"); + dev_err(adev->dev, "mes reg_write_reg_wait is not supported!\n"); r = -EINVAL; goto error; } r = adev->mes.funcs->misc_op(&adev->mes, &op_input); if (r) - DRM_ERROR("failed to reg_write_reg_wait\n"); + dev_err(adev->dev, "failed to reg_write_reg_wait\n"); error: return r; @@ -517,14 +517,14 @@ int amdgpu_mes_reg_wait(struct amdgpu_device *adev, uint32_t reg, op_input.wrm_reg.mask = mask; if (!adev->mes.funcs->misc_op) { - DRM_ERROR("mes reg wait is not supported!\n"); + dev_err(adev->dev, "mes reg wait is not supported!\n"); r = -EINVAL; goto error; } r = adev->mes.funcs->misc_op(&adev->mes, &op_input); if (r) - DRM_ERROR("failed to reg_write_reg_wait\n"); + dev_err(adev->dev, "failed to reg_write_reg_wait\n"); error: return r; -- 2.51.0 From cf1fcdeec4cacf12cad6a4b2cfb79fdc8e13fe9b Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Mon, 28 Apr 2025 15:07:07 +0530 Subject: [PATCH 07/16] drm/amdgpu: Print bootloader status for long waits If it needs a long wait for completion of bootloader execution, report the status in between. That helps to know if there is some issue during bootloader execution. Signed-off-by: Lijo Lazar Reviewed-by: Asad Kamal Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 31 ++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c index f5f616ab20e7..f8af2cc63446 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c @@ -85,6 +85,8 @@ MODULE_FIRMWARE("amdgpu/psp_14_0_4_ta.bin"); #define regMP1_PUB_SCRATCH0 0x3b10090 +#define PSP13_BL_STATUS_SIZE 100 + static int psp_v13_0_init_microcode(struct psp_context *psp) { struct amdgpu_device *adev = psp->adev; @@ -151,6 +153,32 @@ static bool psp_v13_0_is_sos_alive(struct psp_context *psp) return sol_reg != 0x0; } +static void psp_v13_0_bootloader_print_status(struct psp_context *psp, + const char *msg) +{ + struct amdgpu_device *adev = psp->adev; + u32 bl_status_reg; + char bl_status_msg[PSP13_BL_STATUS_SIZE]; + int i, at; + + if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) || + amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) || + amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14)) { + at = 0; + for_each_inst(i, adev->aid_mask) { + bl_status_reg = + (SOC15_REG_OFFSET(MP0, 0, regMP0_SMN_C2PMSG_92) + << 2) + + adev->asic_funcs->encode_ext_smn_addressing(i); + at += snprintf(bl_status_msg + at, + PSP13_BL_STATUS_SIZE - at, + " status(%02i): 0x%08x", i, + RREG32_PCIE_EXT(bl_status_reg)); + } + dev_info(adev->dev, "%s - %s", msg, bl_status_msg); + } +} + static int psp_v13_0_wait_for_vmbx_ready(struct psp_context *psp) { struct amdgpu_device *adev = psp->adev; @@ -196,6 +224,9 @@ static int psp_v13_0_wait_for_bootloader(struct psp_context *psp) if (ret == 0) return 0; + if (retry_loop && !(retry_loop % 10)) + psp_v13_0_bootloader_print_status( + psp, "Waiting for bootloader completion"); } return ret; -- 2.51.0 From 3580440308a1817ce286351530b5b6a09493ba46 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Mon, 28 Apr 2025 10:29:04 +0530 Subject: [PATCH 08/16] drm/amd/pm: Fix comment style Fix code comment style Signed-off-by: Lijo Lazar Reviewed-by: Yang Wang Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202504271422.D6cqMlZ0-lkp@intel.com/ Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c index 453952cdc353..9ad46f545d15 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c @@ -1347,7 +1347,7 @@ static int arcturus_get_power_limit(struct smu_context *smu, *default_power_limit = power_limit; if (max_power_limit) *max_power_limit = power_limit; - /** + /* * No lower bound is imposed on the limit. Any unreasonable limit set * will result in frequent throttling. */ -- 2.51.0 From 0105725e2d985899cef5ee187bb27f040f24f2ab Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Mon, 28 Apr 2025 10:36:35 +0530 Subject: [PATCH 09/16] drm/amdgpu: Fix comment style Fix code comment style Signed-off-by: Lijo Lazar Reviewed-by: Asad Kamal Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202504271826.xy2fFO28-lkp@intel.com/ Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 17f0911ee7e9..82013b495436 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2165,7 +2165,7 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) /* Fatal error events are handled on host side */ if (amdgpu_sriov_vf(adev)) return; - /** + /* * If the current interrupt is caused by a non-fatal RAS error, skip * check for fatal error. For fatal errors, FED status of all devices * in XGMI hive gets set when the first device gets fatal error -- 2.51.0 From 4e24c6bb5fab4d74205dde07c5c4e6c004f11938 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Sun, 27 Apr 2025 09:36:26 +0700 Subject: [PATCH 10/16] drm/amdgpu/userq: fix user_queue parameters list Sphinx reports htmldocs warning: Documentation/gpu/amdgpu/module-parameters:7: drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c:1119: ERROR: Unexpected indentation. [docutils] Fix the warning by using reST bullet list syntax for user_queue parameter options, separated from preceding paragraph by a blank line. Fixes: fb20954c9717 ("drm/amdgpu/userq: rework driver parameter") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20250422202956.176fb590@canb.auug.org.au/ Signed-off-by: Bagas Sanjaya Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index b9a1ef343c79..ec8057597c5a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1115,11 +1115,12 @@ module_param_named(rebar, amdgpu_rebar, int, 0444); /** * DOC: user_queue (int) - * Enable user queues on systems that support user queues. - * -1 = auto (ASIC specific default) - * 0 = user queues disabled - * 1 = user queues enabled and kernel queues enabled (if supported) - * 2 = user queues enabled and kernel queues disabled + * Enable user queues on systems that support user queues. Possible values: + * + * - -1 = auto (ASIC specific default) + * - 0 = user queues disabled + * - 1 = user queues enabled and kernel queues enabled (if supported) + * - 2 = user queues enabled and kernel queues disabled */ MODULE_PARM_DESC(user_queue, "Enable user queues (-1 = auto (default), 0 = disable, 1 = enable, 2 = enable UQs and disable KQs)"); module_param_named(user_queue, amdgpu_user_queue, int, 0444); -- 2.51.0 From 6535348a3eaa6a4521ff35367d661c432dedaf7b Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 24 Apr 2025 16:35:30 -0400 Subject: [PATCH 11/16] drm/amdgpu/mes: remove more unused functions These were leftover from mes bring up and are unused. Reviewed-by: Sunil Khatri Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 62 ------------------------- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 19 -------- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 26 ----------- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 27 ----------- 4 files changed, 134 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index 8d1a930e93ba..2103c8dcd7f3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -285,68 +285,6 @@ int amdgpu_mes_resume(struct amdgpu_device *adev) return r; } -int amdgpu_mes_reset_hw_queue(struct amdgpu_device *adev, int queue_id) -{ - unsigned long flags; - struct amdgpu_mes_queue *queue; - struct amdgpu_mes_gang *gang; - struct mes_reset_queue_input queue_input; - int r; - - /* - * Avoid taking any other locks under MES lock to avoid circular - * lock dependencies. - */ - amdgpu_mes_lock(&adev->mes); - - /* remove the mes gang from idr list */ - spin_lock_irqsave(&adev->mes.queue_id_lock, flags); - - queue = idr_find(&adev->mes.queue_id_idr, queue_id); - if (!queue) { - spin_unlock_irqrestore(&adev->mes.queue_id_lock, flags); - amdgpu_mes_unlock(&adev->mes); - DRM_ERROR("queue id %d doesn't exist\n", queue_id); - return -EINVAL; - } - spin_unlock_irqrestore(&adev->mes.queue_id_lock, flags); - - DRM_DEBUG("try to reset queue, doorbell off = 0x%llx\n", - queue->doorbell_off); - - gang = queue->gang; - queue_input.doorbell_offset = queue->doorbell_off; - queue_input.gang_context_addr = gang->gang_ctx_gpu_addr; - - r = adev->mes.funcs->reset_hw_queue(&adev->mes, &queue_input); - if (r) - DRM_ERROR("failed to reset hardware queue, queue id = %d\n", - queue_id); - - amdgpu_mes_unlock(&adev->mes); - - return 0; -} - -int amdgpu_mes_reset_hw_queue_mmio(struct amdgpu_device *adev, int queue_type, - int me_id, int pipe_id, int queue_id, int vmid) -{ - struct mes_reset_queue_input queue_input; - int r; - - queue_input.queue_type = queue_type; - queue_input.use_mmio = true; - queue_input.me_id = me_id; - queue_input.pipe_id = pipe_id; - queue_input.queue_id = queue_id; - queue_input.vmid = vmid; - r = adev->mes.funcs->reset_hw_queue(&adev->mes, &queue_input); - if (r) - DRM_ERROR("failed to reset hardware queue by mmio, queue id = %d\n", - queue_id); - return r; -} - int amdgpu_mes_map_legacy_queue(struct amdgpu_device *adev, struct amdgpu_ring *ring) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index be3390d26301..af6e341f6411 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -235,18 +235,6 @@ struct mes_remove_queue_input { uint64_t gang_context_addr; }; -struct mes_reset_queue_input { - uint32_t doorbell_offset; - uint64_t gang_context_addr; - bool use_mmio; - uint32_t queue_type; - uint32_t me_id; - uint32_t pipe_id; - uint32_t queue_id; - uint32_t xcc_id; - uint32_t vmid; -}; - struct mes_map_legacy_queue_input { uint32_t queue_type; uint32_t doorbell_offset; @@ -377,9 +365,6 @@ struct amdgpu_mes_funcs { int (*reset_legacy_queue)(struct amdgpu_mes *mes, struct mes_reset_legacy_queue_input *input); - - int (*reset_hw_queue)(struct amdgpu_mes *mes, - struct mes_reset_queue_input *input); }; #define amdgpu_mes_kiq_hw_init(adev) (adev)->mes.kiq_hw_init((adev)) @@ -394,10 +379,6 @@ void amdgpu_mes_fini(struct amdgpu_device *adev); int amdgpu_mes_suspend(struct amdgpu_device *adev); int amdgpu_mes_resume(struct amdgpu_device *adev); -int amdgpu_mes_reset_hw_queue(struct amdgpu_device *adev, int queue_id); -int amdgpu_mes_reset_hw_queue_mmio(struct amdgpu_device *adev, int queue_type, - int me_id, int pipe_id, int queue_id, int vmid); - int amdgpu_mes_map_legacy_queue(struct amdgpu_device *adev, struct amdgpu_ring *ring); int amdgpu_mes_unmap_legacy_queue(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index b34d7bedc317..5ce62a3f01e7 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -475,31 +475,6 @@ static int mes_v11_0_reset_queue_mmio(struct amdgpu_mes *mes, uint32_t queue_typ return r; } -static int mes_v11_0_reset_hw_queue(struct amdgpu_mes *mes, - struct mes_reset_queue_input *input) -{ - if (input->use_mmio) - return mes_v11_0_reset_queue_mmio(mes, input->queue_type, - input->me_id, input->pipe_id, - input->queue_id, input->vmid); - - union MESAPI__RESET mes_reset_queue_pkt; - - memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt)); - - mes_reset_queue_pkt.header.type = MES_API_TYPE_SCHEDULER; - mes_reset_queue_pkt.header.opcode = MES_SCH_API_RESET; - mes_reset_queue_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; - - mes_reset_queue_pkt.doorbell_offset = input->doorbell_offset; - mes_reset_queue_pkt.gang_context_addr = input->gang_context_addr; - /*mes_reset_queue_pkt.reset_queue_only = 1;*/ - - return mes_v11_0_submit_pkt_and_poll_completion(mes, - &mes_reset_queue_pkt, sizeof(mes_reset_queue_pkt), - offsetof(union MESAPI__RESET, api_status)); -} - static int mes_v11_0_map_legacy_queue(struct amdgpu_mes *mes, struct mes_map_legacy_queue_input *input) { @@ -817,7 +792,6 @@ static const struct amdgpu_mes_funcs mes_v11_0_funcs = { .resume_gang = mes_v11_0_resume_gang, .misc_op = mes_v11_0_misc_op, .reset_legacy_queue = mes_v11_0_reset_legacy_queue, - .reset_hw_queue = mes_v11_0_reset_hw_queue, }; static int mes_v11_0_allocate_ucode_buffer(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index ee8b531b713d..a3391810c897 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -494,32 +494,6 @@ static int mes_v12_0_reset_queue_mmio(struct amdgpu_mes *mes, uint32_t queue_typ return r; } -static int mes_v12_0_reset_hw_queue(struct amdgpu_mes *mes, - struct mes_reset_queue_input *input) -{ - union MESAPI__RESET mes_reset_queue_pkt; - int pipe; - - memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt)); - - mes_reset_queue_pkt.header.type = MES_API_TYPE_SCHEDULER; - mes_reset_queue_pkt.header.opcode = MES_SCH_API_RESET; - mes_reset_queue_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; - - mes_reset_queue_pkt.doorbell_offset = input->doorbell_offset; - mes_reset_queue_pkt.gang_context_addr = input->gang_context_addr; - /*mes_reset_queue_pkt.reset_queue_only = 1;*/ - - if (mes->adev->enable_uni_mes) - pipe = AMDGPU_MES_KIQ_PIPE; - else - pipe = AMDGPU_MES_SCHED_PIPE; - - return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, - &mes_reset_queue_pkt, sizeof(mes_reset_queue_pkt), - offsetof(union MESAPI__RESET, api_status)); -} - static int mes_v12_0_map_legacy_queue(struct amdgpu_mes *mes, struct mes_map_legacy_queue_input *input) { @@ -914,7 +888,6 @@ static const struct amdgpu_mes_funcs mes_v12_0_funcs = { .resume_gang = mes_v12_0_resume_gang, .misc_op = mes_v12_0_misc_op, .reset_legacy_queue = mes_v12_0_reset_legacy_queue, - .reset_hw_queue = mes_v12_0_reset_hw_queue, }; static int mes_v12_0_allocate_ucode_buffer(struct amdgpu_device *adev, -- 2.51.0 From 2408b0272b042e751b6a3cbbc14bb638bccfcefc Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 24 Apr 2025 16:46:55 -0400 Subject: [PATCH 12/16] drm/amdgpu/mes: consolidate on a single mes reset callback Use the legacy one as it covers both kernel queues and user queues. Reviewed-by: Sunil Khatri Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 6 ++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 7 ++++--- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 8 ++++---- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 8 ++++---- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index 2103c8dcd7f3..5de0d6c528f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -335,7 +335,7 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev, unsigned int vmid, bool use_mmio) { - struct mes_reset_legacy_queue_input queue_input; + struct mes_reset_queue_input queue_input; int r; memset(&queue_input, 0, sizeof(queue_input)); @@ -349,8 +349,10 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev, queue_input.wptr_addr = ring->wptr_gpu_addr; queue_input.vmid = vmid; queue_input.use_mmio = use_mmio; + if (ring->funcs->type == AMDGPU_RING_TYPE_GFX) + queue_input.legacy_gfx = true; - r = adev->mes.funcs->reset_legacy_queue(&adev->mes, &queue_input); + r = adev->mes.funcs->reset_hw_queue(&adev->mes, &queue_input); if (r) DRM_ERROR("failed to reset legacy queue\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index af6e341f6411..e98b0d892a59 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -266,7 +266,7 @@ struct mes_resume_gang_input { uint64_t gang_context_addr; }; -struct mes_reset_legacy_queue_input { +struct mes_reset_queue_input { uint32_t queue_type; uint32_t doorbell_offset; bool use_mmio; @@ -276,6 +276,7 @@ struct mes_reset_legacy_queue_input { uint64_t mqd_addr; uint64_t wptr_addr; uint32_t vmid; + bool legacy_gfx; }; enum mes_misc_opcode { @@ -363,8 +364,8 @@ struct amdgpu_mes_funcs { int (*misc_op)(struct amdgpu_mes *mes, struct mes_misc_op_input *input); - int (*reset_legacy_queue)(struct amdgpu_mes *mes, - struct mes_reset_legacy_queue_input *input); + int (*reset_hw_queue)(struct amdgpu_mes *mes, + struct mes_reset_queue_input *input); }; #define amdgpu_mes_kiq_hw_init(adev) (adev)->mes.kiq_hw_init((adev)) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 5ce62a3f01e7..c9eba537de09 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -746,8 +746,8 @@ static int mes_v11_0_set_hw_resources_1(struct amdgpu_mes *mes) offsetof(union MESAPI_SET_HW_RESOURCES_1, api_status)); } -static int mes_v11_0_reset_legacy_queue(struct amdgpu_mes *mes, - struct mes_reset_legacy_queue_input *input) +static int mes_v11_0_reset_hw_queue(struct amdgpu_mes *mes, + struct mes_reset_queue_input *input) { union MESAPI__RESET mes_reset_queue_pkt; @@ -765,7 +765,7 @@ static int mes_v11_0_reset_legacy_queue(struct amdgpu_mes *mes, mes_reset_queue_pkt.queue_type = convert_to_mes_queue_type(input->queue_type); - if (mes_reset_queue_pkt.queue_type == MES_QUEUE_TYPE_GFX) { + if (input->legacy_gfx) { mes_reset_queue_pkt.reset_legacy_gfx = 1; mes_reset_queue_pkt.pipe_id_lp = input->pipe_id; mes_reset_queue_pkt.queue_id_lp = input->queue_id; @@ -791,7 +791,7 @@ static const struct amdgpu_mes_funcs mes_v11_0_funcs = { .suspend_gang = mes_v11_0_suspend_gang, .resume_gang = mes_v11_0_resume_gang, .misc_op = mes_v11_0_misc_op, - .reset_legacy_queue = mes_v11_0_reset_legacy_queue, + .reset_hw_queue = mes_v11_0_reset_hw_queue, }; static int mes_v11_0_allocate_ucode_buffer(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index a3391810c897..f9f2fbc0a716 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -836,8 +836,8 @@ static void mes_v12_0_enable_unmapped_doorbell_handling( WREG32_SOC15(GC, 0, regCP_UNMAPPED_DOORBELL, data); } -static int mes_v12_0_reset_legacy_queue(struct amdgpu_mes *mes, - struct mes_reset_legacy_queue_input *input) +static int mes_v12_0_reset_hw_queue(struct amdgpu_mes *mes, + struct mes_reset_queue_input *input) { union MESAPI__RESET mes_reset_queue_pkt; int pipe; @@ -856,7 +856,7 @@ static int mes_v12_0_reset_legacy_queue(struct amdgpu_mes *mes, mes_reset_queue_pkt.queue_type = convert_to_mes_queue_type(input->queue_type); - if (mes_reset_queue_pkt.queue_type == MES_QUEUE_TYPE_GFX) { + if (input->legacy_gfx) { mes_reset_queue_pkt.reset_legacy_gfx = 1; mes_reset_queue_pkt.pipe_id_lp = input->pipe_id; mes_reset_queue_pkt.queue_id_lp = input->queue_id; @@ -887,7 +887,7 @@ static const struct amdgpu_mes_funcs mes_v12_0_funcs = { .suspend_gang = mes_v12_0_suspend_gang, .resume_gang = mes_v12_0_resume_gang, .misc_op = mes_v12_0_misc_op, - .reset_legacy_queue = mes_v12_0_reset_legacy_queue, + .reset_hw_queue = mes_v12_0_reset_hw_queue, }; static int mes_v12_0_allocate_ucode_buffer(struct amdgpu_device *adev, -- 2.51.0 From 2e828a25f850f1b7bd9be61fdac07bf6901b0d08 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 29 Apr 2025 15:27:20 -0400 Subject: [PATCH 13/16] drm/amdgpu/mes: use correct MES pipe for resets Use the KIQ pipe for kernel queues and the SCHED pipe for user queues. Fixes: 2408b0272b04 ("drm/amdgpu/mes: consolidate on a single mes reset callback") Cc: Michael Chen Cc: Shaoyun Liu Reviewed-by: Michael Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 + drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index 5de0d6c528f4..2febb63ab232 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -349,6 +349,7 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev, queue_input.wptr_addr = ring->wptr_gpu_addr; queue_input.vmid = vmid; queue_input.use_mmio = use_mmio; + queue_input.is_kq = true; if (ring->funcs->type == AMDGPU_RING_TYPE_GFX) queue_input.legacy_gfx = true; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index e98b0d892a59..a41f65b4f733 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -277,6 +277,7 @@ struct mes_reset_queue_input { uint64_t wptr_addr; uint32_t vmid; bool legacy_gfx; + bool is_kq; }; enum mes_misc_opcode { diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index f9f2fbc0a716..b4f17332d466 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -869,7 +869,7 @@ static int mes_v12_0_reset_hw_queue(struct amdgpu_mes *mes, mes_reset_queue_pkt.doorbell_offset = input->doorbell_offset; } - if (mes->adev->enable_uni_mes) + if (input->is_kq) pipe = AMDGPU_MES_KIQ_PIPE; else pipe = AMDGPU_MES_SCHED_PIPE; -- 2.51.0 From aded8b3c36f17575604544fb10bfb01f1b197db1 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 29 Apr 2025 13:46:12 -0400 Subject: [PATCH 14/16] drm/amdgpu: properly handle GC vs MM in amdgpu_vmid_mgr_init() When kernel queues are disabled, all GC vmids are available for the scheduler. MM vmids are still managed by the driver so make all 16 available. Also fix gmc 10 vs 11 mix up in commit 1f61fc28b939 ("drm/amdgpu/mes: make more vmids available when disable_kq=1") v2: Properly handle pre-GC 10 hardware Fixes: 1f61fc28b939 ("drm/amdgpu/mes: make more vmids available when disable_kq=1") Cc: Arvind Yadav Reviewed-by: Arvind Yadav Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 12 ++++++++++-- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c index 359c19de9a5b..5dd78a9cb12d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c @@ -576,8 +576,16 @@ void amdgpu_vmid_mgr_init(struct amdgpu_device *adev) INIT_LIST_HEAD(&id_mgr->ids_lru); id_mgr->reserved_use_count = 0; - /* manage only VMIDs not used by KFD */ - id_mgr->num_ids = adev->vm_manager.first_kfd_vmid; + /* for GC <10, SDMA uses MMHUB so use first_kfd_vmid for both GC and MM */ + if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(10, 0, 0)) + /* manage only VMIDs not used by KFD */ + id_mgr->num_ids = adev->vm_manager.first_kfd_vmid; + else if (AMDGPU_IS_MMHUB0(i) || + AMDGPU_IS_MMHUB1(i)) + id_mgr->num_ids = 16; + else + /* manage only VMIDs not used by KFD */ + id_mgr->num_ids = adev->vm_manager.first_kfd_vmid; /* skip over VMID 0, since it is the system VM */ for (j = 1; j < id_mgr->num_ids; ++j) { diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 7648e977b44b..a3e2787501f1 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -896,7 +896,7 @@ static int gmc_v10_0_sw_init(struct amdgpu_ip_block *ip_block) * amdgpu graphics/compute will use VMIDs 1-7 * amdkfd will use VMIDs 8-15 */ - adev->vm_manager.first_kfd_vmid = adev->gfx.disable_kq ? 1 : 8; + adev->vm_manager.first_kfd_vmid = 8; amdgpu_vm_manager_init(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index 7f5ca170f141..917d894a1316 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -828,7 +828,7 @@ static int gmc_v11_0_sw_init(struct amdgpu_ip_block *ip_block) * amdgpu graphics/compute will use VMIDs 1-7 * amdkfd will use VMIDs 8-15 */ - adev->vm_manager.first_kfd_vmid = 8; + adev->vm_manager.first_kfd_vmid = adev->gfx.disable_kq ? 1 : 8; amdgpu_vm_manager_init(adev); -- 2.51.0 From d6c6d5ec6652f0c2492767d886e2c39d089d58b9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 30 Apr 2025 11:01:24 +0300 Subject: [PATCH 15/16] drm/amdgpu/userq: Call unreserve on error in amdgpu_userq_fence_read_wptr() This error path should call amdgpu_bo_unreserve() before returning. Fixes: d8675102ba32 ("drm/amdgpu: add vm root BO lock before accessing the vm") Signed-off-by: Dan Carpenter Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index 3288c2ff692e..34200cd04f27 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -370,6 +370,7 @@ static int amdgpu_userq_fence_read_wptr(struct amdgpu_usermode_queue *queue, mapping = amdgpu_vm_bo_lookup_mapping(queue->vm, addr >> PAGE_SHIFT); if (!mapping) { + amdgpu_bo_unreserve(queue->vm->root.bo); DRM_ERROR("Failed to lookup amdgpu_bo_va_mapping\n"); return -EINVAL; } -- 2.51.0 From 97c39b4da606f6e6ec62689d1963ee6a5ad7f8ac Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 30 Apr 2025 11:05:13 +0300 Subject: [PATCH 16/16] drm/amdgpu/userq: remove unnecessary NULL check MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The "ticket" pointer points to in the middle of the &exec struct so it can't be NULL. Remove the check. Reviewed-by: Christian König Acked-by: Shashank Sharma Signed-off-by: Dan Carpenter Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index afbe01149ed3..33544586ffaa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -623,7 +623,7 @@ amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr) clear = false; unlock = true; /* The caller is already holding the reservation lock */ - } else if (ticket && dma_resv_locking_ctx(resv) == ticket) { + } else if (dma_resv_locking_ctx(resv) == ticket) { clear = false; unlock = false; /* Somebody else is using the BO right now */ -- 2.51.0