struct ras_ih_if ih_info = {
.cb = amdgpu_gfx_process_ras_data_cb,
};
+ struct ras_query_if info = { 0 };
if (!adev->gfx.ras_if) {
adev->gfx.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
strcpy(adev->gfx.ras_if->name, "gfx");
}
fs_info.head = ih_info.head = *adev->gfx.ras_if;
-
r = amdgpu_ras_late_init(adev, adev->gfx.ras_if,
&fs_info, &ih_info);
if (r)
goto free;
if (amdgpu_ras_is_supported(adev, adev->gfx.ras_if->block)) {
+ if (adev->gmc.xgmi.connected_to_cpu) {
+ info.head = *adev->gfx.ras_if;
+ amdgpu_ras_query_error_status(adev, &info);
+ } else {
+ amdgpu_ras_reset_error_status(adev, AMDGPU_RAS_BLOCK__GFX);
+ }
+
r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
if (r)
goto late_fini;
void (*reset_ras_error_count) (struct amdgpu_device *adev);
void (*init_spm_golden)(struct amdgpu_device *adev);
void (*query_ras_error_status) (struct amdgpu_device *adev);
+ void (*reset_ras_error_status) (struct amdgpu_device *adev);
void (*update_perfmon_mgcg)(struct amdgpu_device *adev, bool enable);
void (*enable_watchdog_timer)(struct amdgpu_device *adev);
- void (*query_sq_timeout_status)(struct amdgpu_device *adev);
};
struct sq_work {
ssize_t s;
char val[128];
- if (amdgpu_ras_error_query(obj->adev, &info))
+ if (amdgpu_ras_query_error_status(obj->adev, &info))
return -EINVAL;
s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
return snprintf(buf, PAGE_SIZE,
"Query currently inaccessible\n");
- if (amdgpu_ras_error_query(obj->adev, &info))
+ if (amdgpu_ras_query_error_status(obj->adev, &info))
return -EINVAL;
return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
/* feature ctl end */
/* query/inject/cure begin */
-int amdgpu_ras_error_query(struct amdgpu_device *adev,
- struct ras_query_if *info)
+int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
+ struct ras_query_if *info)
{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
struct ras_err_data err_data = {0, 0, 0, NULL};
case AMDGPU_RAS_BLOCK__GFX:
if (adev->gfx.funcs->query_ras_error_count)
adev->gfx.funcs->query_ras_error_count(adev, &err_data);
+
+ if (adev->gfx.funcs->query_ras_error_status)
+ adev->gfx.funcs->query_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__MMHUB:
if (adev->mmhub.funcs->query_ras_error_count)
adev->mmhub.funcs->query_ras_error_count(adev, &err_data);
+
+ if (adev->mmhub.funcs->query_ras_error_status)
+ adev->mmhub.funcs->query_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__PCIE_BIF:
if (adev->nbio.funcs->query_ras_error_count)
return 0;
}
+int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block)
+{
+ if (!amdgpu_ras_is_supported(adev, block))
+ return -EINVAL;
+
+ switch (block) {
+ case AMDGPU_RAS_BLOCK__GFX:
+ if (adev->gfx.funcs->reset_ras_error_count)
+ adev->gfx.funcs->reset_ras_error_count(adev);
+
+ if (adev->gfx.funcs->reset_ras_error_status)
+ adev->gfx.funcs->reset_ras_error_status(adev);
+ break;
+ case AMDGPU_RAS_BLOCK__MMHUB:
+ if (adev->mmhub.funcs->reset_ras_error_count)
+ adev->mmhub.funcs->reset_ras_error_count(adev);
+ break;
+ case AMDGPU_RAS_BLOCK__SDMA:
+ if (adev->sdma.funcs->reset_ras_error_count)
+ adev->sdma.funcs->reset_ras_error_count(adev);
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
/* Trigger XGMI/WAFL error */
static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
struct ta_ras_trigger_error_input *block_info)
.head = obj->head,
};
- if (amdgpu_ras_error_query(adev, &info))
+ if (amdgpu_ras_query_error_status(adev, &info))
return 0;
data.ce_count += info.ce_count;
if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
continue;
- amdgpu_ras_error_query(adev, &info);
+ amdgpu_ras_query_error_status(adev, &info);
}
}
case AMDGPU_RAS_BLOCK__GFX:
if (adev->gfx.funcs->query_ras_error_status)
adev->gfx.funcs->query_ras_error_status(adev);
-
- if (adev->gfx.funcs->query_sq_timeout_status)
- adev->gfx.funcs->query_sq_timeout_status(adev);
break;
case AMDGPU_RAS_BLOCK__MMHUB:
if (adev->mmhub.funcs->query_ras_error_status)
void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev);
-int amdgpu_ras_error_query(struct amdgpu_device *adev,
+int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
struct ras_query_if *info);
+int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block);
+
int amdgpu_ras_error_inject(struct amdgpu_device *adev,
struct ras_inject_if *info);
.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
.query_ras_error_status = &gfx_v9_4_2_query_ras_error_status,
+ .reset_ras_error_status = &gfx_v9_4_2_reset_ras_error_status,
.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
- .query_sq_timeout_status = &gfx_v9_4_2_query_sq_timeout_status,
};
static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
if (adev->asic_type == CHIP_ALDEBARAN)
gfx_v9_4_2_set_power_brake_sequence(adev);
- if (adev->gfx.funcs->enable_watchdog_timer)
- adev->gfx.funcs->enable_watchdog_timer(adev);
-
return r;
}
if (r)
return r;
- if (adev->gfx.funcs &&
- adev->gfx.funcs->reset_ras_error_count)
- adev->gfx.funcs->reset_ras_error_count(adev);
-
r = amdgpu_gfx_ras_late_init(adev);
if (r)
return r;
+ if (adev->gfx.funcs->enable_watchdog_timer)
+ adev->gfx.funcs->enable_watchdog_timer(adev);
+
return 0;
}
SOC15_REG_GOLDEN_VALUE(GC, 0, regTCI_CNTL_3, 0xff, 0x20),
};
+static void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev);
+static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev);
+
void gfx_v9_4_2_init_golden_registers(struct amdgpu_device *adev,
uint32_t die_id)
{
gfx_v9_4_2_query_sram_edc_count(adev, NULL, NULL);
gfx_v9_4_2_query_utc_edc_count(adev, NULL, NULL);
- gfx_v9_4_2_reset_utc_err_status(adev);
- gfx_v9_4_2_reset_ea_err_status(adev);
}
int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_if)
if (reg_value)
dev_warn(adev->dev, "GCEA err detected at instance: %d, status: 0x%x!\n",
j, reg_value);
+ /* clear after read */
+ WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_rdrsp_status_regs), 0x10);
}
}
uint32_t data;
data = RREG32_SOC15(GC, 0, regUTCL2_MEM_ECC_STATUS);
- if (!data)
+ if (!data) {
dev_warn(adev->dev, "GFX UTCL2 Mem Ecc Status: 0x%x!\n", data);
+ WREG32_SOC15(GC, 0, regUTCL2_MEM_ECC_STATUS, 0x3);
+ }
data = RREG32_SOC15(GC, 0, regVML2_MEM_ECC_STATUS);
- if (!data)
+ if (!data) {
dev_warn(adev->dev, "GFX VML2 Mem Ecc Status: 0x%x!\n", data);
+ WREG32_SOC15(GC, 0, regVML2_MEM_ECC_STATUS, 0x3);
+ }
data = RREG32_SOC15(GC, 0, regVML2_WALKER_MEM_ECC_STATUS);
- if (!data)
+ if (!data) {
dev_warn(adev->dev, "GFX VML2 Walker Mem Ecc Status: 0x%x!\n", data);
+ WREG32_SOC15(GC, 0, regVML2_WALKER_MEM_ECC_STATUS, 0x3);
+ }
}
void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev)
gfx_v9_4_2_query_ea_err_status(adev);
gfx_v9_4_2_query_utc_err_status(adev);
+ gfx_v9_4_2_query_sq_timeout_status(adev);
+}
+
+void gfx_v9_4_2_reset_ras_error_status(struct amdgpu_device *adev)
+{
+ if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
+ return;
+
+ gfx_v9_4_2_reset_utc_err_status(adev);
+ gfx_v9_4_2_reset_ea_err_status(adev);
+ gfx_v9_4_2_reset_sq_timeout_status(adev);
}
void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev)
}
}
-void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev)
+static void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev)
{
uint32_t se_idx, sh_idx, cu_idx;
uint32_t status;
}
gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
mutex_unlock(&adev->grbm_idx_mutex);
+}
+
+static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev)
+{
+ uint32_t se_idx, sh_idx, cu_idx;
+
+ mutex_lock(&adev->grbm_idx_mutex);
+ for (se_idx = 0; se_idx < adev->gfx.config.max_shader_engines;
+ se_idx++) {
+ for (sh_idx = 0; sh_idx < adev->gfx.config.max_sh_per_se;
+ sh_idx++) {
+ for (cu_idx = 0;
+ cu_idx < adev->gfx.config.max_cu_per_sh;
+ cu_idx++) {
+ gfx_v9_4_2_select_se_sh(adev, se_idx, sh_idx,
+ cu_idx);
+ WREG32_SOC15(GC, 0, regSQ_TIMEOUT_STATUS, 0);
+ }
+ }
+ }
+ gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
+ mutex_unlock(&adev->grbm_idx_mutex);
}
\ No newline at end of file
void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev);
int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status);
-
+void gfx_v9_4_2_reset_ras_error_status(struct amdgpu_device *adev);
void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev);
-void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev);
#endif /* __GFX_V9_4_2_H__ */