From 96d01ef3b106799dc6fcecfe03ceb0ccc14a2d54 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Fri, 21 Feb 2025 16:10:42 -0800 Subject: [PATCH 01/16] drivers: base: devres: Fix find_group() documentation It returns the last open group, not the last group. Acked-by: Greg Kroah-Hartman Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-3-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi --- drivers/base/devres.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/base/devres.c b/drivers/base/devres.c index 68224f2f83ff..830e9f4eb148 100644 --- a/drivers/base/devres.c +++ b/drivers/base/devres.c @@ -576,7 +576,10 @@ void *devres_open_group(struct device *dev, void *id, gfp_t gfp) } EXPORT_SYMBOL_GPL(devres_open_group); -/* Find devres group with ID @id. If @id is NULL, look for the latest. */ +/* + * Find devres group with ID @id. If @id is NULL, look for the latest open + * group. + */ static struct devres_group *find_group(struct device *dev, void *id) { struct devres_node *node; -- 2.51.0 From 2babfdfe2e9bd0b6aad30684c92b08c57d476d88 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Fri, 21 Feb 2025 16:10:43 -0800 Subject: [PATCH 02/16] drivers: base: component: Add debug message for unbind Like when binding component, add a debug message to the unbinding case to make it easy to track the lifecycle. This also includes the component pointer since that is used to open a group in devres, making it easier to track the resources. Acked-by: Greg Kroah-Hartman Reviewed-by: Rodrigo Vivi Reviewed-by: Tejas Upadhyay Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-4-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi --- drivers/base/component.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/base/component.c b/drivers/base/component.c index 741497324d78..5d10600bbc25 100644 --- a/drivers/base/component.c +++ b/drivers/base/component.c @@ -574,6 +574,9 @@ static void component_unbind(struct component *component, { WARN_ON(!component->bound); + dev_dbg(adev->parent, "unbinding %s component %p (ops %ps)\n", + dev_name(component->dev), component, component->ops); + if (component->ops && component->ops->unbind) component->ops->unbind(component->dev, adev->parent, data); component->bound = false; -- 2.51.0 From 83e3d0876754f820cb2adef55275d09d31676020 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Fri, 21 Feb 2025 16:10:44 -0800 Subject: [PATCH 03/16] drm/xe: Stop setting drvdata to NULL PCI subsystem is not supposed to call the remove() function when probe fails and doesn't need a protection for that. The only places checking for NULL drvdata, is on 2 sysfs files and they shouldn't be needed since the files are removed and reads on open fds just return an error. For this protection the core driver implementation in drivers/base/dd.c:device_unbind_cleanup() already sets it to NULL, after the release of dev resources. Remove the setting to NULL so it's possible to obtain the xe pointer from callbacks like the component unbind from device_unbind_cleanup(), i.e. after xe_pci_remove() already finished. Reviewed-by: Rodrigo Vivi Reviewed-by: Jonathan Cavitt Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-5-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_device_sysfs.c | 6 ------ drivers/gpu/drm/xe/xe_pci.c | 7 +------ drivers/gpu/drm/xe/xe_survivability_mode.c | 1 - 3 files changed, 1 insertion(+), 13 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_device_sysfs.c b/drivers/gpu/drm/xe/xe_device_sysfs.c index 7375937934fa..7efbd4c52791 100644 --- a/drivers/gpu/drm/xe/xe_device_sysfs.c +++ b/drivers/gpu/drm/xe/xe_device_sysfs.c @@ -32,9 +32,6 @@ vram_d3cold_threshold_show(struct device *dev, struct xe_device *xe = pdev_to_xe_device(pdev); int ret; - if (!xe) - return -EINVAL; - xe_pm_runtime_get(xe); ret = sysfs_emit(buf, "%d\n", xe->d3cold.vram_threshold); xe_pm_runtime_put(xe); @@ -51,9 +48,6 @@ vram_d3cold_threshold_store(struct device *dev, struct device_attribute *attr, u32 vram_d3cold_threshold; int ret; - if (!xe) - return -EINVAL; - ret = kstrtou32(buff, 0, &vram_d3cold_threshold); if (ret) return ret; diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index f8417f4d8ce6..078cc8d96085 100644 --- a/drivers/gpu/drm/xe/xe_pci.c +++ b/drivers/gpu/drm/xe/xe_pci.c @@ -765,11 +765,7 @@ static int xe_info_init(struct xe_device *xe, static void xe_pci_remove(struct pci_dev *pdev) { - struct xe_device *xe; - - xe = pdev_to_xe_device(pdev); - if (!xe) /* driver load aborted, nothing to cleanup */ - return; + struct xe_device *xe = pdev_to_xe_device(pdev); if (IS_SRIOV_PF(xe)) xe_pci_sriov_configure(pdev, 0); @@ -779,7 +775,6 @@ static void xe_pci_remove(struct pci_dev *pdev) xe_device_remove(xe); xe_pm_runtime_fini(xe); - pci_set_drvdata(pdev, NULL); } /* diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c index 02b4eadf8407..04a341606a7c 100644 --- a/drivers/gpu/drm/xe/xe_survivability_mode.c +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c @@ -202,7 +202,6 @@ void xe_survivability_mode_remove(struct xe_device *xe) sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr); xe_heci_gsc_fini(xe); kfree(survivability->info); - pci_set_drvdata(pdev, NULL); } /** -- 2.51.0 From 01b1ace3b48171c4cbdd9b2e79e25099f6e3c861 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Fri, 21 Feb 2025 16:10:45 -0800 Subject: [PATCH 04/16] drm/xe: Switch from xe to devm actions Now that component drivers are compatible with devm, switch to using it instead of our own. Reviewed-by: Rodrigo Vivi Reviewed-by: Jonathan Cavitt Reviewed-by: Tejas Upadhyay Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-6-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/display/xe_display.c | 2 +- drivers/gpu/drm/xe/xe_gsc_proxy.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c index 7fef78f5606d..1909effd35a9 100644 --- a/drivers/gpu/drm/xe/display/xe_display.c +++ b/drivers/gpu/drm/xe/display/xe_display.c @@ -184,7 +184,7 @@ int xe_display_init(struct xe_device *xe) if (err) return err; - return xe_device_add_action_or_reset(xe, xe_display_fini, xe); + return devm_add_action_or_reset(xe->drm.dev, xe_display_fini, xe); } void xe_display_register(struct xe_device *xe) diff --git a/drivers/gpu/drm/xe/xe_gsc_proxy.c b/drivers/gpu/drm/xe/xe_gsc_proxy.c index 31c90577faf0..8cf70b228ff3 100644 --- a/drivers/gpu/drm/xe/xe_gsc_proxy.c +++ b/drivers/gpu/drm/xe/xe_gsc_proxy.c @@ -490,7 +490,7 @@ int xe_gsc_proxy_init(struct xe_gsc *gsc) gsc->proxy.component_added = true; - return xe_device_add_action_or_reset(xe, xe_gsc_proxy_remove, gsc); + return devm_add_action_or_reset(xe->drm.dev, xe_gsc_proxy_remove, gsc); } /** -- 2.51.0 From d01bdc00254c2d12d36b0dbb5d098286edeb00ea Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Fri, 21 Feb 2025 16:10:46 -0800 Subject: [PATCH 05/16] drm/xe: Drop remove callback support Now that devres supports component driver cleanup during driver removal cleanup, the xe custom support for removal callbacks is not needed anymore. Drop it. Reviewed-by: Rodrigo Vivi Reviewed-by: Jonathan Cavitt Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-7-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_device.c | 68 ---------------------------- drivers/gpu/drm/xe/xe_device.h | 3 -- drivers/gpu/drm/xe/xe_device_types.h | 14 ------ drivers/gpu/drm/xe/xe_pci.c | 4 +- 4 files changed, 1 insertion(+), 88 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 06ccff145050..858b3e5da9c5 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -65,12 +65,6 @@ #include -struct xe_device_remove_action { - struct list_head node; - void (*action)(void *); - void *data; -}; - static int xe_file_open(struct drm_device *dev, struct drm_file *file) { struct xe_device *xe = to_xe_device(dev); @@ -752,9 +746,6 @@ int xe_device_probe(struct xe_device *xe) int err; u8 id; - xe->probing = true; - INIT_LIST_HEAD(&xe->remove_action_list); - xe_pat_init_early(xe); err = xe_sriov_init(xe); @@ -904,8 +895,6 @@ int xe_device_probe(struct xe_device *xe) xe_vsec_init(xe); - xe->probing = false; - return devm_add_action_or_reset(xe->drm.dev, xe_device_sanitize, xe); err_unregister_display: @@ -916,61 +905,6 @@ err_remove_display: return err; } -/** - * xe_device_call_remove_actions - Call the remove actions - * @xe: xe device instance - * - * This is only to be used by xe_pci and xe_device to call the remove actions - * while removing the driver or handling probe failures. - */ -void xe_device_call_remove_actions(struct xe_device *xe) -{ - struct xe_device_remove_action *ra, *tmp; - - list_for_each_entry_safe(ra, tmp, &xe->remove_action_list, node) { - ra->action(ra->data); - list_del(&ra->node); - kfree(ra); - } - - xe->probing = false; -} - -/** - * xe_device_add_action_or_reset - Add an action to run on driver removal - * @xe: xe device instance - * @action: Function that should be called on device remove - * @data: Pointer to data passed to @action implementation - * - * This adds a custom action to the list of remove callbacks executed on device - * remove, before any dev or drm managed resources are removed. This is only - * needed if the action leads to component_del()/component_master_del() since - * that is not compatible with devres cleanup. - * - * Returns: 0 on success or a negative error code on failure, in which case - * @action is already called. - */ -int xe_device_add_action_or_reset(struct xe_device *xe, - void (*action)(void *), void *data) -{ - struct xe_device_remove_action *ra; - - drm_WARN_ON(&xe->drm, !xe->probing); - - ra = kmalloc(sizeof(*ra), GFP_KERNEL); - if (!ra) { - action(data); - return -ENOMEM; - } - - INIT_LIST_HEAD(&ra->node); - ra->action = action; - ra->data = data; - list_add(&ra->node, &xe->remove_action_list); - - return 0; -} - void xe_device_remove(struct xe_device *xe) { xe_display_unregister(xe); @@ -980,8 +914,6 @@ void xe_device_remove(struct xe_device *xe) xe_display_driver_remove(xe); xe_heci_gsc_fini(xe); - - xe_device_call_remove_actions(xe); } void xe_device_shutdown(struct xe_device *xe) diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h index 079dad32a6f5..0bc3bc8e6803 100644 --- a/drivers/gpu/drm/xe/xe_device.h +++ b/drivers/gpu/drm/xe/xe_device.h @@ -45,9 +45,6 @@ struct xe_device *xe_device_create(struct pci_dev *pdev, const struct pci_device_id *ent); int xe_device_probe_early(struct xe_device *xe); int xe_device_probe(struct xe_device *xe); -int xe_device_add_action_or_reset(struct xe_device *xe, - void (*action)(void *), void *data); -void xe_device_call_remove_actions(struct xe_device *xe); void xe_device_remove(struct xe_device *xe); void xe_device_shutdown(struct xe_device *xe); diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index 4cf08c408b95..28d10a1d7b64 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -428,20 +428,6 @@ struct xe_device { /** @tiles: device tiles */ struct xe_tile tiles[XE_MAX_TILES_PER_DEVICE]; - /** - * @remove_action_list: list of actions to execute on device remove. - * Use xe_device_add_remove_action() for that. Actions can only be added - * during probe and are executed during the call from PCI subsystem to - * remove the driver from the device. - */ - struct list_head remove_action_list; - - /** - * @probing: cover the section in which @remove_action_list can be used - * to post cleaning actions - */ - bool probing; - /** * @mem_access: keep track of memory access in the device, possibly * triggering additional actions when they occur. diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index 078cc8d96085..a0f4bd45b61b 100644 --- a/drivers/gpu/drm/xe/xe_pci.c +++ b/drivers/gpu/drm/xe/xe_pci.c @@ -895,10 +895,8 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return err; err = xe_device_probe(xe); - if (err) { - xe_device_call_remove_actions(xe); + if (err) return err; - } err = xe_pm_init(xe); if (err) -- 2.51.0 From d41d048043c47a5fce1879e8e95dc93a573d3708 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Fri, 21 Feb 2025 16:10:47 -0800 Subject: [PATCH 06/16] drm/xe/display: Drop xe_display_driver_remove() Handle it as part of xe_display_fini(). The error handling was already calling it if a step after xe_display_init() failed. Just re-use the same xe_display_fini() for driver remove. Cc: Rodrigo Vivi Cc: Jani Nikula Reviewed-by: Jonathan Cavitt Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-8-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/display/xe_display.c | 11 +---------- drivers/gpu/drm/xe/display/xe_display.h | 1 - drivers/gpu/drm/xe/xe_device.c | 8 ++------ 3 files changed, 3 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c index 1909effd35a9..279b786d64dc 100644 --- a/drivers/gpu/drm/xe/display/xe_display.c +++ b/drivers/gpu/drm/xe/display/xe_display.c @@ -170,6 +170,7 @@ static void xe_display_fini(void *arg) intel_hpd_poll_fini(xe); intel_hdcp_component_fini(display); intel_audio_deinit(xe); + intel_display_driver_remove(display); } int xe_display_init(struct xe_device *xe) @@ -209,16 +210,6 @@ void xe_display_unregister(struct xe_device *xe) intel_display_driver_unregister(display); } -void xe_display_driver_remove(struct xe_device *xe) -{ - struct intel_display *display = &xe->display; - - if (!xe->info.probe_display) - return; - - intel_display_driver_remove(display); -} - /* IRQ-related functions */ void xe_display_irq_handler(struct xe_device *xe, u32 master_ctl) diff --git a/drivers/gpu/drm/xe/display/xe_display.h b/drivers/gpu/drm/xe/display/xe_display.h index 685dc74402fb..46e14f8dee28 100644 --- a/drivers/gpu/drm/xe/display/xe_display.h +++ b/drivers/gpu/drm/xe/display/xe_display.h @@ -14,7 +14,6 @@ struct drm_driver; bool xe_display_driver_probe_defer(struct pci_dev *pdev); void xe_display_driver_set_hooks(struct drm_driver *driver); -void xe_display_driver_remove(struct xe_device *xe); int xe_display_create(struct xe_device *xe); diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 858b3e5da9c5..d50ac3d43511 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -868,11 +868,11 @@ int xe_device_probe(struct xe_device *xe) err = xe_pxp_init(xe); if (err) - goto err_remove_display; + return err; err = drm_dev_register(&xe->drm, 0); if (err) - goto err_remove_display; + return err; xe_display_register(xe); @@ -899,8 +899,6 @@ int xe_device_probe(struct xe_device *xe) err_unregister_display: xe_display_unregister(xe); -err_remove_display: - xe_display_driver_remove(xe); return err; } @@ -911,8 +909,6 @@ void xe_device_remove(struct xe_device *xe) drm_dev_unplug(&xe->drm); - xe_display_driver_remove(xe); - xe_heci_gsc_fini(xe); } -- 2.51.0 From d40f275d96e890ac58cdaf2a46cb928c4240fcb7 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Fri, 21 Feb 2025 16:10:48 -0800 Subject: [PATCH 07/16] drm/xe: Move survivability entirely to xe_pci There's an odd split between xe_pci.c and xe_device.c wrt xe_survivability: it's initialized by xe_device, but then finalized by xe_pci. Move it entirely to the outer layer, xe_pci, so it controls the flow entirely. This also allows to stop ignoring some of the errors. E.g.: if there's an -ENOMEM, it shouldn't continue as if it survivability had been enabled. One change worth mentioning is that if "wait for lmem" fails, it will also check the pcode status to decide if it should enter or not in survivability mode, which it was not doing before. The bit from pcode for that decision should remain the same after lmem failed initialization, so it should be fine. Cc: Riana Tauro Reviewed-by: Jonathan Cavitt Reviewed-by: Riana Tauro Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-9-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_device.c | 7 +-- drivers/gpu/drm/xe/xe_heci_gsc.c | 2 +- drivers/gpu/drm/xe/xe_pci.c | 17 ++--- drivers/gpu/drm/xe/xe_survivability_mode.c | 73 +++++++++++----------- drivers/gpu/drm/xe/xe_survivability_mode.h | 5 +- 5 files changed, 49 insertions(+), 55 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index d50ac3d43511..ef269227b64b 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -53,7 +53,6 @@ #include "xe_pxp.h" #include "xe_query.h" #include "xe_sriov.h" -#include "xe_survivability_mode.h" #include "xe_tile.h" #include "xe_ttm_stolen_mgr.h" #include "xe_ttm_sys_mgr.h" @@ -695,12 +694,8 @@ int xe_device_probe_early(struct xe_device *xe) update_device_info(xe); err = xe_pcode_probe_early(xe); - if (err) { - if (xe_survivability_mode_required(xe)) - xe_survivability_mode_init(xe); - + if (err) return err; - } err = wait_for_lmem_ready(xe); if (err) diff --git a/drivers/gpu/drm/xe/xe_heci_gsc.c b/drivers/gpu/drm/xe/xe_heci_gsc.c index 06dc78d3a812..992ee47abcdb 100644 --- a/drivers/gpu/drm/xe/xe_heci_gsc.c +++ b/drivers/gpu/drm/xe/xe_heci_gsc.c @@ -201,7 +201,7 @@ void xe_heci_gsc_init(struct xe_device *xe) return; } - if (!def->use_polling && !xe_survivability_mode_enabled(xe)) { + if (!def->use_polling && !xe_survivability_mode_is_enabled(xe)) { ret = heci_gsc_irq_setup(xe); if (ret) goto fail; diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index a0f4bd45b61b..8b6658b214be 100644 --- a/drivers/gpu/drm/xe/xe_pci.c +++ b/drivers/gpu/drm/xe/xe_pci.c @@ -770,8 +770,8 @@ static void xe_pci_remove(struct pci_dev *pdev) if (IS_SRIOV_PF(xe)) xe_pci_sriov_configure(pdev, 0); - if (xe_survivability_mode_enabled(xe)) - return xe_survivability_mode_remove(xe); + if (xe_survivability_mode_is_enabled(xe)) + return; xe_device_remove(xe); xe_pm_runtime_fini(xe); @@ -846,13 +846,14 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err = xe_device_probe_early(xe); /* - * In Boot Survivability mode, no drm card is exposed - * and driver is loaded with bare minimum to allow - * for firmware to be flashed through mei. Return - * success if survivability mode is enabled. + * In Boot Survivability mode, no drm card is exposed and driver is + * loaded with bare minimum to allow for firmware to be flashed through + * mei. If early probe fails, check if survivability mode is flagged by + * HW to be enabled. In that case enable it and return success. */ if (err) { - if (xe_survivability_mode_enabled(xe)) + if (xe_survivability_mode_required(xe) && + xe_survivability_mode_enable(xe)) return 0; return err; @@ -946,7 +947,7 @@ static int xe_pci_suspend(struct device *dev) struct xe_device *xe = pdev_to_xe_device(pdev); int err; - if (xe_survivability_mode_enabled(xe)) + if (xe_survivability_mode_is_enabled(xe)) return -EBUSY; err = xe_pm_suspend(xe); diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c index 04a341606a7c..7ba02e085b5b 100644 --- a/drivers/gpu/drm/xe/xe_survivability_mode.c +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c @@ -127,40 +127,54 @@ static ssize_t survivability_mode_show(struct device *dev, static DEVICE_ATTR_ADMIN_RO(survivability_mode); -static void enable_survivability_mode(struct pci_dev *pdev) +static void xe_survivability_mode_fini(void *arg) +{ + struct xe_device *xe = arg; + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + struct device *dev = &pdev->dev; + + sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr); + xe_heci_gsc_fini(xe); +} + +static int enable_survivability_mode(struct pci_dev *pdev) { struct device *dev = &pdev->dev; struct xe_device *xe = pdev_to_xe_device(pdev); struct xe_survivability *survivability = &xe->survivability; int ret = 0; - /* set survivability mode */ - survivability->mode = true; - dev_info(dev, "In Survivability Mode\n"); - /* create survivability mode sysfs */ ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr); if (ret) { dev_warn(dev, "Failed to create survivability sysfs files\n"); - return; + return ret; } + ret = devm_add_action_or_reset(xe->drm.dev, + xe_survivability_mode_fini, xe); + if (ret) + return ret; + xe_heci_gsc_init(xe); xe_vsec_init(xe); + + survivability->mode = true; + dev_err(dev, "In Survivability Mode\n"); + + return 0; } /** - * xe_survivability_mode_enabled - check if survivability mode is enabled + * xe_survivability_mode_is_enabled - check if survivability mode is enabled * @xe: xe device instance * * Returns true if in survivability mode, false otherwise */ -bool xe_survivability_mode_enabled(struct xe_device *xe) +bool xe_survivability_mode_is_enabled(struct xe_device *xe) { - struct xe_survivability *survivability = &xe->survivability; - - return survivability->mode; + return xe->survivability.mode; } /** @@ -183,34 +197,19 @@ bool xe_survivability_mode_required(struct xe_device *xe) data = xe_mmio_read32(mmio, PCODE_SCRATCH(0)); survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data); - return (survivability->boot_status == NON_CRITICAL_FAILURE || - survivability->boot_status == CRITICAL_FAILURE); + return survivability->boot_status == NON_CRITICAL_FAILURE || + survivability->boot_status == CRITICAL_FAILURE; } /** - * xe_survivability_mode_remove - remove survivability mode + * xe_survivability_mode_enable - Initialize and enable the survivability mode * @xe: xe device instance * - * clean up sysfs entries of survivability mode - */ -void xe_survivability_mode_remove(struct xe_device *xe) -{ - struct xe_survivability *survivability = &xe->survivability; - struct pci_dev *pdev = to_pci_dev(xe->drm.dev); - struct device *dev = &pdev->dev; - - sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr); - xe_heci_gsc_fini(xe); - kfree(survivability->info); -} - -/** - * xe_survivability_mode_init - Initialize the survivability mode - * @xe: xe device instance + * Initialize survivability information and enable survivability mode * - * Initializes survivability information and enables survivability mode + * Return: 0 for success, negative error code otherwise. */ -void xe_survivability_mode_init(struct xe_device *xe) +int xe_survivability_mode_enable(struct xe_device *xe) { struct xe_survivability *survivability = &xe->survivability; struct xe_survivability_info *info; @@ -218,9 +217,10 @@ void xe_survivability_mode_init(struct xe_device *xe) survivability->size = MAX_SCRATCH_MMIO; - info = kcalloc(survivability->size, sizeof(*info), GFP_KERNEL); + info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info), + GFP_KERNEL); if (!info) - return; + return -ENOMEM; survivability->info = info; @@ -229,9 +229,8 @@ void xe_survivability_mode_init(struct xe_device *xe) /* Only log debug information and exit if it is a critical failure */ if (survivability->boot_status == CRITICAL_FAILURE) { log_survivability_info(pdev); - kfree(survivability->info); - return; + return -ENXIO; } - enable_survivability_mode(pdev); + return enable_survivability_mode(pdev); } diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.h b/drivers/gpu/drm/xe/xe_survivability_mode.h index f530507a22c6..f4df5f9025ce 100644 --- a/drivers/gpu/drm/xe/xe_survivability_mode.h +++ b/drivers/gpu/drm/xe/xe_survivability_mode.h @@ -10,9 +10,8 @@ struct xe_device; -void xe_survivability_mode_init(struct xe_device *xe); -void xe_survivability_mode_remove(struct xe_device *xe); -bool xe_survivability_mode_enabled(struct xe_device *xe); +int xe_survivability_mode_enable(struct xe_device *xe); +bool xe_survivability_mode_is_enabled(struct xe_device *xe); bool xe_survivability_mode_required(struct xe_device *xe); #endif /* _XE_SURVIVABILITY_MODE_H_ */ -- 2.51.0 From 292b1a8a50545b47d4fafc54452147abd2d1d86c Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Fri, 21 Feb 2025 16:10:49 -0800 Subject: [PATCH 08/16] drm/xe: Stop ignoring errors from xe_heci_gsc_init() Do not ignore errors from xe_heci_gsc_init(). For example, it shouldn't be fine to report successfully entering survivability mode when there's no communication with gsc working. The driver should also not be half-initialized in the normal case neither. Cc: Riana Tauro Cc: Alexander Usyskin Reviewed-by: Jonathan Cavitt Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-10-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_device.c | 6 ++-- drivers/gpu/drm/xe/xe_heci_gsc.c | 35 +++++++++------------- drivers/gpu/drm/xe/xe_heci_gsc.h | 3 +- drivers/gpu/drm/xe/xe_survivability_mode.c | 5 ++-- 4 files changed, 21 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index ef269227b64b..5ef8cffbc88f 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -851,7 +851,9 @@ int xe_device_probe(struct xe_device *xe) return err; } - xe_heci_gsc_init(xe); + err = xe_heci_gsc_init(xe); + if (err) + return err; err = xe_oa_init(xe); if (err) @@ -903,8 +905,6 @@ void xe_device_remove(struct xe_device *xe) xe_display_unregister(xe); drm_dev_unplug(&xe->drm); - - xe_heci_gsc_fini(xe); } void xe_device_shutdown(struct xe_device *xe) diff --git a/drivers/gpu/drm/xe/xe_heci_gsc.c b/drivers/gpu/drm/xe/xe_heci_gsc.c index 992ee47abcdb..3ea325d3db99 100644 --- a/drivers/gpu/drm/xe/xe_heci_gsc.c +++ b/drivers/gpu/drm/xe/xe_heci_gsc.c @@ -89,12 +89,9 @@ static void heci_gsc_release_dev(struct device *dev) kfree(adev); } -void xe_heci_gsc_fini(struct xe_device *xe) +static void xe_heci_gsc_fini(void *arg) { - struct xe_heci_gsc *heci_gsc = &xe->heci_gsc; - - if (!xe->info.has_heci_gscfi && !xe->info.has_heci_cscfi) - return; + struct xe_heci_gsc *heci_gsc = arg; if (heci_gsc->adev) { struct auxiliary_device *aux_dev = &heci_gsc->adev->aux_dev; @@ -106,6 +103,7 @@ void xe_heci_gsc_fini(struct xe_device *xe) if (heci_gsc->irq >= 0) irq_free_desc(heci_gsc->irq); + heci_gsc->irq = -1; } @@ -172,14 +170,14 @@ static int heci_gsc_add_device(struct xe_device *xe, const struct heci_gsc_def * return ret; } -void xe_heci_gsc_init(struct xe_device *xe) +int xe_heci_gsc_init(struct xe_device *xe) { struct xe_heci_gsc *heci_gsc = &xe->heci_gsc; const struct heci_gsc_def *def; int ret; if (!xe->info.has_heci_gscfi && !xe->info.has_heci_cscfi) - return; + return 0; heci_gsc->irq = -1; @@ -191,29 +189,24 @@ void xe_heci_gsc_init(struct xe_device *xe) def = &heci_gsc_def_dg2; } else if (xe->info.platform == XE_DG1) { def = &heci_gsc_def_dg1; - } else { - drm_warn_once(&xe->drm, "Unknown platform\n"); - return; } - if (!def->name) { - drm_warn_once(&xe->drm, "HECI is not implemented!\n"); - return; + if (!def || !def->name) { + drm_warn(&xe->drm, "HECI is not implemented!\n"); + return 0; } + ret = devm_add_action_or_reset(xe->drm.dev, xe_heci_gsc_fini, heci_gsc); + if (ret) + return ret; + if (!def->use_polling && !xe_survivability_mode_is_enabled(xe)) { ret = heci_gsc_irq_setup(xe); if (ret) - goto fail; + return ret; } - ret = heci_gsc_add_device(xe, def); - if (ret) - goto fail; - - return; -fail: - xe_heci_gsc_fini(xe); + return heci_gsc_add_device(xe, def); } void xe_heci_gsc_irq_handler(struct xe_device *xe, u32 iir) diff --git a/drivers/gpu/drm/xe/xe_heci_gsc.h b/drivers/gpu/drm/xe/xe_heci_gsc.h index 48b3b1838045..745eb6783942 100644 --- a/drivers/gpu/drm/xe/xe_heci_gsc.h +++ b/drivers/gpu/drm/xe/xe_heci_gsc.h @@ -33,8 +33,7 @@ struct xe_heci_gsc { int irq; }; -void xe_heci_gsc_init(struct xe_device *xe); -void xe_heci_gsc_fini(struct xe_device *xe); +int xe_heci_gsc_init(struct xe_device *xe); void xe_heci_gsc_irq_handler(struct xe_device *xe, u32 iir); void xe_heci_csc_irq_handler(struct xe_device *xe, u32 iir); diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c index 7ba02e085b5b..d939ce70e6fa 100644 --- a/drivers/gpu/drm/xe/xe_survivability_mode.c +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c @@ -134,7 +134,6 @@ static void xe_survivability_mode_fini(void *arg) struct device *dev = &pdev->dev; sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr); - xe_heci_gsc_fini(xe); } static int enable_survivability_mode(struct pci_dev *pdev) @@ -156,7 +155,9 @@ static int enable_survivability_mode(struct pci_dev *pdev) if (ret) return ret; - xe_heci_gsc_init(xe); + ret = xe_heci_gsc_init(xe); + if (ret) + return ret; xe_vsec_init(xe); -- 2.51.0 From 1671c9617d7e987f7cb815a77dcb2dbcf6d28988 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Fri, 21 Feb 2025 16:10:50 -0800 Subject: [PATCH 09/16] drm/xe: Rename update_device_info() after sriov This is only changing info flags for SR-IOV reasons. Rename it accordingly, because there are several other places in probe where the flags are updated, which is not inside this function. Cc: Michal Wajdeczko Reviewed-by: Jonathan Cavitt Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-11-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 5ef8cffbc88f..ed1cc5983f74 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -660,7 +660,7 @@ static int wait_for_lmem_ready(struct xe_device *xe) } ALLOW_ERROR_INJECTION(wait_for_lmem_ready, ERRNO); /* See xe_pci_probe() */ -static void update_device_info(struct xe_device *xe) +static void sriov_update_device_info(struct xe_device *xe) { /* disable features that are not available/applicable to VFs */ if (IS_SRIOV_VF(xe)) { @@ -691,7 +691,7 @@ int xe_device_probe_early(struct xe_device *xe) xe_sriov_probe_early(xe); - update_device_info(xe); + sriov_update_device_info(xe); err = xe_pcode_probe_early(xe); if (err) -- 2.51.0 From 35359c36356a4226af1ba3956d48abf7ed136ebb Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Fri, 21 Feb 2025 16:10:51 -0800 Subject: [PATCH 10/16] drm/xe: Stop ignoring errors from xe_ttm_sys_mgr_init() xe_ttm_sys_mgr_init() already cleans up after itself, just return error if that failed. Reviewed-by: Jonathan Cavitt Reviewed-by: Tejas Upadhyay Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-12-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_device.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index ed1cc5983f74..c9ab79da3f9f 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -748,6 +748,7 @@ int xe_device_probe(struct xe_device *xe) return err; xe->info.mem_region_mask = 1; + err = xe_set_dma_info(xe); if (err) return err; @@ -756,7 +757,9 @@ int xe_device_probe(struct xe_device *xe) if (err) return err; - xe_ttm_sys_mgr_init(xe); + err = xe_ttm_sys_mgr_init(xe); + if (err) + return err; for_each_gt(gt, xe, id) { err = xe_gt_init_early(gt); -- 2.51.0 From 6b68c4542ffecc36087a9e14db8fc990c88bb01b Mon Sep 17 00:00:00 2001 From: Mingcong Bai Date: Tue, 25 Feb 2025 15:31:01 +0800 Subject: [PATCH 11/16] drm/xe/regs: remove a duplicate definition for RING_CTL_SIZE(size) Commit b79e8fd954c4 ("drm/xe: Remove dependency on intel_engine_regs.h") introduced an internal set of engine registers, however, as part of this change, it has also introduced two duplicate `define' lines for `RING_CTL_SIZE(size)'. This commit was introduced to the tree in v6.8-rc1. While this is harmless as the definitions did not change, so no compiler warning was observed. Drop this line anyway for the sake of correctness. Cc: stable@vger.kernel.org # v6.8-rc1+ Fixes: b79e8fd954c4 ("drm/xe: Remove dependency on intel_engine_regs.h") Signed-off-by: Mingcong Bai Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20250225073104.865230-1-jeffbai@aosc.io Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/regs/xe_engine_regs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h index c8fd3d5ca502..4f372dc2cb89 100644 --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h @@ -53,7 +53,6 @@ #define RING_CTL(base) XE_REG((base) + 0x3c) #define RING_CTL_SIZE(size) ((size) - PAGE_SIZE) /* in bytes -> pages */ -#define RING_CTL_SIZE(size) ((size) - PAGE_SIZE) /* in bytes -> pages */ #define RING_START_UDW(base) XE_REG((base) + 0x48) -- 2.51.0 From 18fbd567e75f9b97b699b2ab4f1fa76b7cf268f6 Mon Sep 17 00:00:00 2001 From: Tejas Upadhyay Date: Tue, 25 Feb 2025 10:27:54 +0530 Subject: [PATCH 12/16] drm/xe: cancel pending job timer before freeing scheduler The async call to __guc_exec_queue_fini_async frees the scheduler while a submission may time out and restart. To prevent this race condition, the pending job timer should be canceled before freeing the scheduler. V3(MattB): - Adjust position of cancel pending job - Remove gitlab issue# from commit message V2(MattB): - Cancel pending jobs before scheduler finish Fixes: a20c75dba192 ("drm/xe: Call __guc_exec_queue_fini_async direct for KERNEL exec_queues") Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20250225045754.600905-1-tejas.upadhyay@intel.com Signed-off-by: Tejas Upadhyay --- drivers/gpu/drm/xe/xe_guc_submit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 913c74d6e2ae..b6a2dd742ebd 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -1248,6 +1248,8 @@ static void __guc_exec_queue_fini_async(struct work_struct *w) if (xe_exec_queue_is_lr(q)) cancel_work_sync(&ge->lr_tdr); + /* Confirm no work left behind accessing device structures */ + cancel_delayed_work_sync(&ge->sched.base.work_tdr); release_guc_id(guc, q); xe_sched_entity_fini(&ge->entity); xe_sched_fini(&ge->sched); -- 2.51.0 From 4f109b061c12d63b332338ce9192593842fa09a4 Mon Sep 17 00:00:00 2001 From: Francois Dugast Date: Tue, 25 Feb 2025 20:57:33 +0100 Subject: [PATCH 13/16] drm/xe/gt_stats: Use atomic64_t for counters The stats counters are now used for things like counting the VMA bytes during page faults. During workload execution, the counter value can grow fast and easily reach the atomic int limit, in which case it overflows. To make this less likely to happen, push the limit by switching to 64b atomic to store the counter value. Overhead is very small as there are only 3 stat entries per GT as of now, and stats are only enabled with CONFIG_DEBUG_FS. Suggested-by: Matthew Auld Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20250225195902.1247100-2-francois.dugast@intel.com Signed-off-by: Francois Dugast --- drivers/gpu/drm/xe/xe_gt_stats.c | 6 +++--- drivers/gpu/drm/xe/xe_gt_types.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_stats.c b/drivers/gpu/drm/xe/xe_gt_stats.c index 2e9879ea4674..af3fd03f665c 100644 --- a/drivers/gpu/drm/xe/xe_gt_stats.c +++ b/drivers/gpu/drm/xe/xe_gt_stats.c @@ -23,7 +23,7 @@ void xe_gt_stats_incr(struct xe_gt *gt, const enum xe_gt_stats_id id, int incr) if (id >= __XE_GT_STATS_NUM_IDS) return; - atomic_add(incr, >->stats.counters[id]); + atomic64_add(incr, >->stats.counters[id]); } static const char *const stat_description[__XE_GT_STATS_NUM_IDS] = { @@ -44,8 +44,8 @@ int xe_gt_stats_print_info(struct xe_gt *gt, struct drm_printer *p) enum xe_gt_stats_id id; for (id = 0; id < __XE_GT_STATS_NUM_IDS; ++id) - drm_printf(p, "%s: %d\n", stat_description[id], - atomic_read(>->stats.counters[id])); + drm_printf(p, "%s: %lld\n", stat_description[id], + atomic64_read(>->stats.counters[id])); return 0; } diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h index 6e66bf0e8b3f..f72b965cc9e6 100644 --- a/drivers/gpu/drm/xe/xe_gt_types.h +++ b/drivers/gpu/drm/xe/xe_gt_types.h @@ -139,7 +139,7 @@ struct xe_gt { /** @stats: GT stats */ struct { /** @stats.counters: counters for various GT stats */ - atomic_t counters[__XE_GT_STATS_NUM_IDS]; + atomic64_t counters[__XE_GT_STATS_NUM_IDS]; } stats; #endif -- 2.51.0 From 278d4f429143d1c5e7c4deb7d7147063da12606d Mon Sep 17 00:00:00 2001 From: Francois Dugast Date: Tue, 25 Feb 2025 20:57:34 +0100 Subject: [PATCH 14/16] drm/xe/gt_pagefault: Change vma_pagefault unit to kilobyte Increase the amount of bytes that can be counted before the counter overflows, while not losing information as the VMA is not expected to have sub-kilobyte size. Suggested-by: Matthew Auld Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20250225195902.1247100-3-francois.dugast@intel.com Signed-off-by: Francois Dugast --- drivers/gpu/drm/xe/xe_gt_pagefault.c | 2 +- drivers/gpu/drm/xe/xe_gt_stats.c | 2 +- drivers/gpu/drm/xe/xe_gt_stats_types.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c index 46701ca11ce0..17d69039b866 100644 --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c @@ -137,7 +137,7 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct pagefault *pf, bool atomic; xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_COUNT, 1); - xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_BYTES, xe_vma_size(vma)); + xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_KB, xe_vma_size(vma) / 1024); trace_xe_vma_pagefault(vma); atomic = access_is_atomic(pf->access_type); diff --git a/drivers/gpu/drm/xe/xe_gt_stats.c b/drivers/gpu/drm/xe/xe_gt_stats.c index af3fd03f665c..6155ea354432 100644 --- a/drivers/gpu/drm/xe/xe_gt_stats.c +++ b/drivers/gpu/drm/xe/xe_gt_stats.c @@ -29,7 +29,7 @@ void xe_gt_stats_incr(struct xe_gt *gt, const enum xe_gt_stats_id id, int incr) static const char *const stat_description[__XE_GT_STATS_NUM_IDS] = { "tlb_inval_count", "vma_pagefault_count", - "vma_pagefault_bytes", + "vma_pagefault_kb", }; /** diff --git a/drivers/gpu/drm/xe/xe_gt_stats_types.h b/drivers/gpu/drm/xe/xe_gt_stats_types.h index b072bd80c4b9..d556771f99d6 100644 --- a/drivers/gpu/drm/xe/xe_gt_stats_types.h +++ b/drivers/gpu/drm/xe/xe_gt_stats_types.h @@ -9,7 +9,7 @@ enum xe_gt_stats_id { XE_GT_STATS_ID_TLB_INVAL, XE_GT_STATS_ID_VMA_PAGEFAULT_COUNT, - XE_GT_STATS_ID_VMA_PAGEFAULT_BYTES, + XE_GT_STATS_ID_VMA_PAGEFAULT_KB, /* must be the last entry */ __XE_GT_STATS_NUM_IDS, }; -- 2.51.0 From 8c5fe7d88bc1c12662a804fd75edb6ac85225ce2 Mon Sep 17 00:00:00 2001 From: Aradhya Bhatia Date: Thu, 20 Feb 2025 15:16:44 +0530 Subject: [PATCH 15/16] drm/xe: Add Wa_16021333562 and Wa_14016712196 Wa_16021333562 and Wa_14016712196 are permanent workarounds that apply to multiple platforms. Wa_16021333562 applies to platforms ranging from TGL (12.00) to Xe_LPM (13.00), while Wa_14016712196 from DG2 (12.55) to Xe_LPG (12.74). Reviewed-by: Tejas Upadhyay Signed-off-by: Aradhya Bhatia Link: https://patchwork.freedesktop.org/patch/msgid/20250220094645.358647-2-aradhya.bhatia@intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_guc_ads.c | 2 +- drivers/gpu/drm/xe/xe_ring_ops.c | 4 ++++ drivers/gpu/drm/xe/xe_wa_oob.rules | 4 ++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c index fab259adc380..e7c9e095a19f 100644 --- a/drivers/gpu/drm/xe/xe_guc_ads.c +++ b/drivers/gpu/drm/xe/xe_guc_ads.c @@ -342,7 +342,7 @@ static void guc_waklv_init(struct xe_guc_ads *ads) offset = guc_ads_waklv_offset(ads); remain = guc_ads_waklv_size(ads); - if (XE_WA(gt, 14019882105)) + if (XE_WA(gt, 14019882105) || XE_WA(gt, 16021333562)) guc_waklv_enable_simple(ads, GUC_WORKAROUND_KLV_BLOCK_INTERRUPTS_WHEN_MGSR_BLOCKED, &offset, &remain); diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c index 0c230ee53bba..d2f604aa96fa 100644 --- a/drivers/gpu/drm/xe/xe_ring_ops.c +++ b/drivers/gpu/drm/xe/xe_ring_ops.c @@ -177,6 +177,10 @@ static int emit_render_cache_flush(struct xe_sched_job *job, u32 *dw, int i) bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK); u32 flags; + if (XE_WA(gt, 14016712196)) + i = emit_pipe_control(dw, i, 0, PIPE_CONTROL_DEPTH_CACHE_FLUSH, + LRC_PPHWSP_FLUSH_INVAL_SCRATCH_ADDR, 0); + flags = (PIPE_CONTROL_CS_STALL | PIPE_CONTROL_TILE_CACHE_FLUSH | PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules b/drivers/gpu/drm/xe/xe_wa_oob.rules index 228436532282..ea72bcc02e1e 100644 --- a/drivers/gpu/drm/xe/xe_wa_oob.rules +++ b/drivers/gpu/drm/xe/xe_wa_oob.rules @@ -43,3 +43,7 @@ no_media_l3 MEDIA_VERSION(3000) 14022866841 GRAPHICS_VERSION(3000), GRAPHICS_STEP(A0, B0) MEDIA_VERSION(3000), MEDIA_STEP(A0, B0) +16021333562 GRAPHICS_VERSION_RANGE(1200, 1274) + MEDIA_VERSION(1300) +14016712196 GRAPHICS_VERSION(1255) + GRAPHICS_VERSION_RANGE(1270, 1274) -- 2.51.0 From eef3ede533aea7a40e2f72a7886da4827f10eeac Mon Sep 17 00:00:00 2001 From: Aradhya Bhatia Date: Thu, 20 Feb 2025 15:16:45 +0530 Subject: [PATCH 16/16] drm/xe/oa: Refactor WAs to use XE_WA() macro Refactor Wa_18013179988, Wa_14015568240, Wa_1508761755, and Wa_1509372804, to use the proper workaround-check implementation for out-of-band workarounds, XE_WA(), and drop the use of the platform based WA selection. Reviewed-by: Tejas Upadhyay Reviewed-by: Lucas De Marchi Signed-off-by: Aradhya Bhatia Link: https://patchwork.freedesktop.org/patch/msgid/20250220094645.358647-3-aradhya.bhatia@intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_oa.c | 30 +++++++++--------------------- drivers/gpu/drm/xe/xe_wa_oob.rules | 5 +++++ 2 files changed, 14 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index 6bf5b793b29f..6f185632da14 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -12,6 +12,8 @@ #include #include +#include + #include "abi/guc_actions_slpc_abi.h" #include "instructions/xe_mi_commands.h" #include "regs/xe_engine_regs.h" @@ -35,6 +37,7 @@ #include "xe_sched_job.h" #include "xe_sriov.h" #include "xe_sync.h" +#include "xe_wa.h" #define DEFAULT_POLL_FREQUENCY_HZ 200 #define DEFAULT_POLL_PERIOD_NS (NSEC_PER_SEC / DEFAULT_POLL_FREQUENCY_HZ) @@ -812,11 +815,8 @@ static void xe_oa_disable_metric_set(struct xe_oa_stream *stream) struct xe_mmio *mmio = &stream->gt->mmio; u32 sqcnt1; - /* - * Wa_1508761755:xehpsdv, dg2 - * Enable thread stall DOP gating and EU DOP gating. - */ - if (stream->oa->xe->info.platform == XE_DG2) { + /* Enable thread stall DOP gating and EU DOP gating. */ + if (XE_WA(stream->gt, 1508761755)) { xe_gt_mcr_multicast_write(stream->gt, ROW_CHICKEN, _MASKED_BIT_DISABLE(STALL_DOP_GATING_DISABLE)); xe_gt_mcr_multicast_write(stream->gt, ROW_CHICKEN2, @@ -1065,11 +1065,10 @@ static int xe_oa_enable_metric_set(struct xe_oa_stream *stream) int ret; /* - * Wa_1508761755:xehpsdv, dg2 * EU NOA signals behave incorrectly if EU clock gating is enabled. * Disable thread stall DOP gating and EU DOP gating. */ - if (stream->oa->xe->info.platform == XE_DG2) { + if (XE_WA(stream->gt, 1508761755)) { xe_gt_mcr_multicast_write(stream->gt, ROW_CHICKEN, _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE)); xe_gt_mcr_multicast_write(stream->gt, ROW_CHICKEN2, @@ -1720,12 +1719,10 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream, } /* - * Wa_1509372804:pvc - * * GuC reset of engines causes OA to lose configuration * state. Prevent this by overriding GUCRC mode. */ - if (stream->oa->xe->info.platform == XE_PVC) { + if (XE_WA(stream->gt, 1509372804)) { ret = xe_guc_pc_override_gucrc_mode(>->uc.guc.pc, SLPC_GUCRC_MODE_GUCRC_NO_RC6); if (ret) @@ -1857,23 +1854,14 @@ u32 xe_oa_timestamp_frequency(struct xe_gt *gt) { u32 reg, shift; - /* - * Wa_18013179988:dg2 - * Wa_14015568240:pvc - * Wa_14015846243:mtl - */ - switch (gt_to_xe(gt)->info.platform) { - case XE_DG2: - case XE_PVC: - case XE_METEORLAKE: + if (XE_WA(gt, 18013179988) || XE_WA(gt, 14015568240)) { xe_pm_runtime_get(gt_to_xe(gt)); reg = xe_mmio_read32(>->mmio, RPM_CONFIG0); xe_pm_runtime_put(gt_to_xe(gt)); shift = REG_FIELD_GET(RPM_CONFIG0_CTC_SHIFT_PARAMETER_MASK, reg); return gt->info.reference_clock << (3 - shift); - - default: + } else { return gt->info.reference_clock; } } diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules b/drivers/gpu/drm/xe/xe_wa_oob.rules index ea72bcc02e1e..1dd02a231926 100644 --- a/drivers/gpu/drm/xe/xe_wa_oob.rules +++ b/drivers/gpu/drm/xe/xe_wa_oob.rules @@ -47,3 +47,8 @@ no_media_l3 MEDIA_VERSION(3000) MEDIA_VERSION(1300) 14016712196 GRAPHICS_VERSION(1255) GRAPHICS_VERSION_RANGE(1270, 1274) +14015568240 GRAPHICS_VERSION_RANGE(1255, 1260) +18013179988 GRAPHICS_VERSION(1255) + GRAPHICS_VERSION_RANGE(1270, 1274) +1508761755 GRAPHICS_VERSION(1255) + GRAPHICS_VERSION(1260), GRAPHICS_STEP(A0, B0) -- 2.51.0