From 96d01ef3b106799dc6fcecfe03ceb0ccc14a2d54 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@intel.com>
Date: Fri, 21 Feb 2025 16:10:42 -0800
Subject: [PATCH 01/16] drivers: base: devres: Fix find_group() documentation

It returns the last open group, not the last group.

Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-3-lucas.demarchi@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/base/devres.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/base/devres.c b/drivers/base/devres.c
index 68224f2f83ff..830e9f4eb148 100644
--- a/drivers/base/devres.c
+++ b/drivers/base/devres.c
@@ -576,7 +576,10 @@ void *devres_open_group(struct device *dev, void *id, gfp_t gfp)
 }
 EXPORT_SYMBOL_GPL(devres_open_group);
 
-/* Find devres group with ID @id.  If @id is NULL, look for the latest. */
+/*
+ * Find devres group with ID @id.  If @id is NULL, look for the latest open
+ * group.
+ */
 static struct devres_group *find_group(struct device *dev, void *id)
 {
 	struct devres_node *node;
-- 
2.51.0


From 2babfdfe2e9bd0b6aad30684c92b08c57d476d88 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@intel.com>
Date: Fri, 21 Feb 2025 16:10:43 -0800
Subject: [PATCH 02/16] drivers: base: component: Add debug message for unbind

Like when binding component, add a debug message to the unbinding case
to make it easy to track the lifecycle. This also includes the component
pointer since that is used to open a group in devres, making it easier
to track the resources.

Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Tejas Upadhyay <tejas.upadhyay@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-4-lucas.demarchi@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/base/component.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/base/component.c b/drivers/base/component.c
index 741497324d78..5d10600bbc25 100644
--- a/drivers/base/component.c
+++ b/drivers/base/component.c
@@ -574,6 +574,9 @@ static void component_unbind(struct component *component,
 {
 	WARN_ON(!component->bound);
 
+	dev_dbg(adev->parent, "unbinding %s component %p (ops %ps)\n",
+		dev_name(component->dev), component, component->ops);
+
 	if (component->ops && component->ops->unbind)
 		component->ops->unbind(component->dev, adev->parent, data);
 	component->bound = false;
-- 
2.51.0


From 83e3d0876754f820cb2adef55275d09d31676020 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@intel.com>
Date: Fri, 21 Feb 2025 16:10:44 -0800
Subject: [PATCH 03/16] drm/xe: Stop setting drvdata to NULL

PCI subsystem is not supposed to call the remove() function when probe
fails and doesn't need a protection for that. The only places checking
for NULL drvdata, is on 2 sysfs files and they shouldn't be needed since
the files are removed and reads on open fds just return an error.

For this protection the core driver implementation in
drivers/base/dd.c:device_unbind_cleanup() already sets it to NULL, after
the release of dev resources.

Remove the setting to NULL so it's possible to obtain the xe pointer
from callbacks like the component unbind from device_unbind_cleanup(),
i.e. after xe_pci_remove() already finished.

Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-5-lucas.demarchi@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/xe/xe_device_sysfs.c       | 6 ------
 drivers/gpu/drm/xe/xe_pci.c                | 7 +------
 drivers/gpu/drm/xe/xe_survivability_mode.c | 1 -
 3 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_device_sysfs.c b/drivers/gpu/drm/xe/xe_device_sysfs.c
index 7375937934fa..7efbd4c52791 100644
--- a/drivers/gpu/drm/xe/xe_device_sysfs.c
+++ b/drivers/gpu/drm/xe/xe_device_sysfs.c
@@ -32,9 +32,6 @@ vram_d3cold_threshold_show(struct device *dev,
 	struct xe_device *xe = pdev_to_xe_device(pdev);
 	int ret;
 
-	if (!xe)
-		return -EINVAL;
-
 	xe_pm_runtime_get(xe);
 	ret = sysfs_emit(buf, "%d\n", xe->d3cold.vram_threshold);
 	xe_pm_runtime_put(xe);
@@ -51,9 +48,6 @@ vram_d3cold_threshold_store(struct device *dev, struct device_attribute *attr,
 	u32 vram_d3cold_threshold;
 	int ret;
 
-	if (!xe)
-		return -EINVAL;
-
 	ret = kstrtou32(buff, 0, &vram_d3cold_threshold);
 	if (ret)
 		return ret;
diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
index f8417f4d8ce6..078cc8d96085 100644
--- a/drivers/gpu/drm/xe/xe_pci.c
+++ b/drivers/gpu/drm/xe/xe_pci.c
@@ -765,11 +765,7 @@ static int xe_info_init(struct xe_device *xe,
 
 static void xe_pci_remove(struct pci_dev *pdev)
 {
-	struct xe_device *xe;
-
-	xe = pdev_to_xe_device(pdev);
-	if (!xe) /* driver load aborted, nothing to cleanup */
-		return;
+	struct xe_device *xe = pdev_to_xe_device(pdev);
 
 	if (IS_SRIOV_PF(xe))
 		xe_pci_sriov_configure(pdev, 0);
@@ -779,7 +775,6 @@ static void xe_pci_remove(struct pci_dev *pdev)
 
 	xe_device_remove(xe);
 	xe_pm_runtime_fini(xe);
-	pci_set_drvdata(pdev, NULL);
 }
 
 /*
diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c
index 02b4eadf8407..04a341606a7c 100644
--- a/drivers/gpu/drm/xe/xe_survivability_mode.c
+++ b/drivers/gpu/drm/xe/xe_survivability_mode.c
@@ -202,7 +202,6 @@ void xe_survivability_mode_remove(struct xe_device *xe)
 	sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
 	xe_heci_gsc_fini(xe);
 	kfree(survivability->info);
-	pci_set_drvdata(pdev, NULL);
 }
 
 /**
-- 
2.51.0


From 01b1ace3b48171c4cbdd9b2e79e25099f6e3c861 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@intel.com>
Date: Fri, 21 Feb 2025 16:10:45 -0800
Subject: [PATCH 04/16] drm/xe: Switch from xe to devm actions

Now that component drivers are compatible with devm, switch to using it
instead of our own.

Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Reviewed-by: Tejas Upadhyay <tejas.upadhyay@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-6-lucas.demarchi@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/xe/display/xe_display.c | 2 +-
 drivers/gpu/drm/xe/xe_gsc_proxy.c       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c
index 7fef78f5606d..1909effd35a9 100644
--- a/drivers/gpu/drm/xe/display/xe_display.c
+++ b/drivers/gpu/drm/xe/display/xe_display.c
@@ -184,7 +184,7 @@ int xe_display_init(struct xe_device *xe)
 	if (err)
 		return err;
 
-	return xe_device_add_action_or_reset(xe, xe_display_fini, xe);
+	return devm_add_action_or_reset(xe->drm.dev, xe_display_fini, xe);
 }
 
 void xe_display_register(struct xe_device *xe)
diff --git a/drivers/gpu/drm/xe/xe_gsc_proxy.c b/drivers/gpu/drm/xe/xe_gsc_proxy.c
index 31c90577faf0..8cf70b228ff3 100644
--- a/drivers/gpu/drm/xe/xe_gsc_proxy.c
+++ b/drivers/gpu/drm/xe/xe_gsc_proxy.c
@@ -490,7 +490,7 @@ int xe_gsc_proxy_init(struct xe_gsc *gsc)
 
 	gsc->proxy.component_added = true;
 
-	return xe_device_add_action_or_reset(xe, xe_gsc_proxy_remove, gsc);
+	return devm_add_action_or_reset(xe->drm.dev, xe_gsc_proxy_remove, gsc);
 }
 
 /**
-- 
2.51.0


From d01bdc00254c2d12d36b0dbb5d098286edeb00ea Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@intel.com>
Date: Fri, 21 Feb 2025 16:10:46 -0800
Subject: [PATCH 05/16] drm/xe: Drop remove callback support

Now that devres supports component driver cleanup during driver removal
cleanup, the xe custom support for removal callbacks is not needed
anymore. Drop it.

Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-7-lucas.demarchi@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/xe/xe_device.c       | 68 ----------------------------
 drivers/gpu/drm/xe/xe_device.h       |  3 --
 drivers/gpu/drm/xe/xe_device_types.h | 14 ------
 drivers/gpu/drm/xe/xe_pci.c          |  4 +-
 4 files changed, 1 insertion(+), 88 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 06ccff145050..858b3e5da9c5 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -65,12 +65,6 @@
 
 #include <generated/xe_wa_oob.h>
 
-struct xe_device_remove_action {
-	struct list_head node;
-	void (*action)(void *);
-	void *data;
-};
-
 static int xe_file_open(struct drm_device *dev, struct drm_file *file)
 {
 	struct xe_device *xe = to_xe_device(dev);
@@ -752,9 +746,6 @@ int xe_device_probe(struct xe_device *xe)
 	int err;
 	u8 id;
 
-	xe->probing = true;
-	INIT_LIST_HEAD(&xe->remove_action_list);
-
 	xe_pat_init_early(xe);
 
 	err = xe_sriov_init(xe);
@@ -904,8 +895,6 @@ int xe_device_probe(struct xe_device *xe)
 
 	xe_vsec_init(xe);
 
-	xe->probing = false;
-
 	return devm_add_action_or_reset(xe->drm.dev, xe_device_sanitize, xe);
 
 err_unregister_display:
@@ -916,61 +905,6 @@ err_remove_display:
 	return err;
 }
 
-/**
- * xe_device_call_remove_actions - Call the remove actions
- * @xe: xe device instance
- *
- * This is only to be used by xe_pci and xe_device to call the remove actions
- * while removing the driver or handling probe failures.
- */
-void xe_device_call_remove_actions(struct xe_device *xe)
-{
-	struct xe_device_remove_action *ra, *tmp;
-
-	list_for_each_entry_safe(ra, tmp, &xe->remove_action_list, node) {
-		ra->action(ra->data);
-		list_del(&ra->node);
-		kfree(ra);
-	}
-
-	xe->probing = false;
-}
-
-/**
- * xe_device_add_action_or_reset - Add an action to run on driver removal
- * @xe: xe device instance
- * @action: Function that should be called on device remove
- * @data: Pointer to data passed to @action implementation
- *
- * This adds a custom action to the list of remove callbacks executed on device
- * remove, before any dev or drm managed resources are removed.  This is only
- * needed if the action leads to component_del()/component_master_del() since
- * that is not compatible with devres cleanup.
- *
- * Returns: 0 on success or a negative error code on failure, in which case
- * @action is already called.
- */
-int xe_device_add_action_or_reset(struct xe_device *xe,
-				  void (*action)(void *), void *data)
-{
-	struct xe_device_remove_action *ra;
-
-	drm_WARN_ON(&xe->drm, !xe->probing);
-
-	ra = kmalloc(sizeof(*ra), GFP_KERNEL);
-	if (!ra) {
-		action(data);
-		return -ENOMEM;
-	}
-
-	INIT_LIST_HEAD(&ra->node);
-	ra->action = action;
-	ra->data = data;
-	list_add(&ra->node, &xe->remove_action_list);
-
-	return 0;
-}
-
 void xe_device_remove(struct xe_device *xe)
 {
 	xe_display_unregister(xe);
@@ -980,8 +914,6 @@ void xe_device_remove(struct xe_device *xe)
 	xe_display_driver_remove(xe);
 
 	xe_heci_gsc_fini(xe);
-
-	xe_device_call_remove_actions(xe);
 }
 
 void xe_device_shutdown(struct xe_device *xe)
diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
index 079dad32a6f5..0bc3bc8e6803 100644
--- a/drivers/gpu/drm/xe/xe_device.h
+++ b/drivers/gpu/drm/xe/xe_device.h
@@ -45,9 +45,6 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
 				   const struct pci_device_id *ent);
 int xe_device_probe_early(struct xe_device *xe);
 int xe_device_probe(struct xe_device *xe);
-int xe_device_add_action_or_reset(struct xe_device *xe,
-				  void (*action)(void *), void *data);
-void xe_device_call_remove_actions(struct xe_device *xe);
 void xe_device_remove(struct xe_device *xe);
 void xe_device_shutdown(struct xe_device *xe);
 
diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index 4cf08c408b95..28d10a1d7b64 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -428,20 +428,6 @@ struct xe_device {
 	/** @tiles: device tiles */
 	struct xe_tile tiles[XE_MAX_TILES_PER_DEVICE];
 
-	/**
-	 * @remove_action_list: list of actions to execute on device remove.
-	 * Use xe_device_add_remove_action() for that. Actions can only be added
-	 * during probe and are executed during the call from PCI subsystem to
-	 * remove the driver from the device.
-	 */
-	struct list_head remove_action_list;
-
-	/**
-	 * @probing: cover the section in which @remove_action_list can be used
-	 * to post cleaning actions
-	 */
-	bool probing;
-
 	/**
 	 * @mem_access: keep track of memory access in the device, possibly
 	 * triggering additional actions when they occur.
diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
index 078cc8d96085..a0f4bd45b61b 100644
--- a/drivers/gpu/drm/xe/xe_pci.c
+++ b/drivers/gpu/drm/xe/xe_pci.c
@@ -895,10 +895,8 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		return err;
 
 	err = xe_device_probe(xe);
-	if (err) {
-		xe_device_call_remove_actions(xe);
+	if (err)
 		return err;
-	}
 
 	err = xe_pm_init(xe);
 	if (err)
-- 
2.51.0


From d41d048043c47a5fce1879e8e95dc93a573d3708 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@intel.com>
Date: Fri, 21 Feb 2025 16:10:47 -0800
Subject: [PATCH 06/16] drm/xe/display: Drop xe_display_driver_remove()

Handle it as part of xe_display_fini(). The error handling was already
calling it if a step after xe_display_init() failed. Just re-use the
same xe_display_fini() for driver remove.

Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-8-lucas.demarchi@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/xe/display/xe_display.c | 11 +----------
 drivers/gpu/drm/xe/display/xe_display.h |  1 -
 drivers/gpu/drm/xe/xe_device.c          |  8 ++------
 3 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c
index 1909effd35a9..279b786d64dc 100644
--- a/drivers/gpu/drm/xe/display/xe_display.c
+++ b/drivers/gpu/drm/xe/display/xe_display.c
@@ -170,6 +170,7 @@ static void xe_display_fini(void *arg)
 	intel_hpd_poll_fini(xe);
 	intel_hdcp_component_fini(display);
 	intel_audio_deinit(xe);
+	intel_display_driver_remove(display);
 }
 
 int xe_display_init(struct xe_device *xe)
@@ -209,16 +210,6 @@ void xe_display_unregister(struct xe_device *xe)
 	intel_display_driver_unregister(display);
 }
 
-void xe_display_driver_remove(struct xe_device *xe)
-{
-	struct intel_display *display = &xe->display;
-
-	if (!xe->info.probe_display)
-		return;
-
-	intel_display_driver_remove(display);
-}
-
 /* IRQ-related functions */
 
 void xe_display_irq_handler(struct xe_device *xe, u32 master_ctl)
diff --git a/drivers/gpu/drm/xe/display/xe_display.h b/drivers/gpu/drm/xe/display/xe_display.h
index 685dc74402fb..46e14f8dee28 100644
--- a/drivers/gpu/drm/xe/display/xe_display.h
+++ b/drivers/gpu/drm/xe/display/xe_display.h
@@ -14,7 +14,6 @@ struct drm_driver;
 
 bool xe_display_driver_probe_defer(struct pci_dev *pdev);
 void xe_display_driver_set_hooks(struct drm_driver *driver);
-void xe_display_driver_remove(struct xe_device *xe);
 
 int xe_display_create(struct xe_device *xe);
 
diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 858b3e5da9c5..d50ac3d43511 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -868,11 +868,11 @@ int xe_device_probe(struct xe_device *xe)
 
 	err = xe_pxp_init(xe);
 	if (err)
-		goto err_remove_display;
+		return err;
 
 	err = drm_dev_register(&xe->drm, 0);
 	if (err)
-		goto err_remove_display;
+		return err;
 
 	xe_display_register(xe);
 
@@ -899,8 +899,6 @@ int xe_device_probe(struct xe_device *xe)
 
 err_unregister_display:
 	xe_display_unregister(xe);
-err_remove_display:
-	xe_display_driver_remove(xe);
 
 	return err;
 }
@@ -911,8 +909,6 @@ void xe_device_remove(struct xe_device *xe)
 
 	drm_dev_unplug(&xe->drm);
 
-	xe_display_driver_remove(xe);
-
 	xe_heci_gsc_fini(xe);
 }
 
-- 
2.51.0


From d40f275d96e890ac58cdaf2a46cb928c4240fcb7 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@intel.com>
Date: Fri, 21 Feb 2025 16:10:48 -0800
Subject: [PATCH 07/16] drm/xe: Move survivability entirely to xe_pci

There's an odd split between xe_pci.c and xe_device.c wrt
xe_survivability: it's initialized by xe_device, but then finalized by
xe_pci. Move it entirely to the outer layer, xe_pci, so it controls
the flow entirely.

This also allows to stop ignoring some of the errors. E.g.: if there's
an -ENOMEM, it shouldn't continue as if it survivability had been
enabled.

One change worth mentioning is that if "wait for lmem" fails, it will
also check the pcode status to decide if it should enter or not in
survivability mode, which it was not doing before. The bit from pcode
for that decision should remain the same after lmem failed
initialization, so it should be fine.

Cc: Riana Tauro <riana.tauro@intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Reviewed-by: Riana Tauro <riana.tauro@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-9-lucas.demarchi@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/xe/xe_device.c             |  7 +--
 drivers/gpu/drm/xe/xe_heci_gsc.c           |  2 +-
 drivers/gpu/drm/xe/xe_pci.c                | 17 ++---
 drivers/gpu/drm/xe/xe_survivability_mode.c | 73 +++++++++++-----------
 drivers/gpu/drm/xe/xe_survivability_mode.h |  5 +-
 5 files changed, 49 insertions(+), 55 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index d50ac3d43511..ef269227b64b 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -53,7 +53,6 @@
 #include "xe_pxp.h"
 #include "xe_query.h"
 #include "xe_sriov.h"
-#include "xe_survivability_mode.h"
 #include "xe_tile.h"
 #include "xe_ttm_stolen_mgr.h"
 #include "xe_ttm_sys_mgr.h"
@@ -695,12 +694,8 @@ int xe_device_probe_early(struct xe_device *xe)
 	update_device_info(xe);
 
 	err = xe_pcode_probe_early(xe);
-	if (err) {
-		if (xe_survivability_mode_required(xe))
-			xe_survivability_mode_init(xe);
-
+	if (err)
 		return err;
-	}
 
 	err = wait_for_lmem_ready(xe);
 	if (err)
diff --git a/drivers/gpu/drm/xe/xe_heci_gsc.c b/drivers/gpu/drm/xe/xe_heci_gsc.c
index 06dc78d3a812..992ee47abcdb 100644
--- a/drivers/gpu/drm/xe/xe_heci_gsc.c
+++ b/drivers/gpu/drm/xe/xe_heci_gsc.c
@@ -201,7 +201,7 @@ void xe_heci_gsc_init(struct xe_device *xe)
 		return;
 	}
 
-	if (!def->use_polling && !xe_survivability_mode_enabled(xe)) {
+	if (!def->use_polling && !xe_survivability_mode_is_enabled(xe)) {
 		ret = heci_gsc_irq_setup(xe);
 		if (ret)
 			goto fail;
diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
index a0f4bd45b61b..8b6658b214be 100644
--- a/drivers/gpu/drm/xe/xe_pci.c
+++ b/drivers/gpu/drm/xe/xe_pci.c
@@ -770,8 +770,8 @@ static void xe_pci_remove(struct pci_dev *pdev)
 	if (IS_SRIOV_PF(xe))
 		xe_pci_sriov_configure(pdev, 0);
 
-	if (xe_survivability_mode_enabled(xe))
-		return xe_survivability_mode_remove(xe);
+	if (xe_survivability_mode_is_enabled(xe))
+		return;
 
 	xe_device_remove(xe);
 	xe_pm_runtime_fini(xe);
@@ -846,13 +846,14 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	err = xe_device_probe_early(xe);
 
 	/*
-	 * In Boot Survivability mode, no drm card is exposed
-	 * and driver is loaded with bare minimum to allow
-	 * for firmware to be flashed through mei. Return
-	 * success if survivability mode is enabled.
+	 * In Boot Survivability mode, no drm card is exposed and driver is
+	 * loaded with bare minimum to allow for firmware to be flashed through
+	 * mei. If early probe fails, check if survivability mode is flagged by
+	 * HW to be enabled. In that case enable it and return success.
 	 */
 	if (err) {
-		if (xe_survivability_mode_enabled(xe))
+		if (xe_survivability_mode_required(xe) &&
+		    xe_survivability_mode_enable(xe))
 			return 0;
 
 		return err;
@@ -946,7 +947,7 @@ static int xe_pci_suspend(struct device *dev)
 	struct xe_device *xe = pdev_to_xe_device(pdev);
 	int err;
 
-	if (xe_survivability_mode_enabled(xe))
+	if (xe_survivability_mode_is_enabled(xe))
 		return -EBUSY;
 
 	err = xe_pm_suspend(xe);
diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c
index 04a341606a7c..7ba02e085b5b 100644
--- a/drivers/gpu/drm/xe/xe_survivability_mode.c
+++ b/drivers/gpu/drm/xe/xe_survivability_mode.c
@@ -127,40 +127,54 @@ static ssize_t survivability_mode_show(struct device *dev,
 
 static DEVICE_ATTR_ADMIN_RO(survivability_mode);
 
-static void enable_survivability_mode(struct pci_dev *pdev)
+static void xe_survivability_mode_fini(void *arg)
+{
+	struct xe_device *xe = arg;
+	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
+	struct device *dev = &pdev->dev;
+
+	sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
+	xe_heci_gsc_fini(xe);
+}
+
+static int enable_survivability_mode(struct pci_dev *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct xe_device *xe = pdev_to_xe_device(pdev);
 	struct xe_survivability *survivability = &xe->survivability;
 	int ret = 0;
 
-	/* set survivability mode */
-	survivability->mode = true;
-	dev_info(dev, "In Survivability Mode\n");
-
 	/* create survivability mode sysfs */
 	ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr);
 	if (ret) {
 		dev_warn(dev, "Failed to create survivability sysfs files\n");
-		return;
+		return ret;
 	}
 
+	ret = devm_add_action_or_reset(xe->drm.dev,
+				       xe_survivability_mode_fini, xe);
+	if (ret)
+		return ret;
+
 	xe_heci_gsc_init(xe);
 
 	xe_vsec_init(xe);
+
+	survivability->mode = true;
+	dev_err(dev, "In Survivability Mode\n");
+
+	return 0;
 }
 
 /**
- * xe_survivability_mode_enabled - check if survivability mode is enabled
+ * xe_survivability_mode_is_enabled - check if survivability mode is enabled
  * @xe: xe device instance
  *
  * Returns true if in survivability mode, false otherwise
  */
-bool xe_survivability_mode_enabled(struct xe_device *xe)
+bool xe_survivability_mode_is_enabled(struct xe_device *xe)
 {
-	struct xe_survivability *survivability = &xe->survivability;
-
-	return survivability->mode;
+	return xe->survivability.mode;
 }
 
 /**
@@ -183,34 +197,19 @@ bool xe_survivability_mode_required(struct xe_device *xe)
 	data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
 	survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
 
-	return (survivability->boot_status == NON_CRITICAL_FAILURE ||
-		survivability->boot_status == CRITICAL_FAILURE);
+	return survivability->boot_status == NON_CRITICAL_FAILURE ||
+		survivability->boot_status == CRITICAL_FAILURE;
 }
 
 /**
- * xe_survivability_mode_remove - remove survivability mode
+ * xe_survivability_mode_enable - Initialize and enable the survivability mode
  * @xe: xe device instance
  *
- * clean up sysfs entries of survivability mode
- */
-void xe_survivability_mode_remove(struct xe_device *xe)
-{
-	struct xe_survivability *survivability = &xe->survivability;
-	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
-	struct device *dev = &pdev->dev;
-
-	sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
-	xe_heci_gsc_fini(xe);
-	kfree(survivability->info);
-}
-
-/**
- * xe_survivability_mode_init - Initialize the survivability mode
- * @xe: xe device instance
+ * Initialize survivability information and enable survivability mode
  *
- * Initializes survivability information and enables survivability mode
+ * Return: 0 for success, negative error code otherwise.
  */
-void xe_survivability_mode_init(struct xe_device *xe)
+int xe_survivability_mode_enable(struct xe_device *xe)
 {
 	struct xe_survivability *survivability = &xe->survivability;
 	struct xe_survivability_info *info;
@@ -218,9 +217,10 @@ void xe_survivability_mode_init(struct xe_device *xe)
 
 	survivability->size = MAX_SCRATCH_MMIO;
 
-	info = kcalloc(survivability->size, sizeof(*info), GFP_KERNEL);
+	info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info),
+			    GFP_KERNEL);
 	if (!info)
-		return;
+		return -ENOMEM;
 
 	survivability->info = info;
 
@@ -229,9 +229,8 @@ void xe_survivability_mode_init(struct xe_device *xe)
 	/* Only log debug information and exit if it is a critical failure */
 	if (survivability->boot_status == CRITICAL_FAILURE) {
 		log_survivability_info(pdev);
-		kfree(survivability->info);
-		return;
+		return -ENXIO;
 	}
 
-	enable_survivability_mode(pdev);
+	return enable_survivability_mode(pdev);
 }
diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.h b/drivers/gpu/drm/xe/xe_survivability_mode.h
index f530507a22c6..f4df5f9025ce 100644
--- a/drivers/gpu/drm/xe/xe_survivability_mode.h
+++ b/drivers/gpu/drm/xe/xe_survivability_mode.h
@@ -10,9 +10,8 @@
 
 struct xe_device;
 
-void xe_survivability_mode_init(struct xe_device *xe);
-void xe_survivability_mode_remove(struct xe_device *xe);
-bool xe_survivability_mode_enabled(struct xe_device *xe);
+int xe_survivability_mode_enable(struct xe_device *xe);
+bool xe_survivability_mode_is_enabled(struct xe_device *xe);
 bool xe_survivability_mode_required(struct xe_device *xe);
 
 #endif /* _XE_SURVIVABILITY_MODE_H_ */
-- 
2.51.0


From 292b1a8a50545b47d4fafc54452147abd2d1d86c Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@intel.com>
Date: Fri, 21 Feb 2025 16:10:49 -0800
Subject: [PATCH 08/16] drm/xe: Stop ignoring errors from xe_heci_gsc_init()

Do not ignore errors from xe_heci_gsc_init(). For example, it shouldn't
be fine to report successfully entering survivability mode when there's
no communication with gsc working. The driver should also not be
half-initialized in the normal case neither.

Cc: Riana Tauro <riana.tauro@intel.com>
Cc: Alexander Usyskin <alexander.usyskin@intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-10-lucas.demarchi@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/xe/xe_device.c             |  6 ++--
 drivers/gpu/drm/xe/xe_heci_gsc.c           | 35 +++++++++-------------
 drivers/gpu/drm/xe/xe_heci_gsc.h           |  3 +-
 drivers/gpu/drm/xe/xe_survivability_mode.c |  5 ++--
 4 files changed, 21 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index ef269227b64b..5ef8cffbc88f 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -851,7 +851,9 @@ int xe_device_probe(struct xe_device *xe)
 			return err;
 	}
 
-	xe_heci_gsc_init(xe);
+	err = xe_heci_gsc_init(xe);
+	if (err)
+		return err;
 
 	err = xe_oa_init(xe);
 	if (err)
@@ -903,8 +905,6 @@ void xe_device_remove(struct xe_device *xe)
 	xe_display_unregister(xe);
 
 	drm_dev_unplug(&xe->drm);
-
-	xe_heci_gsc_fini(xe);
 }
 
 void xe_device_shutdown(struct xe_device *xe)
diff --git a/drivers/gpu/drm/xe/xe_heci_gsc.c b/drivers/gpu/drm/xe/xe_heci_gsc.c
index 992ee47abcdb..3ea325d3db99 100644
--- a/drivers/gpu/drm/xe/xe_heci_gsc.c
+++ b/drivers/gpu/drm/xe/xe_heci_gsc.c
@@ -89,12 +89,9 @@ static void heci_gsc_release_dev(struct device *dev)
 	kfree(adev);
 }
 
-void xe_heci_gsc_fini(struct xe_device *xe)
+static void xe_heci_gsc_fini(void *arg)
 {
-	struct xe_heci_gsc *heci_gsc = &xe->heci_gsc;
-
-	if (!xe->info.has_heci_gscfi && !xe->info.has_heci_cscfi)
-		return;
+	struct xe_heci_gsc *heci_gsc = arg;
 
 	if (heci_gsc->adev) {
 		struct auxiliary_device *aux_dev = &heci_gsc->adev->aux_dev;
@@ -106,6 +103,7 @@ void xe_heci_gsc_fini(struct xe_device *xe)
 
 	if (heci_gsc->irq >= 0)
 		irq_free_desc(heci_gsc->irq);
+
 	heci_gsc->irq = -1;
 }
 
@@ -172,14 +170,14 @@ static int heci_gsc_add_device(struct xe_device *xe, const struct heci_gsc_def *
 	return ret;
 }
 
-void xe_heci_gsc_init(struct xe_device *xe)
+int xe_heci_gsc_init(struct xe_device *xe)
 {
 	struct xe_heci_gsc *heci_gsc = &xe->heci_gsc;
 	const struct heci_gsc_def *def;
 	int ret;
 
 	if (!xe->info.has_heci_gscfi && !xe->info.has_heci_cscfi)
-		return;
+		return 0;
 
 	heci_gsc->irq = -1;
 
@@ -191,29 +189,24 @@ void xe_heci_gsc_init(struct xe_device *xe)
 		def = &heci_gsc_def_dg2;
 	} else if (xe->info.platform == XE_DG1) {
 		def = &heci_gsc_def_dg1;
-	} else {
-		drm_warn_once(&xe->drm, "Unknown platform\n");
-		return;
 	}
 
-	if (!def->name) {
-		drm_warn_once(&xe->drm, "HECI is not implemented!\n");
-		return;
+	if (!def || !def->name) {
+		drm_warn(&xe->drm, "HECI is not implemented!\n");
+		return 0;
 	}
 
+	ret = devm_add_action_or_reset(xe->drm.dev, xe_heci_gsc_fini, heci_gsc);
+	if (ret)
+		return ret;
+
 	if (!def->use_polling && !xe_survivability_mode_is_enabled(xe)) {
 		ret = heci_gsc_irq_setup(xe);
 		if (ret)
-			goto fail;
+			return ret;
 	}
 
-	ret = heci_gsc_add_device(xe, def);
-	if (ret)
-		goto fail;
-
-	return;
-fail:
-	xe_heci_gsc_fini(xe);
+	return heci_gsc_add_device(xe, def);
 }
 
 void xe_heci_gsc_irq_handler(struct xe_device *xe, u32 iir)
diff --git a/drivers/gpu/drm/xe/xe_heci_gsc.h b/drivers/gpu/drm/xe/xe_heci_gsc.h
index 48b3b1838045..745eb6783942 100644
--- a/drivers/gpu/drm/xe/xe_heci_gsc.h
+++ b/drivers/gpu/drm/xe/xe_heci_gsc.h
@@ -33,8 +33,7 @@ struct xe_heci_gsc {
 	int irq;
 };
 
-void xe_heci_gsc_init(struct xe_device *xe);
-void xe_heci_gsc_fini(struct xe_device *xe);
+int xe_heci_gsc_init(struct xe_device *xe);
 void xe_heci_gsc_irq_handler(struct xe_device *xe, u32 iir);
 void xe_heci_csc_irq_handler(struct xe_device *xe, u32 iir);
 
diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c
index 7ba02e085b5b..d939ce70e6fa 100644
--- a/drivers/gpu/drm/xe/xe_survivability_mode.c
+++ b/drivers/gpu/drm/xe/xe_survivability_mode.c
@@ -134,7 +134,6 @@ static void xe_survivability_mode_fini(void *arg)
 	struct device *dev = &pdev->dev;
 
 	sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
-	xe_heci_gsc_fini(xe);
 }
 
 static int enable_survivability_mode(struct pci_dev *pdev)
@@ -156,7 +155,9 @@ static int enable_survivability_mode(struct pci_dev *pdev)
 	if (ret)
 		return ret;
 
-	xe_heci_gsc_init(xe);
+	ret = xe_heci_gsc_init(xe);
+	if (ret)
+		return ret;
 
 	xe_vsec_init(xe);
 
-- 
2.51.0


From 1671c9617d7e987f7cb815a77dcb2dbcf6d28988 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@intel.com>
Date: Fri, 21 Feb 2025 16:10:50 -0800
Subject: [PATCH 09/16] drm/xe: Rename update_device_info() after sriov

This is only changing info flags for SR-IOV reasons. Rename it
accordingly, because there are several other places in probe where the
flags are updated, which is not inside this function.

Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-11-lucas.demarchi@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/xe/xe_device.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 5ef8cffbc88f..ed1cc5983f74 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -660,7 +660,7 @@ static int wait_for_lmem_ready(struct xe_device *xe)
 }
 ALLOW_ERROR_INJECTION(wait_for_lmem_ready, ERRNO); /* See xe_pci_probe() */
 
-static void update_device_info(struct xe_device *xe)
+static void sriov_update_device_info(struct xe_device *xe)
 {
 	/* disable features that are not available/applicable to VFs */
 	if (IS_SRIOV_VF(xe)) {
@@ -691,7 +691,7 @@ int xe_device_probe_early(struct xe_device *xe)
 
 	xe_sriov_probe_early(xe);
 
-	update_device_info(xe);
+	sriov_update_device_info(xe);
 
 	err = xe_pcode_probe_early(xe);
 	if (err)
-- 
2.51.0


From 35359c36356a4226af1ba3956d48abf7ed136ebb Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@intel.com>
Date: Fri, 21 Feb 2025 16:10:51 -0800
Subject: [PATCH 10/16] drm/xe: Stop ignoring errors from xe_ttm_sys_mgr_init()

xe_ttm_sys_mgr_init() already cleans up after itself, just return error
if that failed.

Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Reviewed-by: Tejas Upadhyay <tejas.upadhyay@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250222001051.3012936-12-lucas.demarchi@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/xe/xe_device.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index ed1cc5983f74..c9ab79da3f9f 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -748,6 +748,7 @@ int xe_device_probe(struct xe_device *xe)
 		return err;
 
 	xe->info.mem_region_mask = 1;
+
 	err = xe_set_dma_info(xe);
 	if (err)
 		return err;
@@ -756,7 +757,9 @@ int xe_device_probe(struct xe_device *xe)
 	if (err)
 		return err;
 
-	xe_ttm_sys_mgr_init(xe);
+	err = xe_ttm_sys_mgr_init(xe);
+	if (err)
+		return err;
 
 	for_each_gt(gt, xe, id) {
 		err = xe_gt_init_early(gt);
-- 
2.51.0


From 6b68c4542ffecc36087a9e14db8fc990c88bb01b Mon Sep 17 00:00:00 2001
From: Mingcong Bai <jeffbai@aosc.io>
Date: Tue, 25 Feb 2025 15:31:01 +0800
Subject: [PATCH 11/16] drm/xe/regs: remove a duplicate definition for
 RING_CTL_SIZE(size)

Commit b79e8fd954c4 ("drm/xe: Remove dependency on intel_engine_regs.h")
introduced an internal set of engine registers, however, as part of this
change, it has also introduced two duplicate `define' lines for
`RING_CTL_SIZE(size)'. This commit was introduced to the tree in v6.8-rc1.

While this is harmless as the definitions did not change, so no compiler
warning was observed.

Drop this line anyway for the sake of correctness.

Cc: stable@vger.kernel.org # v6.8-rc1+
Fixes: b79e8fd954c4 ("drm/xe: Remove dependency on intel_engine_regs.h")
Signed-off-by: Mingcong Bai <jeffbai@aosc.io>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250225073104.865230-1-jeffbai@aosc.io
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/xe/regs/xe_engine_regs.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
index c8fd3d5ca502..4f372dc2cb89 100644
--- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
@@ -53,7 +53,6 @@
 
 #define RING_CTL(base)				XE_REG((base) + 0x3c)
 #define   RING_CTL_SIZE(size)			((size) - PAGE_SIZE) /* in bytes -> pages */
-#define   RING_CTL_SIZE(size)			((size) - PAGE_SIZE) /* in bytes -> pages */
 
 #define RING_START_UDW(base)			XE_REG((base) + 0x48)
 
-- 
2.51.0


From 18fbd567e75f9b97b699b2ab4f1fa76b7cf268f6 Mon Sep 17 00:00:00 2001
From: Tejas Upadhyay <tejas.upadhyay@intel.com>
Date: Tue, 25 Feb 2025 10:27:54 +0530
Subject: [PATCH 12/16] drm/xe: cancel pending job timer before freeing
 scheduler

The async call to __guc_exec_queue_fini_async frees the scheduler
while a submission may time out and restart. To prevent this race
condition, the pending job timer should be canceled before freeing
the scheduler.

V3(MattB):
 - Adjust position of cancel pending job
 - Remove gitlab issue# from commit message
V2(MattB):
 - Cancel pending jobs before scheduler finish

Fixes: a20c75dba192 ("drm/xe: Call __guc_exec_queue_fini_async direct for KERNEL exec_queues")
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250225045754.600905-1-tejas.upadhyay@intel.com
Signed-off-by: Tejas Upadhyay <tejas.upadhyay@intel.com>
---
 drivers/gpu/drm/xe/xe_guc_submit.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 913c74d6e2ae..b6a2dd742ebd 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -1248,6 +1248,8 @@ static void __guc_exec_queue_fini_async(struct work_struct *w)
 
 	if (xe_exec_queue_is_lr(q))
 		cancel_work_sync(&ge->lr_tdr);
+	/* Confirm no work left behind accessing device structures */
+	cancel_delayed_work_sync(&ge->sched.base.work_tdr);
 	release_guc_id(guc, q);
 	xe_sched_entity_fini(&ge->entity);
 	xe_sched_fini(&ge->sched);
-- 
2.51.0


From 4f109b061c12d63b332338ce9192593842fa09a4 Mon Sep 17 00:00:00 2001
From: Francois Dugast <francois.dugast@intel.com>
Date: Tue, 25 Feb 2025 20:57:33 +0100
Subject: [PATCH 13/16] drm/xe/gt_stats: Use atomic64_t for counters

The stats counters are now used for things like counting the VMA
bytes during page faults. During workload execution, the counter
value can grow fast and easily reach the atomic int limit, in
which case it overflows. To make this less likely to happen, push
the limit by switching to 64b atomic to store the counter value.
Overhead is very small as there are only 3 stat entries per GT as
of now, and stats are only enabled with CONFIG_DEBUG_FS.

Suggested-by: Matthew Auld <matthew.auld@intel.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250225195902.1247100-2-francois.dugast@intel.com
Signed-off-by: Francois Dugast <francois.dugast@intel.com>
---
 drivers/gpu/drm/xe/xe_gt_stats.c | 6 +++---
 drivers/gpu/drm/xe/xe_gt_types.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_gt_stats.c b/drivers/gpu/drm/xe/xe_gt_stats.c
index 2e9879ea4674..af3fd03f665c 100644
--- a/drivers/gpu/drm/xe/xe_gt_stats.c
+++ b/drivers/gpu/drm/xe/xe_gt_stats.c
@@ -23,7 +23,7 @@ void xe_gt_stats_incr(struct xe_gt *gt, const enum xe_gt_stats_id id, int incr)
 	if (id >= __XE_GT_STATS_NUM_IDS)
 		return;
 
-	atomic_add(incr, &gt->stats.counters[id]);
+	atomic64_add(incr, &gt->stats.counters[id]);
 }
 
 static const char *const stat_description[__XE_GT_STATS_NUM_IDS] = {
@@ -44,8 +44,8 @@ int xe_gt_stats_print_info(struct xe_gt *gt, struct drm_printer *p)
 	enum xe_gt_stats_id id;
 
 	for (id = 0; id < __XE_GT_STATS_NUM_IDS; ++id)
-		drm_printf(p, "%s: %d\n", stat_description[id],
-			   atomic_read(&gt->stats.counters[id]));
+		drm_printf(p, "%s: %lld\n", stat_description[id],
+			   atomic64_read(&gt->stats.counters[id]));
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
index 6e66bf0e8b3f..f72b965cc9e6 100644
--- a/drivers/gpu/drm/xe/xe_gt_types.h
+++ b/drivers/gpu/drm/xe/xe_gt_types.h
@@ -139,7 +139,7 @@ struct xe_gt {
 	/** @stats: GT stats */
 	struct {
 		/** @stats.counters: counters for various GT stats */
-		atomic_t counters[__XE_GT_STATS_NUM_IDS];
+		atomic64_t counters[__XE_GT_STATS_NUM_IDS];
 	} stats;
 #endif
 
-- 
2.51.0


From 278d4f429143d1c5e7c4deb7d7147063da12606d Mon Sep 17 00:00:00 2001
From: Francois Dugast <francois.dugast@intel.com>
Date: Tue, 25 Feb 2025 20:57:34 +0100
Subject: [PATCH 14/16] drm/xe/gt_pagefault: Change vma_pagefault unit to
 kilobyte

Increase the amount of bytes that can be counted before the counter
overflows, while not losing information as the VMA is not expected
to have sub-kilobyte size.

Suggested-by: Matthew Auld <matthew.auld@intel.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250225195902.1247100-3-francois.dugast@intel.com
Signed-off-by: Francois Dugast <francois.dugast@intel.com>
---
 drivers/gpu/drm/xe/xe_gt_pagefault.c   | 2 +-
 drivers/gpu/drm/xe/xe_gt_stats.c       | 2 +-
 drivers/gpu/drm/xe/xe_gt_stats_types.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
index 46701ca11ce0..17d69039b866 100644
--- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
+++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
@@ -137,7 +137,7 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct pagefault *pf,
 	bool atomic;
 
 	xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_COUNT, 1);
-	xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_BYTES, xe_vma_size(vma));
+	xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_KB, xe_vma_size(vma) / 1024);
 
 	trace_xe_vma_pagefault(vma);
 	atomic = access_is_atomic(pf->access_type);
diff --git a/drivers/gpu/drm/xe/xe_gt_stats.c b/drivers/gpu/drm/xe/xe_gt_stats.c
index af3fd03f665c..6155ea354432 100644
--- a/drivers/gpu/drm/xe/xe_gt_stats.c
+++ b/drivers/gpu/drm/xe/xe_gt_stats.c
@@ -29,7 +29,7 @@ void xe_gt_stats_incr(struct xe_gt *gt, const enum xe_gt_stats_id id, int incr)
 static const char *const stat_description[__XE_GT_STATS_NUM_IDS] = {
 	"tlb_inval_count",
 	"vma_pagefault_count",
-	"vma_pagefault_bytes",
+	"vma_pagefault_kb",
 };
 
 /**
diff --git a/drivers/gpu/drm/xe/xe_gt_stats_types.h b/drivers/gpu/drm/xe/xe_gt_stats_types.h
index b072bd80c4b9..d556771f99d6 100644
--- a/drivers/gpu/drm/xe/xe_gt_stats_types.h
+++ b/drivers/gpu/drm/xe/xe_gt_stats_types.h
@@ -9,7 +9,7 @@
 enum xe_gt_stats_id {
 	XE_GT_STATS_ID_TLB_INVAL,
 	XE_GT_STATS_ID_VMA_PAGEFAULT_COUNT,
-	XE_GT_STATS_ID_VMA_PAGEFAULT_BYTES,
+	XE_GT_STATS_ID_VMA_PAGEFAULT_KB,
 	/* must be the last entry */
 	__XE_GT_STATS_NUM_IDS,
 };
-- 
2.51.0


From 8c5fe7d88bc1c12662a804fd75edb6ac85225ce2 Mon Sep 17 00:00:00 2001
From: Aradhya Bhatia <aradhya.bhatia@intel.com>
Date: Thu, 20 Feb 2025 15:16:44 +0530
Subject: [PATCH 15/16] drm/xe: Add Wa_16021333562 and Wa_14016712196

Wa_16021333562 and Wa_14016712196 are permanent workarounds that apply
to multiple platforms. Wa_16021333562 applies to platforms ranging from
TGL (12.00) to Xe_LPM (13.00), while Wa_14016712196 from DG2 (12.55) to
Xe_LPG (12.74).

Reviewed-by: Tejas Upadhyay <tejas.upadhyay@intel.com>
Signed-off-by: Aradhya Bhatia <aradhya.bhatia@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250220094645.358647-2-aradhya.bhatia@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/xe/xe_guc_ads.c    | 2 +-
 drivers/gpu/drm/xe/xe_ring_ops.c   | 4 ++++
 drivers/gpu/drm/xe/xe_wa_oob.rules | 4 ++++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
index fab259adc380..e7c9e095a19f 100644
--- a/drivers/gpu/drm/xe/xe_guc_ads.c
+++ b/drivers/gpu/drm/xe/xe_guc_ads.c
@@ -342,7 +342,7 @@ static void guc_waklv_init(struct xe_guc_ads *ads)
 	offset = guc_ads_waklv_offset(ads);
 	remain = guc_ads_waklv_size(ads);
 
-	if (XE_WA(gt, 14019882105))
+	if (XE_WA(gt, 14019882105) || XE_WA(gt, 16021333562))
 		guc_waklv_enable_simple(ads,
 					GUC_WORKAROUND_KLV_BLOCK_INTERRUPTS_WHEN_MGSR_BLOCKED,
 					&offset, &remain);
diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c
index 0c230ee53bba..d2f604aa96fa 100644
--- a/drivers/gpu/drm/xe/xe_ring_ops.c
+++ b/drivers/gpu/drm/xe/xe_ring_ops.c
@@ -177,6 +177,10 @@ static int emit_render_cache_flush(struct xe_sched_job *job, u32 *dw, int i)
 	bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK);
 	u32 flags;
 
+	if (XE_WA(gt, 14016712196))
+		i = emit_pipe_control(dw, i, 0, PIPE_CONTROL_DEPTH_CACHE_FLUSH,
+				      LRC_PPHWSP_FLUSH_INVAL_SCRATCH_ADDR, 0);
+
 	flags = (PIPE_CONTROL_CS_STALL |
 		 PIPE_CONTROL_TILE_CACHE_FLUSH |
 		 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules b/drivers/gpu/drm/xe/xe_wa_oob.rules
index 228436532282..ea72bcc02e1e 100644
--- a/drivers/gpu/drm/xe/xe_wa_oob.rules
+++ b/drivers/gpu/drm/xe/xe_wa_oob.rules
@@ -43,3 +43,7 @@
 no_media_l3	MEDIA_VERSION(3000)
 14022866841	GRAPHICS_VERSION(3000), GRAPHICS_STEP(A0, B0)
 		MEDIA_VERSION(3000), MEDIA_STEP(A0, B0)
+16021333562	GRAPHICS_VERSION_RANGE(1200, 1274)
+		MEDIA_VERSION(1300)
+14016712196	GRAPHICS_VERSION(1255)
+		GRAPHICS_VERSION_RANGE(1270, 1274)
-- 
2.51.0


From eef3ede533aea7a40e2f72a7886da4827f10eeac Mon Sep 17 00:00:00 2001
From: Aradhya Bhatia <aradhya.bhatia@intel.com>
Date: Thu, 20 Feb 2025 15:16:45 +0530
Subject: [PATCH 16/16] drm/xe/oa: Refactor WAs to use XE_WA() macro

Refactor Wa_18013179988, Wa_14015568240, Wa_1508761755, and
Wa_1509372804, to use the proper workaround-check implementation for
out-of-band workarounds, XE_WA(), and drop the use of the platform
based WA selection.

Reviewed-by: Tejas Upadhyay <tejas.upadhyay@intel.com>
Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Aradhya Bhatia <aradhya.bhatia@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250220094645.358647-3-aradhya.bhatia@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/xe/xe_oa.c         | 30 +++++++++---------------------
 drivers/gpu/drm/xe/xe_wa_oob.rules |  5 +++++
 2 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c
index 6bf5b793b29f..6f185632da14 100644
--- a/drivers/gpu/drm/xe/xe_oa.c
+++ b/drivers/gpu/drm/xe/xe_oa.c
@@ -12,6 +12,8 @@
 #include <drm/drm_managed.h>
 #include <uapi/drm/xe_drm.h>
 
+#include <generated/xe_wa_oob.h>
+
 #include "abi/guc_actions_slpc_abi.h"
 #include "instructions/xe_mi_commands.h"
 #include "regs/xe_engine_regs.h"
@@ -35,6 +37,7 @@
 #include "xe_sched_job.h"
 #include "xe_sriov.h"
 #include "xe_sync.h"
+#include "xe_wa.h"
 
 #define DEFAULT_POLL_FREQUENCY_HZ 200
 #define DEFAULT_POLL_PERIOD_NS (NSEC_PER_SEC / DEFAULT_POLL_FREQUENCY_HZ)
@@ -812,11 +815,8 @@ static void xe_oa_disable_metric_set(struct xe_oa_stream *stream)
 	struct xe_mmio *mmio = &stream->gt->mmio;
 	u32 sqcnt1;
 
-	/*
-	 * Wa_1508761755:xehpsdv, dg2
-	 * Enable thread stall DOP gating and EU DOP gating.
-	 */
-	if (stream->oa->xe->info.platform == XE_DG2) {
+	/* Enable thread stall DOP gating and EU DOP gating. */
+	if (XE_WA(stream->gt, 1508761755)) {
 		xe_gt_mcr_multicast_write(stream->gt, ROW_CHICKEN,
 					  _MASKED_BIT_DISABLE(STALL_DOP_GATING_DISABLE));
 		xe_gt_mcr_multicast_write(stream->gt, ROW_CHICKEN2,
@@ -1065,11 +1065,10 @@ static int xe_oa_enable_metric_set(struct xe_oa_stream *stream)
 	int ret;
 
 	/*
-	 * Wa_1508761755:xehpsdv, dg2
 	 * EU NOA signals behave incorrectly if EU clock gating is enabled.
 	 * Disable thread stall DOP gating and EU DOP gating.
 	 */
-	if (stream->oa->xe->info.platform == XE_DG2) {
+	if (XE_WA(stream->gt, 1508761755)) {
 		xe_gt_mcr_multicast_write(stream->gt, ROW_CHICKEN,
 					  _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE));
 		xe_gt_mcr_multicast_write(stream->gt, ROW_CHICKEN2,
@@ -1720,12 +1719,10 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream,
 	}
 
 	/*
-	 * Wa_1509372804:pvc
-	 *
 	 * GuC reset of engines causes OA to lose configuration
 	 * state. Prevent this by overriding GUCRC mode.
 	 */
-	if (stream->oa->xe->info.platform == XE_PVC) {
+	if (XE_WA(stream->gt, 1509372804)) {
 		ret = xe_guc_pc_override_gucrc_mode(&gt->uc.guc.pc,
 						    SLPC_GUCRC_MODE_GUCRC_NO_RC6);
 		if (ret)
@@ -1857,23 +1854,14 @@ u32 xe_oa_timestamp_frequency(struct xe_gt *gt)
 {
 	u32 reg, shift;
 
-	/*
-	 * Wa_18013179988:dg2
-	 * Wa_14015568240:pvc
-	 * Wa_14015846243:mtl
-	 */
-	switch (gt_to_xe(gt)->info.platform) {
-	case XE_DG2:
-	case XE_PVC:
-	case XE_METEORLAKE:
+	if (XE_WA(gt, 18013179988) || XE_WA(gt, 14015568240)) {
 		xe_pm_runtime_get(gt_to_xe(gt));
 		reg = xe_mmio_read32(&gt->mmio, RPM_CONFIG0);
 		xe_pm_runtime_put(gt_to_xe(gt));
 
 		shift = REG_FIELD_GET(RPM_CONFIG0_CTC_SHIFT_PARAMETER_MASK, reg);
 		return gt->info.reference_clock << (3 - shift);
-
-	default:
+	} else {
 		return gt->info.reference_clock;
 	}
 }
diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules b/drivers/gpu/drm/xe/xe_wa_oob.rules
index ea72bcc02e1e..1dd02a231926 100644
--- a/drivers/gpu/drm/xe/xe_wa_oob.rules
+++ b/drivers/gpu/drm/xe/xe_wa_oob.rules
@@ -47,3 +47,8 @@ no_media_l3	MEDIA_VERSION(3000)
 		MEDIA_VERSION(1300)
 14016712196	GRAPHICS_VERSION(1255)
 		GRAPHICS_VERSION_RANGE(1270, 1274)
+14015568240	GRAPHICS_VERSION_RANGE(1255, 1260)
+18013179988	GRAPHICS_VERSION(1255)
+		GRAPHICS_VERSION_RANGE(1270, 1274)
+1508761755	GRAPHICS_VERSION(1255)
+		GRAPHICS_VERSION(1260), GRAPHICS_STEP(A0, B0)
-- 
2.51.0