From f350747a9935d6c1539684c37f30e45ce9a3319f Mon Sep 17 00:00:00 2001 From: Matt Roper Date: Fri, 4 Apr 2025 15:00:54 -0700 Subject: [PATCH 01/16] drm/xe: Ensure XE_BO_FLAG_CPU_ADDR_MIRROR has a unique value When XE_BO_FLAG_PINNED_NORESTORE and XE_BO_FLAG_PINNED_LATE_RESTORE were added, they were assigned BO flag values in the middle of the flag range, requiring renumbering of the higher flags. In both cases, XE_BO_FLAG_CPU_ADDR_MIRROR was overlooked during renumbering because it was defined below XE_BO_FLAG_GGTT_ALL and thus was not immediately visible in code diffs changing this area of the code; this resulted in XE_BO_FLAG_CPU_ADDR_MIRROR clashing with another flag. Assign XE_BO_FLAG_CPU_ADDR_MIRROR a unique value, and also move the definition of XE_BO_FLAG_GGTT_ALL down below all of the individual flags so that this kind of mistake is less likely in the future. Also, while we're at it, fix up some space vs tab whitespace inconsistency in these flag definitions. Fixes: 7f387e6012b6 ("drm/xe: add XE_BO_FLAG_PINNED_LATE_RESTORE") Fixes: 045448da87bf ("drm/xe: Add XE_BO_FLAG_PINNED_NORESTORE") Cc: Matthew Auld Cc: Matthew Brost Reviewed-by: Matthew Auld Link: https://lore.kernel.org/r/20250404220053.1758356-2-matthew.d.roper@intel.com Signed-off-by: Matt Roper --- drivers/gpu/drm/xe/xe_bo.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_bo.h b/drivers/gpu/drm/xe/xe_bo.h index f7e716f59948..0a19b50045b2 100644 --- a/drivers/gpu/drm/xe/xe_bo.h +++ b/drivers/gpu/drm/xe/xe_bo.h @@ -40,21 +40,22 @@ #define XE_BO_FLAG_NEEDS_2M BIT(16) #define XE_BO_FLAG_GGTT_INVALIDATE BIT(17) #define XE_BO_FLAG_PINNED_NORESTORE BIT(18) -#define XE_BO_FLAG_PINNED_LATE_RESTORE BIT(19) -#define XE_BO_FLAG_GGTT0 BIT(20) -#define XE_BO_FLAG_GGTT1 BIT(21) -#define XE_BO_FLAG_GGTT2 BIT(22) -#define XE_BO_FLAG_GGTT3 BIT(23) -#define XE_BO_FLAG_GGTT_ALL (XE_BO_FLAG_GGTT0 | \ - XE_BO_FLAG_GGTT1 | \ - XE_BO_FLAG_GGTT2 | \ - XE_BO_FLAG_GGTT3) -#define XE_BO_FLAG_CPU_ADDR_MIRROR BIT(22) +#define XE_BO_FLAG_PINNED_LATE_RESTORE BIT(19) +#define XE_BO_FLAG_GGTT0 BIT(20) +#define XE_BO_FLAG_GGTT1 BIT(21) +#define XE_BO_FLAG_GGTT2 BIT(22) +#define XE_BO_FLAG_GGTT3 BIT(23) +#define XE_BO_FLAG_CPU_ADDR_MIRROR BIT(24) /* this one is trigger internally only */ #define XE_BO_FLAG_INTERNAL_TEST BIT(30) #define XE_BO_FLAG_INTERNAL_64K BIT(31) +#define XE_BO_FLAG_GGTT_ALL (XE_BO_FLAG_GGTT0 | \ + XE_BO_FLAG_GGTT1 | \ + XE_BO_FLAG_GGTT2 | \ + XE_BO_FLAG_GGTT3) + #define XE_BO_FLAG_GGTTx(tile) \ (XE_BO_FLAG_GGTT0 << (tile)->id) -- 2.51.0 From 1e1981b16bb1bbe2fafa57ed439b45cb5b34e32d Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 2 Apr 2025 22:38:05 -0700 Subject: [PATCH 02/16] drm/xe: Fix taking invalid lock on wedge If device wedges on e.g. GuC upload, the submission is not yet enabled and the state is not even initialized. Protect the wedge call so it does nothing in this case. It fixes the following splat: [] xe 0000:bf:00.0: [drm] device wedged, needs recovery [] ------------[ cut here ]------------ [] DEBUG_LOCKS_WARN_ON(lock->magic != lock) [] WARNING: CPU: 48 PID: 312 at kernel/locking/mutex.c:564 __mutex_lock+0x8a1/0xe60 ... [] RIP: 0010:__mutex_lock+0x8a1/0xe60 [] mutex_lock_nested+0x1b/0x30 [] xe_guc_submit_wedge+0x80/0x2b0 [xe] Reviewed-by: Balasubramani Vivekanandan Link: https://lore.kernel.org/r/20250402-warn-after-wedge-v1-1-93e971511fa5@intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_guc_submit.c | 9 +++++++++ drivers/gpu/drm/xe/xe_guc_types.h | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 31bc2022bfc2..813c3c0bb250 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -300,6 +300,8 @@ int xe_guc_submit_init(struct xe_guc *guc, unsigned int num_ids) primelockdep(guc); + guc->submission_state.initialized = true; + return drmm_add_action_or_reset(&xe->drm, guc_submit_fini, guc); } @@ -834,6 +836,13 @@ void xe_guc_submit_wedge(struct xe_guc *guc) xe_gt_assert(guc_to_gt(guc), guc_to_xe(guc)->wedged.mode); + /* + * If device is being wedged even before submission_state is + * initialized, there's nothing to do here. + */ + if (!guc->submission_state.initialized) + return; + err = devm_add_action_or_reset(guc_to_xe(guc)->drm.dev, guc_submit_wedged_fini, guc); if (err) { diff --git a/drivers/gpu/drm/xe/xe_guc_types.h b/drivers/gpu/drm/xe/xe_guc_types.h index 63bac64429a5..1fde7614fcc5 100644 --- a/drivers/gpu/drm/xe/xe_guc_types.h +++ b/drivers/gpu/drm/xe/xe_guc_types.h @@ -89,6 +89,11 @@ struct xe_guc { struct mutex lock; /** @submission_state.enabled: submission is enabled */ bool enabled; + /** + * @submission_state.initialized: mark when submission state is + * even initialized - before that not even the lock is valid + */ + bool initialized; /** @submission_state.fini_wq: submit fini wait queue */ wait_queue_head_t fini_wq; } submission_state; -- 2.51.0 From 16280ded45fba1216d1d4c6acfc20c2d5b45ef50 Mon Sep 17 00:00:00 2001 From: Riana Tauro Date: Mon, 7 Apr 2025 10:44:11 +0530 Subject: [PATCH 03/16] drm/xe: Add configfs to enable survivability mode Registers a configfs subsystem called 'xe' that creates a directory in the mounted configfs directory (/sys/kernel/config) Userspace can then create the device that has to be configured under the xe directory mkdir /sys/kernel/config/xe/0000:03:00.0 The device created will have the following attributes to be configured /sys/kernel/config/xe/ .. 0000:03:00.0/ ... survivability_mode v2: fix kernel-doc fix return value (Lucas) v3: fix kernel-doc (Lucas) Signed-off-by: Riana Tauro Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250407051414.1651616-2-riana.tauro@intel.com Signed-off-by: Lucas De Marchi --- Documentation/gpu/xe/index.rst | 1 + Documentation/gpu/xe/xe_configfs.rst | 10 ++ drivers/gpu/drm/xe/Makefile | 1 + drivers/gpu/drm/xe/xe_configfs.c | 188 +++++++++++++++++++++++++++ drivers/gpu/drm/xe/xe_configfs.h | 16 +++ drivers/gpu/drm/xe/xe_module.c | 5 + 6 files changed, 221 insertions(+) create mode 100644 Documentation/gpu/xe/xe_configfs.rst create mode 100644 drivers/gpu/drm/xe/xe_configfs.c create mode 100644 drivers/gpu/drm/xe/xe_configfs.h diff --git a/Documentation/gpu/xe/index.rst b/Documentation/gpu/xe/index.rst index 92cfb25e64d3..b2369561f24e 100644 --- a/Documentation/gpu/xe/index.rst +++ b/Documentation/gpu/xe/index.rst @@ -25,3 +25,4 @@ DG2, etc is provided to prototype the driver. xe_debugging xe_devcoredump xe-drm-usage-stats.rst + xe_configfs diff --git a/Documentation/gpu/xe/xe_configfs.rst b/Documentation/gpu/xe/xe_configfs.rst new file mode 100644 index 000000000000..9b9d941eb20e --- /dev/null +++ b/Documentation/gpu/xe/xe_configfs.rst @@ -0,0 +1,10 @@ +.. SPDX-License-Identifier: GPL-2.0+ + +.. _xe_configfs: + +============ +Xe Configfs +============ + +.. kernel-doc:: drivers/gpu/drm/xe/xe_configfs.c + :doc: Xe Configfs diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index e4fec90bab55..c1493ef2c12b 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -131,6 +131,7 @@ xe-$(CONFIG_DRM_XE_GPUSVM) += xe_svm.o xe-$(CONFIG_HWMON) += xe_hwmon.o xe-$(CONFIG_PERF_EVENTS) += xe_pmu.o +xe-$(CONFIG_CONFIGFS_FS) += xe_configfs.o # graphics virtualization (SR-IOV) support xe-y += \ diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/xe_configfs.c new file mode 100644 index 000000000000..48a9f428bda9 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_configfs.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2025 Intel Corporation + */ + +#include +#include +#include +#include + +#include "xe_configfs.h" +#include "xe_module.h" + +/** + * DOC: Xe Configfs + * + * Overview + * ========= + * + * Configfs is a filesystem-based manager of kernel objects. XE KMD registers a + * configfs subsystem called ``'xe'`` that creates a directory in the mounted configfs directory + * The user can create devices under this directory and configure them as necessary + * See Documentation/filesystems/configfs.rst for more information about how configfs works. + * + * Create devices + * =============== + * + * In order to create a device, the user has to create a directory inside ``'xe'``:: + * + * mkdir /sys/kernel/config/xe/0000:03:00.0/ + * + * Every device created is populated by the driver with entries that can be + * used to configure it:: + * + * /sys/kernel/config/xe/ + * .. 0000:03:00.0/ + * ... survivability_mode + * + * Configure Attributes + * ==================== + * + * Survivability mode: + * ------------------- + * + * Enable survivability mode on supported cards. This setting only takes + * effect when probing the device. Example to enable it:: + * + * # echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode + * # echo 0000:03:00.0 > /sys/bus/pci/drivers/xe/bind (Enters survivability mode if supported) + * + * Remove devices + * ============== + * + * The created device directories can be removed using ``rmdir``:: + * + * rmdir /sys/kernel/config/xe/0000:03:00.0/ + */ + +struct xe_config_device { + struct config_group group; + + bool survivability_mode; + + /* protects attributes */ + struct mutex lock; +}; + +static struct xe_config_device *to_xe_config_device(struct config_item *item) +{ + return container_of(to_config_group(item), struct xe_config_device, group); +} + +static ssize_t survivability_mode_show(struct config_item *item, char *page) +{ + struct xe_config_device *dev = to_xe_config_device(item); + + return sprintf(page, "%d\n", dev->survivability_mode); +} + +static ssize_t survivability_mode_store(struct config_item *item, const char *page, size_t len) +{ + struct xe_config_device *dev = to_xe_config_device(item); + bool survivability_mode; + int ret; + + ret = kstrtobool(page, &survivability_mode); + if (ret) + return ret; + + mutex_lock(&dev->lock); + dev->survivability_mode = survivability_mode; + mutex_unlock(&dev->lock); + + return len; +} + +CONFIGFS_ATTR(, survivability_mode); + +static struct configfs_attribute *xe_config_device_attrs[] = { + &attr_survivability_mode, + NULL, +}; + +static void xe_config_device_release(struct config_item *item) +{ + struct xe_config_device *dev = to_xe_config_device(item); + + mutex_destroy(&dev->lock); + kfree(dev); +} + +static struct configfs_item_operations xe_config_device_ops = { + .release = xe_config_device_release, +}; + +static const struct config_item_type xe_config_device_type = { + .ct_item_ops = &xe_config_device_ops, + .ct_attrs = xe_config_device_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *xe_config_make_device_group(struct config_group *group, + const char *name) +{ + unsigned int domain, bus, slot, function; + struct xe_config_device *dev; + struct pci_dev *pdev; + int ret; + + ret = sscanf(name, "%04x:%02x:%02x.%x", &domain, &bus, &slot, &function); + if (ret != 4) + return ERR_PTR(-EINVAL); + + pdev = pci_get_domain_bus_and_slot(domain, bus, PCI_DEVFN(slot, function)); + if (!pdev) + return ERR_PTR(-EINVAL); + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return ERR_PTR(-ENOMEM); + + config_group_init_type_name(&dev->group, name, &xe_config_device_type); + + mutex_init(&dev->lock); + + return &dev->group; +} + +static struct configfs_group_operations xe_config_device_group_ops = { + .make_group = xe_config_make_device_group, +}; + +static const struct config_item_type xe_configfs_type = { + .ct_group_ops = &xe_config_device_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem xe_configfs = { + .su_group = { + .cg_item = { + .ci_namebuf = "xe", + .ci_type = &xe_configfs_type, + }, + }, +}; + +int __init xe_configfs_init(void) +{ + struct config_group *root = &xe_configfs.su_group; + int ret; + + config_group_init(root); + mutex_init(&xe_configfs.su_mutex); + ret = configfs_register_subsystem(&xe_configfs); + if (ret) { + pr_err("Error %d while registering %s subsystem\n", + ret, root->cg_item.ci_namebuf); + return ret; + } + + return 0; +} + +void __exit xe_configfs_exit(void) +{ + configfs_unregister_subsystem(&xe_configfs); +} + diff --git a/drivers/gpu/drm/xe/xe_configfs.h b/drivers/gpu/drm/xe/xe_configfs.h new file mode 100644 index 000000000000..5532320818e4 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_configfs.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2025 Intel Corporation + */ +#ifndef _XE_CONFIGFS_H_ +#define _XE_CONFIGFS_H_ + +#if IS_ENABLED(CONFIG_CONFIGFS_FS) +int xe_configfs_init(void); +void xe_configfs_exit(void); +#else +static inline int xe_configfs_init(void) { return 0; }; +static inline void xe_configfs_exit(void) {}; +#endif + +#endif diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c index 9f4632e39a1a..be8603b16ff3 100644 --- a/drivers/gpu/drm/xe/xe_module.c +++ b/drivers/gpu/drm/xe/xe_module.c @@ -11,6 +11,7 @@ #include #include "xe_drv.h" +#include "xe_configfs.h" #include "xe_hw_fence.h" #include "xe_pci.h" #include "xe_pm.h" @@ -88,6 +89,10 @@ static const struct init_funcs init_funcs[] = { { .init = xe_check_nomodeset, }, + { + .init = xe_configfs_init, + .exit = xe_configfs_exit, + }, { .init = xe_hw_fence_module_init, .exit = xe_hw_fence_module_exit, -- 2.51.0 From 77052ab24590cb72598e31de4a7c29f99d51d201 Mon Sep 17 00:00:00 2001 From: Riana Tauro Date: Mon, 7 Apr 2025 10:44:12 +0530 Subject: [PATCH 04/16] drm/xe: Add documentation for survivability mode Add survivability mode document to pcode document as it is enabled when pcode detects a failure. v2: fix kernel-doc (Lucas) Signed-off-by: Riana Tauro Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250407051414.1651616-3-riana.tauro@intel.com Signed-off-by: Lucas De Marchi --- Documentation/gpu/xe/xe_pcode.rst | 7 +++++ drivers/gpu/drm/xe/xe_survivability_mode.c | 34 +++++++++++++++------- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/Documentation/gpu/xe/xe_pcode.rst b/Documentation/gpu/xe/xe_pcode.rst index d2e22cc45061..5937ef3599b0 100644 --- a/Documentation/gpu/xe/xe_pcode.rst +++ b/Documentation/gpu/xe/xe_pcode.rst @@ -12,3 +12,10 @@ Internal API .. kernel-doc:: drivers/gpu/drm/xe/xe_pcode.c :internal: + +================== +Boot Survivability +================== + +.. kernel-doc:: drivers/gpu/drm/xe/xe_survivability_mode.c + :doc: Xe Boot Survivability diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c index cb813b337fd3..399c06890b0b 100644 --- a/drivers/gpu/drm/xe/xe_survivability_mode.c +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c @@ -28,20 +28,32 @@ * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware * to be flashed through mei and collect telemetry. The driver's probe flow is modified * such that it enters survivability mode when pcode initialization is incomplete and boot status - * denotes a failure. The driver then populates the survivability_mode PCI sysfs indicating - * survivability mode and provides additional information required for debug + * denotes a failure. * - * KMD exposes below admin-only readable sysfs in survivability mode + * Survivability mode can also be entered manually using the survivability mode attribute available + * through configfs which is beneficial in several usecases. It can be used to address scenarios + * where pcode does not detect failure or for validation purposes. It can also be used in + * In-Field-Repair (IFR) to repair a single card without impacting the other cards in a node. * - * device/survivability_mode: The presence of this file indicates that the card is in survivability - * mode. Also, provides additional information on why the driver entered - * survivability mode. + * Use below command enable survivability mode manually:: * - * Capability Information - Provides boot status - * Postcode Information - Provides information about the failure - * Overflow Information - Provides history of previous failures - * Auxiliary Information - Certain failures may have information in - * addition to postcode information + * # echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode + * + * Refer :ref:`xe_configfs` for more details on how to use configfs + * + * Survivability mode is indicated by the below admin-only readable sysfs which provides additional + * debug information:: + * + * /sys/bus/pci/devices//surivability_mode + * + * Capability Information: + * Provides boot status + * Postcode Information: + * Provides information about the failure + * Overflow Information + * Provides history of previous failures + * Auxiliary Information + * Certain failures may have information in addition to postcode information */ static u32 aux_history_offset(u32 reg_value) -- 2.51.0 From bc417e54e24bc9c96d3c6eba2c8c60f7919e5afe Mon Sep 17 00:00:00 2001 From: Riana Tauro Date: Mon, 7 Apr 2025 10:44:13 +0530 Subject: [PATCH 05/16] drm/xe: Enable configfs support for survivability mode Enable survivability mode if supported and configfs attribute is set. Enabling survivability mode manually is useful in cases where pcode does not detect failure, validation and for IFR (in-field-repair). To set configfs survivability mode attribute for a device echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode The card enters survivability mode if supported v2: add a log if survivability mode is enabled for unsupported platforms (Rodrigo) Signed-off-by: Riana Tauro Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250407051414.1651616-4-riana.tauro@intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_configfs.c | 62 ++++++++++++++++++++++ drivers/gpu/drm/xe/xe_configfs.h | 8 +++ drivers/gpu/drm/xe/xe_device.c | 2 +- drivers/gpu/drm/xe/xe_pci.c | 19 ++++--- drivers/gpu/drm/xe/xe_survivability_mode.c | 35 +++++++++--- drivers/gpu/drm/xe/xe_survivability_mode.h | 1 + 6 files changed, 108 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/xe_configfs.c index 48a9f428bda9..cb9f175c89a1 100644 --- a/drivers/gpu/drm/xe/xe_configfs.c +++ b/drivers/gpu/drm/xe/xe_configfs.c @@ -164,6 +164,68 @@ static struct configfs_subsystem xe_configfs = { }, }; +static struct xe_config_device *configfs_find_group(struct pci_dev *pdev) +{ + struct config_item *item; + char name[64]; + + snprintf(name, sizeof(name), "%04x:%02x:%02x.%x", pci_domain_nr(pdev->bus), + pdev->bus->number, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); + + mutex_lock(&xe_configfs.su_mutex); + item = config_group_find_item(&xe_configfs.su_group, name); + mutex_unlock(&xe_configfs.su_mutex); + + if (!item) + return NULL; + + return to_xe_config_device(item); +} + +/** + * xe_configfs_get_survivability_mode - get configfs survivability mode attribute + * @pdev: pci device + * + * find the configfs group that belongs to the pci device and return + * the survivability mode attribute + * + * Return: survivability mode if config group is found, false otherwise + */ +bool xe_configfs_get_survivability_mode(struct pci_dev *pdev) +{ + struct xe_config_device *dev = configfs_find_group(pdev); + bool mode; + + if (!dev) + return false; + + mode = dev->survivability_mode; + config_item_put(&dev->group.cg_item); + + return mode; +} + +/** + * xe_configfs_clear_survivability_mode - clear configfs survivability mode attribute + * @pdev: pci device + * + * find the configfs group that belongs to the pci device and clear survivability + * mode attribute + */ +void xe_configfs_clear_survivability_mode(struct pci_dev *pdev) +{ + struct xe_config_device *dev = configfs_find_group(pdev); + + if (!dev) + return; + + mutex_lock(&dev->lock); + dev->survivability_mode = 0; + mutex_unlock(&dev->lock); + + config_item_put(&dev->group.cg_item); +} + int __init xe_configfs_init(void) { struct config_group *root = &xe_configfs.su_group; diff --git a/drivers/gpu/drm/xe/xe_configfs.h b/drivers/gpu/drm/xe/xe_configfs.h index 5532320818e4..d7d041ec2611 100644 --- a/drivers/gpu/drm/xe/xe_configfs.h +++ b/drivers/gpu/drm/xe/xe_configfs.h @@ -5,12 +5,20 @@ #ifndef _XE_CONFIGFS_H_ #define _XE_CONFIGFS_H_ +#include + +struct pci_dev; + #if IS_ENABLED(CONFIG_CONFIGFS_FS) int xe_configfs_init(void); void xe_configfs_exit(void); +bool xe_configfs_get_survivability_mode(struct pci_dev *pdev); +void xe_configfs_clear_survivability_mode(struct pci_dev *pdev); #else static inline int xe_configfs_init(void) { return 0; }; static inline void xe_configfs_exit(void) {}; +static inline bool xe_configfs_get_survivability_mode(struct pci_dev *pdev) { return false; }; +static inline void xe_configfs_clear_survivability_mode(struct pci_dev *pdev) {}; #endif #endif diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index d8e227ddf255..75e753e0a682 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -712,7 +712,7 @@ int xe_device_probe_early(struct xe_device *xe) sriov_update_device_info(xe); err = xe_pcode_probe_early(xe); - if (err) { + if (err || xe_survivability_mode_is_requested(xe)) { int save_err = err; /* diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index 780287692e61..07fe994f2a80 100644 --- a/drivers/gpu/drm/xe/xe_pci.c +++ b/drivers/gpu/drm/xe/xe_pci.c @@ -812,18 +812,17 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return err; err = xe_device_probe_early(xe); - if (err) { - /* - * In Boot Survivability mode, no drm card is exposed and driver - * is loaded with bare minimum to allow for firmware to be - * flashed through mei. If early probe failed, but it managed to - * enable survivability mode, return success. - */ - if (xe_survivability_mode_is_enabled(xe)) - return 0; + /* + * In Boot Survivability mode, no drm card is exposed and driver + * is loaded with bare minimum to allow for firmware to be + * flashed through mei. Return success, if survivability mode + * is enabled due to pcode failure or configfs being set + */ + if (xe_survivability_mode_is_enabled(xe)) + return 0; + if (err) return err; - } err = xe_info_init(xe, desc); if (err) diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c index 399c06890b0b..1f710b3fc599 100644 --- a/drivers/gpu/drm/xe/xe_survivability_mode.c +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c @@ -10,6 +10,7 @@ #include #include +#include "xe_configfs.h" #include "xe_device.h" #include "xe_gt.h" #include "xe_heci_gsc.h" @@ -145,6 +146,7 @@ static void xe_survivability_mode_fini(void *arg) struct pci_dev *pdev = to_pci_dev(xe->drm.dev); struct device *dev = &pdev->dev; + xe_configfs_clear_survivability_mode(pdev); sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr); } @@ -198,23 +200,40 @@ bool xe_survivability_mode_is_enabled(struct xe_device *xe) return xe->survivability.mode; } -/* - * survivability_mode_requested - check if it's possible to enable - * survivability mode and that was requested by firmware +/** + * xe_survivability_mode_is_requested - check if it's possible to enable survivability + * mode that was requested by firmware or userspace + * @xe: xe device instance * - * This function reads the boot status from Pcode. + * This function reads configfs and boot status from Pcode. * * Return: true if platform support is available and boot status indicates - * failure, false otherwise. + * failure or if survivability mode is requested, false otherwise. */ -static bool survivability_mode_requested(struct xe_device *xe) +bool xe_survivability_mode_is_requested(struct xe_device *xe) { struct xe_survivability *survivability = &xe->survivability; struct xe_mmio *mmio = xe_root_tile_mmio(xe); + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); u32 data; + bool survivability_mode; + + if (!IS_DGFX(xe) || IS_SRIOV_VF(xe)) + return false; + + survivability_mode = xe_configfs_get_survivability_mode(pdev); - if (!IS_DGFX(xe) || xe->info.platform < XE_BATTLEMAGE || IS_SRIOV_VF(xe)) + if (xe->info.platform < XE_BATTLEMAGE) { + if (survivability_mode) { + dev_err(&pdev->dev, "Survivability Mode is not supported on this card\n"); + xe_configfs_clear_survivability_mode(pdev); + } return false; + } + + /* Enable survivability mode if set via configfs */ + if (survivability_mode) + return true; data = xe_mmio_read32(mmio, PCODE_SCRATCH(0)); survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data); @@ -238,7 +257,7 @@ int xe_survivability_mode_enable(struct xe_device *xe) struct xe_survivability_info *info; struct pci_dev *pdev = to_pci_dev(xe->drm.dev); - if (!survivability_mode_requested(xe)) + if (!xe_survivability_mode_is_requested(xe)) return 0; survivability->size = MAX_SCRATCH_MMIO; diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.h b/drivers/gpu/drm/xe/xe_survivability_mode.h index d7e64885570d..02231c2bf008 100644 --- a/drivers/gpu/drm/xe/xe_survivability_mode.h +++ b/drivers/gpu/drm/xe/xe_survivability_mode.h @@ -12,5 +12,6 @@ struct xe_device; int xe_survivability_mode_enable(struct xe_device *xe); bool xe_survivability_mode_is_enabled(struct xe_device *xe); +bool xe_survivability_mode_is_requested(struct xe_device *xe); #endif /* _XE_SURVIVABILITY_MODE_H_ */ -- 2.51.0 From 3ded92c439449d69017d7a41c5eda3392d6724c7 Mon Sep 17 00:00:00 2001 From: Shuicheng Lin Date: Sat, 5 Apr 2025 17:15:39 +0000 Subject: [PATCH 06/16] drm/xe: remove unused LE_COS The LE_COS definition missed passing the value parameter to REG_FIELD_PREP. This didn't cause build errors because the entire macro was unused. The value for this field is universally "0" for every MOCS entry on the old Xe_LP platforms, and the whole field has been removed from Xe_HP onward. Just delete the line so that we don't have an unused definition. Suggested-by: Matt Roper Reviewed-by: Matt Roper Cc: Lucas De Marchi Signed-off-by: Shuicheng Lin Link: https://lore.kernel.org/r/20250405171539.599850-1-shuicheng.lin@intel.com Signed-off-by: Matt Roper --- drivers/gpu/drm/xe/regs/xe_gt_regs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h index 58f4218c2569..cbb9f7cbcfc0 100644 --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h @@ -62,7 +62,6 @@ #define LE_SSE_MASK REG_GENMASK(18, 17) #define LE_SSE(value) REG_FIELD_PREP(LE_SSE_MASK, value) #define LE_COS_MASK REG_GENMASK(16, 15) -#define LE_COS(value) REG_FIELD_PREP(LE_COS_MASK) #define LE_SCF_MASK REG_BIT(14) #define LE_SCF(value) REG_FIELD_PREP(LE_SCF_MASK, value) #define LE_PFM_MASK REG_GENMASK(13, 11) -- 2.51.0 From 29582e0ea75c95668d168b12406e3c56cf5a73c4 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Tue, 8 Apr 2025 08:59:15 -0700 Subject: [PATCH 07/16] drm/xe: Add page queue multiplier For an unknown reason the math to determine the PF queue size does is not correct - compute UMD applications are overflowing the PF queue which is fatal. A multippier of 8 fixes the problem. Fixes: 3338e4f90c14 ("drm/xe: Use topology to determine page fault queue size") Cc: stable@vger.kernel.org Signed-off-by: Matthew Brost Reviewed-by: Jagmeet Randhawa Link: https://lore.kernel.org/r/20250408155915.78770-1-matthew.brost@intel.com --- drivers/gpu/drm/xe/xe_gt_pagefault.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c index 9fa11e837dd1..10622ca471a2 100644 --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c @@ -435,9 +435,16 @@ static int xe_alloc_pf_queue(struct xe_gt *gt, struct pf_queue *pf_queue) num_eus = bitmap_weight(gt->fuse_topo.eu_mask_per_dss, XE_MAX_EU_FUSE_BITS) * num_dss; - /* user can issue separate page faults per EU and per CS */ + /* + * user can issue separate page faults per EU and per CS + * + * XXX: Multiplier required as compute UMD are getting PF queue errors + * without it. Follow on why this multiplier is required. + */ +#define PF_MULTIPLIER 8 pf_queue->num_dw = - (num_eus + XE_NUM_HW_ENGINES) * PF_MSG_LEN_DW; + (num_eus + XE_NUM_HW_ENGINES) * PF_MSG_LEN_DW * PF_MULTIPLIER; +#undef PF_MULTIPLIER pf_queue->gt = gt; pf_queue->data = devm_kcalloc(xe->drm.dev, pf_queue->num_dw, -- 2.51.0 From d3e8349edf7ed9eaf076ab3d9973331ccc20e26c Mon Sep 17 00:00:00 2001 From: John Harrison Date: Thu, 3 Apr 2025 11:56:11 -0700 Subject: [PATCH 08/16] drm/xe/guc: Enable w/a 16026508708 The workaround is only relevant to SRIOV but does affect all platforms. Signed-off-by: John Harrison Reviewed-by: Daniele Ceraolo Spurio Link: https://lore.kernel.org/r/20250403185619.1555853-2-John.C.Harrison@Intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/abi/guc_klvs_abi.h | 1 + drivers/gpu/drm/xe/xe_guc_ads.c | 5 +++++ drivers/gpu/drm/xe/xe_wa_oob.rules | 2 ++ 3 files changed, 8 insertions(+) diff --git a/drivers/gpu/drm/xe/abi/guc_klvs_abi.h b/drivers/gpu/drm/xe/abi/guc_klvs_abi.h index d633f1c739e4..7de8f827281f 100644 --- a/drivers/gpu/drm/xe/abi/guc_klvs_abi.h +++ b/drivers/gpu/drm/xe/abi/guc_klvs_abi.h @@ -367,6 +367,7 @@ enum xe_guc_klv_ids { GUC_WA_KLV_NP_RD_WRITE_TO_CLEAR_RCSM_AT_CGP_LATE_RESTORE = 0x9008, GUC_WORKAROUND_KLV_ID_BACK_TO_BACK_RCS_ENGINE_RESET = 0x9009, GUC_WA_KLV_WAKE_POWER_DOMAINS_FOR_OUTBOUND_MMIO = 0x900a, + GUC_WA_KLV_RESET_BB_STACK_PTR_ON_VF_SWITCH = 0x900b, }; #endif diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c index 88400f249e61..315f86c9164f 100644 --- a/drivers/gpu/drm/xe/xe_guc_ads.c +++ b/drivers/gpu/drm/xe/xe_guc_ads.c @@ -376,6 +376,11 @@ static void guc_waklv_init(struct xe_guc_ads *ads) GUC_WORKAROUND_KLV_ID_BACK_TO_BACK_RCS_ENGINE_RESET, &offset, &remain); + if (GUC_FIRMWARE_VER(>->uc.guc) >= MAKE_GUC_VER(70, 44, 0) && XE_WA(gt, 16026508708)) + guc_waklv_enable_simple(ads, + GUC_WA_KLV_RESET_BB_STACK_PTR_ON_VF_SWITCH, + &offset, &remain); + size = guc_ads_waklv_size(ads) - remain; if (!size) return; diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules b/drivers/gpu/drm/xe/xe_wa_oob.rules index 9b9e176992a8..9efc5accd43d 100644 --- a/drivers/gpu/drm/xe/xe_wa_oob.rules +++ b/drivers/gpu/drm/xe/xe_wa_oob.rules @@ -57,3 +57,5 @@ no_media_l3 MEDIA_VERSION(3000) GRAPHICS_VERSION(1260), GRAPHICS_STEP(A0, B0) 16023105232 GRAPHICS_VERSION_RANGE(2001, 3001) MEDIA_VERSION_RANGE(1301, 3000) +16026508708 GRAPHICS_VERSION_RANGE(1200, 3001) + MEDIA_VERSION_RANGE(1300, 3000) -- 2.51.0 From 725648bcf28feddb2c25d752d11b5f5070b0a963 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Thu, 3 Apr 2025 11:56:15 -0700 Subject: [PATCH 09/16] drm/xe/guc: Bump the recommended GuC version to 70.44.1 A new workaround requires a newer GuC version. So, recommend that users install it. Signed-off-by: John Harrison Reviewed-by: Daniele Ceraolo Spurio Link: https://lore.kernel.org/r/20250403185619.1555853-6-John.C.Harrison@Intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_uc_fw.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_uc_fw.c b/drivers/gpu/drm/xe/xe_uc_fw.c index 4a16d3c40ea9..2741849bbf4d 100644 --- a/drivers/gpu/drm/xe/xe_uc_fw.c +++ b/drivers/gpu/drm/xe/xe_uc_fw.c @@ -114,16 +114,16 @@ struct fw_blobs_by_type { #define XE_GT_TYPE_ANY XE_GT_TYPE_UNINITIALIZED #define XE_GUC_FIRMWARE_DEFS(fw_def, mmp_ver, major_ver) \ - fw_def(BATTLEMAGE, GT_TYPE_ANY, major_ver(xe, guc, bmg, 70, 29, 2)) \ - fw_def(LUNARLAKE, GT_TYPE_ANY, major_ver(xe, guc, lnl, 70, 29, 2)) \ - fw_def(METEORLAKE, GT_TYPE_ANY, major_ver(i915, guc, mtl, 70, 29, 2)) \ - fw_def(DG2, GT_TYPE_ANY, major_ver(i915, guc, dg2, 70, 29, 2)) \ - fw_def(DG1, GT_TYPE_ANY, major_ver(i915, guc, dg1, 70, 29, 2)) \ - fw_def(ALDERLAKE_N, GT_TYPE_ANY, major_ver(i915, guc, tgl, 70, 29, 2)) \ - fw_def(ALDERLAKE_P, GT_TYPE_ANY, major_ver(i915, guc, adlp, 70, 29, 2)) \ - fw_def(ALDERLAKE_S, GT_TYPE_ANY, major_ver(i915, guc, tgl, 70, 29, 2)) \ - fw_def(ROCKETLAKE, GT_TYPE_ANY, major_ver(i915, guc, tgl, 70, 29, 2)) \ - fw_def(TIGERLAKE, GT_TYPE_ANY, major_ver(i915, guc, tgl, 70, 29, 2)) + fw_def(BATTLEMAGE, GT_TYPE_ANY, major_ver(xe, guc, bmg, 70, 44, 1)) \ + fw_def(LUNARLAKE, GT_TYPE_ANY, major_ver(xe, guc, lnl, 70, 44, 1)) \ + fw_def(METEORLAKE, GT_TYPE_ANY, major_ver(i915, guc, mtl, 70, 44, 1)) \ + fw_def(DG2, GT_TYPE_ANY, major_ver(i915, guc, dg2, 70, 44, 1)) \ + fw_def(DG1, GT_TYPE_ANY, major_ver(i915, guc, dg1, 70, 44, 1)) \ + fw_def(ALDERLAKE_N, GT_TYPE_ANY, major_ver(i915, guc, tgl, 70, 44, 1)) \ + fw_def(ALDERLAKE_P, GT_TYPE_ANY, major_ver(i915, guc, adlp, 70, 44, 1)) \ + fw_def(ALDERLAKE_S, GT_TYPE_ANY, major_ver(i915, guc, tgl, 70, 44, 1)) \ + fw_def(ROCKETLAKE, GT_TYPE_ANY, major_ver(i915, guc, tgl, 70, 44, 1)) \ + fw_def(TIGERLAKE, GT_TYPE_ANY, major_ver(i915, guc, tgl, 70, 44, 1)) #define XE_HUC_FIRMWARE_DEFS(fw_def, mmp_ver, no_ver) \ fw_def(BATTLEMAGE, GT_TYPE_ANY, no_ver(xe, huc, bmg)) \ -- 2.51.0 From 80742a1aa26ea31a5c48da6f4da1d2e6ce6a2ab2 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 9 Apr 2025 07:09:56 -0700 Subject: [PATCH 10/16] drm/xe: Allow to drop vram resizing MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The default behavior if the LMEMBAR doesn't match the maximum possible size is to try to resize it. However the user might want to keep, even for testing the behavior with small BAR, whatever size was set via sysfs. Change the module parameter to int and check for negative value. Cc: Michał Winiarski Reviewed-by: Michał Winiarski Link: https://lore.kernel.org/r/20250409-bar-resize-param-v1-1-75bf4df38aa0@intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_module.c | 4 ++-- drivers/gpu/drm/xe/xe_vram.c | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c index be8603b16ff3..64bf46646544 100644 --- a/drivers/gpu/drm/xe/xe_module.c +++ b/drivers/gpu/drm/xe/xe_module.c @@ -39,8 +39,8 @@ MODULE_PARM_DESC(force_execlist, "Force Execlist submission"); module_param_named(probe_display, xe_modparam.probe_display, bool, 0444); MODULE_PARM_DESC(probe_display, "Probe display HW, otherwise it's left untouched (default: true)"); -module_param_named(vram_bar_size, xe_modparam.force_vram_bar_size, uint, 0600); -MODULE_PARM_DESC(vram_bar_size, "Set the vram bar size(in MiB)"); +module_param_named(vram_bar_size, xe_modparam.force_vram_bar_size, int, 0600); +MODULE_PARM_DESC(vram_bar_size, "Set the vram bar size (in MiB) - <0=disable-resize, 0=max-needed-size[default], >0=force-size"); module_param_named(guc_log_level, xe_modparam.guc_log_level, int, 0600); MODULE_PARM_DESC(guc_log_level, "GuC firmware logging level (0=disable, 1..5=enable with verbosity min..max)"); diff --git a/drivers/gpu/drm/xe/xe_vram.c b/drivers/gpu/drm/xe/xe_vram.c index b1f81dca610d..e421a74fb87c 100644 --- a/drivers/gpu/drm/xe/xe_vram.c +++ b/drivers/gpu/drm/xe/xe_vram.c @@ -49,7 +49,7 @@ _resize_bar(struct xe_device *xe, int resno, resource_size_t size) */ static void resize_vram_bar(struct xe_device *xe) { - u64 force_vram_bar_size = xe_modparam.force_vram_bar_size; + int force_vram_bar_size = xe_modparam.force_vram_bar_size; struct pci_dev *pdev = to_pci_dev(xe->drm.dev); struct pci_bus *root = pdev->bus; resource_size_t current_size; @@ -66,6 +66,9 @@ static void resize_vram_bar(struct xe_device *xe) if (!bar_size_mask) return; + if (force_vram_bar_size < 0) + return; + /* set to a specific size? */ if (force_vram_bar_size) { u32 bar_size_bit; -- 2.51.0 From e15826bb3c2c5377eedc757f2adec8dcaa5255f7 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 3 Apr 2025 16:26:33 +0200 Subject: [PATCH 11/16] drm/xe/guc: Refactor GuC debugfs initialization We don't have to drmm_kmalloc() local copy of debugfs_list to write there our pointer to the struct xe_guc as we can extract pointer to the struct xe_gt from the grandparent debugfs entry, in similar way to what we did for GT debugfs files. Note that there is no change in file/directory structure, just refactored how files are created and how functions are called. Signed-off-by: Michal Wajdeczko Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250403142635.1821-2-michal.wajdeczko@intel.com --- drivers/gpu/drm/xe/xe_guc_debugfs.c | 130 ++++++++++++++-------------- 1 file changed, 67 insertions(+), 63 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_debugfs.c b/drivers/gpu/drm/xe/xe_guc_debugfs.c index c569ff456e74..9a1c78b89f45 100644 --- a/drivers/gpu/drm/xe/xe_guc_debugfs.c +++ b/drivers/gpu/drm/xe/xe_guc_debugfs.c @@ -17,101 +17,105 @@ #include "xe_macros.h" #include "xe_pm.h" -static struct xe_guc *node_to_guc(struct drm_info_node *node) -{ - return node->info_ent->data; -} - -static int guc_info(struct seq_file *m, void *data) +/* + * guc_debugfs_show - A show callback for struct drm_info_list + * @m: the &seq_file + * @data: data used by the drm debugfs helpers + * + * This callback can be used in struct drm_info_list to describe debugfs + * files that are &xe_guc specific in similar way how we handle &xe_gt + * specific files using &xe_gt_debugfs_simple_show. + * + * It is assumed that those debugfs files will be created on directory entry + * which grandparent struct dentry d_inode->i_private points to &xe_gt. + * + * /sys/kernel/debug/dri/0/ + * ├── gt0 # dent->d_parent->d_parent (d_inode->i_private == gt) + * │   ├── uc # dent->d_parent + * │   │   ├── guc_info # dent + * │   │   ├── guc_... + * + * This function assumes that &m->private will be set to the &struct + * drm_info_node corresponding to the instance of the info on a given &struct + * drm_minor (see struct drm_info_list.show for details). + * + * This function also assumes that struct drm_info_list.data will point to the + * function code that will actually print a file content:: + * + * int (*print)(struct xe_guc *, struct drm_printer *) + * + * Example:: + * + * int foo(struct xe_guc *guc, struct drm_printer *p) + * { + * drm_printf(p, "enabled %d\n", guc->submission_state.enabled); + * return 0; + * } + * + * static const struct drm_info_list bar[] = { + * { name = "foo", .show = guc_debugfs_show, .data = foo }, + * }; + * + * parent = debugfs_create_dir("uc", gtdir); + * drm_debugfs_create_files(bar, ARRAY_SIZE(bar), parent, minor); + * + * Return: 0 on success or a negative error code on failure. + */ +static int guc_debugfs_show(struct seq_file *m, void *data) { - struct xe_guc *guc = node_to_guc(m->private); - struct xe_device *xe = guc_to_xe(guc); struct drm_printer p = drm_seq_file_printer(m); + struct drm_info_node *node = m->private; + struct dentry *parent = node->dent->d_parent; + struct dentry *grandparent = parent->d_parent; + struct xe_gt *gt = grandparent->d_inode->i_private; + struct xe_device *xe = gt_to_xe(gt); + int (*print)(struct xe_guc *, struct drm_printer *) = node->info_ent->data; + int ret; xe_pm_runtime_get(xe); - xe_guc_print_info(guc, &p); + ret = print(>->uc.guc, &p); xe_pm_runtime_put(xe); - return 0; + return ret; } -static int guc_log(struct seq_file *m, void *data) +static int guc_log(struct xe_guc *guc, struct drm_printer *p) { - struct xe_guc *guc = node_to_guc(m->private); - struct xe_device *xe = guc_to_xe(guc); - struct drm_printer p = drm_seq_file_printer(m); - - xe_pm_runtime_get(xe); - xe_guc_log_print(&guc->log, &p); - xe_pm_runtime_put(xe); - + xe_guc_log_print(&guc->log, p); return 0; } -static int guc_log_dmesg(struct seq_file *m, void *data) +static int guc_log_dmesg(struct xe_guc *guc, struct drm_printer *p) { - struct xe_guc *guc = node_to_guc(m->private); - struct xe_device *xe = guc_to_xe(guc); - - xe_pm_runtime_get(xe); xe_guc_log_print_dmesg(&guc->log); - xe_pm_runtime_put(xe); - return 0; } -static int guc_ctb(struct seq_file *m, void *data) +static int guc_ctb(struct xe_guc *guc, struct drm_printer *p) { - struct xe_guc *guc = node_to_guc(m->private); - struct xe_device *xe = guc_to_xe(guc); - struct drm_printer p = drm_seq_file_printer(m); - - xe_pm_runtime_get(xe); - xe_guc_ct_print(&guc->ct, &p, true); - xe_pm_runtime_put(xe); - + xe_guc_ct_print(&guc->ct, p, true); return 0; } -static int guc_pc(struct seq_file *m, void *data) +static int guc_pc(struct xe_guc *guc, struct drm_printer *p) { - struct xe_guc *guc = node_to_guc(m->private); - struct xe_device *xe = guc_to_xe(guc); - struct drm_printer p = drm_seq_file_printer(m); - - xe_pm_runtime_get(xe); - xe_guc_pc_print(&guc->pc, &p); - xe_pm_runtime_put(xe); - + xe_guc_pc_print(&guc->pc, p); return 0; } static const struct drm_info_list debugfs_list[] = { - {"guc_info", guc_info, 0}, - {"guc_log", guc_log, 0}, - {"guc_log_dmesg", guc_log_dmesg, 0}, - {"guc_ctb", guc_ctb, 0}, - {"guc_pc", guc_pc, 0}, + { "guc_info", .show = guc_debugfs_show, .data = xe_guc_print_info }, + { "guc_log", .show = guc_debugfs_show, .data = guc_log }, + { "guc_log_dmesg", .show = guc_debugfs_show, .data = guc_log_dmesg }, + { "guc_ctb", .show = guc_debugfs_show, .data = guc_ctb }, + { "guc_pc", .show = guc_debugfs_show, .data = guc_pc }, }; void xe_guc_debugfs_register(struct xe_guc *guc, struct dentry *parent) { struct drm_minor *minor = guc_to_xe(guc)->drm.primary; - struct drm_info_list *local; - int i; - -#define DEBUGFS_SIZE (ARRAY_SIZE(debugfs_list) * sizeof(struct drm_info_list)) - local = drmm_kmalloc(&guc_to_xe(guc)->drm, DEBUGFS_SIZE, GFP_KERNEL); - if (!local) - return; - - memcpy(local, debugfs_list, DEBUGFS_SIZE); -#undef DEBUGFS_SIZE - - for (i = 0; i < ARRAY_SIZE(debugfs_list); ++i) - local[i].data = guc; - drm_debugfs_create_files(local, + drm_debugfs_create_files(debugfs_list, ARRAY_SIZE(debugfs_list), parent, minor); } -- 2.51.0 From 387444984d7b53dbaee263887cad4ea7c8e57b34 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 3 Apr 2025 16:26:34 +0200 Subject: [PATCH 12/16] drm/xe/guc: Don't expose GuC privileged debugfs files if VF Some of the GuC debugfs files require access to the data that is not available on the VFs. Don't expose those files on the VF driver. Signed-off-by: Michal Wajdeczko Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250403142635.1821-3-michal.wajdeczko@intel.com --- drivers/gpu/drm/xe/xe_guc_debugfs.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_debugfs.c b/drivers/gpu/drm/xe/xe_guc_debugfs.c index 9a1c78b89f45..f33013f8a0f3 100644 --- a/drivers/gpu/drm/xe/xe_guc_debugfs.c +++ b/drivers/gpu/drm/xe/xe_guc_debugfs.c @@ -103,11 +103,20 @@ static int guc_pc(struct xe_guc *guc, struct drm_printer *p) return 0; } -static const struct drm_info_list debugfs_list[] = { +/* + * only for GuC debugfs files which can be safely used on the VF as well: + * - without access to the GuC privileged registers + * - without access to the PF specific GuC objects + */ +static const struct drm_info_list vf_safe_debugfs_list[] = { { "guc_info", .show = guc_debugfs_show, .data = xe_guc_print_info }, + { "guc_ctb", .show = guc_debugfs_show, .data = guc_ctb }, +}; + +/* everything else should be added here */ +static const struct drm_info_list pf_only_debugfs_list[] = { { "guc_log", .show = guc_debugfs_show, .data = guc_log }, { "guc_log_dmesg", .show = guc_debugfs_show, .data = guc_log_dmesg }, - { "guc_ctb", .show = guc_debugfs_show, .data = guc_ctb }, { "guc_pc", .show = guc_debugfs_show, .data = guc_pc }, }; @@ -115,7 +124,12 @@ void xe_guc_debugfs_register(struct xe_guc *guc, struct dentry *parent) { struct drm_minor *minor = guc_to_xe(guc)->drm.primary; - drm_debugfs_create_files(debugfs_list, - ARRAY_SIZE(debugfs_list), + drm_debugfs_create_files(vf_safe_debugfs_list, + ARRAY_SIZE(vf_safe_debugfs_list), parent, minor); + + if (!IS_SRIOV_VF(guc_to_xe(guc))) + drm_debugfs_create_files(pf_only_debugfs_list, + ARRAY_SIZE(pf_only_debugfs_list), + parent, minor); } -- 2.51.0 From d11c5a928a6e1d786e25a9284ef59bf58a02cf0d Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 3 Apr 2025 16:26:35 +0200 Subject: [PATCH 13/16] drm/xe/vf: Don't expose privileged GT debugfs files if VF Some of the debugfs files require access to the registers that are not accessible to the VFs. Don't expose those files on VF drivers. Signed-off-by: Michal Wajdeczko Cc: Marcin Bernatowicz Cc: Lucas De Marchi Tested-by: Marcin Bernatowicz Reviewed-by: Marcin Bernatowicz Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250403142635.1821-4-michal.wajdeczko@intel.com --- drivers/gpu/drm/xe/xe_gt_debugfs.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_debugfs.c b/drivers/gpu/drm/xe/xe_gt_debugfs.c index 2d63a69cbfa3..a88076e9cc7d 100644 --- a/drivers/gpu/drm/xe/xe_gt_debugfs.c +++ b/drivers/gpu/drm/xe/xe_gt_debugfs.c @@ -299,20 +299,20 @@ static int hwconfig(struct xe_gt *gt, struct drm_printer *p) return 0; } -static const struct drm_info_list debugfs_list[] = { - {"hw_engines", .show = xe_gt_debugfs_simple_show, .data = hw_engines}, +/* + * only for GT debugfs files which can be safely used on the VF as well: + * - without access to the GT privileged registers + * - without access to the PF specific data + */ +static const struct drm_info_list vf_safe_debugfs_list[] = { {"force_reset", .show = xe_gt_debugfs_simple_show, .data = force_reset}, {"force_reset_sync", .show = xe_gt_debugfs_simple_show, .data = force_reset_sync}, {"sa_info", .show = xe_gt_debugfs_simple_show, .data = sa_info}, {"topology", .show = xe_gt_debugfs_simple_show, .data = topology}, - {"steering", .show = xe_gt_debugfs_simple_show, .data = steering}, {"ggtt", .show = xe_gt_debugfs_simple_show, .data = ggtt}, - {"powergate_info", .show = xe_gt_debugfs_simple_show, .data = powergate_info}, {"register-save-restore", .show = xe_gt_debugfs_simple_show, .data = register_save_restore}, {"workarounds", .show = xe_gt_debugfs_simple_show, .data = workarounds}, {"tunings", .show = xe_gt_debugfs_simple_show, .data = tunings}, - {"pat", .show = xe_gt_debugfs_simple_show, .data = pat}, - {"mocs", .show = xe_gt_debugfs_simple_show, .data = mocs}, {"default_lrc_rcs", .show = xe_gt_debugfs_simple_show, .data = rcs_default_lrc}, {"default_lrc_ccs", .show = xe_gt_debugfs_simple_show, .data = ccs_default_lrc}, {"default_lrc_bcs", .show = xe_gt_debugfs_simple_show, .data = bcs_default_lrc}, @@ -322,6 +322,15 @@ static const struct drm_info_list debugfs_list[] = { {"hwconfig", .show = xe_gt_debugfs_simple_show, .data = hwconfig}, }; +/* everything else should be added here */ +static const struct drm_info_list pf_only_debugfs_list[] = { + {"hw_engines", .show = xe_gt_debugfs_simple_show, .data = hw_engines}, + {"mocs", .show = xe_gt_debugfs_simple_show, .data = mocs}, + {"pat", .show = xe_gt_debugfs_simple_show, .data = pat}, + {"powergate_info", .show = xe_gt_debugfs_simple_show, .data = powergate_info}, + {"steering", .show = xe_gt_debugfs_simple_show, .data = steering}, +}; + void xe_gt_debugfs_register(struct xe_gt *gt) { struct xe_device *xe = gt_to_xe(gt); @@ -345,10 +354,15 @@ void xe_gt_debugfs_register(struct xe_gt *gt) */ root->d_inode->i_private = gt; - drm_debugfs_create_files(debugfs_list, - ARRAY_SIZE(debugfs_list), + drm_debugfs_create_files(vf_safe_debugfs_list, + ARRAY_SIZE(vf_safe_debugfs_list), root, minor); + if (!IS_SRIOV_VF(xe)) + drm_debugfs_create_files(pf_only_debugfs_list, + ARRAY_SIZE(pf_only_debugfs_list), + root, minor); + xe_uc_debugfs_register(>->uc, root); if (IS_SRIOV_PF(xe)) -- 2.51.0 From 84d37635986987649fec6dd7358392243a35601e Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Fri, 11 Apr 2025 21:30:30 +0200 Subject: [PATCH 14/16] drm/xe/pf: Don't show GGTT/LMEM debugfs files under media GT Most of the PF's debugfs files (and their implementations) are based on the GT hierarchy even if files are related to GGTT or LMEM data, that are related to the tile. While we could reach the tile data from any GT, to avoid potential misuse, some functions allow to be used on the primary GT only, and may use asserts to enforce that. In our case, the following assert could be seen when reading the /sys/kernel/debug/dri/0000:00:02.0/gt1/pf/ggtt_available [ ] xe 0000:00:02.0: [drm] Assertion `!xe_gt_is_media_type(gt)` failed! [ ] WARNING: CPU: 4 PID: 10609 at drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c:379 pf_get_spare_ggtt+0x256/0x4e0 [xe] [ ] RIP: 0010:pf_get_spare_ggtt+0x256/0x4e0 [xe] [ ] Call Trace: [ ] [ ] xe_gt_sriov_pf_config_print_available_ggtt+0xb7/0x480 [xe] [ ] ? __memcg_slab_post_alloc_hook+0x12f/0x3f0 [ ] xe_gt_debugfs_simple_show+0x7b/0xb0 [xe] [ ] ? __pfx___drm_printfn_seq_file+0x10/0x10 [ ] ? __pfx___drm_puts_seq_file+0x10/0x10 [ ] seq_read_iter+0x139/0x4e0 [ ] seq_read+0x11d/0x160 [ ] full_proxy_read+0x6b/0xb0 [ ] vfs_read+0xfa/0x390 Fix that by moving GGTT/LMEM debugfs attributes to separate lists and register them only when applicable (on primary GT, on DGFX). Signed-off-by: Michal Wajdeczko Tested-by: Marcin Bernatowicz Reviewed-by: Marcin Bernatowicz Link: https://lore.kernel.org/r/20250411193030.1865-1-michal.wajdeczko@intel.com --- drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c | 66 +++++++++++++++------ 1 file changed, 49 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c index b2521dd6ec42..0fe47f41b63c 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c @@ -51,26 +51,17 @@ static unsigned int extract_vfid(struct dentry *d) * /sys/kernel/debug/dri/0/ * ├── gt0 * │   ├── pf - * │   │   ├── ggtt_available - * │   │   ├── ggtt_provisioned * │   │   ├── contexts_provisioned * │   │   ├── doorbells_provisioned * │   │   ├── runtime_registers * │   │   ├── negotiated_versions * │   │   ├── adverse_events + * ├── gt1 + * │   ├── pf + * │   │   ├── ... */ static const struct drm_info_list pf_info[] = { - { - "ggtt_available", - .show = xe_gt_debugfs_simple_show, - .data = xe_gt_sriov_pf_config_print_available_ggtt, - }, - { - "ggtt_provisioned", - .show = xe_gt_debugfs_simple_show, - .data = xe_gt_sriov_pf_config_print_ggtt, - }, { "contexts_provisioned", .show = xe_gt_debugfs_simple_show, @@ -81,11 +72,6 @@ static const struct drm_info_list pf_info[] = { .show = xe_gt_debugfs_simple_show, .data = xe_gt_sriov_pf_config_print_dbs, }, - { - "lmem_provisioned", - .show = xe_gt_debugfs_simple_show, - .data = xe_gt_sriov_pf_config_print_lmem, - }, { "runtime_registers", .show = xe_gt_debugfs_simple_show, @@ -103,6 +89,42 @@ static const struct drm_info_list pf_info[] = { }, }; +/* + * /sys/kernel/debug/dri/0/ + * ├── gt0 + * │   ├── pf + * │   │   ├── ggtt_available + * │   │   ├── ggtt_provisioned + */ + +static const struct drm_info_list pf_ggtt_info[] = { + { + "ggtt_available", + .show = xe_gt_debugfs_simple_show, + .data = xe_gt_sriov_pf_config_print_available_ggtt, + }, + { + "ggtt_provisioned", + .show = xe_gt_debugfs_simple_show, + .data = xe_gt_sriov_pf_config_print_ggtt, + }, +}; + +/* + * /sys/kernel/debug/dri/0/ + * ├── gt0 + * │   ├── pf + * │   │   ├── lmem_provisioned + */ + +static const struct drm_info_list pf_lmem_info[] = { + { + "lmem_provisioned", + .show = xe_gt_debugfs_simple_show, + .data = xe_gt_sriov_pf_config_print_lmem, + }, +}; + /* * /sys/kernel/debug/dri/0/ * ├── gt0 @@ -532,6 +554,16 @@ void xe_gt_sriov_pf_debugfs_register(struct xe_gt *gt, struct dentry *root) pfdentry->d_inode->i_private = gt; drm_debugfs_create_files(pf_info, ARRAY_SIZE(pf_info), pfdentry, minor); + if (!xe_gt_is_media_type(gt)) { + drm_debugfs_create_files(pf_ggtt_info, + ARRAY_SIZE(pf_ggtt_info), + pfdentry, minor); + if (IS_DGFX(gt_to_xe(gt))) + drm_debugfs_create_files(pf_lmem_info, + ARRAY_SIZE(pf_lmem_info), + pfdentry, minor); + } + pf_add_policy_attrs(gt, pfdentry); pf_add_config_attrs(gt, pfdentry, PFID); -- 2.51.0 From c31a0b6402d15b530514eee9925adfcb8cfbb1c9 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 9 Apr 2025 21:59:34 -0700 Subject: [PATCH 15/16] drm/xe: Set LRC addresses before guc load The metadata saved in the ADS is read by GuC when it's initialized. Saving the addresses to the LRCs when they are populated is too late as GuC will keep using the old ones. This was causing GuC to use the RCS LRC for any engine class. It's not a big problem on a Linux-only scenario since the they are used by GuC only on media engines when the watchdog is triggered. However, in a virtualization scenario with Windows as the VF, it causes the wrong LRCs to be loaded as the watchdog is used for all engines. Fix it by letting guc_golden_lrc_init() initialize the metadata, like other *_init() functions, and later guc_golden_lrc_populate() to copy the LRCs to the right places. The former is called before the second GuC load, while the latter is called after LRCs have been recorded. Cc: Chee Yin Wong Cc: John Harrison Cc: Matt Roper Cc: Matthew Brost Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: # v6.11+ Reviewed-by: Matthew Brost Tested-by: Chee Yin Wong Link: https://lore.kernel.org/r/20250409-fix-guc-ads-v1-1-494135f7a5d0@intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_guc_ads.c | 75 ++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 30 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c index 315f86c9164f..44c1fa2fe7c8 100644 --- a/drivers/gpu/drm/xe/xe_guc_ads.c +++ b/drivers/gpu/drm/xe/xe_guc_ads.c @@ -496,24 +496,52 @@ static void fill_engine_enable_masks(struct xe_gt *gt, engine_enable_mask(gt, XE_ENGINE_CLASS_OTHER)); } -static void guc_prep_golden_lrc_null(struct xe_guc_ads *ads) +/* + * Write the offsets corresponding to the golden LRCs. The actual data is + * populated later by guc_golden_lrc_populate() + */ +static void guc_golden_lrc_init(struct xe_guc_ads *ads) { struct xe_device *xe = ads_to_xe(ads); + struct xe_gt *gt = ads_to_gt(ads); struct iosys_map info_map = IOSYS_MAP_INIT_OFFSET(ads_to_map(ads), offsetof(struct __guc_ads_blob, system_info)); - u8 guc_class; + size_t alloc_size, real_size; + u32 addr_ggtt, offset; + int class; + + offset = guc_ads_golden_lrc_offset(ads); + addr_ggtt = xe_bo_ggtt_addr(ads->bo) + offset; + + for (class = 0; class < XE_ENGINE_CLASS_MAX; ++class) { + u8 guc_class; + + guc_class = xe_engine_class_to_guc_class(class); - for (guc_class = 0; guc_class <= GUC_MAX_ENGINE_CLASSES; ++guc_class) { if (!info_map_read(xe, &info_map, engine_enabled_masks[guc_class])) continue; + real_size = xe_gt_lrc_size(gt, class); + alloc_size = PAGE_ALIGN(real_size); + + /* + * This interface is slightly confusing. We need to pass the + * base address of the full golden context and the size of just + * the engine state, which is the section of the context image + * that starts after the execlists LRC registers. This is + * required to allow the GuC to restore just the engine state + * when a watchdog reset occurs. + * We calculate the engine state size by removing the size of + * what comes before it in the context image (which is identical + * on all engines). + */ ads_blob_write(ads, ads.eng_state_size[guc_class], - guc_ads_golden_lrc_size(ads) - - xe_lrc_skip_size(xe)); + real_size - xe_lrc_skip_size(xe)); ads_blob_write(ads, ads.golden_context_lrca[guc_class], - xe_bo_ggtt_addr(ads->bo) + - guc_ads_golden_lrc_offset(ads)); + addr_ggtt); + + addr_ggtt += alloc_size; } } @@ -863,7 +891,7 @@ void xe_guc_ads_populate_minimal(struct xe_guc_ads *ads) xe_map_memset(ads_to_xe(ads), ads_to_map(ads), 0, 0, ads->bo->size); guc_policies_init(ads); - guc_prep_golden_lrc_null(ads); + guc_golden_lrc_init(ads); guc_mapping_table_init_invalid(gt, &info_map); guc_doorbell_init(ads); @@ -889,7 +917,7 @@ void xe_guc_ads_populate(struct xe_guc_ads *ads) guc_policies_init(ads); fill_engine_enable_masks(gt, &info_map); guc_mmio_reg_state_init(ads); - guc_prep_golden_lrc_null(ads); + guc_golden_lrc_init(ads); guc_mapping_table_init(gt, &info_map); guc_capture_prep_lists(ads); guc_doorbell_init(ads); @@ -909,18 +937,22 @@ void xe_guc_ads_populate(struct xe_guc_ads *ads) guc_ads_private_data_offset(ads)); } -static void guc_populate_golden_lrc(struct xe_guc_ads *ads) +/* + * After the golden LRC's are recorded for each engine class by the first + * submission, copy them to the ADS, as initialized earlier by + * guc_golden_lrc_init(). + */ +static void guc_golden_lrc_populate(struct xe_guc_ads *ads) { struct xe_device *xe = ads_to_xe(ads); struct xe_gt *gt = ads_to_gt(ads); struct iosys_map info_map = IOSYS_MAP_INIT_OFFSET(ads_to_map(ads), offsetof(struct __guc_ads_blob, system_info)); size_t total_size = 0, alloc_size, real_size; - u32 addr_ggtt, offset; + u32 offset; int class; offset = guc_ads_golden_lrc_offset(ads); - addr_ggtt = xe_bo_ggtt_addr(ads->bo) + offset; for (class = 0; class < XE_ENGINE_CLASS_MAX; ++class) { u8 guc_class; @@ -937,26 +969,9 @@ static void guc_populate_golden_lrc(struct xe_guc_ads *ads) alloc_size = PAGE_ALIGN(real_size); total_size += alloc_size; - /* - * This interface is slightly confusing. We need to pass the - * base address of the full golden context and the size of just - * the engine state, which is the section of the context image - * that starts after the execlists LRC registers. This is - * required to allow the GuC to restore just the engine state - * when a watchdog reset occurs. - * We calculate the engine state size by removing the size of - * what comes before it in the context image (which is identical - * on all engines). - */ - ads_blob_write(ads, ads.eng_state_size[guc_class], - real_size - xe_lrc_skip_size(xe)); - ads_blob_write(ads, ads.golden_context_lrca[guc_class], - addr_ggtt); - xe_map_memcpy_to(xe, ads_to_map(ads), offset, gt->default_lrc[class], real_size); - addr_ggtt += alloc_size; offset += alloc_size; } @@ -965,7 +980,7 @@ static void guc_populate_golden_lrc(struct xe_guc_ads *ads) void xe_guc_ads_populate_post_load(struct xe_guc_ads *ads) { - guc_populate_golden_lrc(ads); + guc_golden_lrc_populate(ads); } static int guc_ads_action_update_policies(struct xe_guc_ads *ads, u32 policy_offset) -- 2.51.0 From 53e11d245c34656af56625bb06d59b9934428aba Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Thu, 3 Apr 2025 20:03:01 +0100 Subject: [PATCH 16/16] drm/xe: Adjust ringbuf emission for maximum possible size MAX_JOB_SIZE_DW seems to be undersized. For the worst case emission from __emit_job_gen12_render_compute I hand count 57 dwords so lets bump this to an even 58. Signed-off-by: Tvrtko Ursulin Reviewed-by: Francois Dugast Link: https://lore.kernel.org/r/20250403190317.6064-2-tvrtko.ursulin@igalia.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_ring_ops_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_ring_ops_types.h b/drivers/gpu/drm/xe/xe_ring_ops_types.h index 1ae56e2ee7b4..d7e3e150a9a5 100644 --- a/drivers/gpu/drm/xe/xe_ring_ops_types.h +++ b/drivers/gpu/drm/xe/xe_ring_ops_types.h @@ -8,7 +8,7 @@ struct xe_sched_job; -#define MAX_JOB_SIZE_DW 48 +#define MAX_JOB_SIZE_DW 58 #define MAX_JOB_SIZE_BYTES (MAX_JOB_SIZE_DW * 4) /** -- 2.51.0