From 16b7e65d299d56442879405ac85800877fa51355 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Mon, 12 May 2025 14:53:24 -0700 Subject: [PATCH 01/16] drm/xe/guc: Track FAST_REQ H2Gs to report where errors came from Most H2G messages are FAST_REQ which means no synchronous response is expected. The messages are sent as fire-and-forget with no tracking. However, errors can still be returned when something goes unexpectedly wrong. That leads to confusion due to not being able to match up the error response to the originating H2G. So add support for tracking the FAST_REQ H2Gs and matching up an error response to its originator. This is only enabled in XE_DEBUG builds given that such errors should never happen in a working system and there is an overhead for the tracking. Further, if XE_DEBUG_GUC is enabled then even more memory and time is used to record the call stack of each H2G and report that with the error. That makes it much easier to work out where a specific H2G came from if there are multiple code paths that can send it. v2: Some re-wording of comments and prints, more consistent use of #if vs stub functions - review feedback from Daniele & Michal). v3: Split config change to separate patch, improve a debug print (review feedback from Michal). v4: Bunch of minor tweaks (review feedback from Michal). Original-i915-code: Michal Wajdeczko Signed-off-by: John Harrison Reviewed-by: Daniele Ceraolo Spurio Reviewed-by: Michal Wajdeczko Link: https://lore.kernel.org/r/20250512215324.1457009-5-John.C.Harrison@Intel.com --- drivers/gpu/drm/xe/Kconfig.debug | 5 +- drivers/gpu/drm/xe/xe_guc_ct.c | 115 ++++++++++++++++++++++----- drivers/gpu/drm/xe/xe_guc_ct_types.h | 15 ++++ 3 files changed, 115 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/xe/Kconfig.debug b/drivers/gpu/drm/xe/Kconfig.debug index db063a513b1e..01735c6ece8b 100644 --- a/drivers/gpu/drm/xe/Kconfig.debug +++ b/drivers/gpu/drm/xe/Kconfig.debug @@ -90,10 +90,13 @@ config DRM_XE_DEBUG_GUC bool "Enable extra GuC related debug options" depends on DRM_XE_DEBUG default n + select STACKDEPOT help Choose this option when debugging guc issues. The GuC log buffer is increased to the maximum allowed, which should - be large enough for complex issues. + be large enough for complex issues. The tracking of FAST_REQ messages + is extended to include a record of the calling stack, which is then + dumped on a FAST_REQ error notification. Recommended for driver developers only. diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index b09dd5788a27..822f4c33f730 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -628,6 +628,47 @@ static void g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len) spin_unlock_irq(&ct->fast_lock); } +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG) +static void fast_req_track(struct xe_guc_ct *ct, u16 fence, u16 action) +{ + unsigned int slot = fence % ARRAY_SIZE(ct->fast_req); +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC) + unsigned long entries[SZ_32]; + unsigned int n; + + n = stack_trace_save(entries, ARRAY_SIZE(entries), 1); + + /* May be called under spinlock, so avoid sleeping */ + ct->fast_req[slot].stack = stack_depot_save(entries, n, GFP_NOWAIT); +#endif + ct->fast_req[slot].fence = fence; + ct->fast_req[slot].action = action; +} +#else +static void fast_req_track(struct xe_guc_ct *ct, u16 fence, u16 action) +{ +} +#endif + +/* + * The CT protocol accepts a 16 bits fence. This field is fully owned by the + * driver, the GuC will just copy it to the reply message. Since we need to + * be able to distinguish between replies to REQUEST and FAST_REQUEST messages, + * we use one bit of the seqno as an indicator for that and a rolling counter + * for the remaining 15 bits. + */ +#define CT_SEQNO_MASK GENMASK(14, 0) +#define CT_SEQNO_UNTRACKED BIT(15) +static u16 next_ct_seqno(struct xe_guc_ct *ct, bool is_g2h_fence) +{ + u32 seqno = ct->fence_seqno++ & CT_SEQNO_MASK; + + if (!is_g2h_fence) + seqno |= CT_SEQNO_UNTRACKED; + + return seqno; +} + #define H2G_CT_HEADERS (GUC_CTB_HDR_LEN + 1) /* one DW CTB header and one DW HxG header */ static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len, @@ -704,6 +745,9 @@ static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len, FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION | GUC_HXG_EVENT_MSG_0_DATA0, action[0]); } else { + fast_req_track(ct, ct_fence_value, + FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, action[0])); + cmd[1] = FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_FAST_REQUEST) | FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION | @@ -736,25 +780,6 @@ corrupted: return -EPIPE; } -/* - * The CT protocol accepts a 16 bits fence. This field is fully owned by the - * driver, the GuC will just copy it to the reply message. Since we need to - * be able to distinguish between replies to REQUEST and FAST_REQUEST messages, - * we use one bit of the seqno as an indicator for that and a rolling counter - * for the remaining 15 bits. - */ -#define CT_SEQNO_MASK GENMASK(14, 0) -#define CT_SEQNO_UNTRACKED BIT(15) -static u16 next_ct_seqno(struct xe_guc_ct *ct, bool is_g2h_fence) -{ - u32 seqno = ct->fence_seqno++ & CT_SEQNO_MASK; - - if (!is_g2h_fence) - seqno |= CT_SEQNO_UNTRACKED; - - return seqno; -} - static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len, u32 g2h_len, u32 num_g2h, struct g2h_fence *g2h_fence) @@ -1146,6 +1171,55 @@ static int guc_crash_process_msg(struct xe_guc_ct *ct, u32 action) return 0; } +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG) +static void fast_req_report(struct xe_guc_ct *ct, u16 fence) +{ + u16 fence_min = U16_MAX, fence_max = 0; + struct xe_gt *gt = ct_to_gt(ct); + bool found = false; + unsigned int n; +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC) + char *buf; +#endif + + lockdep_assert_held(&ct->lock); + + for (n = 0; n < ARRAY_SIZE(ct->fast_req); n++) { + if (ct->fast_req[n].fence < fence_min) + fence_min = ct->fast_req[n].fence; + if (ct->fast_req[n].fence > fence_max) + fence_max = ct->fast_req[n].fence; + + if (ct->fast_req[n].fence != fence) + continue; + found = true; + +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC) + buf = kmalloc(SZ_4K, GFP_NOWAIT); + if (buf && stack_depot_snprint(ct->fast_req[n].stack, buf, SZ_4K, 0)) + xe_gt_err(gt, "Fence 0x%x was used by action %#04x sent at:\n%s", + fence, ct->fast_req[n].action, buf); + else + xe_gt_err(gt, "Fence 0x%x was used by action %#04x [failed to retrieve stack]\n", + fence, ct->fast_req[n].action); + kfree(buf); +#else + xe_gt_err(gt, "Fence 0x%x was used by action %#04x\n", + fence, ct->fast_req[n].action); +#endif + break; + } + + if (!found) + xe_gt_warn(gt, "Fence 0x%x not found - tracking buffer wrapped? [range = 0x%x -> 0x%x, next = 0x%X]\n", + fence, fence_min, fence_max, ct->fence_seqno); +} +#else +static void fast_req_report(struct xe_guc_ct *ct, u16 fence) +{ +} +#endif + static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len) { struct xe_gt *gt = ct_to_gt(ct); @@ -1174,6 +1248,9 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len) else xe_gt_err(gt, "unexpected response %u for FAST_REQ H2G fence 0x%x!\n", type, fence); + + fast_req_report(ct, fence); + CT_DEAD(ct, NULL, PARSE_G2H_RESPONSE); return -EPROTO; diff --git a/drivers/gpu/drm/xe/xe_guc_ct_types.h b/drivers/gpu/drm/xe/xe_guc_ct_types.h index 8e1b9d981d61..8b03b50313d9 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct_types.h +++ b/drivers/gpu/drm/xe/xe_guc_ct_types.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -104,6 +105,18 @@ struct xe_dead_ct { /** snapshot_log: copy of GuC log at point of error */ struct xe_guc_log_snapshot *snapshot_log; }; + +/** struct xe_fast_req_fence - Used to track FAST_REQ messages by fence to match error responses */ +struct xe_fast_req_fence { + /** @fence: sequence number sent in H2G and return in G2H error */ + u16 fence; + /** @action: H2G action code */ + u16 action; +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC) + /** @stack: call stack from when the H2G was sent */ + depot_stack_handle_t stack; +#endif +}; #endif /** @@ -152,6 +165,8 @@ struct xe_guc_ct { #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) /** @dead: information for debugging dead CTs */ struct xe_dead_ct dead; + /** @fast_req: history of FAST_REQ messages for matching with G2H error responses */ + struct xe_fast_req_fence fast_req[SZ_32]; #endif }; -- 2.50.1 From 921ddb37d87c13eb811b8a3280377e4dab73eccf Mon Sep 17 00:00:00 2001 From: =?utf8?q?Piotr=20Pi=C3=B3rkowski?= Date: Tue, 13 May 2025 09:13:21 +0200 Subject: [PATCH 02/16] drm/xe/pf: Don't allow LMEM provisioning if LMTT isn't available on the device MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The LMEM provisioning is applicable only on platforms with LMTT. v2: - new commit description - use xe_gt_assert in xe_gt_sriov_pf_config_set_lmem instead return error, - disable pf_lmem_info if LMTT is not available v3: fix condition in xe_gt_assert v4: rebase Signed-off-by: Piotr Piórkowski Cc: Michal Wajdeczko Reviewed-by: Stuart Summers Signed-off-by: Michal Wajdeczko Link: https://lore.kernel.org/r/20250513071321.700464-1-piotr.piorkowski@intel.com --- drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c | 6 ++++-- drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c index 2420a548cacc..3556c41c041b 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c @@ -1520,6 +1520,8 @@ int xe_gt_sriov_pf_config_set_lmem(struct xe_gt *gt, unsigned int vfid, u64 size { int err; + xe_gt_assert(gt, xe_device_has_lmtt(gt_to_xe(gt))); + mutex_lock(xe_gt_sriov_pf_master_mutex(gt)); if (vfid) err = pf_provision_vf_lmem(gt, vfid, size); @@ -1629,7 +1631,7 @@ int xe_gt_sriov_pf_config_set_fair_lmem(struct xe_gt *gt, unsigned int vfid, xe_gt_assert(gt, num_vfs); xe_gt_assert(gt, !xe_gt_is_media_type(gt)); - if (!IS_DGFX(gt_to_xe(gt))) + if (!xe_device_has_lmtt(gt_to_xe(gt))) return 0; mutex_lock(xe_gt_sriov_pf_master_mutex(gt)); @@ -2163,7 +2165,7 @@ static int pf_validate_vf_config(struct xe_gt *gt, unsigned int vfid) valid_all = valid_all && valid_ggtt; valid_any = valid_any || (valid_ggtt && is_primary); - if (IS_DGFX(xe)) { + if (xe_device_has_lmtt(xe)) { bool valid_lmem = pf_get_vf_config_lmem(primary_gt, vfid); valid_any = valid_any || (valid_lmem && is_primary); diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c index 0fe47f41b63c..13970d5a2867 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c @@ -308,7 +308,7 @@ static void pf_add_config_attrs(struct xe_gt *gt, struct dentry *parent, unsigne if (!xe_gt_is_media_type(gt)) { debugfs_create_file_unsafe(vfid ? "ggtt_quota" : "ggtt_spare", 0644, parent, parent, &ggtt_fops); - if (IS_DGFX(gt_to_xe(gt))) + if (xe_device_has_lmtt(gt_to_xe(gt))) debugfs_create_file_unsafe(vfid ? "lmem_quota" : "lmem_spare", 0644, parent, parent, &lmem_fops); } @@ -558,7 +558,7 @@ void xe_gt_sriov_pf_debugfs_register(struct xe_gt *gt, struct dentry *root) drm_debugfs_create_files(pf_ggtt_info, ARRAY_SIZE(pf_ggtt_info), pfdentry, minor); - if (IS_DGFX(gt_to_xe(gt))) + if (xe_device_has_lmtt(gt_to_xe(gt))) drm_debugfs_create_files(pf_lmem_info, ARRAY_SIZE(pf_lmem_info), pfdentry, minor); -- 2.50.1 From a383cf218ef8bb35d4c03958bd956573b65cf778 Mon Sep 17 00:00:00 2001 From: Tejas Upadhyay Date: Tue, 6 May 2025 19:53:00 +0530 Subject: [PATCH 03/16] drm/xe/mocs: Check if all domains awake Check if all domains are awake specially for LNCF regs Fixes: 1182bc74b39b ("drm/xe: Fix MOCS debugfs LNCF readout") Improvements-suggested-by: Himal Prasad Ghimiray Reviewed-by: Badal Nilawar Link: https://patchwork.freedesktop.org/patch/msgid/20250506142300.1865783-1-tejas.upadhyay@intel.com Signed-off-by: Tejas Upadhyay --- drivers/gpu/drm/xe/xe_mocs.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_mocs.c b/drivers/gpu/drm/xe/xe_mocs.c index 31dade91a089..0c737413fcb6 100644 --- a/drivers/gpu/drm/xe/xe_mocs.c +++ b/drivers/gpu/drm/xe/xe_mocs.c @@ -775,22 +775,23 @@ void xe_mocs_init(struct xe_gt *gt) void xe_mocs_dump(struct xe_gt *gt, struct drm_printer *p) { struct xe_device *xe = gt_to_xe(gt); + enum xe_force_wake_domains domain; struct xe_mocs_info table; unsigned int fw_ref, flags; flags = get_mocs_settings(xe, &table); + domain = flags & HAS_LNCF_MOCS ? XE_FORCEWAKE_ALL : XE_FW_GT; xe_pm_runtime_get_noresume(xe); - fw_ref = xe_force_wake_get(gt_to_fw(gt), - flags & HAS_LNCF_MOCS ? - XE_FORCEWAKE_ALL : XE_FW_GT); - if (!fw_ref) + fw_ref = xe_force_wake_get(gt_to_fw(gt), domain); + + if (!xe_force_wake_ref_has_domain(fw_ref, domain)) goto err_fw; table.ops->dump(&table, flags, gt, p); - xe_force_wake_put(gt_to_fw(gt), fw_ref); err_fw: + xe_force_wake_put(gt_to_fw(gt), fw_ref); xe_pm_runtime_put(xe); } -- 2.50.1 From 17486cf3df5320752cc67ee8bcb2379d1b9de76c Mon Sep 17 00:00:00 2001 From: Aradhya Bhatia Date: Fri, 16 May 2025 14:19:02 +0000 Subject: [PATCH 04/16] drm/xe/guc: Make creation of SLPC debugfs files conditional Platforms that do not support SLPC are exempted from the GuC PC support. The GuC PC does not get initialized, and neither do its BOs get created. This causes a problem because the GuC PC debugfs file is still being created. Whenever the file is attempted to read, it causes a NULL pointer dereference on the supposed BO of the GuC PC. So, make the creation of SLPC debugfs files conditional to when SLPC features are supported. Fixes: aaab5404b16f ("drm/xe: Introduce GuC PC debugfs") Suggested-by: Matt Roper Reviewed-by: Tejas Upadhyay Reviewed-by: Stuart Summers Signed-off-by: Aradhya Bhatia Link: https://lore.kernel.org/r/20250516141902.5614-1-aradhya.bhatia@intel.com Signed-off-by: Matt Roper --- drivers/gpu/drm/xe/xe_guc_debugfs.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_debugfs.c b/drivers/gpu/drm/xe/xe_guc_debugfs.c index f33013f8a0f3..0b102ab46c4d 100644 --- a/drivers/gpu/drm/xe/xe_guc_debugfs.c +++ b/drivers/gpu/drm/xe/xe_guc_debugfs.c @@ -113,23 +113,34 @@ static const struct drm_info_list vf_safe_debugfs_list[] = { { "guc_ctb", .show = guc_debugfs_show, .data = guc_ctb }, }; +/* For GuC debugfs files that require the SLPC support */ +static const struct drm_info_list slpc_debugfs_list[] = { + { "guc_pc", .show = guc_debugfs_show, .data = guc_pc }, +}; + /* everything else should be added here */ static const struct drm_info_list pf_only_debugfs_list[] = { { "guc_log", .show = guc_debugfs_show, .data = guc_log }, { "guc_log_dmesg", .show = guc_debugfs_show, .data = guc_log_dmesg }, - { "guc_pc", .show = guc_debugfs_show, .data = guc_pc }, }; void xe_guc_debugfs_register(struct xe_guc *guc, struct dentry *parent) { - struct drm_minor *minor = guc_to_xe(guc)->drm.primary; + struct xe_device *xe = guc_to_xe(guc); + struct drm_minor *minor = xe->drm.primary; drm_debugfs_create_files(vf_safe_debugfs_list, ARRAY_SIZE(vf_safe_debugfs_list), parent, minor); - if (!IS_SRIOV_VF(guc_to_xe(guc))) + if (!IS_SRIOV_VF(xe)) { drm_debugfs_create_files(pf_only_debugfs_list, ARRAY_SIZE(pf_only_debugfs_list), parent, minor); + + if (!xe->info.skip_guc_pc) + drm_debugfs_create_files(slpc_debugfs_list, + ARRAY_SIZE(slpc_debugfs_list), + parent, minor); + } } -- 2.50.1 From a7f87deac2295d11865048bcb9c2de369b52ed93 Mon Sep 17 00:00:00 2001 From: Aradhya Bhatia Date: Fri, 16 May 2025 12:43:55 +0000 Subject: [PATCH 05/16] drm/xe: Default auto_link_downgrade status to false xe_pcode_read() can return back successfully without updating the variable 'val'. This can cause an arbitrary value to show up in the sysfs file. Allow the auto_link_downgrade_status to default to 0 to avoid any arbitrary value from coming up. Fixes: 0e414bf7ad01 ("drm/xe: Expose PCIe link downgrade attributes") Reviewed-by: Tejas Upadhyay Signed-off-by: Aradhya Bhatia Link: https://lore.kernel.org/r/20250516124355.4872-1-aradhya.bhatia@intel.com Signed-off-by: Matt Roper --- drivers/gpu/drm/xe/xe_device_sysfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_device_sysfs.c b/drivers/gpu/drm/xe/xe_device_sysfs.c index 9628e9a0a0af..2e657692e5b5 100644 --- a/drivers/gpu/drm/xe/xe_device_sysfs.c +++ b/drivers/gpu/drm/xe/xe_device_sysfs.c @@ -124,7 +124,8 @@ auto_link_downgrade_status_show(struct device *dev, struct device_attribute *att { struct pci_dev *pdev = to_pci_dev(dev); struct xe_device *xe = pdev_to_xe_device(pdev); - u32 val; + /* default the auto_link_downgrade status to 0 */ + u32 val = 0; int ret; xe_pm_runtime_get(xe); -- 2.50.1 From d2662cf8f44a68deb6c76ad9f1d9f29dbf7ba601 Mon Sep 17 00:00:00 2001 From: Shuicheng Lin Date: Tue, 13 May 2025 15:30:10 +0000 Subject: [PATCH 06/16] drm/xe: Use xe_mmio_read32() to read mtcfg register The mtcfg register is a 32-bit register and should therefore be accessed using xe_mmio_read32(). Other 3 changes per codestyle suggestion: " xe_mmio.c:83: CHECK: Alignment should match open parenthesis xe_mmio.c:131: CHECK: Comparison to NULL could be written "!xe->mmio.regs" xe_mmio.c:315: CHECK: line length of 103 exceeds 100 columns " Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Reviewed-by: Tejas Upadhyay Cc: Matt Roper Signed-off-by: Shuicheng Lin Link: https://lore.kernel.org/r/20250513153010.3464767-1-shuicheng.lin@intel.com Signed-off-by: Matt Roper --- drivers/gpu/drm/xe/xe_mmio.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_mmio.c b/drivers/gpu/drm/xe/xe_mmio.c index 096c38cc51c8..7357458bc0d2 100644 --- a/drivers/gpu/drm/xe/xe_mmio.c +++ b/drivers/gpu/drm/xe/xe_mmio.c @@ -75,12 +75,12 @@ static void mmio_multi_tile_setup(struct xe_device *xe, size_t tile_mmio_size) * is fine as it's going to the root tile's mmio, that's * guaranteed to be initialized earlier in xe_mmio_probe_early() */ - mtcfg = xe_mmio_read64_2x32(mmio, XEHP_MTCFG_ADDR); + mtcfg = xe_mmio_read32(mmio, XEHP_MTCFG_ADDR); tile_count = REG_FIELD_GET(TILE_COUNT, mtcfg) + 1; if (tile_count < xe->info.tile_count) { drm_info(&xe->drm, "tile_count: %d, reduced_tile_count %d\n", - xe->info.tile_count, tile_count); + xe->info.tile_count, tile_count); xe->info.tile_count = tile_count; /* @@ -128,7 +128,7 @@ int xe_mmio_probe_early(struct xe_device *xe) */ xe->mmio.size = pci_resource_len(pdev, GTTMMADR_BAR); xe->mmio.regs = pci_iomap(pdev, GTTMMADR_BAR, 0); - if (xe->mmio.regs == NULL) { + if (!xe->mmio.regs) { drm_err(&xe->drm, "failed to map registers\n"); return -EIO; } @@ -312,8 +312,8 @@ u64 xe_mmio_read64_2x32(struct xe_mmio *mmio, struct xe_reg reg) return (u64)udw << 32 | ldw; } -static int __xe_mmio_wait32(struct xe_mmio *mmio, struct xe_reg reg, u32 mask, u32 val, u32 timeout_us, - u32 *out_val, bool atomic, bool expect_match) +static int __xe_mmio_wait32(struct xe_mmio *mmio, struct xe_reg reg, u32 mask, u32 val, + u32 timeout_us, u32 *out_val, bool atomic, bool expect_match) { ktime_t cur = ktime_get_raw(); const ktime_t end = ktime_add_us(cur, timeout_us); -- 2.50.1 From 49c6dc74b5968885f421f9f1b45eb4890b955870 Mon Sep 17 00:00:00 2001 From: Matt Atwood Date: Tue, 20 May 2025 12:57:49 -0700 Subject: [PATCH 07/16] drm/xe/ptl: Update the PTL pci id table Update to current bspec table. Bspec: 72574 Signed-off-by: Matt Atwood Reviewed-by: Tejas Upadhyay Reviewed-by: Clint Taylor Link: https://lore.kernel.org/r/20250520195749.371748-1-matthew.s.atwood@intel.com Signed-off-by: Matt Roper --- include/drm/intel/pciids.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/drm/intel/pciids.h b/include/drm/intel/pciids.h index d212848d07f3..a7ce9523c50d 100644 --- a/include/drm/intel/pciids.h +++ b/include/drm/intel/pciids.h @@ -861,6 +861,10 @@ MACRO__(0xB081, ## __VA_ARGS__), \ MACRO__(0xB082, ## __VA_ARGS__), \ MACRO__(0xB083, ## __VA_ARGS__), \ + MACRO__(0xB084, ## __VA_ARGS__), \ + MACRO__(0xB085, ## __VA_ARGS__), \ + MACRO__(0xB086, ## __VA_ARGS__), \ + MACRO__(0xB087, ## __VA_ARGS__), \ MACRO__(0xB08F, ## __VA_ARGS__), \ MACRO__(0xB090, ## __VA_ARGS__), \ MACRO__(0xB0A0, ## __VA_ARGS__), \ -- 2.50.1 From 20a07782dacf7348a259efcc4a2ab56ecbe7aeb1 Mon Sep 17 00:00:00 2001 From: Tomasz Lis Date: Tue, 20 May 2025 01:00:35 +0200 Subject: [PATCH 08/16] drm/xe/vf: Fail migration recovery if fixups needed but platform not supported MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The post-migration recovery needs to be fully implemented for a specific platform in order to make continuation of workloads possible. New platforms introduce changes which affect the recovery procedure, and without a clear verification of support this leads to errors with no straight forward error message explaining the cause. This patch fixes that issue - it introduces a message to be logged when the current driver is known to not support the current platform. Wedging the driver immediately also decreases the amount of additional errors which would come afterwards if the driver continued operation. v2: Show the message during probe as well as during recovery; do not perform any recovery steps if the recovery is bound to fail v3: Use SRIOV-specific logging, fix typos v4: XE_DEBUG_SRIOV to XE_DEBUG check switch, to make testing more straightforward Signed-off-by: Tomasz Lis Cc: Michal Wajdeczko Cc: Michał Winiarski Reviewed-by: Michal Wajdeczko Acked-by: Michał Winiarski Signed-off-by: Michal Wajdeczko Link: https://lore.kernel.org/r/20250519230035.3143966-1-tomasz.lis@intel.com --- drivers/gpu/drm/xe/xe_sriov_vf.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_sriov_vf.c b/drivers/gpu/drm/xe/xe_sriov_vf.c index 2674fa948fda..46466932375c 100644 --- a/drivers/gpu/drm/xe/xe_sriov_vf.c +++ b/drivers/gpu/drm/xe/xe_sriov_vf.c @@ -123,6 +123,15 @@ * | | | */ +static bool vf_migration_supported(struct xe_device *xe) +{ + /* + * TODO: Add conditions to allow specific platforms, when they're + * supported at production quality. + */ + return IS_ENABLED(CONFIG_DRM_XE_DEBUG); +} + static void migration_worker_func(struct work_struct *w); /** @@ -132,6 +141,9 @@ static void migration_worker_func(struct work_struct *w); void xe_sriov_vf_init_early(struct xe_device *xe) { INIT_WORK(&xe->sriov.vf.migration.worker, migration_worker_func); + + if (!vf_migration_supported(xe)) + xe_sriov_info(xe, "migration not supported by this module version\n"); } /** @@ -236,6 +248,11 @@ static void vf_post_migration_recovery(struct xe_device *xe) goto defer; if (unlikely(err)) goto fail; + if (!vf_migration_supported(xe)) { + xe_sriov_err(xe, "migration not supported by this module version\n"); + err = -ENOTRECOVERABLE; + goto fail; + } need_fixups = vf_post_migration_fixup_ggtt_nodes(xe); /* FIXME: add the recovery steps */ -- 2.50.1 From af53f0fd99c3bbb3afd29f1612c9e88c5a92cc01 Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Wed, 21 May 2025 12:51:47 -0400 Subject: [PATCH 09/16] drm/xe: Make xe_gt_freq part of the Documentation The documentation was created with the creation of the component, however it has never been actually shown in the actual Documentation. While doing this, fixes the identation style, to avoid new warnings while building htmldocs. Fixes: bef52b5c7a19 ("drm/xe: Create a xe_gt_freq component for raw management and sysfs") Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250521165146.39616-3-rodrigo.vivi@intel.com Signed-off-by: Rodrigo Vivi --- Documentation/gpu/xe/index.rst | 1 + Documentation/gpu/xe/xe_gt_freq.rst | 14 ++++++++++++++ drivers/gpu/drm/xe/xe_gt_freq.c | 2 ++ 3 files changed, 17 insertions(+) create mode 100644 Documentation/gpu/xe/xe_gt_freq.rst diff --git a/Documentation/gpu/xe/index.rst b/Documentation/gpu/xe/index.rst index b2369561f24e..42ba6c263cd0 100644 --- a/Documentation/gpu/xe/index.rst +++ b/Documentation/gpu/xe/index.rst @@ -16,6 +16,7 @@ DG2, etc is provided to prototype the driver. xe_migrate xe_cs xe_pm + xe_gt_freq xe_pcode xe_gt_mcr xe_wa diff --git a/Documentation/gpu/xe/xe_gt_freq.rst b/Documentation/gpu/xe/xe_gt_freq.rst new file mode 100644 index 000000000000..c0811200e327 --- /dev/null +++ b/Documentation/gpu/xe/xe_gt_freq.rst @@ -0,0 +1,14 @@ +.. SPDX-License-Identifier: (GPL-2.0+ OR MIT) + +========================== +Xe GT Frequency Management +========================== + +.. kernel-doc:: drivers/gpu/drm/xe/xe_gt_freq.c + :doc: Xe GT Frequency Management + +Internal API +============ + +.. kernel-doc:: drivers/gpu/drm/xe/xe_gt_freq.c + :internal: diff --git a/drivers/gpu/drm/xe/xe_gt_freq.c b/drivers/gpu/drm/xe/xe_gt_freq.c index 868a5d2c1a52..3293d89f1386 100644 --- a/drivers/gpu/drm/xe/xe_gt_freq.c +++ b/drivers/gpu/drm/xe/xe_gt_freq.c @@ -32,6 +32,7 @@ * Xe's Freq provides a sysfs API for frequency management: * * device/tile#/gt#/freq0/_freq *read-only* files: + * * - act_freq: The actual resolved frequency decided by PCODE. * - cur_freq: The current one requested by GuC PC to the PCODE. * - rpn_freq: The Render Performance (RP) N level, which is the minimal one. @@ -39,6 +40,7 @@ * - rp0_freq: The Render Performance (RP) 0 level, which is the maximum one. * * device/tile#/gt#/freq0/_freq *read-write* files: + * * - min_freq: Min frequency request. * - max_freq: Max frequency request. * If max <= min, then freq_min becomes a fixed frequency request. -- 2.50.1 From 39578fa40420fb11dbe4f42225a347e945d8fd0e Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Wed, 21 May 2025 12:51:48 -0400 Subject: [PATCH 10/16] drm/xe: Add missing documentation of rpa_freq While at it, already adjust the rpe_freq frequency, to highlight that both are calculated by PCODE at runtime. Fixes: c6aac2fa77a3 ("drm/xe: Introduce the RPa information") Cc: Vinay Belgaumkar Cc: Lucas De Marchi Reviewed-by: Lucas De Marchi Reviewed-by: Vinay Belgaumkar Link: https://lore.kernel.org/r/20250521165146.39616-4-rodrigo.vivi@intel.com Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt_freq.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_gt_freq.c b/drivers/gpu/drm/xe/xe_gt_freq.c index 3293d89f1386..60d9354e7dbf 100644 --- a/drivers/gpu/drm/xe/xe_gt_freq.c +++ b/drivers/gpu/drm/xe/xe_gt_freq.c @@ -36,7 +36,10 @@ * - act_freq: The actual resolved frequency decided by PCODE. * - cur_freq: The current one requested by GuC PC to the PCODE. * - rpn_freq: The Render Performance (RP) N level, which is the minimal one. + * - rpa_freq: The Render Performance (RP) A level, which is the achiveable one. + * Calculated by PCODE at runtime based on multiple running conditions * - rpe_freq: The Render Performance (RP) E level, which is the efficient one. + * Calculated by PCODE at runtime based on multiple running conditions * - rp0_freq: The Render Performance (RP) 0 level, which is the maximum one. * * device/tile#/gt#/freq0/_freq *read-write* files: -- 2.50.1 From 22eba3be8e1dc27de4aec3374bc9018dc8713670 Mon Sep 17 00:00:00 2001 From: Himal Prasad Ghimiray Date: Mon, 26 May 2025 22:09:07 +0530 Subject: [PATCH 11/16] drm/xe/svm: Avoid duplicate eviction on get_pages() failure xe_svm_range_get_pages() already calls drm_gpusvm_range_evict() internally when it fails with -EOPNOTSUPP. Remove the eviction call in the caller to prevent duplicate handling. Fixes: e0ff0d7cf928 ("drm/xe/svm: Refactor usage of drm_gpusvm* function in xe_svm") Cc: Matthew Brost Reviewed-by: Matthew Brost Link: https://lore.kernel.org/r/20250526163907.1011529-1-himal.prasad.ghimiray@intel.com Signed-off-by: Himal Prasad Ghimiray --- drivers/gpu/drm/xe/xe_svm.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c index 4432685936ed..871ac81bb04a 100644 --- a/drivers/gpu/drm/xe/xe_svm.c +++ b/drivers/gpu/drm/xe/xe_svm.c @@ -914,11 +914,6 @@ retry: if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM) { ctx.timeslice_ms <<= 1; /* Double timeslice if we have to retry */ if (migrate_try_count > 0 || !ctx.devmem_only) { - if (err == -EOPNOTSUPP) { - range_debug(range, "PAGE FAULT - EVICT PAGES"); - drm_gpusvm_range_evict(&vm->svm.gpusvm, - &range->base); - } drm_dbg(&vm->xe->drm, "Get pages failed, falling back to retrying, asid=%u, gpusvm=%p, errno=%pe\n", vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err)); -- 2.50.1 From 2cb38bb0add9b81f89ccdb2db88af89e99925880 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Mon, 19 May 2025 22:09:14 +0200 Subject: [PATCH 12/16] drm/xe: Allow to trigger GT resets using debugfs writes Today we allow to trigger GT resest by reading dedicated debugfs files "force_reset" and "force_reset_sync" that we are exposing using drm_info_list[] and drm_debugfs_create_files(). To avoid triggering potentially disruptive actions during otherwise "safe" read operations, expose those two attributes using debugfs function where we can specify file permissions and provide custom "write" handler to trigger the GT resets also from there. This step would allow us to drop triggering GT resets during read operations, which we leave just to give users more time to switch. Signed-off-by: Michal Wajdeczko Cc: Lucas De Marchi Cc: Rodrigo Vivi Reviewed-by: Rodrigo Vivi Link: https://lore.kernel.org/r/20250519200914.216-1-michal.wajdeczko@intel.com --- drivers/gpu/drm/xe/xe_gt_debugfs.c | 96 +++++++++++++++++++++++------- 1 file changed, 76 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_debugfs.c b/drivers/gpu/drm/xe/xe_gt_debugfs.c index 119a55bb7580..848618acdca8 100644 --- a/drivers/gpu/drm/xe/xe_gt_debugfs.c +++ b/drivers/gpu/drm/xe/xe_gt_debugfs.c @@ -122,24 +122,6 @@ static int powergate_info(struct xe_gt *gt, struct drm_printer *p) return ret; } -static int force_reset(struct xe_gt *gt, struct drm_printer *p) -{ - xe_pm_runtime_get(gt_to_xe(gt)); - xe_gt_reset_async(gt); - xe_pm_runtime_put(gt_to_xe(gt)); - - return 0; -} - -static int force_reset_sync(struct xe_gt *gt, struct drm_printer *p) -{ - xe_pm_runtime_get(gt_to_xe(gt)); - xe_gt_reset(gt); - xe_pm_runtime_put(gt_to_xe(gt)); - - return 0; -} - static int sa_info(struct xe_gt *gt, struct drm_printer *p) { struct xe_tile *tile = gt_to_tile(gt); @@ -306,8 +288,6 @@ static int hwconfig(struct xe_gt *gt, struct drm_printer *p) * - without access to the PF specific data */ static const struct drm_info_list vf_safe_debugfs_list[] = { - {"force_reset", .show = xe_gt_debugfs_simple_show, .data = force_reset}, - {"force_reset_sync", .show = xe_gt_debugfs_simple_show, .data = force_reset_sync}, {"sa_info", .show = xe_gt_debugfs_simple_show, .data = sa_info}, {"topology", .show = xe_gt_debugfs_simple_show, .data = topology}, {"ggtt", .show = xe_gt_debugfs_simple_show, .data = ggtt}, @@ -332,6 +312,78 @@ static const struct drm_info_list pf_only_debugfs_list[] = { {"steering", .show = xe_gt_debugfs_simple_show, .data = steering}, }; +static ssize_t write_to_gt_call(const char __user *userbuf, size_t count, loff_t *ppos, + void (*call)(struct xe_gt *), struct xe_gt *gt) +{ + bool yes; + int ret; + + if (*ppos) + return -EINVAL; + ret = kstrtobool_from_user(userbuf, count, &yes); + if (ret < 0) + return ret; + if (yes) + call(gt); + return count; +} + +static void force_reset(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + + xe_pm_runtime_get(xe); + xe_gt_reset_async(gt); + xe_pm_runtime_put(xe); +} + +static ssize_t force_reset_write(struct file *file, + const char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct seq_file *s = file->private_data; + struct xe_gt *gt = s->private; + + return write_to_gt_call(userbuf, count, ppos, force_reset, gt); +} + +static int force_reset_show(struct seq_file *s, void *unused) +{ + struct xe_gt *gt = s->private; + + force_reset(gt); /* to be deprecated! */ + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(force_reset); + +static void force_reset_sync(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + + xe_pm_runtime_get(xe); + xe_gt_reset(gt); + xe_pm_runtime_put(xe); +} + +static ssize_t force_reset_sync_write(struct file *file, + const char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct seq_file *s = file->private_data; + struct xe_gt *gt = s->private; + + return write_to_gt_call(userbuf, count, ppos, force_reset_sync, gt); +} + +static int force_reset_sync_show(struct seq_file *s, void *unused) +{ + struct xe_gt *gt = s->private; + + force_reset_sync(gt); /* to be deprecated! */ + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(force_reset_sync); + void xe_gt_debugfs_register(struct xe_gt *gt) { struct xe_device *xe = gt_to_xe(gt); @@ -355,6 +407,10 @@ void xe_gt_debugfs_register(struct xe_gt *gt) */ root->d_inode->i_private = gt; + /* VF safe */ + debugfs_create_file("force_reset", 0600, root, gt, &force_reset_fops); + debugfs_create_file("force_reset_sync", 0600, root, gt, &force_reset_sync_fops); + drm_debugfs_create_files(vf_safe_debugfs_list, ARRAY_SIZE(vf_safe_debugfs_list), root, minor); -- 2.50.1 From 338ec84deefdaf8e479455ee7d55434571799cf0 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Fri, 16 May 2025 16:38:11 +0100 Subject: [PATCH 13/16] drm/xe/bo: optimise CCS case for WB pages MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Dealing with CCS state is significant on LNL+, where we end up clearing the compression state on every page alloc using the blitter for user buffers, including also saving and restoring it when moving between domains, plus we need to alloc extra pages to hold the raw CCS state for the save step. However all compression PAT modes, on platforms like LNL, also require coh_none, meaning that only WC memory can use compression in the first place. With this we can be sneaky and completely ignore CCS for WB buffers, which is likely the common case anyway. This would then skip all blitter moves/clears between sys <-> tt and then also means we can drop the extra CCS pages. This should be safe since there is no way to interact with the compression state (potentially uncleared) without using a PAT enabled index (which is rejected at bind), including if trying to be malicious and copy the raw CCS state from userpace, which should give back all zeroes if the src surface (indirect) is lacking compressed PAT index. Signed-off-by: Matthew Auld Cc: Satyanarayana K V P Cc: Thomas Hellström Cc: Matthew Brost Reviewed-by: José Roberto de Souza Reviewed-by: Satyanarayana K V P Link: https://lore.kernel.org/r/20250516153810.223530-2-matthew.auld@intel.com --- drivers/gpu/drm/xe/xe_bo.c | 8 ++++++++ drivers/gpu/drm/xe/xe_pat.c | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index d99d91fe8aa9..3fafdcb8d95b 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -2982,6 +2982,14 @@ bool xe_bo_needs_ccs_pages(struct xe_bo *bo) if (IS_DGFX(xe) && (bo->flags & XE_BO_FLAG_SYSTEM)) return false; + /* + * Compression implies coh_none, therefore we know for sure that WB + * memory can't currently use compression, which is likely one of the + * common cases. + */ + if (bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB) + return false; + return true; } diff --git a/drivers/gpu/drm/xe/xe_pat.c b/drivers/gpu/drm/xe/xe_pat.c index 30fdbdb9341e..38a6a49c1b2a 100644 --- a/drivers/gpu/drm/xe/xe_pat.c +++ b/drivers/gpu/drm/xe/xe_pat.c @@ -103,7 +103,8 @@ static const struct xe_pat_table_entry xelpg_pat_table[] = { * * Note: There is an implicit assumption in the driver that compression and * coh_1way+ are mutually exclusive. If this is ever not true then userptr - * and imported dma-buf from external device will have uncleared ccs state. + * and imported dma-buf from external device will have uncleared ccs state. See + * also xe_bo_needs_ccs_pages(). */ #define XE2_PAT(no_promote, comp_en, l3clos, l3_policy, l4_policy, __coh_mode) \ { \ -- 2.50.1 From 96af397aa1a2d1032a6e28ff3f4bc0ab4be40e1d Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Wed, 14 May 2025 16:24:25 +0100 Subject: [PATCH 14/16] drm/xe/vm: move rebind_work init earlier In xe_vm_close_and_put() we need to be able to call flush_work(rebind_work), however during vm creation we can call this on the error path, before having actually set up the worker, leading to a splat from flush_work(). It looks like we can simply move the worker init step earlier to fix this. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Matthew Auld Cc: Matthew Brost Cc: # v6.8+ Reviewed-by: Matthew Brost Link: https://lore.kernel.org/r/20250514152424.149591-3-matthew.auld@intel.com --- drivers/gpu/drm/xe/xe_vm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 5a978da411b0..168756fb140b 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -1704,8 +1704,10 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags) * scheduler drops all the references of it, hence protecting the VM * for this case is necessary. */ - if (flags & XE_VM_FLAG_LR_MODE) + if (flags & XE_VM_FLAG_LR_MODE) { + INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func); xe_pm_runtime_get_noresume(xe); + } vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm); if (!vm_resv_obj) { @@ -1750,10 +1752,8 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags) vm->batch_invalidate_tlb = true; } - if (vm->flags & XE_VM_FLAG_LR_MODE) { - INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func); + if (vm->flags & XE_VM_FLAG_LR_MODE) vm->batch_invalidate_tlb = false; - } /* Fill pt_root after allocating scratch tables */ for_each_tile(tile, xe, id) { -- 2.50.1 From 4f296d77cf49fcb5f90b4674123ad7f3a0676165 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Wed, 14 May 2025 16:24:26 +0100 Subject: [PATCH 15/16] drm/xe/vm: move xe_svm_init() earlier In xe_vm_close_and_put() we need to be able to call xe_svm_fini(), however during vm creation we can call this on the error path, before having actually initialised the svm state, leading to various splats followed by a fatal NPD. Fixes: 6fd979c2f331 ("drm/xe: Add SVM init / close / fini to faulting VMs") Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4967 Signed-off-by: Matthew Auld Cc: Matthew Brost Reviewed-by: Matthew Brost Link: https://lore.kernel.org/r/20250514152424.149591-4-matthew.auld@intel.com --- drivers/gpu/drm/xe/xe_vm.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 168756fb140b..7140d8856bad 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -1709,10 +1709,16 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags) xe_pm_runtime_get_noresume(xe); } + if (flags & XE_VM_FLAG_FAULT_MODE) { + err = xe_svm_init(vm); + if (err) + goto err_no_resv; + } + vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm); if (!vm_resv_obj) { err = -ENOMEM; - goto err_no_resv; + goto err_svm_fini; } drm_gpuvm_init(&vm->gpuvm, "Xe VM", DRM_GPUVM_RESV_PROTECTED, &xe->drm, @@ -1783,12 +1789,6 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags) } } - if (flags & XE_VM_FLAG_FAULT_MODE) { - err = xe_svm_init(vm); - if (err) - goto err_close; - } - if (number_tiles > 1) vm->composite_fence_ctx = dma_fence_context_alloc(1); @@ -1802,6 +1802,11 @@ err_close: xe_vm_close_and_put(vm); return ERR_PTR(err); +err_svm_fini: + if (flags & XE_VM_FLAG_FAULT_MODE) { + vm->size = 0; /* close the vm */ + xe_svm_fini(vm); + } err_no_resv: mutex_destroy(&vm->snap_mutex); for_each_tile(tile, xe, id) -- 2.50.1 From fbeaad071a98fef87deccee81d564de1c8e8e16d Mon Sep 17 00:00:00 2001 From: Niranjana Vishwanathapura Date: Wed, 28 May 2025 22:20:32 -0700 Subject: [PATCH 16/16] drm/xe: Create LRC BO without VM Specifying VM during lrc->bo creation requires VM's reference to be held for the lifetime of lrc->bo as it will use VM's dma reservation object. Using VM's dma reservation object for lrc->bo doesn't provide any advantage. Hence do not pass VM while creating lrc->bo. v2: Use xe_bo_unpin_map_no_vm (Matthew Brost) Fixes: 264eecdba211 ("drm/xe: Decouple xe_exec_queue and xe_lrc") Signed-off-by: Niranjana Vishwanathapura Reviewed-by: Matthew Brost Reviewed-by: Maarten Lankhorst Signed-off-by: Matthew Brost Link: https://lore.kernel.org/r/20250529052031.2429120-2-niranjana.vishwanathapura@intel.com --- drivers/gpu/drm/xe/xe_exec_queue.c | 9 --------- drivers/gpu/drm/xe/xe_lrc.c | 23 ++++------------------- 2 files changed, 4 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index ce78cee5dec6..32c6923df7c3 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -132,12 +132,6 @@ static int __xe_exec_queue_init(struct xe_exec_queue *q) flags |= XE_LRC_CREATE_RUNALONE; } - if (vm) { - err = xe_vm_lock(vm, true); - if (err) - return err; - } - for (i = 0; i < q->width; ++i) { q->lrc[i] = xe_lrc_create(q->hwe, q->vm, SZ_16K, q->msix_vec, flags); if (IS_ERR(q->lrc[i])) { @@ -146,9 +140,6 @@ static int __xe_exec_queue_init(struct xe_exec_queue *q) } } - if (vm) - xe_vm_unlock(vm); - err = q->ops->init(q); if (err) goto err_lrc; diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c index 61a2e87990a9..63d74e27f54c 100644 --- a/drivers/gpu/drm/xe/xe_lrc.c +++ b/drivers/gpu/drm/xe/xe_lrc.c @@ -909,10 +909,7 @@ static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm) static void xe_lrc_finish(struct xe_lrc *lrc) { xe_hw_fence_ctx_finish(&lrc->fence_ctx); - xe_bo_lock(lrc->bo, false); - xe_bo_unpin(lrc->bo); - xe_bo_unlock(lrc->bo); - xe_bo_put(lrc->bo); + xe_bo_unpin_map_no_vm(lrc->bo); xe_bo_unpin_map_no_vm(lrc->bb_per_ctx_bo); } @@ -1007,7 +1004,7 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address * via VM bind calls. */ - lrc->bo = xe_bo_create_pin_map(xe, tile, vm, lrc_size, + lrc->bo = xe_bo_create_pin_map(xe, tile, NULL, lrc_size, ttm_bo_type_kernel, bo_flags); if (IS_ERR(lrc->bo)) @@ -1795,9 +1792,6 @@ struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc) if (!snapshot) return NULL; - if (lrc->bo->vm) - xe_vm_get(lrc->bo->vm); - snapshot->context_desc = xe_lrc_ggtt_addr(lrc); snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc); snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc); @@ -1819,14 +1813,12 @@ struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc) void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot) { struct xe_bo *bo; - struct xe_vm *vm; struct iosys_map src; if (!snapshot) return; bo = snapshot->lrc_bo; - vm = bo->vm; snapshot->lrc_bo = NULL; snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL); @@ -1846,8 +1838,6 @@ void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot) xe_bo_unlock(bo); put_bo: xe_bo_put(bo); - if (vm) - xe_vm_put(vm); } void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p) @@ -1900,14 +1890,9 @@ void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot) return; kvfree(snapshot->lrc_snapshot); - if (snapshot->lrc_bo) { - struct xe_vm *vm; - - vm = snapshot->lrc_bo->vm; + if (snapshot->lrc_bo) xe_bo_put(snapshot->lrc_bo); - if (vm) - xe_vm_put(vm); - } + kfree(snapshot); } -- 2.50.1