From f040327238b1a8311598c40ac94464e77fff368c Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Tue, 1 Oct 2024 09:43:49 +0100 Subject: [PATCH 01/16] drm/xe/guc_submit: fix xa_store() error checking Looks like we are meant to use xa_err() to extract the error encoded in the ptr. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Matthew Auld Cc: Matthew Brost Cc: Badal Nilawar Cc: # v6.8+ Reviewed-by: Badal Nilawar Link: https://patchwork.freedesktop.org/patch/msgid/20241001084346.98516-7-matthew.auld@intel.com --- drivers/gpu/drm/xe/xe_guc_submit.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 80062e1d3f66..8a5c21a87977 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -393,7 +393,6 @@ static void __release_guc_id(struct xe_guc *guc, struct xe_exec_queue *q, u32 xa static int alloc_guc_id(struct xe_guc *guc, struct xe_exec_queue *q) { int ret; - void *ptr; int i; /* @@ -413,12 +412,10 @@ static int alloc_guc_id(struct xe_guc *guc, struct xe_exec_queue *q) q->guc->id = ret; for (i = 0; i < q->width; ++i) { - ptr = xa_store(&guc->submission_state.exec_queue_lookup, - q->guc->id + i, q, GFP_NOWAIT); - if (IS_ERR(ptr)) { - ret = PTR_ERR(ptr); + ret = xa_err(xa_store(&guc->submission_state.exec_queue_lookup, + q->guc->id + i, q, GFP_NOWAIT)); + if (ret) goto err_release; - } } return 0; -- 2.51.0 From 11bfc4a2cfeaa012113d9b64fc30a5e6e742fc19 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Tue, 1 Oct 2024 09:43:50 +0100 Subject: [PATCH 02/16] drm/xe/ct: drop irq usage of xa_erase() Unclear why disabling interrupts is needed here. Nothing seems to be touching fence_lookup and its corresponding lock from an irq so there should be no risk of deadlock. Signed-off-by: Matthew Auld Cc: Matthew Brost Cc: Badal Nilawar Reviewed-by: Badal Nilawar Link: https://patchwork.freedesktop.org/patch/msgid/20241001084346.98516-8-matthew.auld@intel.com --- drivers/gpu/drm/xe/xe_guc_ct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index d3de2e6d690f..3fd1d4e97528 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -890,7 +890,7 @@ retry_same_fence: goto retry_same_fence; if (!g2h_fence_needs_alloc(&g2h_fence)) - xa_erase_irq(&ct->fence_lookup, g2h_fence.seqno); + xa_erase(&ct->fence_lookup, g2h_fence.seqno); return ret; } @@ -907,7 +907,7 @@ retry_same_fence: if (!ret) { xe_gt_err(gt, "Timed out wait for G2H, fence %u, action %04x, done %s", g2h_fence.seqno, action[0], str_yes_no(g2h_fence.done)); - xa_erase_irq(&ct->fence_lookup, g2h_fence.seqno); + xa_erase(&ct->fence_lookup, g2h_fence.seqno); mutex_unlock(&ct->lock); return -ETIME; } -- 2.51.0 From 91b2c42c214f570efaff80a666e30b8f6ce4f12b Mon Sep 17 00:00:00 2001 From: Francois Dugast Date: Fri, 27 Sep 2024 17:12:06 +0200 Subject: [PATCH 03/16] drm/xe: Use fault injection infrastructure to find issues at probe time The kernel fault injection infrastructure is used to test proper error handling during probe. The return code of the functions using ALLOW_ERROR_INJECTION() can be conditionnally modified at runtime by tuning some debugfs entries. This requires CONFIG_FUNCTION_ERROR_INJECTION (among others). One way to use fault injection at probe time by making each of those functions fail one at a time is: FAILTYPE=fail_function DEVICE="0000:00:08.0" # depends on the system ERRNO=-12 # -ENOMEM, can depend on the function echo N > /sys/kernel/debug/$FAILTYPE/task-filter echo 100 > /sys/kernel/debug/$FAILTYPE/probability echo 0 > /sys/kernel/debug/$FAILTYPE/interval echo -1 > /sys/kernel/debug/$FAILTYPE/times echo 0 > /sys/kernel/debug/$FAILTYPE/space echo 1 > /sys/kernel/debug/$FAILTYPE/verbose modprobe xe echo $DEVICE > /sys/bus/pci/drivers/xe/unbind grep -oP "^.* \[xe\]" /sys/kernel/debug/$FAILTYPE/injectable | \ cut -d ' ' -f 1 | while read -r FUNCTION ; do echo "Injecting fault in $FUNCTION" echo "" > /sys/kernel/debug/$FAILTYPE/inject echo $FUNCTION > /sys/kernel/debug/$FAILTYPE/inject printf %#x $ERRNO > /sys/kernel/debug/$FAILTYPE/$FUNCTION/retval echo $DEVICE > /sys/bus/pci/drivers/xe/bind done rmmod xe It will also be integrated into IGT for systematic execution by CI. v2: Wrappers are not needed in the cases covered by this patch, so remove them and use ALLOW_ERROR_INJECTION() directly. v3: Document the use of fault injection at probe time in xe_pci_probe and refer to it where ALLOW_ERROR_INJECTION() is used. Signed-off-by: Francois Dugast Cc: Lucas De Marchi Cc: Matthew Brost Cc: Rodrigo Vivi Cc: Michal Wajdeczko Cc: Jani Nikula Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20240927151207.399354-1-francois.dugast@intel.com Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_device.c | 3 +++ drivers/gpu/drm/xe/xe_ggtt.c | 2 ++ drivers/gpu/drm/xe/xe_guc_ads.c | 3 +++ drivers/gpu/drm/xe/xe_guc_ct.c | 2 ++ drivers/gpu/drm/xe/xe_guc_log.c | 3 +++ drivers/gpu/drm/xe/xe_guc_relay.c | 2 ++ drivers/gpu/drm/xe/xe_pci.c | 19 +++++++++++++++++++ drivers/gpu/drm/xe/xe_pm.c | 2 ++ drivers/gpu/drm/xe/xe_sriov.c | 3 +++ drivers/gpu/drm/xe/xe_tile.c | 3 +++ drivers/gpu/drm/xe/xe_uc_fw.c | 2 ++ drivers/gpu/drm/xe/xe_wa.c | 2 ++ drivers/gpu/drm/xe/xe_wopcm.c | 3 +++ 13 files changed, 49 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 8e9b551c7033..ad70a1bdd476 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -6,6 +6,7 @@ #include "xe_device.h" #include +#include #include #include @@ -382,6 +383,7 @@ struct xe_device *xe_device_create(struct pci_dev *pdev, err: return ERR_PTR(err); } +ALLOW_ERROR_INJECTION(xe_device_create, ERRNO); /* See xe_pci_probe() */ static bool xe_driver_flr_disabled(struct xe_device *xe) { @@ -550,6 +552,7 @@ static int wait_for_lmem_ready(struct xe_device *xe) return 0; } +ALLOW_ERROR_INJECTION(wait_for_lmem_ready, ERRNO); /* See xe_pci_probe() */ static void update_device_info(struct xe_device *xe) { diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c index f68af56c3f86..47bfd9d2635d 100644 --- a/drivers/gpu/drm/xe/xe_ggtt.c +++ b/drivers/gpu/drm/xe/xe_ggtt.c @@ -5,6 +5,7 @@ #include "xe_ggtt.h" +#include #include #include @@ -264,6 +265,7 @@ int xe_ggtt_init_early(struct xe_ggtt *ggtt) return 0; } +ALLOW_ERROR_INJECTION(xe_ggtt_init_early, ERRNO); /* See xe_pci_probe() */ static void xe_ggtt_invalidate(struct xe_ggtt *ggtt); diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c index 66d4e5e95abd..04485461aa20 100644 --- a/drivers/gpu/drm/xe/xe_guc_ads.c +++ b/drivers/gpu/drm/xe/xe_guc_ads.c @@ -5,6 +5,8 @@ #include "xe_guc_ads.h" +#include + #include #include @@ -418,6 +420,7 @@ int xe_guc_ads_init(struct xe_guc_ads *ads) return 0; } +ALLOW_ERROR_INJECTION(xe_guc_ads_init, ERRNO); /* See xe_pci_probe() */ /** * xe_guc_ads_init_post_hwconfig - initialize ADS post hwconfig load diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index 3fd1d4e97528..5aef2be2d027 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -209,6 +210,7 @@ int xe_guc_ct_init(struct xe_guc_ct *ct) ct->state = XE_GUC_CT_STATE_DISABLED; return 0; } +ALLOW_ERROR_INJECTION(xe_guc_ct_init, ERRNO); /* See xe_pci_probe() */ #define desc_read(xe_, guc_ctb__, field_) \ xe_map_rd_field(xe_, &guc_ctb__->desc, 0, \ diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c index a37ee3419428..651543721ce5 100644 --- a/drivers/gpu/drm/xe/xe_guc_log.c +++ b/drivers/gpu/drm/xe/xe_guc_log.c @@ -5,6 +5,8 @@ #include "xe_guc_log.h" +#include + #include #include "xe_bo.h" @@ -96,3 +98,4 @@ int xe_guc_log_init(struct xe_guc_log *log) return 0; } +ALLOW_ERROR_INJECTION(xe_guc_log_init, ERRNO); /* See xe_pci_probe() */ diff --git a/drivers/gpu/drm/xe/xe_guc_relay.c b/drivers/gpu/drm/xe/xe_guc_relay.c index ade6162dc259..8f62de026724 100644 --- a/drivers/gpu/drm/xe/xe_guc_relay.c +++ b/drivers/gpu/drm/xe/xe_guc_relay.c @@ -5,6 +5,7 @@ #include #include +#include #include @@ -355,6 +356,7 @@ int xe_guc_relay_init(struct xe_guc_relay *relay) return drmm_add_action_or_reset(&xe->drm, __fini_relay, relay); } +ALLOW_ERROR_INJECTION(xe_guc_relay_init, ERRNO); /* See xe_pci_probe() */ static u32 to_relay_error(int err) { diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index edaeefd2d648..af7b2f2ff13a 100644 --- a/drivers/gpu/drm/xe/xe_pci.c +++ b/drivers/gpu/drm/xe/xe_pci.c @@ -765,6 +765,25 @@ static void xe_pci_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); } +/* + * Probe the PCI device, initialize various parts of the driver. + * + * Fault injection is used to test the error paths of some initialization + * functions called either directly from xe_pci_probe() or indirectly for + * example through xe_device_probe(). Those functions use the kernel fault + * injection capabilities infrastructure, see + * Documentation/fault-injection/fault-injection.rst for details. The macro + * ALLOW_ERROR_INJECTION() is used to conditionally skip function execution + * at runtime and use a provided return value. The first requirement for + * error injectable functions is proper handling of the error code by the + * caller for recovery, which is always the case here. The second + * requirement is that no state is changed before the first error return. + * It is not strictly fullfilled for all initialization functions using the + * ALLOW_ERROR_INJECTION() macro but this is acceptable because for those + * error cases at probe time, the error code is simply propagated up by the + * caller. Therefore there is no consequence on those specific callers when + * function error injection skips the whole function. + */ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { const struct xe_device_desc *desc = (const void *)ent->driver_data; diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c index 33eb039053e4..40f7c844ed44 100644 --- a/drivers/gpu/drm/xe/xe_pm.c +++ b/drivers/gpu/drm/xe/xe_pm.c @@ -5,6 +5,7 @@ #include "xe_pm.h" +#include #include #include @@ -263,6 +264,7 @@ int xe_pm_init_early(struct xe_device *xe) return 0; } +ALLOW_ERROR_INJECTION(xe_pm_init_early, ERRNO); /* See xe_pci_probe() */ /** * xe_pm_init - Initialize Xe Power Management diff --git a/drivers/gpu/drm/xe/xe_sriov.c b/drivers/gpu/drm/xe/xe_sriov.c index 69a066ef20c0..ef10782af656 100644 --- a/drivers/gpu/drm/xe/xe_sriov.c +++ b/drivers/gpu/drm/xe/xe_sriov.c @@ -3,6 +3,8 @@ * Copyright © 2023 Intel Corporation */ +#include + #include #include "regs/xe_regs.h" @@ -119,6 +121,7 @@ int xe_sriov_init(struct xe_device *xe) return drmm_add_action_or_reset(&xe->drm, fini_sriov, xe); } +ALLOW_ERROR_INJECTION(xe_sriov_init, ERRNO); /* See xe_pci_probe() */ /** * xe_sriov_print_info - Print basic SR-IOV information. diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c index dda5268507d8..07cf7cfe4abd 100644 --- a/drivers/gpu/drm/xe/xe_tile.c +++ b/drivers/gpu/drm/xe/xe_tile.c @@ -3,6 +3,8 @@ * Copyright © 2023 Intel Corporation */ +#include + #include #include "xe_device.h" @@ -129,6 +131,7 @@ int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id) return 0; } +ALLOW_ERROR_INJECTION(xe_tile_init_early, ERRNO); /* See xe_pci_probe() */ static int tile_ttm_mgr_init(struct xe_tile *tile) { diff --git a/drivers/gpu/drm/xe/xe_uc_fw.c b/drivers/gpu/drm/xe/xe_uc_fw.c index d339f636f4d5..fb0eda3d5682 100644 --- a/drivers/gpu/drm/xe/xe_uc_fw.c +++ b/drivers/gpu/drm/xe/xe_uc_fw.c @@ -4,6 +4,7 @@ */ #include +#include #include #include @@ -796,6 +797,7 @@ int xe_uc_fw_init(struct xe_uc_fw *uc_fw) return err; } +ALLOW_ERROR_INJECTION(xe_uc_fw_init, ERRNO); /* See xe_pci_probe() */ static u32 uc_fw_ggtt_offset(struct xe_uc_fw *uc_fw) { diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c index 22c148b1e996..94ea76b098ed 100644 --- a/drivers/gpu/drm/xe/xe_wa.c +++ b/drivers/gpu/drm/xe/xe_wa.c @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -850,6 +851,7 @@ int xe_wa_init(struct xe_gt *gt) return 0; } +ALLOW_ERROR_INJECTION(xe_wa_init, ERRNO); /* See xe_pci_probe() */ void xe_wa_dump(struct xe_gt *gt, struct drm_printer *p) { diff --git a/drivers/gpu/drm/xe/xe_wopcm.c b/drivers/gpu/drm/xe/xe_wopcm.c index 93c82825d896..ada0d0aa6b74 100644 --- a/drivers/gpu/drm/xe/xe_wopcm.c +++ b/drivers/gpu/drm/xe/xe_wopcm.c @@ -5,6 +5,8 @@ #include "xe_wopcm.h" +#include + #include "regs/xe_guc_regs.h" #include "xe_device.h" #include "xe_force_wake.h" @@ -268,3 +270,4 @@ check: return ret; } +ALLOW_ERROR_INJECTION(xe_wopcm_init, ERRNO); /* See xe_pci_probe() */ -- 2.51.0 From 491418a258322bbd7f045e36884d2849b673f23d Mon Sep 17 00:00:00 2001 From: Vinay Belgaumkar Date: Wed, 25 Sep 2024 13:49:18 -0700 Subject: [PATCH 04/16] drm/xe: Restore GT freq on GSC load error As part of a Wa_22019338487, ensure that GT freq is restored even when GSC reload is not successful. Fixes: 3b1592fb7835 ("drm/xe/lnl: Apply Wa_22019338487") Signed-off-by: Vinay Belgaumkar Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20240925204918.1989574-1-vinay.belgaumkar@intel.com Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index 274737417b0f..c6b6c585c57d 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -907,7 +907,9 @@ int xe_gt_sanitize_freq(struct xe_gt *gt) int ret = 0; if ((!xe_uc_fw_is_available(>->uc.gsc.fw) || - xe_uc_fw_is_loaded(>->uc.gsc.fw)) && XE_WA(gt, 22019338487)) + xe_uc_fw_is_loaded(>->uc.gsc.fw) || + xe_uc_fw_is_in_error_state(>->uc.gsc.fw)) && + XE_WA(gt, 22019338487)) ret = xe_guc_pc_restore_stashed_freq(>->uc.guc.pc); return ret; -- 2.51.0 From 93d93813422758f6c99289de446b19184019ef5a Mon Sep 17 00:00:00 2001 From: Matt Roper Date: Wed, 2 Oct 2024 16:06:21 -0700 Subject: [PATCH 05/16] drm/xe: Make wedged_mode debugfs writable The intent of this debugfs entry is to allow modification of wedging behavior, either from IGT tests or during manual debug; it should be marked as writable to properly reflect this. In practice this hasn't caused a problem because we always access wedged_mode as root, which ignores file permissions, but it's still misleading to have the entry incorrectly marked as RO. Cc: Rodrigo Vivi Fixes: 6b8ef44cc0a9 ("drm/xe: Introduce the wedged_mode debugfs") Signed-off-by: Matt Roper Reviewed-by: Gustavo Sousa Link: https://patchwork.freedesktop.org/patch/msgid/20241002230620.1249258-2-matthew.d.roper@intel.com --- drivers/gpu/drm/xe/xe_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c index 668615c6b172..fe4319eb13fd 100644 --- a/drivers/gpu/drm/xe/xe_debugfs.c +++ b/drivers/gpu/drm/xe/xe_debugfs.c @@ -187,7 +187,7 @@ void xe_debugfs_register(struct xe_device *xe) debugfs_create_file("forcewake_all", 0400, root, xe, &forcewake_all_fops); - debugfs_create_file("wedged_mode", 0400, root, xe, + debugfs_create_file("wedged_mode", 0600, root, xe, &wedged_mode_fops); for (mem_type = XE_PL_VRAM0; mem_type <= XE_PL_VRAM1; ++mem_type) { -- 2.51.0 From 95336cfd5b2ce81f839614dd897e26cffd5204e0 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Wed, 25 Sep 2024 10:44:45 +0200 Subject: [PATCH 06/16] drm/xe: Add memirq report page address helpers MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Both xe_memirq_{source,status}_ptr functions are now strictly for obtaining an address of the memory based interrupt report pages used by the HW engines. When initializing the GuC that does not require special per instance page preparation, we don't need to abuse these public functions and pass a NULL instead of valid hwe pointer. Also, without further fixes, this actually may lead to NPD crash once the hw_reports_to_instance_zero() will be true. Add internal helpers that will provide report page addresses based solely on the instance number, which will be always 0 for both GuCs. Fixes: ef6103d20f97 ("drm/xe: memirq infra changes for MSI-X") Signed-off-by: Michal Wajdeczko Cc: Piotr Piórkowski Cc: Ilia Levi Reviewed-by: Ilia Levi Link: https://patchwork.freedesktop.org/patch/msgid/20240925084445.1495-1-michal.wajdeczko@intel.com --- drivers/gpu/drm/xe/xe_memirq.c | 35 ++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_memirq.c b/drivers/gpu/drm/xe/xe_memirq.c index e3610cb90bb9..f833da88150a 100644 --- a/drivers/gpu/drm/xe/xe_memirq.c +++ b/drivers/gpu/drm/xe/xe_memirq.c @@ -261,6 +261,15 @@ int xe_memirq_init(struct xe_memirq *memirq) return 0; } +static u32 __memirq_source_page(struct xe_memirq *memirq, u16 instance) +{ + memirq_assert(memirq, instance <= XE_HW_ENGINE_MAX_INSTANCE); + memirq_assert(memirq, memirq->bo); + + instance = hw_reports_to_instance_zero(memirq) ? instance : 0; + return xe_bo_ggtt_addr(memirq->bo) + XE_MEMIRQ_SOURCE_OFFSET(instance); +} + /** * xe_memirq_source_ptr - Get GGTT's offset of the `Interrupt Source Report Page`_. * @memirq: the &xe_memirq to query @@ -273,14 +282,18 @@ int xe_memirq_init(struct xe_memirq *memirq) */ u32 xe_memirq_source_ptr(struct xe_memirq *memirq, struct xe_hw_engine *hwe) { - u16 instance; - memirq_assert(memirq, xe_device_uses_memirq(memirq_to_xe(memirq))); - memirq_assert(memirq, memirq->bo); - instance = hw_reports_to_instance_zero(memirq) ? hwe->instance : 0; + return __memirq_source_page(memirq, hwe->instance); +} - return xe_bo_ggtt_addr(memirq->bo) + XE_MEMIRQ_SOURCE_OFFSET(instance); +static u32 __memirq_status_page(struct xe_memirq *memirq, u16 instance) +{ + memirq_assert(memirq, instance <= XE_HW_ENGINE_MAX_INSTANCE); + memirq_assert(memirq, memirq->bo); + + instance = hw_reports_to_instance_zero(memirq) ? instance : 0; + return xe_bo_ggtt_addr(memirq->bo) + XE_MEMIRQ_STATUS_OFFSET(instance); } /** @@ -295,14 +308,9 @@ u32 xe_memirq_source_ptr(struct xe_memirq *memirq, struct xe_hw_engine *hwe) */ u32 xe_memirq_status_ptr(struct xe_memirq *memirq, struct xe_hw_engine *hwe) { - u16 instance; - memirq_assert(memirq, xe_device_uses_memirq(memirq_to_xe(memirq))); - memirq_assert(memirq, memirq->bo); - - instance = hw_reports_to_instance_zero(memirq) ? hwe->instance : 0; - return xe_bo_ggtt_addr(memirq->bo) + XE_MEMIRQ_STATUS_OFFSET(instance); + return __memirq_status_page(memirq, hwe->instance); } /** @@ -343,10 +351,9 @@ int xe_memirq_init_guc(struct xe_memirq *memirq, struct xe_guc *guc) int err; memirq_assert(memirq, xe_device_uses_memirq(memirq_to_xe(memirq))); - memirq_assert(memirq, memirq->bo); - source = xe_memirq_source_ptr(memirq, NULL) + offset; - status = xe_memirq_status_ptr(memirq, NULL) + offset * SZ_16; + source = __memirq_source_page(memirq, 0) + offset; + status = __memirq_status_page(memirq, 0) + offset * SZ_16; err = xe_guc_self_cfg64(guc, GUC_KLV_SELF_CFG_MEMIRQ_SOURCE_ADDR_KEY, source); -- 2.51.0 From 43971e30fd8ae24d8c4b6ce1203c1773bde781a4 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 19 Sep 2024 19:15:24 +0200 Subject: [PATCH 07/16] drm/xe/guc: Add yet another helper macro for threshold MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit We already have and use MAKE_GUC_KLV_VF_CFG_THRESHOLD_KEY, but in upcoming patch we want to generate the key for the length. Add MAKE_GUC_KLV_VF_CFG_THRESHOLD_LEN for this purpose. Signed-off-by: Michal Wajdeczko Reviewed-by: Michał Winiarski Link: https://patchwork.freedesktop.org/patch/msgid/20240919171528.1451-2-michal.wajdeczko@intel.com --- drivers/gpu/drm/xe/xe_guc_klv_thresholds_set.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_guc_klv_thresholds_set.h b/drivers/gpu/drm/xe/xe_guc_klv_thresholds_set.h index da0fedbbdbaf..da10cf0389cb 100644 --- a/drivers/gpu/drm/xe/xe_guc_klv_thresholds_set.h +++ b/drivers/gpu/drm/xe/xe_guc_klv_thresholds_set.h @@ -17,6 +17,13 @@ #define MAKE_GUC_KLV_VF_CFG_THRESHOLD_KEY(TAG) \ MAKE_GUC_KLV_KEY(CONCATENATE(VF_CFG_THRESHOLD_, TAG)) +/** + * MAKE_GUC_KLV_VF_CFG_THRESHOLD_LEN - Prepare the name of the KLV length constant. + * @TAG: unique tag of the GuC threshold KLV key. + */ +#define MAKE_GUC_KLV_VF_CFG_THRESHOLD_LEN(TAG) \ + MAKE_GUC_KLV_LEN(CONCATENATE(VF_CFG_THRESHOLD_, TAG)) + /** * xe_guc_klv_threshold_key_to_index - Find index of the tracked GuC threshold. * @key: GuC threshold KLV key. -- 2.51.0 From 99ce45cc25ebfb81328fe520ed5773c2e4929a8d Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 19 Sep 2024 19:15:25 +0200 Subject: [PATCH 08/16] drm/xe/pf: Update success code of pf_validate_vf_config() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This function may return negative error codes on invalid or incomplete VF configuration, but unlike other int functions, it was returning 1 instead of 0 on success, which might be little inconvinient if we would like to use it directly in other functions. Signed-off-by: Michal Wajdeczko Reviewed-by: Michał Winiarski Link: https://patchwork.freedesktop.org/patch/msgid/20240919171528.1451-3-michal.wajdeczko@intel.com --- drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c index 8250ef71e685..27a593b1d52a 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c @@ -2042,7 +2042,7 @@ static int pf_validate_vf_config(struct xe_gt *gt, unsigned int vfid) valid_all = valid_all && valid_lmem; } - return valid_all ? 1 : valid_any ? -ENOKEY : -ENODATA; + return valid_all ? 0 : valid_any ? -ENOKEY : -ENODATA; } /** -- 2.51.0 From bdc2c4d5756c8baaca820fec24fcc6355946da61 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 19 Sep 2024 19:15:26 +0200 Subject: [PATCH 09/16] drm/xe/pf: Allow to encode subset of VF configuration KLVs MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit We want to reuse format of the GuC KLVs while saving and restoring VF configuration by the PF driver, but some of those KLVs (like doorbell begin index or GGTT starting offset) are not strictly needed to correctly restore VF configuration. Modify functions to omit encoding of some of the KLVs with GuC only details. Signed-off-by: Michal Wajdeczko Reviewed-by: Michał Winiarski Link: https://patchwork.freedesktop.org/patch/msgid/20240919171528.1451-4-michal.wajdeczko@intel.com --- drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c | 32 +++++++++++++--------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c index 27a593b1d52a..28e2b26c2d88 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c @@ -229,14 +229,16 @@ static struct xe_gt_sriov_config *pf_pick_vf_config(struct xe_gt *gt, unsigned i } /* Return: number of configuration dwords written */ -static u32 encode_config_ggtt(u32 *cfg, const struct xe_gt_sriov_config *config) +static u32 encode_config_ggtt(u32 *cfg, const struct xe_gt_sriov_config *config, bool details) { u32 n = 0; if (xe_ggtt_node_allocated(config->ggtt_region)) { - cfg[n++] = PREP_GUC_KLV_TAG(VF_CFG_GGTT_START); - cfg[n++] = lower_32_bits(config->ggtt_region->base.start); - cfg[n++] = upper_32_bits(config->ggtt_region->base.start); + if (details) { + cfg[n++] = PREP_GUC_KLV_TAG(VF_CFG_GGTT_START); + cfg[n++] = lower_32_bits(config->ggtt_region->base.start); + cfg[n++] = upper_32_bits(config->ggtt_region->base.start); + } cfg[n++] = PREP_GUC_KLV_TAG(VF_CFG_GGTT_SIZE); cfg[n++] = lower_32_bits(config->ggtt_region->base.size); @@ -247,20 +249,24 @@ static u32 encode_config_ggtt(u32 *cfg, const struct xe_gt_sriov_config *config) } /* Return: number of configuration dwords written */ -static u32 encode_config(u32 *cfg, const struct xe_gt_sriov_config *config) +static u32 encode_config(u32 *cfg, const struct xe_gt_sriov_config *config, bool details) { u32 n = 0; - n += encode_config_ggtt(cfg, config); + n += encode_config_ggtt(cfg, config, details); - cfg[n++] = PREP_GUC_KLV_TAG(VF_CFG_BEGIN_CONTEXT_ID); - cfg[n++] = config->begin_ctx; + if (details) { + cfg[n++] = PREP_GUC_KLV_TAG(VF_CFG_BEGIN_CONTEXT_ID); + cfg[n++] = config->begin_ctx; + } cfg[n++] = PREP_GUC_KLV_TAG(VF_CFG_NUM_CONTEXTS); cfg[n++] = config->num_ctxs; - cfg[n++] = PREP_GUC_KLV_TAG(VF_CFG_BEGIN_DOORBELL_ID); - cfg[n++] = config->begin_db; + if (details) { + cfg[n++] = PREP_GUC_KLV_TAG(VF_CFG_BEGIN_DOORBELL_ID); + cfg[n++] = config->begin_db; + } cfg[n++] = PREP_GUC_KLV_TAG(VF_CFG_NUM_DOORBELLS); cfg[n++] = config->num_dbs; @@ -301,7 +307,7 @@ static int pf_push_full_vf_config(struct xe_gt *gt, unsigned int vfid) if (!cfg) return -ENOMEM; - num_dwords = encode_config(cfg, config); + num_dwords = encode_config(cfg, config, true); xe_gt_assert(gt, num_dwords <= max_cfg_dwords); if (xe_gt_is_media_type(gt)) { @@ -309,10 +315,10 @@ static int pf_push_full_vf_config(struct xe_gt *gt, unsigned int vfid) struct xe_gt_sriov_config *other = pf_pick_vf_config(primary, vfid); /* media-GT will never include a GGTT config */ - xe_gt_assert(gt, !encode_config_ggtt(cfg + num_dwords, config)); + xe_gt_assert(gt, !encode_config_ggtt(cfg + num_dwords, config, true)); /* the GGTT config must be taken from the primary-GT instead */ - num_dwords += encode_config_ggtt(cfg + num_dwords, other); + num_dwords += encode_config_ggtt(cfg + num_dwords, other, true); } xe_gt_assert(gt, num_dwords <= max_cfg_dwords); -- 2.51.0 From e9a14537feb9f4223548b569748098c1ad7360d0 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 19 Sep 2024 19:15:27 +0200 Subject: [PATCH 10/16] drm/xe/pf: Add functions to save and restore VF configuration blob MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit We already have support to save and restore GuC VF state, but that will only work when the target VF configuration (provisioning) will be exactly the same as the source VF configuration. To help with assuring that both configurations match, allow to encode whole VF configuration that can be saved as blob and restored later. In the future we may want to use such captured configuration blobs as templates to make sure we provision VFs with exactly the same configuration that was previously tested or recommended, or when debugfs knobs are not be available. Signed-off-by: Michal Wajdeczko Reviewed-by: Michał Winiarski Link: https://patchwork.freedesktop.org/patch/msgid/20240919171528.1451-5-michal.wajdeczko@intel.com --- drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c | 170 +++++++++++++++++++++ drivers/gpu/drm/xe/xe_gt_sriov_pf_config.h | 4 + 2 files changed, 174 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c index 28e2b26c2d88..a863e50b756e 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c @@ -34,6 +34,8 @@ #include "xe_ttm_vram_mgr.h" #include "xe_wopcm.h" +#define make_u64_from_u32(hi, lo) ((u64)((u64)(u32)(hi) << 32 | (u32)(lo))) + /* * Return: number of KLVs that were successfully parsed and saved, * negative error code on failure. @@ -2074,6 +2076,174 @@ bool xe_gt_sriov_pf_config_is_empty(struct xe_gt *gt, unsigned int vfid) return empty; } +/** + * xe_gt_sriov_pf_config_save - Save a VF provisioning config as binary blob. + * @gt: the &xe_gt + * @vfid: the VF identifier (can't be PF) + * @buf: the buffer to save a config to (or NULL if query the buf size) + * @size: the size of the buffer (or 0 if query the buf size) + * + * This function can only be called on PF. + * + * Return: mininum size of the buffer or the number of bytes saved, + * or a negative error code on failure. + */ +ssize_t xe_gt_sriov_pf_config_save(struct xe_gt *gt, unsigned int vfid, void *buf, size_t size) +{ + struct xe_gt_sriov_config *config; + ssize_t ret; + + xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt))); + xe_gt_assert(gt, vfid); + xe_gt_assert(gt, !(!buf ^ !size)); + + mutex_lock(xe_gt_sriov_pf_master_mutex(gt)); + ret = pf_validate_vf_config(gt, vfid); + if (!size) { + ret = ret ? 0 : SZ_4K; + } else if (!ret) { + if (size < SZ_4K) { + ret = -ENOBUFS; + } else { + config = pf_pick_vf_config(gt, vfid); + ret = encode_config(buf, config, false) * sizeof(u32); + } + } + mutex_unlock(xe_gt_sriov_pf_master_mutex(gt)); + + return ret; +} + +static int pf_restore_vf_config_klv(struct xe_gt *gt, unsigned int vfid, + u32 key, u32 len, const u32 *value) +{ + switch (key) { + case GUC_KLV_VF_CFG_NUM_CONTEXTS_KEY: + if (len != GUC_KLV_VF_CFG_NUM_CONTEXTS_LEN) + return -EBADMSG; + return pf_provision_vf_ctxs(gt, vfid, value[0]); + + case GUC_KLV_VF_CFG_NUM_DOORBELLS_KEY: + if (len != GUC_KLV_VF_CFG_NUM_DOORBELLS_LEN) + return -EBADMSG; + return pf_provision_vf_dbs(gt, vfid, value[0]); + + case GUC_KLV_VF_CFG_EXEC_QUANTUM_KEY: + if (len != GUC_KLV_VF_CFG_EXEC_QUANTUM_LEN) + return -EBADMSG; + return pf_provision_exec_quantum(gt, vfid, value[0]); + + case GUC_KLV_VF_CFG_PREEMPT_TIMEOUT_KEY: + if (len != GUC_KLV_VF_CFG_PREEMPT_TIMEOUT_LEN) + return -EBADMSG; + return pf_provision_preempt_timeout(gt, vfid, value[0]); + + /* auto-generate case statements */ +#define define_threshold_key_to_provision_case(TAG, ...) \ + case MAKE_GUC_KLV_VF_CFG_THRESHOLD_KEY(TAG): \ + BUILD_BUG_ON(MAKE_GUC_KLV_VF_CFG_THRESHOLD_LEN(TAG) != 1u); \ + if (len != MAKE_GUC_KLV_VF_CFG_THRESHOLD_LEN(TAG)) \ + return -EBADMSG; \ + return pf_provision_threshold(gt, vfid, \ + MAKE_XE_GUC_KLV_THRESHOLD_INDEX(TAG), \ + value[0]); + + MAKE_XE_GUC_KLV_THRESHOLDS_SET(define_threshold_key_to_provision_case) +#undef define_threshold_key_to_provision_case + } + + if (xe_gt_is_media_type(gt)) + return -EKEYREJECTED; + + switch (key) { + case GUC_KLV_VF_CFG_GGTT_SIZE_KEY: + if (len != GUC_KLV_VF_CFG_GGTT_SIZE_LEN) + return -EBADMSG; + return pf_provision_vf_ggtt(gt, vfid, make_u64_from_u32(value[1], value[0])); + + case GUC_KLV_VF_CFG_LMEM_SIZE_KEY: + if (!IS_DGFX(gt_to_xe(gt))) + return -EKEYREJECTED; + if (len != GUC_KLV_VF_CFG_LMEM_SIZE_LEN) + return -EBADMSG; + return pf_provision_vf_lmem(gt, vfid, make_u64_from_u32(value[1], value[0])); + } + + return -EKEYREJECTED; +} + +static int pf_restore_vf_config(struct xe_gt *gt, unsigned int vfid, + const u32 *klvs, size_t num_dwords) +{ + int err; + + while (num_dwords >= GUC_KLV_LEN_MIN) { + u32 key = FIELD_GET(GUC_KLV_0_KEY, klvs[0]); + u32 len = FIELD_GET(GUC_KLV_0_LEN, klvs[0]); + + klvs += GUC_KLV_LEN_MIN; + num_dwords -= GUC_KLV_LEN_MIN; + + if (num_dwords < len) + err = -EBADMSG; + else + err = pf_restore_vf_config_klv(gt, vfid, key, len, klvs); + + if (err) { + xe_gt_sriov_dbg(gt, "restore failed on key %#x (%pe)\n", key, ERR_PTR(err)); + return err; + } + + klvs += len; + num_dwords -= len; + } + + return pf_validate_vf_config(gt, vfid); +} + +/** + * xe_gt_sriov_pf_config_restore - Restore a VF provisioning config from binary blob. + * @gt: the &xe_gt + * @vfid: the VF identifier (can't be PF) + * @buf: the buffer with config data + * @size: the size of the config data + * + * This function can only be called on PF. + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_gt_sriov_pf_config_restore(struct xe_gt *gt, unsigned int vfid, + const void *buf, size_t size) +{ + int err; + + xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt))); + xe_gt_assert(gt, vfid); + + if (!size) + return -ENODATA; + + if (size % sizeof(u32)) + return -EINVAL; + + if (IS_ENABLED(CONFIG_DRM_XE_DEBUG_SRIOV)) { + struct drm_printer p = xe_gt_info_printer(gt); + + drm_printf(&p, "restoring VF%u config:\n", vfid); + xe_guc_klv_print(buf, size / sizeof(u32), &p); + } + + mutex_lock(xe_gt_sriov_pf_master_mutex(gt)); + err = pf_send_vf_cfg_reset(gt, vfid); + if (!err) { + pf_release_vf_config(gt, vfid); + err = pf_restore_vf_config(gt, vfid, buf, size / sizeof(u32)); + } + mutex_unlock(xe_gt_sriov_pf_master_mutex(gt)); + + return err; +} + /** * xe_gt_sriov_pf_config_restart - Restart SR-IOV configurations after a GT reset. * @gt: the &xe_gt diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.h b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.h index 42e64769f666..b74ec38baa18 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.h +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.h @@ -54,6 +54,10 @@ int xe_gt_sriov_pf_config_sanitize(struct xe_gt *gt, unsigned int vfid, long tim int xe_gt_sriov_pf_config_release(struct xe_gt *gt, unsigned int vfid, bool force); int xe_gt_sriov_pf_config_push(struct xe_gt *gt, unsigned int vfid, bool refresh); +ssize_t xe_gt_sriov_pf_config_save(struct xe_gt *gt, unsigned int vfid, void *buf, size_t size); +int xe_gt_sriov_pf_config_restore(struct xe_gt *gt, unsigned int vfid, + const void *buf, size_t size); + bool xe_gt_sriov_pf_config_is_empty(struct xe_gt *gt, unsigned int vfid); void xe_gt_sriov_pf_config_restart(struct xe_gt *gt); -- 2.51.0 From d42b0435254f0965ab5484c69cd45b4097f2f47d Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 19 Sep 2024 19:15:28 +0200 Subject: [PATCH 11/16] drm/xe/pf: Allow to save and restore VF config blob from debugfs MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit For feature enabling and testing purposes, allow to capture and replace full VF configuration using debugfs blob file, but only under strict debug config. Signed-off-by: Michal Wajdeczko Reviewed-by: Michał Winiarski Link: https://patchwork.freedesktop.org/patch/msgid/20240919171528.1451-6-michal.wajdeczko@intel.com --- drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c | 78 +++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c index ccbcf6e572d0..91fc42e386d8 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c @@ -417,6 +417,81 @@ static const struct file_operations guc_state_ops = { .llseek = default_llseek, }; +/* + * /sys/kernel/debug/dri/0/ + * ├── gt0 + * │   ├── vf1 + * │   │   ├── config_blob + */ +static ssize_t config_blob_read(struct file *file, char __user *buf, + size_t count, loff_t *pos) +{ + struct dentry *dent = file_dentry(file); + struct dentry *parent = dent->d_parent; + struct xe_gt *gt = extract_gt(parent); + unsigned int vfid = extract_vfid(parent); + ssize_t ret; + void *tmp; + + ret = xe_gt_sriov_pf_config_save(gt, vfid, NULL, 0); + if (!ret) + return -ENODATA; + if (ret < 0) + return ret; + + tmp = kzalloc(ret, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + ret = xe_gt_sriov_pf_config_save(gt, vfid, tmp, ret); + if (ret > 0) + ret = simple_read_from_buffer(buf, count, pos, tmp, ret); + + kfree(tmp); + return ret; +} + +static ssize_t config_blob_write(struct file *file, const char __user *buf, + size_t count, loff_t *pos) +{ + struct dentry *dent = file_dentry(file); + struct dentry *parent = dent->d_parent; + struct xe_gt *gt = extract_gt(parent); + unsigned int vfid = extract_vfid(parent); + ssize_t ret; + void *tmp; + + if (*pos) + return -EINVAL; + + if (!count) + return -ENODATA; + + if (count > SZ_4K) + return -EINVAL; + + tmp = kzalloc(count, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + if (copy_from_user(tmp, buf, count)) { + ret = -EFAULT; + } else { + ret = xe_gt_sriov_pf_config_restore(gt, vfid, tmp, count); + if (!ret) + ret = count; + } + kfree(tmp); + return ret; +} + +static const struct file_operations config_blob_ops = { + .owner = THIS_MODULE, + .read = config_blob_read, + .write = config_blob_write, + .llseek = default_llseek, +}; + /** * xe_gt_sriov_pf_debugfs_register - Register SR-IOV PF specific entries in GT debugfs. * @gt: the &xe_gt to register @@ -471,6 +546,9 @@ void xe_gt_sriov_pf_debugfs_register(struct xe_gt *gt, struct dentry *root) debugfs_create_file("guc_state", IS_ENABLED(CONFIG_DRM_XE_DEBUG_SRIOV) ? 0600 : 0400, vfdentry, NULL, &guc_state_ops); + debugfs_create_file("config_blob", + IS_ENABLED(CONFIG_DRM_XE_DEBUG_SRIOV) ? 0600 : 0400, + vfdentry, NULL, &config_blob_ops); } } } -- 2.51.0 From 0114f66370bfe139d6407a0b6b8f309af4c12148 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 2 Oct 2024 17:46:01 -0700 Subject: [PATCH 12/16] drm/xe/guc: Remove spurious line feed in debug print Including line feeds at the start of a debug print messes up the output when sent to dmesg. The break appears between all the useful prefix information and the actual string being printed. In this case, each block of data has a very clear start line and an extra delimeter is really not necessary. So don't do it. v2: Fix typo in commit message (review feedback from Michal W.) Signed-off-by: John Harrison Reviewed-by: Michal Wajdeczko Reviewed-by: Julia Filipchuk Link: https://patchwork.freedesktop.org/patch/msgid/20241003004611.2323493-2-John.C.Harrison@Intel.com --- drivers/gpu/drm/xe/xe_guc_ct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index 5aef2be2d027..c0dded21621f 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -1533,7 +1533,7 @@ void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot, drm_puts(p, "H2G CTB (all sizes in DW):\n"); guc_ctb_snapshot_print(&snapshot->h2g, p); - drm_puts(p, "\nG2H CTB (all sizes in DW):\n"); + drm_puts(p, "G2H CTB (all sizes in DW):\n"); guc_ctb_snapshot_print(&snapshot->g2h, p); drm_printf(p, "\tg2h outstanding: %d\n", -- 2.51.0 From 9d86d080cfb3ab935c842ac5525a90430a14c998 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 2 Oct 2024 17:46:02 -0700 Subject: [PATCH 13/16] drm/xe/devcoredump: Use drm_puts and already cached local variables There are a bunch of calls to drm_printf with static strings. Switch them to drm_puts instead. There are also a bunch of 'coredump->snapshot.XXX' references when 'coredump->snapshot' has alread been cached locally as 'ss'. So use 'ss->XXX' instead. Signed-off-by: John Harrison Reviewed-by: Julia Filipchuk Link: https://patchwork.freedesktop.org/patch/msgid/20241003004611.2323493-3-John.C.Harrison@Intel.com --- drivers/gpu/drm/xe/xe_devcoredump.c | 40 ++++++++++++++--------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c index bdb76e834e4c..d23719d5c2a3 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump.c +++ b/drivers/gpu/drm/xe/xe_devcoredump.c @@ -85,9 +85,9 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count, p = drm_coredump_printer(&iter); - drm_printf(&p, "**** Xe Device Coredump ****\n"); - drm_printf(&p, "kernel: " UTS_RELEASE "\n"); - drm_printf(&p, "module: " KBUILD_MODNAME "\n"); + drm_puts(&p, "**** Xe Device Coredump ****\n"); + drm_puts(&p, "kernel: " UTS_RELEASE "\n"); + drm_puts(&p, "module: " KBUILD_MODNAME "\n"); ts = ktime_to_timespec64(ss->snapshot_time); drm_printf(&p, "Snapshot time: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec); @@ -96,20 +96,20 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count, drm_printf(&p, "Process: %s\n", ss->process_name); xe_device_snapshot_print(xe, &p); - drm_printf(&p, "\n**** GuC CT ****\n"); - xe_guc_ct_snapshot_print(coredump->snapshot.ct, &p); - xe_guc_exec_queue_snapshot_print(coredump->snapshot.ge, &p); + drm_puts(&p, "\n**** GuC CT ****\n"); + xe_guc_ct_snapshot_print(ss->ct, &p); + xe_guc_exec_queue_snapshot_print(ss->ge, &p); - drm_printf(&p, "\n**** Job ****\n"); - xe_sched_job_snapshot_print(coredump->snapshot.job, &p); + drm_puts(&p, "\n**** Job ****\n"); + xe_sched_job_snapshot_print(ss->job, &p); - drm_printf(&p, "\n**** HW Engines ****\n"); + drm_puts(&p, "\n**** HW Engines ****\n"); for (i = 0; i < XE_NUM_HW_ENGINES; i++) - if (coredump->snapshot.hwe[i]) - xe_hw_engine_snapshot_print(coredump->snapshot.hwe[i], - &p); - drm_printf(&p, "\n**** VM state ****\n"); - xe_vm_snapshot_print(coredump->snapshot.vm, &p); + if (ss->hwe[i]) + xe_hw_engine_snapshot_print(ss->hwe[i], &p); + + drm_puts(&p, "\n**** VM state ****\n"); + xe_vm_snapshot_print(ss->vm, &p); return count - iter.remain; } @@ -247,18 +247,18 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump, if (xe_force_wake_get(gt_to_fw(q->gt), XE_FORCEWAKE_ALL)) xe_gt_info(ss->gt, "failed to get forcewake for coredump capture\n"); - coredump->snapshot.ct = xe_guc_ct_snapshot_capture(&guc->ct, true); - coredump->snapshot.ge = xe_guc_exec_queue_snapshot_capture(q); - coredump->snapshot.job = xe_sched_job_snapshot_capture(job); - coredump->snapshot.vm = xe_vm_snapshot_capture(q->vm); + ss->ct = xe_guc_ct_snapshot_capture(&guc->ct, true); + ss->ge = xe_guc_exec_queue_snapshot_capture(q); + ss->job = xe_sched_job_snapshot_capture(job); + ss->vm = xe_vm_snapshot_capture(q->vm); for_each_hw_engine(hwe, q->gt, id) { if (hwe->class != q->hwe->class || !(BIT(hwe->logical_instance) & adj_logical_mask)) { - coredump->snapshot.hwe[id] = NULL; + ss->hwe[id] = NULL; continue; } - coredump->snapshot.hwe[id] = xe_hw_engine_snapshot_capture(hwe); + ss->hwe[id] = xe_hw_engine_snapshot_capture(hwe); } queue_work(system_unbound_wq, &ss->work); -- 2.51.0 From c28fd6c358db44c87a1408f27ba412c94e25e6c2 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 2 Oct 2024 17:46:03 -0700 Subject: [PATCH 14/16] drm/xe/devcoredump: Improve section headings and add tile info The xe_guc_exec_queue_snapshot is not really a GuC internal thing and is definitely not a GuC CT thing. So give it its own section heading. The snapshot itself is really a capture of the submission backend's internal state. Although all it currently prints out is the submission contexts. So label it as 'Contexts'. If more general state is added later then it could be change to 'Submission backend' or some such. Further, everything from the GuC CT section onwards is GT specific but there was no indication of which GT it was related to (and that is impossible to work out from the other fields that are given). So add a GT section heading. Also include the tile id of the GT, because again significant information. Lastly, drop a couple of unnecessary line feeds within sections. v2: Add GT section heading, add tile id to device section. Signed-off-by: John Harrison Reviewed-by: Julia Filipchuk Link: https://patchwork.freedesktop.org/patch/msgid/20241003004611.2323493-4-John.C.Harrison@Intel.com --- drivers/gpu/drm/xe/xe_devcoredump.c | 5 +++++ drivers/gpu/drm/xe/xe_devcoredump_types.h | 3 ++- drivers/gpu/drm/xe/xe_device.c | 1 + drivers/gpu/drm/xe/xe_guc_submit.c | 2 +- drivers/gpu/drm/xe/xe_hw_engine.c | 1 - 5 files changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c index d23719d5c2a3..2690f1d1cde4 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump.c +++ b/drivers/gpu/drm/xe/xe_devcoredump.c @@ -96,8 +96,13 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count, drm_printf(&p, "Process: %s\n", ss->process_name); xe_device_snapshot_print(xe, &p); + drm_printf(&p, "\n**** GT #%d ****\n", ss->gt->info.id); + drm_printf(&p, "\tTile: %d\n", ss->gt->tile->id); + drm_puts(&p, "\n**** GuC CT ****\n"); xe_guc_ct_snapshot_print(ss->ct, &p); + + drm_puts(&p, "\n**** Contexts ****\n"); xe_guc_exec_queue_snapshot_print(ss->ge, &p); drm_puts(&p, "\n**** Job ****\n"); diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h index 440d05d77a5a..3cc2f095fdfb 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h @@ -37,7 +37,8 @@ struct xe_devcoredump_snapshot { /* GuC snapshots */ /** @ct: GuC CT snapshot */ struct xe_guc_ct_snapshot *ct; - /** @ge: Guc Engine snapshot */ + + /** @ge: GuC Submission Engine snapshot */ struct xe_guc_submit_exec_queue_snapshot *ge; /** @hwe: HW Engine snapshot array */ diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 1940cd3229f4..cd241a8e1838 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -964,6 +964,7 @@ void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p) for_each_gt(gt, xe, id) { drm_printf(p, "GT id: %u\n", id); + drm_printf(p, "\tTile: %u\n", gt->tile->id); drm_printf(p, "\tType: %s\n", gt->info.type == XE_GT_TYPE_MAIN ? "main" : "media"); drm_printf(p, "\tIP ver: %u.%u.%u\n", diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 8a5c21a87977..fc4008018d1f 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -2237,7 +2237,7 @@ xe_guc_exec_queue_snapshot_print(struct xe_guc_submit_exec_queue_snapshot *snaps if (!snapshot) return; - drm_printf(p, "\nGuC ID: %d\n", snapshot->guc.id); + drm_printf(p, "GuC ID: %d\n", snapshot->guc.id); drm_printf(p, "\tName: %s\n", snapshot->name); drm_printf(p, "\tClass: %d\n", snapshot->class); drm_printf(p, "\tLogical mask: 0x%x\n", snapshot->logical_mask); diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c index ea6d9ef7fab6..6c9c27304cdc 100644 --- a/drivers/gpu/drm/xe/xe_hw_engine.c +++ b/drivers/gpu/drm/xe/xe_hw_engine.c @@ -1084,7 +1084,6 @@ void xe_hw_engine_snapshot_print(struct xe_hw_engine_snapshot *snapshot, if (snapshot->hwe->class == XE_ENGINE_CLASS_COMPUTE) drm_printf(p, "\tRCU_MODE: 0x%08x\n", snapshot->reg.rcu_mode); - drm_puts(p, "\n"); } /** -- 2.51.0 From ec1455ce7e35a31289d2dbc1070b980538698921 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 2 Oct 2024 17:46:04 -0700 Subject: [PATCH 15/16] drm/xe/devcoredump: Add ASCII85 dump helper function There is a need to include the GuC log and other large binary objects in core dumps and via dmesg. So add a helper for dumping to a printer function via conversion to ASCII85 encoding. Another issue with dumping such a large buffer is that it can be slow, especially if dumping to dmesg over a serial port. So add a yield to prevent the 'task has been stuck for 120s' kernel hang check feature from firing. v2: Add a prefix to the output string. Fix memory allocation bug. v3: Correct a string size calculation and clean up a define (review feedback from Julia F). Signed-off-by: John Harrison Reviewed-by: Julia Filipchuk Link: https://patchwork.freedesktop.org/patch/msgid/20241003004611.2323493-5-John.C.Harrison@Intel.com --- drivers/gpu/drm/xe/xe_devcoredump.c | 87 +++++++++++++++++++++++++++++ drivers/gpu/drm/xe/xe_devcoredump.h | 6 ++ 2 files changed, 93 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c index 2690f1d1cde4..0884c49942fe 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump.c +++ b/drivers/gpu/drm/xe/xe_devcoredump.c @@ -6,6 +6,7 @@ #include "xe_devcoredump.h" #include "xe_devcoredump_types.h" +#include #include #include @@ -315,3 +316,89 @@ int xe_devcoredump_init(struct xe_device *xe) } #endif + +/** + * xe_print_blob_ascii85 - print a BLOB to some useful location in ASCII85 + * + * The output is split to multiple lines because some print targets, e.g. dmesg + * cannot handle arbitrarily long lines. Note also that printing to dmesg in + * piece-meal fashion is not possible, each separate call to drm_puts() has a + * line-feed automatically added! Therefore, the entire output line must be + * constructed in a local buffer first, then printed in one atomic output call. + * + * There is also a scheduler yield call to prevent the 'task has been stuck for + * 120s' kernel hang check feature from firing when printing to a slow target + * such as dmesg over a serial port. + * + * TODO: Add compression prior to the ASCII85 encoding to shrink huge buffers down. + * + * @p: the printer object to output to + * @prefix: optional prefix to add to output string + * @blob: the Binary Large OBject to dump out + * @offset: offset in bytes to skip from the front of the BLOB, must be a multiple of sizeof(u32) + * @size: the size in bytes of the BLOB, must be a multiple of sizeof(u32) + */ +void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, + const void *blob, size_t offset, size_t size) +{ + const u32 *blob32 = (const u32 *)blob; + char buff[ASCII85_BUFSZ], *line_buff; + size_t line_pos = 0; + +#define DMESG_MAX_LINE_LEN 800 +#define MIN_SPACE (ASCII85_BUFSZ + 2) /* 85 + "\n\0" */ + + if (size & 3) + drm_printf(p, "Size not word aligned: %zu", size); + if (offset & 3) + drm_printf(p, "Offset not word aligned: %zu", size); + + line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL); + if (IS_ERR_OR_NULL(line_buff)) { + drm_printf(p, "Failed to allocate line buffer: %pe", line_buff); + return; + } + + blob32 += offset / sizeof(*blob32); + size /= sizeof(*blob32); + + if (prefix) { + strscpy(line_buff, prefix, DMESG_MAX_LINE_LEN - MIN_SPACE - 2); + line_pos = strlen(line_buff); + + line_buff[line_pos++] = ':'; + line_buff[line_pos++] = ' '; + } + + while (size--) { + u32 val = *(blob32++); + + strscpy(line_buff + line_pos, ascii85_encode(val, buff), + DMESG_MAX_LINE_LEN - line_pos); + line_pos += strlen(line_buff + line_pos); + + if ((line_pos + MIN_SPACE) >= DMESG_MAX_LINE_LEN) { + line_buff[line_pos++] = '\n'; + line_buff[line_pos++] = 0; + + drm_puts(p, line_buff); + + line_pos = 0; + + /* Prevent 'stuck thread' time out errors */ + cond_resched(); + } + } + + if (line_pos) { + line_buff[line_pos++] = '\n'; + line_buff[line_pos++] = 0; + + drm_puts(p, line_buff); + } + + kfree(line_buff); + +#undef MIN_SPACE +#undef DMESG_MAX_LINE_LEN +} diff --git a/drivers/gpu/drm/xe/xe_devcoredump.h b/drivers/gpu/drm/xe/xe_devcoredump.h index e2fa65ce0932..a4eebc285fc8 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump.h +++ b/drivers/gpu/drm/xe/xe_devcoredump.h @@ -6,6 +6,9 @@ #ifndef _XE_DEVCOREDUMP_H_ #define _XE_DEVCOREDUMP_H_ +#include + +struct drm_printer; struct xe_device; struct xe_sched_job; @@ -23,4 +26,7 @@ static inline int xe_devcoredump_init(struct xe_device *xe) } #endif +void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, + const void *blob, size_t offset, size_t size); + #endif -- 2.51.0 From a59a403419aa03d5e44c8cf014e415490395b17f Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 2 Oct 2024 17:46:05 -0700 Subject: [PATCH 16/16] drm/xe/guc: Copy GuC log prior to dumping Add an extra stage to the GuC log print to copy the log buffer into regular host memory first, rather than printing the live GPU buffer object directly. Doing so helps prevent inconsistencies due to the log being updated as it is being dumped. It also allows the use of the ASCII85 helper function for printing the log in a more compact form than a straight hex dump. v2: Use %zx instead of %lx for size_t prints. v3: Replace hexdump code with ascii85 call (review feedback from Matthew B). Move chunking code into next patch as that reduces the deltas of both. v4: Add a prefix to the ASCII85 output to aid tool parsing. Signed-off-by: John Harrison Reviewed-by: Julia Filipchuk Link: https://patchwork.freedesktop.org/patch/msgid/20241003004611.2323493-6-John.C.Harrison@Intel.com --- drivers/gpu/drm/xe/xe_guc_log.c | 40 +++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c index 651543721ce5..bb9446b2d29d 100644 --- a/drivers/gpu/drm/xe/xe_guc_log.c +++ b/drivers/gpu/drm/xe/xe_guc_log.c @@ -8,9 +8,12 @@ #include #include +#include #include "xe_bo.h" +#include "xe_devcoredump.h" #include "xe_gt.h" +#include "xe_gt_printk.h" #include "xe_map.h" #include "xe_module.h" @@ -51,32 +54,35 @@ static size_t guc_log_size(void) CAPTURE_BUFFER_SIZE; } +/** + * xe_guc_log_print - dump a copy of the GuC log to some useful location + * @log: GuC log structure + * @p: the printer object to output to + */ void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p) { struct xe_device *xe = log_to_xe(log); size_t size; - int i, j; + void *copy; - xe_assert(xe, log->bo); + if (!log->bo) { + drm_puts(p, "GuC log buffer not allocated"); + return; + } size = log->bo->size; -#define DW_PER_READ 128 - xe_assert(xe, !(size % (DW_PER_READ * sizeof(u32)))); - for (i = 0; i < size / sizeof(u32); i += DW_PER_READ) { - u32 read[DW_PER_READ]; - - xe_map_memcpy_from(xe, read, &log->bo->vmap, i * sizeof(u32), - DW_PER_READ * sizeof(u32)); -#define DW_PER_PRINT 4 - for (j = 0; j < DW_PER_READ / DW_PER_PRINT; ++j) { - u32 *print = read + j * DW_PER_PRINT; - - drm_printf(p, "0x%08x 0x%08x 0x%08x 0x%08x\n", - *(print + 0), *(print + 1), - *(print + 2), *(print + 3)); - } + copy = vmalloc(size); + if (!copy) { + drm_printf(p, "Failed to allocate %zu", size); + return; } + + xe_map_memcpy_from(xe, copy, &log->bo->vmap, 0, size); + + xe_print_blob_ascii85(p, "Log data", copy, 0, size); + + vfree(copy); } int xe_guc_log_init(struct xe_guc_log *log) -- 2.51.0