From c0d0a9ff6d5b5b23ddabde8bcbafb28fa454ae00 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Thu, 8 May 2025 16:30:36 +0800 Subject: [PATCH 01/16] block: remove test of incorrect io priority level Ever since commit eca2040972b4("scsi: block: ioprio: Clean up interface definition"), the macro IOPRIO_PRIO_LEVEL() will mask the level value to something between 0 and 7 so necessarily, level will always be lower than IOPRIO_NR_LEVELS(8). Remove this obsolete check. Reported-by: Kexin Wei Cc: Damien Le Moal Signed-off-by: Aaron Lu Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20250508083018.GA769554@bytedance Signed-off-by: Jens Axboe --- block/ioprio.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/block/ioprio.c b/block/ioprio.c index 73301a261429..f0ee2798539c 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -46,12 +46,8 @@ int ioprio_check_cap(int ioprio) */ if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE)) return -EPERM; - fallthrough; - /* rt has prio field too */ - case IOPRIO_CLASS_BE: - if (level >= IOPRIO_NR_LEVELS) - return -EINVAL; break; + case IOPRIO_CLASS_BE: case IOPRIO_CLASS_IDLE: break; case IOPRIO_CLASS_NONE: -- 2.51.0 From 0e33e0f339b91eecd9558311449a3d1e728722d4 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 30 Apr 2025 12:46:56 -0400 Subject: [PATCH 02/16] drm/amdgpu/hdp5: use memcfg register to post the write for HDP flush Reading back the remapped HDP flush register seems to cause problems on some platforms. All we need is a read, so read back the memcfg register. Fixes: cf424020e040 ("drm/amdgpu/hdp5.0: do a posting read when flushing HDP") Reported-by: Alexey Klimov Link: https://lists.freedesktop.org/archives/amd-gfx/2025-April/123150.html Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4119 Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3908 Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher (cherry picked from commit a5cb344033c7598762e89255e8ff52827abb57a4) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/hdp_v5_0.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/hdp_v5_0.c b/drivers/gpu/drm/amd/amdgpu/hdp_v5_0.c index 43195c079748..086a647308df 100644 --- a/drivers/gpu/drm/amd/amdgpu/hdp_v5_0.c +++ b/drivers/gpu/drm/amd/amdgpu/hdp_v5_0.c @@ -32,7 +32,12 @@ static void hdp_v5_0_flush_hdp(struct amdgpu_device *adev, { if (!ring || !ring->funcs->emit_wreg) { WREG32((adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0); - RREG32((adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2); + /* We just need to read back a register to post the write. + * Reading back the remapped register causes problems on + * some platforms so just read back the memory size register. + */ + if (adev->nbio.funcs->get_memsize) + adev->nbio.funcs->get_memsize(adev); } else { amdgpu_ring_emit_wreg(ring, (adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0); } -- 2.51.0 From dbc988c689333faeeed44d5561f372ff20395304 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 30 Apr 2025 12:47:37 -0400 Subject: [PATCH 03/16] drm/amdgpu/hdp5.2: use memcfg register to post the write for HDP flush Reading back the remapped HDP flush register seems to cause problems on some platforms. All we need is a read, so read back the memcfg register. Fixes: f756dbac1ce1 ("drm/amdgpu/hdp5.2: do a posting read when flushing HDP") Reported-by: Alexey Klimov Link: https://lists.freedesktop.org/archives/amd-gfx/2025-April/123150.html Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4119 Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3908 Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher (cherry picked from commit 4a89b7698e771914b4d5b571600c76e2fdcbe2a9) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/hdp_v5_2.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/hdp_v5_2.c b/drivers/gpu/drm/amd/amdgpu/hdp_v5_2.c index fcb8dd2876bc..40940b4ab400 100644 --- a/drivers/gpu/drm/amd/amdgpu/hdp_v5_2.c +++ b/drivers/gpu/drm/amd/amdgpu/hdp_v5_2.c @@ -33,7 +33,17 @@ static void hdp_v5_2_flush_hdp(struct amdgpu_device *adev, if (!ring || !ring->funcs->emit_wreg) { WREG32_NO_KIQ((adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0); - RREG32_NO_KIQ((adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2); + if (amdgpu_sriov_vf(adev)) { + /* this is fine because SR_IOV doesn't remap the register */ + RREG32_NO_KIQ((adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2); + } else { + /* We just need to read back a register to post the write. + * Reading back the remapped register causes problems on + * some platforms so just read back the memory size register. + */ + if (adev->nbio.funcs->get_memsize) + adev->nbio.funcs->get_memsize(adev); + } } else { amdgpu_ring_emit_wreg(ring, (adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, -- 2.51.0 From ca28e80abe4219c8f1a2961ae05102d70af6dc87 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 30 Apr 2025 12:48:51 -0400 Subject: [PATCH 04/16] drm/amdgpu/hdp6: use memcfg register to post the write for HDP flush Reading back the remapped HDP flush register seems to cause problems on some platforms. All we need is a read, so read back the memcfg register. Fixes: abe1cbaec6cf ("drm/amdgpu/hdp6.0: do a posting read when flushing HDP") Reported-by: Alexey Klimov Link: https://lists.freedesktop.org/archives/amd-gfx/2025-April/123150.html Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4119 Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3908 Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher (cherry picked from commit 84141ff615951359c9a99696fd79a36c465ed847) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/hdp_v6_0.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/hdp_v6_0.c b/drivers/gpu/drm/amd/amdgpu/hdp_v6_0.c index a88d25a06c29..6ccd31c8bc69 100644 --- a/drivers/gpu/drm/amd/amdgpu/hdp_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/hdp_v6_0.c @@ -35,7 +35,12 @@ static void hdp_v6_0_flush_hdp(struct amdgpu_device *adev, { if (!ring || !ring->funcs->emit_wreg) { WREG32((adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0); - RREG32((adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2); + /* We just need to read back a register to post the write. + * Reading back the remapped register causes problems on + * some platforms so just read back the memory size register. + */ + if (adev->nbio.funcs->get_memsize) + adev->nbio.funcs->get_memsize(adev); } else { amdgpu_ring_emit_wreg(ring, (adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0); } -- 2.51.0 From 5a11a2767731139bf87e667331aa2209e33a1d19 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 30 Apr 2025 12:50:02 -0400 Subject: [PATCH 05/16] drm/amdgpu/hdp7: use memcfg register to post the write for HDP flush Reading back the remapped HDP flush register seems to cause problems on some platforms. All we need is a read, so read back the memcfg register. Fixes: 689275140cb8 ("drm/amdgpu/hdp7.0: do a posting read when flushing HDP") Reported-by: Alexey Klimov Link: https://lists.freedesktop.org/archives/amd-gfx/2025-April/123150.html Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4119 Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3908 Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher (cherry picked from commit dbc064adfcf9095e7d895bea87b2f75c1ab23236) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/hdp_v7_0.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/hdp_v7_0.c b/drivers/gpu/drm/amd/amdgpu/hdp_v7_0.c index 49f7eb4fbd11..2c9239a22f39 100644 --- a/drivers/gpu/drm/amd/amdgpu/hdp_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/hdp_v7_0.c @@ -32,7 +32,12 @@ static void hdp_v7_0_flush_hdp(struct amdgpu_device *adev, { if (!ring || !ring->funcs->emit_wreg) { WREG32((adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0); - RREG32((adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2); + /* We just need to read back a register to post the write. + * Reading back the remapped register causes problems on + * some platforms so just read back the memory size register. + */ + if (adev->nbio.funcs->get_memsize) + adev->nbio.funcs->get_memsize(adev); } else { amdgpu_ring_emit_wreg(ring, (adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0); } -- 2.51.0 From 391008f34e711253c5983b0bf52277cc43723127 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Tue, 8 Apr 2025 08:59:15 -0700 Subject: [PATCH 06/16] drm/xe: Add page queue multiplier For an unknown reason the math to determine the PF queue size does is not correct - compute UMD applications are overflowing the PF queue which is fatal. A multippier of 8 fixes the problem. Fixes: 3338e4f90c14 ("drm/xe: Use topology to determine page fault queue size") Cc: stable@vger.kernel.org Signed-off-by: Matthew Brost Reviewed-by: Jagmeet Randhawa Link: https://lore.kernel.org/r/20250408155915.78770-1-matthew.brost@intel.com (cherry picked from commit 29582e0ea75c95668d168b12406e3c56cf5a73c4) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_gt_pagefault.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c index c5ad9a0a89c2..0c22b3a36655 100644 --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c @@ -435,9 +435,16 @@ static int xe_alloc_pf_queue(struct xe_gt *gt, struct pf_queue *pf_queue) num_eus = bitmap_weight(gt->fuse_topo.eu_mask_per_dss, XE_MAX_EU_FUSE_BITS) * num_dss; - /* user can issue separate page faults per EU and per CS */ + /* + * user can issue separate page faults per EU and per CS + * + * XXX: Multiplier required as compute UMD are getting PF queue errors + * without it. Follow on why this multiplier is required. + */ +#define PF_MULTIPLIER 8 pf_queue->num_dw = - (num_eus + XE_NUM_HW_ENGINES) * PF_MSG_LEN_DW; + (num_eus + XE_NUM_HW_ENGINES) * PF_MSG_LEN_DW * PF_MULTIPLIER; +#undef PF_MULTIPLIER pf_queue->gt = gt; pf_queue->data = devm_kcalloc(xe->drm.dev, pf_queue->num_dw, -- 2.51.0 From 51c0ee84e4dc339287b2d7335f2b54d747794c83 Mon Sep 17 00:00:00 2001 From: Tejas Upadhyay Date: Mon, 28 Apr 2025 13:53:57 +0530 Subject: [PATCH 07/16] drm/xe/tests/mocs: Hold XE_FORCEWAKE_ALL for LNCF regs LNCF registers report wrong values when XE_FORCEWAKE_GT only is held. Holding XE_FORCEWAKE_ALL ensures correct operations on LNCF regs. V2(Himal): - Use xe_force_wake_ref_has_domain Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/1999 Fixes: a6a4ea6d7d37 ("drm/xe: Add mocs kunit") Reviewed-by: Himal Prasad Ghimiray Link: https://patchwork.freedesktop.org/patch/msgid/20250428082357.1730068-1-tejas.upadhyay@intel.com Signed-off-by: Tejas Upadhyay (cherry picked from commit 70a2585e582058e94fe4381a337be42dec800337) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/tests/xe_mocs.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/tests/xe_mocs.c b/drivers/gpu/drm/xe/tests/xe_mocs.c index ef1e5256c56a..0e502feaca81 100644 --- a/drivers/gpu/drm/xe/tests/xe_mocs.c +++ b/drivers/gpu/drm/xe/tests/xe_mocs.c @@ -46,8 +46,11 @@ static void read_l3cc_table(struct xe_gt *gt, unsigned int fw_ref, i; u32 reg_val; - fw_ref = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); - KUNIT_ASSERT_NE_MSG(test, fw_ref, 0, "Forcewake Failed.\n"); + fw_ref = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL); + if (!xe_force_wake_ref_has_domain(fw_ref, XE_FORCEWAKE_ALL)) { + xe_force_wake_put(gt_to_fw(gt), fw_ref); + KUNIT_ASSERT_TRUE_MSG(test, true, "Forcewake Failed.\n"); + } for (i = 0; i < info->num_mocs_regs; i++) { if (!(i & 1)) { -- 2.51.0 From 03552d8ac0afcc080c339faa0b726e2c0e9361cb Mon Sep 17 00:00:00 2001 From: Daniele Ceraolo Spurio Date: Fri, 2 May 2025 08:51:04 -0700 Subject: [PATCH 08/16] drm/xe/gsc: do not flush the GSC worker from the reset path The workqueue used for the reset worker is marked as WQ_MEM_RECLAIM, while the GSC one isn't (and can't be as we need to do memory allocations in the gsc worker). Therefore, we can't flush the latter from the former. The reason why we had such a flush was to avoid interrupting either the GSC FW load or in progress GSC proxy operations. GSC proxy operations fall into 2 categories: 1) GSC proxy init: this only happens once immediately after GSC FW load and does not support being interrupted. The only way to recover from an interruption of the proxy init is to do an FLR and re-load the GSC. 2) GSC proxy request: this can happen in response to a request that the driver sends to the GSC. If this is interrupted, the GSC FW will timeout and the driver request will be failed, but overall the GSC will keep working fine. Flushing the work allowed us to avoid interruption in both cases (unless the hang came from the GSC engine itself, in which case we're toast anyway). However, a failure on a proxy request is tolerable if we're in a scenario where we're triggering a GT reset (i.e., something is already gone pretty wrong), so what we really need to avoid is interrupting the init flow, which we can do by polling on the register that reports when the proxy init is complete (as that ensure us that all the load and init operations have been completed). Note that during suspend we still want to do a flush of the worker to make sure it completes any operations involving the HW before the power is cut. v2: fix spelling in commit msg, rename waiter function (Julia) Fixes: dd0e89e5edc2 ("drm/xe/gsc: GSC FW load") Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4830 Signed-off-by: Daniele Ceraolo Spurio Cc: John Harrison Cc: Alan Previn Cc: # v6.8+ Reviewed-by: Julia Filipchuk Link: https://lore.kernel.org/r/20250502155104.2201469-1-daniele.ceraolospurio@intel.com (cherry picked from commit 12370bfcc4f0bdf70279ec5b570eb298963422b5) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_gsc.c | 22 ++++++++++++++++++++++ drivers/gpu/drm/xe/xe_gsc.h | 1 + drivers/gpu/drm/xe/xe_gsc_proxy.c | 11 +++++++++++ drivers/gpu/drm/xe/xe_gsc_proxy.h | 1 + drivers/gpu/drm/xe/xe_gt.c | 2 +- drivers/gpu/drm/xe/xe_uc.c | 8 +++++++- drivers/gpu/drm/xe/xe_uc.h | 1 + 7 files changed, 44 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gsc.c b/drivers/gpu/drm/xe/xe_gsc.c index fd41113f8572..0bcf97063ff6 100644 --- a/drivers/gpu/drm/xe/xe_gsc.c +++ b/drivers/gpu/drm/xe/xe_gsc.c @@ -555,6 +555,28 @@ void xe_gsc_wait_for_worker_completion(struct xe_gsc *gsc) flush_work(&gsc->work); } +void xe_gsc_stop_prepare(struct xe_gsc *gsc) +{ + struct xe_gt *gt = gsc_to_gt(gsc); + int ret; + + if (!xe_uc_fw_is_loadable(&gsc->fw) || xe_uc_fw_is_in_error_state(&gsc->fw)) + return; + + xe_force_wake_assert_held(gt_to_fw(gt), XE_FW_GSC); + + /* + * If the GSC FW load or the proxy init are interrupted, the only way + * to recover it is to do an FLR and reload the GSC from scratch. + * Therefore, let's wait for the init to complete before stopping + * operations. The proxy init is the last step, so we can just wait on + * that + */ + ret = xe_gsc_wait_for_proxy_init_done(gsc); + if (ret) + xe_gt_err(gt, "failed to wait for GSC init completion before uc stop\n"); +} + /* * wa_14015076503: if the GSC FW is loaded, we need to alert it before doing a * GSC engine reset by writing a notification bit in the GS1 register and then diff --git a/drivers/gpu/drm/xe/xe_gsc.h b/drivers/gpu/drm/xe/xe_gsc.h index d99f66c38075..b8b8e0810ad9 100644 --- a/drivers/gpu/drm/xe/xe_gsc.h +++ b/drivers/gpu/drm/xe/xe_gsc.h @@ -16,6 +16,7 @@ struct xe_hw_engine; int xe_gsc_init(struct xe_gsc *gsc); int xe_gsc_init_post_hwconfig(struct xe_gsc *gsc); void xe_gsc_wait_for_worker_completion(struct xe_gsc *gsc); +void xe_gsc_stop_prepare(struct xe_gsc *gsc); void xe_gsc_load_start(struct xe_gsc *gsc); void xe_gsc_hwe_irq_handler(struct xe_hw_engine *hwe, u16 intr_vec); diff --git a/drivers/gpu/drm/xe/xe_gsc_proxy.c b/drivers/gpu/drm/xe/xe_gsc_proxy.c index 8cf70b228ff3..d0519cd6704a 100644 --- a/drivers/gpu/drm/xe/xe_gsc_proxy.c +++ b/drivers/gpu/drm/xe/xe_gsc_proxy.c @@ -71,6 +71,17 @@ bool xe_gsc_proxy_init_done(struct xe_gsc *gsc) HECI1_FWSTS1_PROXY_STATE_NORMAL; } +int xe_gsc_wait_for_proxy_init_done(struct xe_gsc *gsc) +{ + struct xe_gt *gt = gsc_to_gt(gsc); + + /* Proxy init can take up to 500ms, so wait double that for safety */ + return xe_mmio_wait32(>->mmio, HECI_FWSTS1(MTL_GSC_HECI1_BASE), + HECI1_FWSTS1_CURRENT_STATE, + HECI1_FWSTS1_PROXY_STATE_NORMAL, + USEC_PER_SEC, NULL, false); +} + static void __gsc_proxy_irq_rmw(struct xe_gsc *gsc, u32 clr, u32 set) { struct xe_gt *gt = gsc_to_gt(gsc); diff --git a/drivers/gpu/drm/xe/xe_gsc_proxy.h b/drivers/gpu/drm/xe/xe_gsc_proxy.h index fdef56995cd4..765602221dbc 100644 --- a/drivers/gpu/drm/xe/xe_gsc_proxy.h +++ b/drivers/gpu/drm/xe/xe_gsc_proxy.h @@ -12,6 +12,7 @@ struct xe_gsc; int xe_gsc_proxy_init(struct xe_gsc *gsc); bool xe_gsc_proxy_init_done(struct xe_gsc *gsc); +int xe_gsc_wait_for_proxy_init_done(struct xe_gsc *gsc); int xe_gsc_proxy_start(struct xe_gsc *gsc); int xe_gsc_proxy_request_handler(struct xe_gsc *gsc); diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index 10a9e3c72b36..66198cf2662c 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -857,7 +857,7 @@ void xe_gt_suspend_prepare(struct xe_gt *gt) fw_ref = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL); - xe_uc_stop_prepare(>->uc); + xe_uc_suspend_prepare(>->uc); xe_force_wake_put(gt_to_fw(gt), fw_ref); } diff --git a/drivers/gpu/drm/xe/xe_uc.c b/drivers/gpu/drm/xe/xe_uc.c index c14bd2282044..3a8751a8b92d 100644 --- a/drivers/gpu/drm/xe/xe_uc.c +++ b/drivers/gpu/drm/xe/xe_uc.c @@ -244,7 +244,7 @@ void xe_uc_gucrc_disable(struct xe_uc *uc) void xe_uc_stop_prepare(struct xe_uc *uc) { - xe_gsc_wait_for_worker_completion(&uc->gsc); + xe_gsc_stop_prepare(&uc->gsc); xe_guc_stop_prepare(&uc->guc); } @@ -278,6 +278,12 @@ again: goto again; } +void xe_uc_suspend_prepare(struct xe_uc *uc) +{ + xe_gsc_wait_for_worker_completion(&uc->gsc); + xe_guc_stop_prepare(&uc->guc); +} + int xe_uc_suspend(struct xe_uc *uc) { /* GuC submission not enabled, nothing to do */ diff --git a/drivers/gpu/drm/xe/xe_uc.h b/drivers/gpu/drm/xe/xe_uc.h index 3813c1ede450..c23e6f5e2514 100644 --- a/drivers/gpu/drm/xe/xe_uc.h +++ b/drivers/gpu/drm/xe/xe_uc.h @@ -18,6 +18,7 @@ int xe_uc_reset_prepare(struct xe_uc *uc); void xe_uc_stop_prepare(struct xe_uc *uc); void xe_uc_stop(struct xe_uc *uc); int xe_uc_start(struct xe_uc *uc); +void xe_uc_suspend_prepare(struct xe_uc *uc); int xe_uc_suspend(struct xe_uc *uc); int xe_uc_sanitize_reset(struct xe_uc *uc); void xe_uc_declare_wedged(struct xe_uc *uc); -- 2.51.0 From 9d271a4f5ba52520e448ab223b1a91c6e35f17c7 Mon Sep 17 00:00:00 2001 From: Shuicheng Lin Date: Wed, 7 May 2025 02:23:02 +0000 Subject: [PATCH 09/16] drm/xe: Release force wake first then runtime power xe_force_wake_get() is dependent on xe_pm_runtime_get(), so for the release path, xe_force_wake_put() should be called first then xe_pm_runtime_put(). Combine the error path and normal path together with goto. Fixes: 85d547608ef5 ("drm/xe/xe_gt_debugfs: Update handling of xe_force_wake_get return") Cc: Himal Prasad Ghimiray Cc: Rodrigo Vivi Signed-off-by: Shuicheng Lin Reviewed-by: Himal Prasad Ghimiray Link: https://lore.kernel.org/r/20250507022302.2187527-1-shuicheng.lin@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit 432cd94efdca06296cc5e76d673546f58aa90ee1) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_gt_debugfs.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_debugfs.c b/drivers/gpu/drm/xe/xe_gt_debugfs.c index 2d63a69cbfa3..f7005a3643e6 100644 --- a/drivers/gpu/drm/xe/xe_gt_debugfs.c +++ b/drivers/gpu/drm/xe/xe_gt_debugfs.c @@ -92,22 +92,23 @@ static int hw_engines(struct xe_gt *gt, struct drm_printer *p) struct xe_hw_engine *hwe; enum xe_hw_engine_id id; unsigned int fw_ref; + int ret = 0; xe_pm_runtime_get(xe); fw_ref = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL); if (!xe_force_wake_ref_has_domain(fw_ref, XE_FORCEWAKE_ALL)) { - xe_pm_runtime_put(xe); - xe_force_wake_put(gt_to_fw(gt), fw_ref); - return -ETIMEDOUT; + ret = -ETIMEDOUT; + goto fw_put; } for_each_hw_engine(hwe, gt, id) xe_hw_engine_print(hwe, p); +fw_put: xe_force_wake_put(gt_to_fw(gt), fw_ref); xe_pm_runtime_put(xe); - return 0; + return ret; } static int powergate_info(struct xe_gt *gt, struct drm_printer *p) -- 2.51.0 From 564467e9d06c6352fac9100e8957d40a1a50234c Mon Sep 17 00:00:00 2001 From: Shuicheng Lin Date: Fri, 2 May 2025 17:00:52 +0000 Subject: [PATCH 10/16] drm/xe: Add config control for svm flush work Without CONFIG_DRM_XE_GPUSVM set, GPU SVM is not initialized thus below warning pops. Refine the flush work code to be controlled by the config to avoid below warning: " [ 453.132028] ------------[ cut here ]------------ [ 453.132527] WARNING: CPU: 9 PID: 4491 at kernel/workqueue.c:4205 __flush_work+0x379/0x3a0 [ 453.133355] Modules linked in: xe drm_ttm_helper ttm gpu_sched drm_buddy drm_suballoc_helper drm_gpuvm drm_exec [ 453.134352] CPU: 9 UID: 0 PID: 4491 Comm: xe_exec_mix_mod Tainted: G U W 6.15.0-rc3+ #7 PREEMPT(full) [ 453.135405] Tainted: [U]=USER, [W]=WARN ... [ 453.136921] RIP: 0010:__flush_work+0x379/0x3a0 [ 453.137417] Code: 8b 45 00 48 8b 55 08 89 c7 48 c1 e8 04 83 e7 08 83 e0 0f 83 cf 02 89 c6 48 0f ba 6d 00 03 e9 d5 fe ff ff 0f 0b e9 db fd ff ff <0f> 0b 45 31 e4 e9 d1 fd ff ff 0f 0b e9 03 ff ff ff 0f 0b e9 d6 fe [ 453.139250] RSP: 0018:ffffc90000c67b18 EFLAGS: 00010246 [ 453.139782] RAX: 0000000000000000 RBX: ffff888108a24000 RCX: 0000000000002000 [ 453.140521] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff8881016d61c8 [ 453.141253] RBP: ffff8881016d61c8 R08: 0000000000000000 R09: 0000000000000000 [ 453.141985] R10: 0000000000000000 R11: 0000000008a24000 R12: 0000000000000001 [ 453.142709] R13: 0000000000000002 R14: 0000000000000000 R15: ffff888107db8c00 [ 453.143450] FS: 00007f44853d4c80(0000) GS:ffff8882f469b000(0000) knlGS:0000000000000000 [ 453.144276] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 453.144853] CR2: 00007f4487629228 CR3: 00000001016aa000 CR4: 00000000000406f0 [ 453.145594] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 453.146320] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 453.147061] Call Trace: [ 453.147336] [ 453.147579] ? tick_nohz_tick_stopped+0xd/0x30 [ 453.148067] ? xas_load+0x9/0xb0 [ 453.148435] ? xa_load+0x6f/0xb0 [ 453.148781] __xe_vm_bind_ioctl+0xbd5/0x1500 [xe] [ 453.149338] ? dev_printk_emit+0x48/0x70 [ 453.149762] ? _dev_printk+0x57/0x80 [ 453.150148] ? drm_ioctl+0x17c/0x440 [ 453.150544] ? __drm_dev_vprintk+0x36/0x90 [ 453.150983] ? __pfx_xe_vm_bind_ioctl+0x10/0x10 [xe] [ 453.151575] ? drm_ioctl_kernel+0x9f/0xf0 [ 453.151998] ? __pfx_xe_vm_bind_ioctl+0x10/0x10 [xe] [ 453.152560] drm_ioctl_kernel+0x9f/0xf0 [ 453.152968] drm_ioctl+0x20f/0x440 [ 453.153332] ? __pfx_xe_vm_bind_ioctl+0x10/0x10 [xe] [ 453.153893] ? ioctl_has_perm.constprop.0.isra.0+0xae/0x100 [ 453.154489] ? memory_bm_test_bit+0x5/0x60 [ 453.154935] xe_drm_ioctl+0x47/0x70 [xe] [ 453.155419] __x64_sys_ioctl+0x8d/0xc0 [ 453.155824] do_syscall_64+0x47/0x110 [ 453.156228] entry_SYSCALL_64_after_hwframe+0x76/0x7e " v2 (Matt): refine commit message to have more details add Fixes tag move the code to xe_svm.h which already have the config remove a blank line per codestyle suggestion Fixes: 63f6e480d115 ("drm/xe: Add SVM garbage collector") Cc: Matthew Brost Signed-off-by: Shuicheng Lin Reviewed-by: Matthew Brost Signed-off-by: Matthew Brost Link: https://lore.kernel.org/r/20250502170052.1787973-1-shuicheng.lin@intel.com (cherry picked from commit 9d80698bcd97a5ad1088bcbb055e73fd068895e2) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_svm.c | 12 ++++++++++++ drivers/gpu/drm/xe/xe_svm.h | 8 ++++++++ drivers/gpu/drm/xe/xe_vm.c | 3 +-- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c index 0b6547c06961..24c578e1170e 100644 --- a/drivers/gpu/drm/xe/xe_svm.c +++ b/drivers/gpu/drm/xe/xe_svm.c @@ -947,3 +947,15 @@ int xe_devm_add(struct xe_tile *tile, struct xe_vram_region *vr) return 0; } #endif + +/** + * xe_svm_flush() - SVM flush + * @vm: The VM. + * + * Flush all SVM actions. + */ +void xe_svm_flush(struct xe_vm *vm) +{ + if (xe_vm_in_fault_mode(vm)) + flush_work(&vm->svm.garbage_collector.work); +} diff --git a/drivers/gpu/drm/xe/xe_svm.h b/drivers/gpu/drm/xe/xe_svm.h index e059590e5076..be306fe7aaa4 100644 --- a/drivers/gpu/drm/xe/xe_svm.h +++ b/drivers/gpu/drm/xe/xe_svm.h @@ -72,6 +72,9 @@ bool xe_svm_has_mapping(struct xe_vm *vm, u64 start, u64 end); int xe_svm_bo_evict(struct xe_bo *bo); void xe_svm_range_debug(struct xe_svm_range *range, const char *operation); + +void xe_svm_flush(struct xe_vm *vm); + #else static inline bool xe_svm_range_pages_valid(struct xe_svm_range *range) { @@ -124,6 +127,11 @@ static inline void xe_svm_range_debug(struct xe_svm_range *range, const char *operation) { } + +static inline void xe_svm_flush(struct xe_vm *vm) +{ +} + #endif /** diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 60303998bd61..367c84b90e9e 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -3312,8 +3312,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file) } /* Ensure all UNMAPs visible */ - if (xe_vm_in_fault_mode(vm)) - flush_work(&vm->svm.garbage_collector.work); + xe_svm_flush(vm); err = down_write_killable(&vm->lock); if (err) -- 2.51.0 From 732b87a409667a370b87955c518e5d004de740b5 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Wed, 7 May 2025 18:19:53 +0300 Subject: [PATCH 11/16] drm/i915/dp: Fix determining SST/MST mode during MTP TU state computation Determining the SST/MST mode during state computation must be done based on the output type stored in the CRTC state, which in turn is set once based on the modeset connector's SST vs. MST type and will not change as long as the connector is using the CRTC. OTOH the MST mode indicated by the given connector's intel_dp::is_mst flag can change independently of the above output type, based on what sink is at any moment plugged to the connector. Fix the state computation accordingly. Cc: Jani Nikula Fixes: f6971d7427c2 ("drm/i915/mst: adapt intel_dp_mtp_tu_compute_config() for 128b/132b SST") Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4607 Reviewed-by: Jani Nikula Signed-off-by: Imre Deak Link: https://lore.kernel.org/r/20250507151953.251846-1-imre.deak@intel.com (cherry picked from commit 0f45696ddb2b901fbf15cb8d2e89767be481d59f) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_dp_mst.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp_mst.c b/drivers/gpu/drm/i915/display/intel_dp_mst.c index 02f95108c637..6dc2d31ccb5a 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_mst.c +++ b/drivers/gpu/drm/i915/display/intel_dp_mst.c @@ -242,7 +242,7 @@ int intel_dp_mtp_tu_compute_config(struct intel_dp *intel_dp, to_intel_connector(conn_state->connector); const struct drm_display_mode *adjusted_mode = &crtc_state->hw.adjusted_mode; - bool is_mst = intel_dp->is_mst; + bool is_mst = intel_crtc_has_type(crtc_state, INTEL_OUTPUT_DP_MST); int bpp_x16, slots = -EINVAL; int dsc_slice_count = 0; int max_dpt_bpp_x16; -- 2.51.0 From 92835cebab120f8a5f023a26a792a2ac3f816c4f Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Thu, 8 May 2025 14:12:03 -0400 Subject: [PATCH 12/16] io_uring/sqpoll: Increase task_work submission batch size Our QA team reported a 10%-23%, throughput reduction on an io_uring sqpoll testcase doing IO to a null_blk, that I traced back to a reduction of the device submission queue depth utilization. It turns out that, after commit af5d68f8892f ("io_uring/sqpoll: manage task_work privately"), we capped the number of task_work entries that can be completed from a single spin of sqpoll to only 8 entries, before the sqpoll goes around to (potentially) sleep. While this cap doesn't drive the submission side directly, it impacts the completion behavior, which affects the number of IO queued by fio per sqpoll cycle on the submission side, and io_uring ends up seeing less ios per sqpoll cycle. As a result, block layer plugging is less effective, and we see more time spent inside the block layer in profilings charts, and increased submission latency measured by fio. There are other places that have increased overhead once sqpoll sleeps more often, such as the sqpoll utilization calculation. But, in this microbenchmark, those were not representative enough in perf charts, and their removal didn't yield measurable changes in throughput. The major overhead comes from the fact we plug less, and less often, when submitting to the block layer. My benchmark is: fio --ioengine=io_uring --direct=1 --iodepth=128 --runtime=300 --bs=4k \ --invalidate=1 --time_based --ramp_time=10 --group_reporting=1 \ --filename=/dev/nullb0 --name=RandomReads-direct-nullb-sqpoll-4k-1 \ --rw=randread --numjobs=1 --sqthread_poll In one machine, tested on top of Linux 6.15-rc1, we have the following baseline: READ: bw=4994MiB/s (5236MB/s), 4994MiB/s-4994MiB/s (5236MB/s-5236MB/s), io=439GiB (471GB), run=90001-90001msec With this patch: READ: bw=5762MiB/s (6042MB/s), 5762MiB/s-5762MiB/s (6042MB/s-6042MB/s), io=506GiB (544GB), run=90001-90001msec which is a 15% improvement in measured bandwidth. The average submission latency is noticeably lowered too. As measured by fio: Baseline: lat (usec): min=20, max=241, avg=99.81, stdev=3.38 Patched: lat (usec): min=26, max=226, avg=86.48, stdev=4.82 If we look at blktrace, we can also see the plugging behavior is improved. In the baseline, we end up limited to plugging 8 requests in the block layer regardless of the device queue depth size, while after patching we can drive more io, and we manage to utilize the full device queue. In the baseline, after a stabilization phase, an ordinary submission looks like: 254,0 1 49942 0.016028795 5977 U N [iou-sqp-5976] 7 After patching, I see consistently more requests per unplug. 254,0 1 4996 0.001432872 3145 U N [iou-sqp-3144] 32 Ideally, the cap size would at least be the deep enough to fill the device queue, but we can't predict that behavior, or assume all IO goes to a single device, and thus can't guess the ideal batch size. We also don't want to let the tw run unbounded, though I'm not sure it would really be a problem. Instead, let's just give it a more sensible value that will allow for more efficient batching. I've tested with different cap values, and initially proposed to increase the cap to 1024. Jens argued it is too big of a bump and I observed that, with 32, I'm no longer able to observe this bottleneck in any of my machines. Fixes: af5d68f8892f ("io_uring/sqpoll: manage task_work privately") Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20250508181203.3785544-1-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index d037cc68e9d3..03c699493b5a 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -20,7 +20,7 @@ #include "sqpoll.h" #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 -#define IORING_TW_CAP_ENTRIES_VALUE 8 +#define IORING_TW_CAP_ENTRIES_VALUE 32 enum { IO_SQ_THREAD_SHOULD_STOP = 0, -- 2.51.0 From fea4e317f9e7e1f449ce90dedc27a2d2a95bee5a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 8 May 2025 15:41:32 -0700 Subject: [PATCH 13/16] x86/mm: Eliminate window where TLB flushes may be inadvertently skipped tl;dr: There is a window in the mm switching code where the new CR3 is set and the CPU should be getting TLB flushes for the new mm. But should_flush_tlb() has a bug and suppresses the flush. Fix it by widening the window where should_flush_tlb() sends an IPI. Long Version: === History === There were a few things leading up to this. First, updating mm_cpumask() was observed to be too expensive, so it was made lazier. But being lazy caused too many unnecessary IPIs to CPUs due to the now-lazy mm_cpumask(). So code was added to cull mm_cpumask() periodically[2]. But that culling was a bit too aggressive and skipped sending TLB flushes to CPUs that need them. So here we are again. === Problem === The too-aggressive code in should_flush_tlb() strikes in this window: // Turn on IPIs for this CPU/mm combination, but only // if should_flush_tlb() agrees: cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); load_new_mm_cr3(need_flush); // ^ After 'need_flush' is set to false, IPIs *MUST* // be sent to this CPU and not be ignored. this_cpu_write(cpu_tlbstate.loaded_mm, next); // ^ Not until this point does should_flush_tlb() // become true! should_flush_tlb() will suppress TLB flushes between load_new_mm_cr3() and writing to 'loaded_mm', which is a window where they should not be suppressed. Whoops. === Solution === Thankfully, the fuzzy "just about to write CR3" window is already marked with loaded_mm==LOADED_MM_SWITCHING. Simply checking for that state in should_flush_tlb() is sufficient to ensure that the CPU is targeted with an IPI. This will cause more TLB flush IPIs. But the window is relatively small and I do not expect this to cause any kind of measurable performance impact. Update the comment where LOADED_MM_SWITCHING is written since it grew yet another user. Peter Z also raised a concern that should_flush_tlb() might not observe 'loaded_mm' and 'is_lazy' in the same order that switch_mm_irqs_off() writes them. Add a barrier to ensure that they are observed in the order they are written. Signed-off-by: Dave Hansen Acked-by: Rik van Riel Link: https://lore.kernel.org/oe-lkp/202411282207.6bd28eae-lkp@intel.com/ [1] Fixes: 6db2526c1d69 ("x86/mm/tlb: Only trim the mm_cpumask once a second") [2] Reported-by: Stephen Dolan Cc: stable@vger.kernel.org Acked-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Signed-off-by: Linus Torvalds --- arch/x86/mm/tlb.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index eb83348f9305..b6d6750e4bd1 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -899,8 +899,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, cond_mitigation(tsk); /* - * Let nmi_uaccess_okay() and finish_asid_transition() - * know that CR3 is changing. + * Indicate that CR3 is about to change. nmi_uaccess_okay() + * and others are sensitive to the window where mm_cpumask(), + * CR3 and cpu_tlbstate.loaded_mm are not all in sync. */ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); barrier(); @@ -1204,8 +1205,16 @@ done: static bool should_flush_tlb(int cpu, void *data) { + struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu); struct flush_tlb_info *info = data; + /* + * Order the 'loaded_mm' and 'is_lazy' against their + * write ordering in switch_mm_irqs_off(). Ensure + * 'is_lazy' is at least as new as 'loaded_mm'. + */ + smp_rmb(); + /* Lazy TLB will get flushed at the next context switch. */ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) return false; @@ -1214,8 +1223,15 @@ static bool should_flush_tlb(int cpu, void *data) if (!info->mm) return true; + /* + * While switching, the remote CPU could have state from + * either the prev or next mm. Assume the worst and flush. + */ + if (loaded_mm == LOADED_MM_SWITCHING) + return true; + /* The target mm is loaded, and the CPU is not lazy. */ - if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm) + if (loaded_mm == info->mm) return true; /* In cpumask, but not the loaded mm? Periodically remove by flushing. */ -- 2.51.0 From 9dda18a32b4a6693fccd3f7c0738af646147b1cf Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 10 Apr 2025 05:22:21 -0700 Subject: [PATCH 14/16] tracing: fprobe: Fix RCU warning message in list traversal When CONFIG_PROVE_RCU_LIST is enabled, fprobe triggers the following warning: WARNING: suspicious RCU usage kernel/trace/fprobe.c:457 RCU-list traversed in non-reader section!! other info that might help us debug this: #1: ffffffff863c4e08 (fprobe_mutex){+.+.}-{4:4}, at: fprobe_module_callback+0x7b/0x8c0 Call Trace: fprobe_module_callback notifier_call_chain blocking_notifier_call_chain This warning occurs because fprobe_remove_node_in_module() traverses an RCU list using RCU primitives without holding an RCU read lock. However, the function is only called from fprobe_module_callback(), which holds the fprobe_mutex lock that provides sufficient protection for safely traversing the list. Fix the warning by specifying the locking design to the CONFIG_PROVE_RCU_LIST mechanism. Add the lockdep_is_held() argument to hlist_for_each_entry_rcu() to inform the RCU checker that fprobe_mutex provides the required protection. Link: https://lore.kernel.org/all/20250410-fprobe-v1-1-068ef5f41436@debian.org/ Fixes: a3dc2983ca7b90 ("tracing: fprobe: Cleanup fprobe hash when module unloading") Signed-off-by: Breno Leitao Tested-by: Antonio Quartulli Tested-by: Matthieu Baerts (NGI0) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/fprobe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 95c6e3473a76..ba7ff14f5339 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -454,7 +454,8 @@ static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head * struct fprobe_hlist_node *node; int ret = 0; - hlist_for_each_entry_rcu(node, head, hlist) { + hlist_for_each_entry_rcu(node, head, hlist, + lockdep_is_held(&fprobe_mutex)) { if (!within_module(node->addr, mod)) continue; if (delete_fprobe_node(node)) -- 2.51.0 From e41b5af4519f90f9a751805ede2102ae36caf5d0 Mon Sep 17 00:00:00 2001 From: Paul Cacheux Date: Sun, 4 May 2025 20:27:52 +0200 Subject: [PATCH 15/16] tracing: add missing trace_probe_log_clear for eprobes Make sure trace_probe_log_clear is called in the tracing eprobe code path, matching the trace_probe_log_init call. Link: https://lore.kernel.org/all/20250504-fix-trace-probe-log-race-v3-1-9e99fec7eddc@gmail.com/ Signed-off-by: Paul Cacheux Acked-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_eprobe.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index c08355c3ef32..916555f0de81 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -969,10 +969,13 @@ static int __trace_eprobe_create(int argc, const char *argv[]) goto error; } } + trace_probe_log_clear(); return ret; + parse_error: ret = -EINVAL; error: + trace_probe_log_clear(); trace_event_probe_cleanup(ep); return ret; } -- 2.51.0 From fd837de3c9cb1a162c69bc1fb1f438467fe7f2f5 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 10 May 2025 12:44:41 +0900 Subject: [PATCH 16/16] tracing: probes: Fix a possible race in trace_probe_log APIs Since the shared trace_probe_log variable can be accessed and modified via probe event create operation of kprobe_events, uprobe_events, and dynamic_events, it should be protected. In the dynamic_events, all operations are serialized by `dyn_event_ops_mutex`. But kprobe_events and uprobe_events interfaces are not serialized. To solve this issue, introduces dyn_event_create(), which runs create() operation under the mutex, for kprobe_events and uprobe_events. This also uses lockdep to check the mutex is held when using trace_probe_log* APIs. Link: https://lore.kernel.org/all/174684868120.551552.3068655787654268804.stgit@devnote2/ Reported-by: Paul Cacheux Closes: https://lore.kernel.org/all/20250510074456.805a16872b591e2971a4d221@kernel.org/ Fixes: ab105a4fb894 ("tracing: Use tracing error_log with probe events") Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_dynevent.c | 16 +++++++++++++++- kernel/trace/trace_dynevent.h | 1 + kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_probe.c | 9 +++++++++ kernel/trace/trace_uprobe.c | 2 +- 5 files changed, 27 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index a322e4f249a5..5d64a18cacac 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -16,7 +16,7 @@ #include "trace_output.h" /* for trace_event_sem */ #include "trace_dynevent.h" -static DEFINE_MUTEX(dyn_event_ops_mutex); +DEFINE_MUTEX(dyn_event_ops_mutex); static LIST_HEAD(dyn_event_ops_list); bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call) @@ -116,6 +116,20 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type return ret; } +/* + * Locked version of event creation. The event creation must be protected by + * dyn_event_ops_mutex because of protecting trace_probe_log. + */ +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type) +{ + int ret; + + mutex_lock(&dyn_event_ops_mutex); + ret = type->create(raw_command); + mutex_unlock(&dyn_event_ops_mutex); + return ret; +} + static int create_dyn_event(const char *raw_command) { struct dyn_event_operations *ops; diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 936477a111d3..beee3f8d7544 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -100,6 +100,7 @@ void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos); void dyn_event_seq_stop(struct seq_file *m, void *v); int dyn_events_release_all(struct dyn_event_operations *type); int dyn_event_release(const char *raw_command, struct dyn_event_operations *type); +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type); /* * for_each_dyn_event - iterate over the dyn_event list diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2703b96d8990..3e5c47b6d7b2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1089,7 +1089,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_kprobe_ops); - ret = trace_kprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_kprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 2eeecb6c95ee..424751cdf31f 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -154,9 +154,12 @@ fail: } static struct trace_probe_log trace_probe_log; +extern struct mutex dyn_event_ops_mutex; void trace_probe_log_init(const char *subsystem, int argc, const char **argv) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.subsystem = subsystem; trace_probe_log.argc = argc; trace_probe_log.argv = argv; @@ -165,11 +168,15 @@ void trace_probe_log_init(const char *subsystem, int argc, const char **argv) void trace_probe_log_clear(void) { + lockdep_assert_held(&dyn_event_ops_mutex); + memset(&trace_probe_log, 0, sizeof(trace_probe_log)); } void trace_probe_log_set_index(int index) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.index = index; } @@ -178,6 +185,8 @@ void __trace_probe_log_err(int offset, int err_type) char *command, *p; int i, len = 0, pos = 0; + lockdep_assert_held(&dyn_event_ops_mutex); + if (!trace_probe_log.argv) return; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 3386439ec9f6..35cf76c75dd7 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -741,7 +741,7 @@ static int create_or_delete_trace_uprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_uprobe_ops); - ret = trace_uprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_uprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } -- 2.51.0