From a2f925a2f62254119cdaa360cfc9c0424bccd531 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 28 Feb 2025 13:26:04 +0100 Subject: [PATCH 01/16] Revert "ata: libata-core: Add ATA_QUIRK_NOLPM for Samsung SSD 870 QVO drives" This reverts commit cc77e2ce187d26cc66af3577bf896d7410eb25ab. It was reported that adding ATA_QUIRK_NOLPM for Samsung SSD 870 QVO drives breaks entering lower package states for certain systems. It turns out that Samsung SSD 870 QVO actually has working LPM when using a recent SSD firmware version. The author of commit cc77e2ce187d ("ata: libata-core: Add ATA_QUIRK_NOLPM for Samsung SSD 870 QVO drives") reported himself that only older SSD firmware versions have broken LPM: https://lore.kernel.org/stable/93c10d38-718c-459d-84a5-4d87680b4da7@debian.org/ Unfortunately, he did not specify which older firmware version he was using which had broken LPM. Let's revert this quirk, which has FW version field specified as NULL (which means that it applies for all Samsung SSD 870 QVO firmware versions) for now. Once the author reports which older firmware version(s) that are broken, we can create a more fine grained quirk, which populates the FW version field accordingly. Fixes: cc77e2ce187d ("ata: libata-core: Add ATA_QUIRK_NOLPM for Samsung SSD 870 QVO drives") Reported-by: Dieter Mummenschanz Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219747 Link: https://lore.kernel.org/r/20250228122603.91814-2-cassel@kernel.org Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 63ec2f218431..c085dd81ebe7 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4143,10 +4143,6 @@ static const struct ata_dev_quirks_entry __ata_dev_quirks[] = { { "Samsung SSD 860*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM | ATA_QUIRK_NO_NCQ_ON_ATI }, - { "Samsung SSD 870 QVO*", NULL, ATA_QUIRK_NO_NCQ_TRIM | - ATA_QUIRK_ZERO_AFTER_TRIM | - ATA_QUIRK_NO_NCQ_ON_ATI | - ATA_QUIRK_NOLPM }, { "Samsung SSD 870*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM | ATA_QUIRK_NO_NCQ_ON_ATI }, -- 2.51.0 From 2565e42539b120b81a68a58da961ce5d1e34eac8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2024 14:39:11 +0100 Subject: [PATCH 02/16] perf/core: Fix pmus_lock vs. pmus_srcu ordering Commit a63fbed776c7 ("perf/tracing/cpuhotplug: Fix locking order") placed pmus_lock inside pmus_srcu, this makes perf_pmu_unregister() trip lockdep. Move the locking about such that only pmu_idr and pmus (list) are modified while holding pmus_lock. This avoids doing synchronize_srcu() while holding pmus_lock and all is well again. Fixes: a63fbed776c7 ("perf/tracing/cpuhotplug: Fix locking order") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241104135517.679556858@infradead.org --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 6364319e2f88..11793d690cbb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11939,6 +11939,8 @@ void perf_pmu_unregister(struct pmu *pmu) { mutex_lock(&pmus_lock); list_del_rcu(&pmu->entry); + idr_remove(&pmu_idr, pmu->type); + mutex_unlock(&pmus_lock); /* * We dereference the pmu list under both SRCU and regular RCU, so @@ -11948,7 +11950,6 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_rcu(); free_percpu(pmu->pmu_disable_count); - idr_remove(&pmu_idr, pmu->type); if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { if (pmu->nr_addr_filters) device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); @@ -11956,7 +11957,6 @@ void perf_pmu_unregister(struct pmu *pmu) put_device(pmu->dev); } free_pmu_context(pmu); - mutex_unlock(&pmus_lock); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); -- 2.51.0 From 003659fec9f6d8c04738cb74b5384398ae8a7e88 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2024 14:39:12 +0100 Subject: [PATCH 03/16] perf/core: Fix perf_pmu_register() vs. perf_init_event() There is a fairly obvious race between perf_init_event() doing idr_find() and perf_pmu_register() doing idr_alloc() with an incompletely initialized PMU pointer. Avoid by doing idr_alloc() on a NULL pointer to register the id, and swizzling the real struct pmu pointer at the end using idr_replace(). Also making sure to not set struct pmu members after publishing the struct pmu, duh. [ introduce idr_cmpxchg() in order to better handle the idr_replace() error case -- if it were to return an unexpected pointer, it will already have replaced the value and there is no going back. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241104135517.858805880@infradead.org --- kernel/events/core.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 11793d690cbb..823aa0824916 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11830,6 +11830,21 @@ free_dev: static struct lock_class_key cpuctx_mutex; static struct lock_class_key cpuctx_lock; +static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new) +{ + void *tmp, *val = idr_find(idr, id); + + if (val != old) + return false; + + tmp = idr_replace(idr, new, id); + if (IS_ERR(tmp)) + return false; + + WARN_ON_ONCE(tmp != val); + return true; +} + int perf_pmu_register(struct pmu *pmu, const char *name, int type) { int cpu, ret, max = PERF_TYPE_MAX; @@ -11856,7 +11871,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) if (type >= 0) max = type; - ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); + ret = idr_alloc(&pmu_idr, NULL, max, 0, GFP_KERNEL); if (ret < 0) goto free_pdc; @@ -11864,6 +11879,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) type = ret; pmu->type = type; + atomic_set(&pmu->exclusive_cnt, 0); if (pmu_bus_running && !pmu->dev) { ret = pmu_dev_alloc(pmu); @@ -11912,14 +11928,22 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; + /* + * Now that the PMU is complete, make it visible to perf_try_init_event(). + */ + if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu)) + goto free_context; list_add_rcu(&pmu->entry, &pmus); - atomic_set(&pmu->exclusive_cnt, 0); + ret = 0; unlock: mutex_unlock(&pmus_lock); return ret; +free_context: + free_percpu(pmu->cpu_pmu_context); + free_dev: if (pmu->dev && pmu->dev != PMU_NULL_DEV) { device_del(pmu->dev); -- 2.51.0 From 061c991697062f3bf87b72ed553d1d33a0e370dd Mon Sep 17 00:00:00 2001 From: Saket Kumar Bhaskar Date: Mon, 3 Mar 2025 14:54:51 +0530 Subject: [PATCH 04/16] perf/hw_breakpoint: Return EOPNOTSUPP for unsupported breakpoint type Currently, __reserve_bp_slot() returns -ENOSPC for unsupported breakpoint types on the architecture. For example, powerpc does not support hardware instruction breakpoints. This causes the perf_skip BPF selftest to fail, as neither ENOENT nor EOPNOTSUPP is returned by perf_event_open for unsupported breakpoint types. As a result, the test that should be skipped for this arch is not correctly identified. To resolve this, hw_breakpoint_event_init() should exit early by checking for unsupported breakpoint types using hw_breakpoint_slots_cached() and return the appropriate error (-EOPNOTSUPP). Signed-off-by: Saket Kumar Bhaskar Signed-off-by: Ingo Molnar Cc: Marco Elver Cc: Dmitry Vyukov Cc: Ian Rogers Cc: Frederic Weisbecker Link: https://lore.kernel.org/r/20250303092451.1862862-1-skb99@linux.ibm.com --- kernel/events/hw_breakpoint.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index bc4a61029b6d..8ec2cb688903 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -950,9 +950,10 @@ static int hw_breakpoint_event_init(struct perf_event *bp) return -ENOENT; /* - * no branch sampling for breakpoint events + * Check if breakpoint type is supported before proceeding. + * Also, no branch sampling for breakpoint events. */ - if (has_branch_stack(bp)) + if (!hw_breakpoint_slots_cached(find_slot_idx(bp->attr.bp_type)) || has_branch_stack(bp)) return -EOPNOTSUPP; err = register_perf_hw_breakpoint(bp); -- 2.51.0 From c70ca298036c58a88686ff388d3d367e9d21acf0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2024 14:39:13 +0100 Subject: [PATCH 05/16] perf/core: Simplify the perf_event_alloc() error path The error cleanup sequence in perf_event_alloc() is a subset of the existing _free_event() function (it must of course be). Split this out into __free_event() and simplify the error path. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Ravi Bangoria Link: https://lore.kernel.org/r/20241104135517.967889521@infradead.org --- include/linux/perf_event.h | 16 +++-- kernel/events/core.c | 138 ++++++++++++++++++------------------- 2 files changed, 78 insertions(+), 76 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index c4525bae2fe9..8c0117bcbdb9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -673,13 +673,15 @@ struct swevent_hlist { struct rcu_head rcu_head; }; -#define PERF_ATTACH_CONTEXT 0x01 -#define PERF_ATTACH_GROUP 0x02 -#define PERF_ATTACH_TASK 0x04 -#define PERF_ATTACH_TASK_DATA 0x08 -#define PERF_ATTACH_ITRACE 0x10 -#define PERF_ATTACH_SCHED_CB 0x20 -#define PERF_ATTACH_CHILD 0x40 +#define PERF_ATTACH_CONTEXT 0x0001 +#define PERF_ATTACH_GROUP 0x0002 +#define PERF_ATTACH_TASK 0x0004 +#define PERF_ATTACH_TASK_DATA 0x0008 +#define PERF_ATTACH_ITRACE 0x0010 +#define PERF_ATTACH_SCHED_CB 0x0020 +#define PERF_ATTACH_CHILD 0x0040 +#define PERF_ATTACH_EXCLUSIVE 0x0080 +#define PERF_ATTACH_CALLCHAIN 0x0100 struct bpf_prog; struct perf_cgroup; diff --git a/kernel/events/core.c b/kernel/events/core.c index 6ccf363d73cc..1b8b1c8f41ff 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5289,6 +5289,8 @@ static int exclusive_event_init(struct perf_event *event) return -EBUSY; } + event->attach_state |= PERF_ATTACH_EXCLUSIVE; + return 0; } @@ -5296,14 +5298,13 @@ static void exclusive_event_destroy(struct perf_event *event) { struct pmu *pmu = event->pmu; - if (!is_exclusive_pmu(pmu)) - return; - /* see comment in exclusive_event_init() */ if (event->attach_state & PERF_ATTACH_TASK) atomic_dec(&pmu->exclusive_cnt); else atomic_inc(&pmu->exclusive_cnt); + + event->attach_state &= ~PERF_ATTACH_EXCLUSIVE; } static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) @@ -5362,40 +5363,20 @@ static void perf_pending_task_sync(struct perf_event *event) rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE); } -static void _free_event(struct perf_event *event) +/* vs perf_event_alloc() error */ +static void __free_event(struct perf_event *event) { - irq_work_sync(&event->pending_irq); - irq_work_sync(&event->pending_disable_irq); - perf_pending_task_sync(event); + if (event->attach_state & PERF_ATTACH_CALLCHAIN) + put_callchain_buffers(); - unaccount_event(event); + kfree(event->addr_filter_ranges); - security_perf_event_free(event); - - if (event->rb) { - /* - * Can happen when we close an event with re-directed output. - * - * Since we have a 0 refcount, perf_mmap_close() will skip - * over us; possibly making our ring_buffer_put() the last. - */ - mutex_lock(&event->mmap_mutex); - ring_buffer_attach(event, NULL); - mutex_unlock(&event->mmap_mutex); - } + if (event->attach_state & PERF_ATTACH_EXCLUSIVE) + exclusive_event_destroy(event); if (is_cgroup_event(event)) perf_detach_cgroup(event); - if (!event->parent) { - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - } - - perf_event_free_bpf_prog(event); - perf_addr_filters_splice(event, NULL); - kfree(event->addr_filter_ranges); - if (event->destroy) event->destroy(event); @@ -5406,22 +5387,58 @@ static void _free_event(struct perf_event *event) if (event->hw.target) put_task_struct(event->hw.target); - if (event->pmu_ctx) + if (event->pmu_ctx) { + /* + * put_pmu_ctx() needs an event->ctx reference, because of + * epc->ctx. + */ + WARN_ON_ONCE(!event->ctx); + WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx); put_pmu_ctx(event->pmu_ctx); + } /* - * perf_event_free_task() relies on put_ctx() being 'last', in particular - * all task references must be cleaned up. + * perf_event_free_task() relies on put_ctx() being 'last', in + * particular all task references must be cleaned up. */ if (event->ctx) put_ctx(event->ctx); - exclusive_event_destroy(event); - module_put(event->pmu->module); + if (event->pmu) + module_put(event->pmu->module); call_rcu(&event->rcu_head, free_event_rcu); } +/* vs perf_event_alloc() success */ +static void _free_event(struct perf_event *event) +{ + irq_work_sync(&event->pending_irq); + irq_work_sync(&event->pending_disable_irq); + perf_pending_task_sync(event); + + unaccount_event(event); + + security_perf_event_free(event); + + if (event->rb) { + /* + * Can happen when we close an event with re-directed output. + * + * Since we have a 0 refcount, perf_mmap_close() will skip + * over us; possibly making our ring_buffer_put() the last. + */ + mutex_lock(&event->mmap_mutex); + ring_buffer_attach(event, NULL); + mutex_unlock(&event->mmap_mutex); + } + + perf_event_free_bpf_prog(event); + perf_addr_filters_splice(event, NULL); + + __free_event(event); +} + /* * Used to free events which have a known refcount of 1, such as in error paths * where the event isn't exposed yet and inherited events. @@ -12093,8 +12110,10 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) event->destroy(event); } - if (ret) + if (ret) { + event->pmu = NULL; module_put(pmu->module); + } return ret; } @@ -12422,7 +12441,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * See perf_output_read(). */ if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID)) - goto err_ns; + goto err; if (!has_branch_stack(event)) event->attr.branch_sample_type = 0; @@ -12430,7 +12449,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, pmu = perf_init_event(event); if (IS_ERR(pmu)) { err = PTR_ERR(pmu); - goto err_ns; + goto err; } /* @@ -12440,25 +12459,25 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, */ if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) { err = -EINVAL; - goto err_pmu; + goto err; } if (event->attr.aux_output && (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) || event->attr.aux_pause || event->attr.aux_resume)) { err = -EOPNOTSUPP; - goto err_pmu; + goto err; } if (event->attr.aux_pause && event->attr.aux_resume) { err = -EINVAL; - goto err_pmu; + goto err; } if (event->attr.aux_start_paused) { if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) { err = -EOPNOTSUPP; - goto err_pmu; + goto err; } event->hw.aux_paused = 1; } @@ -12466,12 +12485,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (cgroup_fd != -1) { err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); if (err) - goto err_pmu; + goto err; } err = exclusive_event_init(event); if (err) - goto err_pmu; + goto err; if (has_addr_filter(event)) { event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, @@ -12479,7 +12498,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, GFP_KERNEL); if (!event->addr_filter_ranges) { err = -ENOMEM; - goto err_per_task; + goto err; } /* @@ -12504,41 +12523,22 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(attr->sample_max_stack); if (err) - goto err_addr_filters; + goto err; + event->attach_state |= PERF_ATTACH_CALLCHAIN; } } err = security_perf_event_alloc(event); if (err) - goto err_callchain_buffer; + goto err; /* symmetric to unaccount_event() in _free_event() */ account_event(event); return event; -err_callchain_buffer: - if (!event->parent) { - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - } -err_addr_filters: - kfree(event->addr_filter_ranges); - -err_per_task: - exclusive_event_destroy(event); - -err_pmu: - if (is_cgroup_event(event)) - perf_detach_cgroup(event); - if (event->destroy) - event->destroy(event); - module_put(pmu->module); -err_ns: - if (event->hw.target) - put_task_struct(event->hw.target); - call_rcu(&event->rcu_head, free_event_rcu); - +err: + __free_event(event); return ERR_PTR(err); } -- 2.51.0 From 8f4c4963d28349cbf1920ab71edea8276f6ac4c5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2024 14:39:14 +0100 Subject: [PATCH 06/16] perf/core: Simplify the perf_pmu_register() error path The error path of perf_pmu_register() is of course very similar to a subset of perf_pmu_unregister(). Extract this common part in perf_pmu_free() and simplify things. [ mingo: Forward ported it ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Ravi Bangoria Link: https://lore.kernel.org/r/20241104135518.090915501@infradead.org --- kernel/events/core.c | 67 ++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 37 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 1b8b1c8f41ff..ee5cdd692383 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11675,11 +11675,6 @@ static int perf_event_idx_default(struct perf_event *event) return 0; } -static void free_pmu_context(struct pmu *pmu) -{ - free_percpu(pmu->cpu_pmu_context); -} - /* * Let userspace know that this PMU supports address range filtering: */ @@ -11885,6 +11880,7 @@ del_dev: free_dev: put_device(pmu->dev); + pmu->dev = NULL; goto out; } @@ -11906,25 +11902,38 @@ static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new) return true; } +static void perf_pmu_free(struct pmu *pmu) +{ + free_percpu(pmu->pmu_disable_count); + if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); + device_del(pmu->dev); + put_device(pmu->dev); + } + free_percpu(pmu->cpu_pmu_context); +} + int perf_pmu_register(struct pmu *pmu, const char *name, int type) { int cpu, ret, max = PERF_TYPE_MAX; + pmu->type = -1; + mutex_lock(&pmus_lock); ret = -ENOMEM; pmu->pmu_disable_count = alloc_percpu(int); if (!pmu->pmu_disable_count) goto unlock; - pmu->type = -1; if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) { ret = -EINVAL; - goto free_pdc; + goto free; } if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) { ret = -EINVAL; - goto free_pdc; + goto free; } pmu->name = name; @@ -11934,24 +11943,23 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) ret = idr_alloc(&pmu_idr, NULL, max, 0, GFP_KERNEL); if (ret < 0) - goto free_pdc; + goto free; WARN_ON(type >= 0 && ret != type); - type = ret; - pmu->type = type; + pmu->type = ret; atomic_set(&pmu->exclusive_cnt, 0); if (pmu_bus_running && !pmu->dev) { ret = pmu_dev_alloc(pmu); if (ret) - goto free_idr; + goto free; } ret = -ENOMEM; pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context); if (!pmu->cpu_pmu_context) - goto free_dev; + goto free; for_each_possible_cpu(cpu) { struct perf_cpu_pmu_context *cpc; @@ -11992,8 +12000,10 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) /* * Now that the PMU is complete, make it visible to perf_try_init_event(). */ - if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu)) - goto free_context; + if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu)) { + ret = -EINVAL; + goto free; + } list_add_rcu(&pmu->entry, &pmus); ret = 0; @@ -12002,20 +12012,10 @@ unlock: return ret; -free_context: - free_percpu(pmu->cpu_pmu_context); - -free_dev: - if (pmu->dev && pmu->dev != PMU_NULL_DEV) { - device_del(pmu->dev); - put_device(pmu->dev); - } - -free_idr: - idr_remove(&pmu_idr, pmu->type); - -free_pdc: - free_percpu(pmu->pmu_disable_count); +free: + if (pmu->type >= 0) + idr_remove(&pmu_idr, pmu->type); + perf_pmu_free(pmu); goto unlock; } EXPORT_SYMBOL_GPL(perf_pmu_register); @@ -12034,14 +12034,7 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_srcu(&pmus_srcu); synchronize_rcu(); - free_percpu(pmu->pmu_disable_count); - if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { - if (pmu->nr_addr_filters) - device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); - device_del(pmu->dev); - put_device(pmu->dev); - } - free_pmu_context(pmu); + perf_pmu_free(pmu); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); -- 2.51.0 From 6c8b0b835f003647e593c08331a4dd2150d5eb0e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2024 14:39:15 +0100 Subject: [PATCH 07/16] perf/core: Simplify perf_pmu_register() Using the previously introduced perf_pmu_free() and a new IDR helper, simplify the perf_pmu_register error paths. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Ravi Bangoria Link: https://lore.kernel.org/r/20241104135518.198937277@infradead.org --- include/linux/idr.h | 17 +++++++++++ kernel/events/core.c | 71 ++++++++++++++++++-------------------------- 2 files changed, 46 insertions(+), 42 deletions(-) diff --git a/include/linux/idr.h b/include/linux/idr.h index da5f5fa4a3a6..cd729be369b3 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -15,6 +15,7 @@ #include #include #include +#include struct idr { struct radix_tree_root idr_rt; @@ -124,6 +125,22 @@ void *idr_get_next_ul(struct idr *, unsigned long *nextid); void *idr_replace(struct idr *, void *, unsigned long id); void idr_destroy(struct idr *); +struct __class_idr { + struct idr *idr; + int id; +}; + +#define idr_null ((struct __class_idr){ NULL, -1 }) +#define take_idr_id(id) __get_and_null(id, idr_null) + +DEFINE_CLASS(idr_alloc, struct __class_idr, + if (_T.id >= 0) idr_remove(_T.idr, _T.id), + ((struct __class_idr){ + .idr = idr, + .id = idr_alloc(idr, ptr, start, end, gfp), + }), + struct idr *idr, void *ptr, int start, int end, gfp_t gfp); + /** * idr_init_base() - Initialise an IDR. * @idr: IDR handle. diff --git a/kernel/events/core.c b/kernel/events/core.c index ee5cdd692383..215dad53aa1b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11914,52 +11914,49 @@ static void perf_pmu_free(struct pmu *pmu) free_percpu(pmu->cpu_pmu_context); } -int perf_pmu_register(struct pmu *pmu, const char *name, int type) +DEFINE_FREE(pmu_unregister, struct pmu *, if (_T) perf_pmu_free(_T)) + +int perf_pmu_register(struct pmu *_pmu, const char *name, int type) { - int cpu, ret, max = PERF_TYPE_MAX; + int cpu, max = PERF_TYPE_MAX; - pmu->type = -1; + struct pmu *pmu __free(pmu_unregister) = _pmu; + guard(mutex)(&pmus_lock); - mutex_lock(&pmus_lock); - ret = -ENOMEM; pmu->pmu_disable_count = alloc_percpu(int); if (!pmu->pmu_disable_count) - goto unlock; + return -ENOMEM; - if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) { - ret = -EINVAL; - goto free; - } + if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) + return -EINVAL; - if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) { - ret = -EINVAL; - goto free; - } + if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, + "Can not register a pmu with an invalid scope.\n")) + return -EINVAL; pmu->name = name; if (type >= 0) max = type; - ret = idr_alloc(&pmu_idr, NULL, max, 0, GFP_KERNEL); - if (ret < 0) - goto free; + CLASS(idr_alloc, pmu_type)(&pmu_idr, NULL, max, 0, GFP_KERNEL); + if (pmu_type.id < 0) + return pmu_type.id; - WARN_ON(type >= 0 && ret != type); + WARN_ON(type >= 0 && pmu_type.id != type); - pmu->type = ret; + pmu->type = pmu_type.id; atomic_set(&pmu->exclusive_cnt, 0); if (pmu_bus_running && !pmu->dev) { - ret = pmu_dev_alloc(pmu); + int ret = pmu_dev_alloc(pmu); if (ret) - goto free; + return ret; } - ret = -ENOMEM; pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context); if (!pmu->cpu_pmu_context) - goto free; + return -ENOMEM; for_each_possible_cpu(cpu) { struct perf_cpu_pmu_context *cpc; @@ -12000,32 +11997,22 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) /* * Now that the PMU is complete, make it visible to perf_try_init_event(). */ - if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu)) { - ret = -EINVAL; - goto free; - } + if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu)) + return -EINVAL; list_add_rcu(&pmu->entry, &pmus); - ret = 0; -unlock: - mutex_unlock(&pmus_lock); - - return ret; - -free: - if (pmu->type >= 0) - idr_remove(&pmu_idr, pmu->type); - perf_pmu_free(pmu); - goto unlock; + take_idr_id(pmu_type); + _pmu = no_free_ptr(pmu); // let it rip + return 0; } EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { - mutex_lock(&pmus_lock); - list_del_rcu(&pmu->entry); - idr_remove(&pmu_idr, pmu->type); - mutex_unlock(&pmus_lock); + scoped_guard (mutex, &pmus_lock) { + list_del_rcu(&pmu->entry); + idr_remove(&pmu_idr, pmu->type); + } /* * We dereference the pmu list under both SRCU and regular RCU, so -- 2.51.0 From caf8b765d453198d4ca5305d9e207535934b6e3b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2024 14:39:16 +0100 Subject: [PATCH 08/16] perf/core: Simplify perf_init_event() Use the guard() and scoped_guard() infrastructure to simplify the control flow. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Ravi Bangoria Cc: Thomas Gleixner Link: https://lore.kernel.org/r/20241104135518.302444446@infradead.org --- kernel/events/core.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 215dad53aa1b..fd352361259c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -12101,10 +12101,10 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) static struct pmu *perf_init_event(struct perf_event *event) { bool extended_type = false; - int idx, type, ret; struct pmu *pmu; + int type, ret; - idx = srcu_read_lock(&pmus_srcu); + guard(srcu)(&pmus_srcu); /* * Save original type before calling pmu->event_init() since certain @@ -12117,7 +12117,7 @@ static struct pmu *perf_init_event(struct perf_event *event) pmu = event->parent->pmu; ret = perf_try_init_event(pmu, event); if (!ret) - goto unlock; + return pmu; } /* @@ -12136,13 +12136,12 @@ static struct pmu *perf_init_event(struct perf_event *event) } again: - rcu_read_lock(); - pmu = idr_find(&pmu_idr, type); - rcu_read_unlock(); + scoped_guard (rcu) + pmu = idr_find(&pmu_idr, type); if (pmu) { if (event->attr.type != type && type != PERF_TYPE_RAW && !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) - goto fail; + return ERR_PTR(-ENOENT); ret = perf_try_init_event(pmu, event); if (ret == -ENOENT && event->attr.type != type && !extended_type) { @@ -12151,27 +12150,21 @@ again: } if (ret) - pmu = ERR_PTR(ret); + return ERR_PTR(ret); - goto unlock; + return pmu; } list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { ret = perf_try_init_event(pmu, event); if (!ret) - goto unlock; + return pmu; - if (ret != -ENOENT) { - pmu = ERR_PTR(ret); - goto unlock; - } + if (ret != -ENOENT) + return ERR_PTR(ret); } -fail: - pmu = ERR_PTR(-ENOENT); -unlock: - srcu_read_unlock(&pmus_srcu, idx); - return pmu; + return ERR_PTR(-ENOENT); } static void attach_sb_event(struct perf_event *event) -- 2.51.0 From 8f2221f52eced88e74c7ae22b4b2d67dc7a96bd2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2024 14:39:17 +0100 Subject: [PATCH 09/16] perf/core: Simplify perf_event_alloc() Using the previous simplifications, transition perf_event_alloc() to the cleanup way of things -- reducing error path magic. [ mingo: Ported it to recent kernels. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Ravi Bangoria Link: https://lore.kernel.org/r/20241104135518.410755241@infradead.org --- kernel/events/core.c | 59 +++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 37 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index fd352361259c..348a379d4f05 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5410,6 +5410,8 @@ static void __free_event(struct perf_event *event) call_rcu(&event->rcu_head, free_event_rcu); } +DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T)) + /* vs perf_event_alloc() success */ static void _free_event(struct perf_event *event) { @@ -12291,7 +12293,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, void *context, int cgroup_fd) { struct pmu *pmu; - struct perf_event *event; struct hw_perf_event *hwc; long err = -EINVAL; int node; @@ -12306,8 +12307,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } node = (cpu >= 0) ? cpu_to_node(cpu) : -1; - event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, - node); + struct perf_event *event __free(__free_event) = + kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node); if (!event) return ERR_PTR(-ENOMEM); @@ -12414,65 +12415,53 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * See perf_output_read(). */ if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID)) - goto err; + return ERR_PTR(-EINVAL); if (!has_branch_stack(event)) event->attr.branch_sample_type = 0; pmu = perf_init_event(event); - if (IS_ERR(pmu)) { - err = PTR_ERR(pmu); - goto err; - } + if (IS_ERR(pmu)) + return (void*)pmu; /* * Disallow uncore-task events. Similarly, disallow uncore-cgroup * events (they don't make sense as the cgroup will be different * on other CPUs in the uncore mask). */ - if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) { - err = -EINVAL; - goto err; - } + if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) + return ERR_PTR(-EINVAL); if (event->attr.aux_output && (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) || - event->attr.aux_pause || event->attr.aux_resume)) { - err = -EOPNOTSUPP; - goto err; - } + event->attr.aux_pause || event->attr.aux_resume)) + return ERR_PTR(-EOPNOTSUPP); - if (event->attr.aux_pause && event->attr.aux_resume) { - err = -EINVAL; - goto err; - } + if (event->attr.aux_pause && event->attr.aux_resume) + return ERR_PTR(-EINVAL); if (event->attr.aux_start_paused) { - if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) { - err = -EOPNOTSUPP; - goto err; - } + if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) + return ERR_PTR(-EOPNOTSUPP); event->hw.aux_paused = 1; } if (cgroup_fd != -1) { err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); if (err) - goto err; + return ERR_PTR(err); } err = exclusive_event_init(event); if (err) - goto err; + return ERR_PTR(err); if (has_addr_filter(event)) { event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, sizeof(struct perf_addr_filter_range), GFP_KERNEL); - if (!event->addr_filter_ranges) { - err = -ENOMEM; - goto err; - } + if (!event->addr_filter_ranges) + return ERR_PTR(-ENOMEM); /* * Clone the parent's vma offsets: they are valid until exec() @@ -12496,23 +12485,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(attr->sample_max_stack); if (err) - goto err; + return ERR_PTR(err); event->attach_state |= PERF_ATTACH_CALLCHAIN; } } err = security_perf_event_alloc(event); if (err) - goto err; + return ERR_PTR(err); /* symmetric to unaccount_event() in _free_event() */ account_event(event); - return event; - -err: - __free_event(event); - return ERR_PTR(err); + return_ptr(event); } static int perf_copy_attr(struct perf_event_attr __user *uattr, -- 2.51.0 From 4baeb0687abf5eca3f7ab8b147c27cce82ec49ea Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2024 14:39:18 +0100 Subject: [PATCH 10/16] perf/core: Merge struct pmu::pmu_disable_count into struct perf_cpu_pmu_context::pmu_disable_count Because it makes no sense to have two per-cpu allocations per pmu. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Ravi Bangoria Link: https://lore.kernel.org/r/20241104135518.518730578@infradead.org --- include/linux/perf_event.h | 2 +- kernel/events/core.c | 12 ++++-------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8c0117bcbdb9..5f293e679ab6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -343,7 +343,6 @@ struct pmu { */ unsigned int scope; - int __percpu *pmu_disable_count; struct perf_cpu_pmu_context __percpu *cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; @@ -1031,6 +1030,7 @@ struct perf_cpu_pmu_context { int active_oncpu; int exclusive; + int pmu_disable_count; raw_spinlock_t hrtimer_lock; struct hrtimer hrtimer; diff --git a/kernel/events/core.c b/kernel/events/core.c index 348a379d4f05..8321b719b6ab 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1219,21 +1219,22 @@ static int perf_mux_hrtimer_restart_ipi(void *arg) void perf_pmu_disable(struct pmu *pmu) { - int *count = this_cpu_ptr(pmu->pmu_disable_count); + int *count = &this_cpu_ptr(pmu->cpu_pmu_context)->pmu_disable_count; if (!(*count)++) pmu->pmu_disable(pmu); } void perf_pmu_enable(struct pmu *pmu) { - int *count = this_cpu_ptr(pmu->pmu_disable_count); + int *count = &this_cpu_ptr(pmu->cpu_pmu_context)->pmu_disable_count; if (!--(*count)) pmu->pmu_enable(pmu); } static void perf_assert_pmu_disabled(struct pmu *pmu) { - WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0); + int *count = &this_cpu_ptr(pmu->cpu_pmu_context)->pmu_disable_count; + WARN_ON_ONCE(*count == 0); } static inline void perf_pmu_read(struct perf_event *event) @@ -11906,7 +11907,6 @@ static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new) static void perf_pmu_free(struct pmu *pmu) { - free_percpu(pmu->pmu_disable_count); if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { if (pmu->nr_addr_filters) device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); @@ -11925,10 +11925,6 @@ int perf_pmu_register(struct pmu *_pmu, const char *name, int type) struct pmu *pmu __free(pmu_unregister) = _pmu; guard(mutex)(&pmus_lock); - pmu->pmu_disable_count = alloc_percpu(int); - if (!pmu->pmu_disable_count) - return -ENOMEM; - if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) return -EINVAL; -- 2.51.0 From b2996f56556e389a13377158904c218da6fffa91 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2024 14:39:19 +0100 Subject: [PATCH 11/16] perf/core: Add this_cpc() helper As a preparation for adding yet another indirection. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Ravi Bangoria Link: https://lore.kernel.org/r/20241104135518.650051565@infradead.org --- kernel/events/core.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 8321b719b6ab..0c7015f3ec03 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1217,23 +1217,28 @@ static int perf_mux_hrtimer_restart_ipi(void *arg) return perf_mux_hrtimer_restart(arg); } +static __always_inline struct perf_cpu_pmu_context *this_cpc(struct pmu *pmu) +{ + return this_cpu_ptr(pmu->cpu_pmu_context); +} + void perf_pmu_disable(struct pmu *pmu) { - int *count = &this_cpu_ptr(pmu->cpu_pmu_context)->pmu_disable_count; + int *count = &this_cpc(pmu)->pmu_disable_count; if (!(*count)++) pmu->pmu_disable(pmu); } void perf_pmu_enable(struct pmu *pmu) { - int *count = &this_cpu_ptr(pmu->cpu_pmu_context)->pmu_disable_count; + int *count = &this_cpc(pmu)->pmu_disable_count; if (!--(*count)) pmu->pmu_enable(pmu); } static void perf_assert_pmu_disabled(struct pmu *pmu) { - int *count = &this_cpu_ptr(pmu->cpu_pmu_context)->pmu_disable_count; + int *count = &this_cpc(pmu)->pmu_disable_count; WARN_ON_ONCE(*count == 0); } @@ -2355,7 +2360,7 @@ static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; // XXX cpc serialization, probably per-cpu IRQ disabled @@ -2496,9 +2501,8 @@ __perf_remove_from_context(struct perf_event *event, pmu_ctx->rotate_necessary = 0; if (ctx->task && ctx->is_active) { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = this_cpc(pmu_ctx->pmu); - cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } @@ -2636,7 +2640,7 @@ static int event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); int ret = 0; WARN_ON_ONCE(event->ctx != ctx); @@ -2743,7 +2747,7 @@ error: static int group_can_go_on(struct perf_event *event, int can_add_hw) { struct perf_event_pmu_context *epc = event->pmu_ctx; - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); /* * Groups consisting entirely of software events can always go on. @@ -3366,9 +3370,8 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, struct pmu *pmu = pmu_ctx->pmu; if (ctx->task && !(ctx->is_active & EVENT_ALL)) { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); - cpc = this_cpu_ptr(pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } @@ -3615,7 +3618,7 @@ static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in struct perf_cpu_pmu_context *cpc; list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + cpc = this_cpc(pmu_ctx->pmu); if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task) pmu_ctx->pmu->sched_task(pmu_ctx, sched_in); @@ -3724,7 +3727,7 @@ static DEFINE_PER_CPU(int, perf_sched_cb_usages); void perf_sched_cb_dec(struct pmu *pmu) { - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); this_cpu_dec(perf_sched_cb_usages); barrier(); @@ -3736,7 +3739,7 @@ void perf_sched_cb_dec(struct pmu *pmu) void perf_sched_cb_inc(struct pmu *pmu) { - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); if (!cpc->sched_cb_usage++) list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); @@ -3853,7 +3856,7 @@ static void __link_epc(struct perf_event_pmu_context *pmu_ctx) if (!pmu_ctx->ctx->task) return; - cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + cpc = this_cpc(pmu_ctx->pmu); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = pmu_ctx; } @@ -3982,10 +3985,9 @@ static int merge_sched_in(struct perf_event *event, void *data) perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } else { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu); event->pmu_ctx->rotate_necessary = 1; - cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context); perf_mux_hrtimer_restart(cpc); group_update_userpage(event); } -- 2.51.0 From adc38b4ca1ed25ed2f1300e4d87c483bf51bfd50 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2024 14:39:21 +0100 Subject: [PATCH 12/16] perf/core: Introduce perf_free_addr_filters() Replace _free_event()'s use of perf_addr_filters_splice()s use with an explicit perf_free_addr_filters() with the explicit propery that it is able to be called a second time without ill effect. Most notable, referencing event->pmu must be avoided when there are no filters left (from eg a previous call). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Ravi Bangoria Link: https://lore.kernel.org/r/20241104135518.868460518@infradead.org --- kernel/events/core.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 0c7015f3ec03..525c64ee7925 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5339,8 +5339,7 @@ static bool exclusive_event_installable(struct perf_event *event, return true; } -static void perf_addr_filters_splice(struct perf_event *event, - struct list_head *head); +static void perf_free_addr_filters(struct perf_event *event); static void perf_pending_task_sync(struct perf_event *event) { @@ -5439,7 +5438,7 @@ static void _free_event(struct perf_event *event) } perf_event_free_bpf_prog(event); - perf_addr_filters_splice(event, NULL); + perf_free_addr_filters(event); __free_event(event); } @@ -11004,6 +11003,17 @@ static void perf_addr_filters_splice(struct perf_event *event, free_filters_list(&list); } +static void perf_free_addr_filters(struct perf_event *event) +{ + /* + * Used during free paths, there is no concurrency. + */ + if (list_empty(&event->addr_filters.list)) + return; + + perf_addr_filters_splice(event, NULL); +} + /* * Scan through mm's vmas and see if one of them matches the * @filter; if so, adjust filter's address range. -- 2.51.0 From c5b96789575b670b1e776071bb243e0ed3d3abaa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2024 14:39:22 +0100 Subject: [PATCH 13/16] perf/bpf: Robustify perf_event_free_bpf_prog() Ensure perf_event_free_bpf_prog() is safe to call a second time; notably without making any references to event->pmu when there is no prog left. Note: perf_event_detach_bpf_prog() might leave a stale event->prog Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Ravi Bangoria Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Andrii Nakryiko Link: https://lore.kernel.org/r/20241104135518.978956692@infradead.org --- kernel/events/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index 525c64ee7925..ab4e497087da 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10905,6 +10905,9 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, void perf_event_free_bpf_prog(struct perf_event *event) { + if (!event->prog) + return; + if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); return; -- 2.51.0 From 954878377bc81459b95937a05f01e8ebf6a05083 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2024 14:39:23 +0100 Subject: [PATCH 14/16] perf/core: Simplify the perf_mmap() control flow Identity-transform: if (c) { X1; } else { Y; goto l; } X2; l: into the simpler: if (c) { X1; X2; } else { Y; } [ mingo: Forward ported it ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Ravi Bangoria Link: https://lore.kernel.org/r/20241104135519.095904637@infradead.org --- kernel/events/core.c | 75 +++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index ab4e497087da..d1b04c850881 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6701,6 +6701,42 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_pgoff == 0) { nr_pages = (vma_size / PAGE_SIZE) - 1; + + /* + * If we have rb pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + return -EINVAL; + + if (vma_size != PAGE_SIZE * (1 + nr_pages)) + return -EINVAL; + + WARN_ON_ONCE(event->ctx->parent_ctx); +again: + mutex_lock(&event->mmap_mutex); + if (event->rb) { + if (data_page_nr(event->rb) != nr_pages) { + ret = -EINVAL; + goto unlock; + } + + if (!atomic_inc_not_zero(&event->rb->mmap_count)) { + /* + * Raced against perf_mmap_close(); remove the + * event and try again. + */ + ring_buffer_attach(event, NULL); + mutex_unlock(&event->mmap_mutex); + goto again; + } + + /* We need the rb to map pages. */ + rb = event->rb; + goto unlock; + } + + user_extra = nr_pages + 1; } else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already @@ -6760,47 +6796,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) atomic_set(&rb->aux_mmap_count, 1); user_extra = nr_pages; - - goto accounting; - } - - /* - * If we have rb pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - return -EINVAL; - - if (vma_size != PAGE_SIZE * (1 + nr_pages)) - return -EINVAL; - - WARN_ON_ONCE(event->ctx->parent_ctx); -again: - mutex_lock(&event->mmap_mutex); - if (event->rb) { - if (data_page_nr(event->rb) != nr_pages) { - ret = -EINVAL; - goto unlock; - } - - if (!atomic_inc_not_zero(&event->rb->mmap_count)) { - /* - * Raced against perf_mmap_close(); remove the - * event and try again. - */ - ring_buffer_attach(event, NULL); - mutex_unlock(&event->mmap_mutex); - goto again; - } - - /* We need the rb to map pages. */ - rb = event->rb; - goto unlock; } - user_extra = nr_pages + 1; - -accounting: user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); /* -- 2.51.0 From 0c8a4e4139adf09b27fb910edbc596ea2d31a5db Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2024 14:39:25 +0100 Subject: [PATCH 15/16] perf/core: Further simplify perf_mmap() Perform CSE and such. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Ravi Bangoria Link: https://lore.kernel.org/r/20241104135519.354909594@infradead.org --- kernel/events/core.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index d1b04c850881..4cd3494c65e2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6698,9 +6698,18 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return ret; vma_size = vma->vm_end - vma->vm_start; + nr_pages = vma_size / PAGE_SIZE; + + if (nr_pages > INT_MAX) + return -ENOMEM; + + if (vma_size != PAGE_SIZE * nr_pages) + return -EINVAL; + + user_extra = nr_pages; if (vma->vm_pgoff == 0) { - nr_pages = (vma_size / PAGE_SIZE) - 1; + nr_pages -= 1; /* * If we have rb pages ensure they're a power-of-two number, so we @@ -6709,9 +6718,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (nr_pages != 0 && !is_power_of_2(nr_pages)) return -EINVAL; - if (vma_size != PAGE_SIZE * (1 + nr_pages)) - return -EINVAL; - WARN_ON_ONCE(event->ctx->parent_ctx); again: mutex_lock(&event->mmap_mutex); @@ -6735,8 +6741,6 @@ again: rb = event->rb; goto unlock; } - - user_extra = nr_pages + 1; } else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already @@ -6748,10 +6752,6 @@ again: if (!event->rb) return -EINVAL; - nr_pages = vma_size / PAGE_SIZE; - if (nr_pages > INT_MAX) - return -ENOMEM; - mutex_lock(&event->mmap_mutex); ret = -EINVAL; @@ -6795,7 +6795,6 @@ again: } atomic_set(&rb->aux_mmap_count, 1); - user_extra = nr_pages; } user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); -- 2.51.0 From 8eaec7bb723c9a0addfc0457e2f28e41735607af Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2024 14:39:26 +0100 Subject: [PATCH 16/16] perf/core: Remove retry loop from perf_mmap() AFAICT there is no actual benefit from the mutex drop on re-try. The 'worst' case scenario is that we instantly re-gain the mutex without perf_mmap_close() getting it. So might as well make that the normal case. Reflow the code to make the ring buffer detach case naturally flow into the no ring buffer case. [ mingo: Forward ported it ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Ravi Bangoria Link: https://lore.kernel.org/r/20241104135519.463607258@infradead.org --- kernel/events/core.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 4cd3494c65e2..ca4c1242c29b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6719,28 +6719,33 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; WARN_ON_ONCE(event->ctx->parent_ctx); -again: mutex_lock(&event->mmap_mutex); + if (event->rb) { if (data_page_nr(event->rb) != nr_pages) { ret = -EINVAL; goto unlock; } - if (!atomic_inc_not_zero(&event->rb->mmap_count)) { + if (atomic_inc_not_zero(&event->rb->mmap_count)) { /* - * Raced against perf_mmap_close(); remove the - * event and try again. + * Success -- managed to mmap() the same buffer + * multiple times. */ - ring_buffer_attach(event, NULL); - mutex_unlock(&event->mmap_mutex); - goto again; + ret = 0; + /* We need the rb to map pages. */ + rb = event->rb; + goto unlock; } - /* We need the rb to map pages. */ - rb = event->rb; - goto unlock; + /* + * Raced against perf_mmap_close()'s + * atomic_dec_and_mutex_lock() remove the + * event and continue as if !event->rb + */ + ring_buffer_attach(event, NULL); } + } else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already -- 2.51.0