From 30d68cb0c37ebe2dc63aa1d46a28b9163e61caa2 Mon Sep 17 00:00:00 2001 From: Frederick Lawler Date: Thu, 27 Mar 2025 11:09:11 -0500 Subject: [PATCH 01/16] ima: process_measurement() needlessly takes inode_lock() on MAY_READ On IMA policy update, if a measure rule exists in the policy, IMA_MEASURE is set for ima_policy_flags which makes the violation_check variable always true. Coupled with a no-action on MAY_READ for a FILE_CHECK call, we're always taking the inode_lock(). This becomes a performance problem for extremely heavy read-only workloads. Therefore, prevent this only in the case there's no action to be taken. Signed-off-by: Frederick Lawler Acked-by: Roberto Sassu Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index f3e7ac513db3..f99ab1a3b0f0 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -245,7 +245,9 @@ static int process_measurement(struct file *file, const struct cred *cred, &allowed_algos); violation_check = ((func == FILE_CHECK || func == MMAP_CHECK || func == MMAP_CHECK_REQPROT) && - (ima_policy_flag & IMA_MEASURE)); + (ima_policy_flag & IMA_MEASURE) && + ((action & IMA_MEASURE) || + (file->f_mode & FMODE_WRITE))); if (!action && !violation_check) return 0; -- 2.51.0 From 0db61388b389f43c1ba2f1cee3613feb4fd12150 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 22 Apr 2025 15:33:18 -0700 Subject: [PATCH 02/16] perf/core: Change to POLLERR for pinned events with error Commit: f4b07fd62d4d11d5 ("perf/core: Use POLLHUP for pinned events in error") started to emit POLLHUP for pinned events in an error state. But the POLLHUP is also used to signal events that the attached task is terminated. To distinguish pinned per-task events in the error state it would need to check if the task is live. Change it to POLLERR to make it clear. Suggested-by: Gabriel Marin Signed-off-by: Namhyung Kim Signed-off-by: Ingo Molnar Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Kan Liang Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250422223318.180343-1-namhyung@kernel.org --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index e93c19565914..95e703891b24 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3943,7 +3943,7 @@ static int merge_sched_in(struct perf_event *event, void *data) perf_event_set_state(event, PERF_EVENT_STATE_ERROR); if (*perf_event_fasync(event)) - event->pending_kill = POLL_HUP; + event->pending_kill = POLL_ERR; perf_event_wakeup(event); } else { @@ -6075,7 +6075,7 @@ static __poll_t perf_poll(struct file *file, poll_table *wait) if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR && event->attr.pinned)) - return events; + return EPOLLERR; /* * Pin the event->rb by taking event->mmap_mutex; otherwise -- 2.51.0 From 1a97fea9db9e9b9c4839d4232dde9f505ff5b4cc Mon Sep 17 00:00:00 2001 From: Luo Gengkun Date: Wed, 23 Apr 2025 06:47:24 +0000 Subject: [PATCH 03/16] perf/x86: Fix non-sampling (counting) events on certain x86 platforms Perf doesn't work at perf stat for hardware events on certain x86 platforms: $perf stat -- sleep 1 Performance counter stats for 'sleep 1': 16.44 msec task-clock # 0.016 CPUs utilized 2 context-switches # 121.691 /sec 0 cpu-migrations # 0.000 /sec 54 page-faults # 3.286 K/sec cycles instructions branches branch-misses The reason is that the check in x86_pmu_hw_config() for sampling events is unexpectedly applied to counting events as well. It should only impact x86 platforms with limit_period used for non-PEBS events. For Intel platforms, it should only impact some older platforms, e.g., HSW, BDW and NHM. Fixes: 88ec7eedbbd2 ("perf/x86: Fix low freqency setting issue") Signed-off-by: Luo Gengkun Signed-off-by: Ingo Molnar Reviewed-by: Kan Liang Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Link: https://lore.kernel.org/r/20250423064724.3716211-1-luogengkun@huaweicloud.com --- arch/x86/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6866cc5acb0b..3a4f031d2f44 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -629,7 +629,7 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.type == event->pmu->type) event->hw.config |= x86_pmu_get_event_config(event); - if (!event->attr.freq && x86_pmu.limit_period) { + if (is_sampling_event(event) && !event->attr.freq && x86_pmu.limit_period) { s64 left = event->attr.sample_period; x86_pmu.limit_period(event, &left); if (left > event->attr.sample_period) -- 2.51.0 From 75aea4b0656ead0facd13d2aae4cb77326e53d2f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 24 Apr 2025 06:47:14 -0700 Subject: [PATCH 04/16] perf/x86/intel: Only check the group flag for X86 leader A warning in intel_pmu_lbr_counters_reorder() may be triggered by below perf command. perf record -e "{cpu-clock,cycles/call-graph="lbr"/}" -- sleep 1 It's because the group is mistakenly treated as a branch counter group. The hw.flags of the leader are used to determine whether a group is a branch counters group. However, the hw.flags is only available for a hardware event. The field to store the flags is a union type. For a software event, it's a hrtimer. The corresponding bit may be set if the leader is a software event. For a branch counter group and other groups that have a group flag (e.g., topdown, PEBS counters snapshotting, and ACR), the leader must be a X86 event. Check the X86 event before checking the flag. The patch only fixes the issue for the branch counter group. The following patch will fix the other groups. There may be an alternative way to fix the issue by moving the hw.flags out of the union type. It should work for now. But it's still possible that the flags will be used by other types of events later. As long as that type of event is used as a leader, a similar issue will be triggered. So the alternative way is dropped. Fixes: 33744916196b ("perf/x86/intel: Support branch counters logging") Closes: https://lore.kernel.org/lkml/20250412091423.1839809-1-luogengkun@huaweicloud.com/ Reported-by: Luo Gengkun Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250424134718.311934-2-kan.liang@linux.intel.com --- arch/x86/events/core.c | 2 +- arch/x86/events/perf_event.h | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 3a4f031d2f44..139ad80d1df3 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -754,7 +754,7 @@ void x86_pmu_enable_all(int added) } } -static inline int is_x86_event(struct perf_event *event) +int is_x86_event(struct perf_event *event) { int i; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 2c0ce0e9545e..4237c379cdc5 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -110,9 +110,16 @@ static inline bool is_topdown_event(struct perf_event *event) return is_metric_event(event) || is_slots_event(event); } +int is_x86_event(struct perf_event *event); + +static inline bool check_leader_group(struct perf_event *leader, int flags) +{ + return is_x86_event(leader) ? !!(leader->hw.flags & flags) : false; +} + static inline bool is_branch_counters_group(struct perf_event *event) { - return event->group_leader->hw.flags & PERF_X86_EVENT_BRANCH_COUNTERS; + return check_leader_group(event->group_leader, PERF_X86_EVENT_BRANCH_COUNTERS); } static inline bool is_pebs_counter_event_group(struct perf_event *event) -- 2.51.0 From e9988ad7b1744991118ac348a804f9395368a284 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 24 Apr 2025 06:47:15 -0700 Subject: [PATCH 05/16] perf/x86/intel: Check the X86 leader for pebs_counter_event_group The PEBS counters snapshotting group also requires a group flag in the leader. The leader must be a X86 event. Fixes: e02e9b0374c3 ("perf/x86/intel: Support PEBS counters snapshotting") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250424134718.311934-3-kan.liang@linux.intel.com --- arch/x86/events/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 4237c379cdc5..46d120597bab 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -124,7 +124,7 @@ static inline bool is_branch_counters_group(struct perf_event *event) static inline bool is_pebs_counter_event_group(struct perf_event *event) { - return event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR; + return check_leader_group(event->group_leader, PERF_X86_EVENT_PEBS_CNTR); } struct amd_nb { -- 2.51.0 From 7da9960b59fb7e590eb8538c9428db55a4ea2d23 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 24 Apr 2025 06:47:18 -0700 Subject: [PATCH 06/16] perf/x86/intel/ds: Fix counter backwards of non-precise events counters-snapshotting The counter backwards may be observed in the PMI handler when counters-snapshotting some non-precise events in the freq mode. For the non-precise events, it's possible the counters-snapshotting records a positive value for an overflowed PEBS event. Then the HW auto-reload mechanism reset the counter to 0 immediately. Because the pebs_event_reset is cleared in the freq mode, which doesn't set the PERF_X86_EVENT_AUTO_RELOAD. In the PMI handler, 0 will be read rather than the positive value recorded in the counters-snapshotting record. The counters-snapshotting case has to be specially handled. Since the event value has been updated when processing the counters-snapshotting record, only needs to set the new period for the counter via x86_pmu_set_period(). Fixes: e02e9b0374c3 ("perf/x86/intel: Support PEBS counters snapshotting") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250424134718.311934-6-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 18c3ab579b8b..9b20acc0e932 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2379,8 +2379,25 @@ __intel_pmu_pebs_last_event(struct perf_event *event, */ intel_pmu_save_and_restart_reload(event, count); } - } else - intel_pmu_save_and_restart(event); + } else { + /* + * For a non-precise event, it's possible the + * counters-snapshotting records a positive value for the + * overflowed event. Then the HW auto-reload mechanism + * reset the counter to 0 immediately, because the + * pebs_event_reset is cleared if the PERF_X86_EVENT_AUTO_RELOAD + * is not set. The counter backwards may be observed in a + * PMI handler. + * + * Since the event value has been updated when processing the + * counters-snapshotting record, only needs to set the new + * period for the counter. + */ + if (is_pebs_counter_event_group(event)) + static_call(x86_pmu_set_period)(event); + else + intel_pmu_save_and_restart(event); + } } static __always_inline void -- 2.51.0 From f51972e6f8b9a737b2b3eb588069acb538fa72de Mon Sep 17 00:00:00 2001 From: Qing Wang Date: Sat, 5 Apr 2025 22:16:35 +0800 Subject: [PATCH 07/16] perf/core: Fix broken throttling when max_samples_per_tick=1 According to the throttling mechanism, the pmu interrupts number can not exceed the max_samples_per_tick in one tick. But this mechanism is ineffective when max_samples_per_tick=1, because the throttling check is skipped during the first interrupt and only performed when the second interrupt arrives. Perhaps this bug may cause little influence in one tick, but if in a larger time scale, the problem can not be underestimated. When max_samples_per_tick = 1: Allowed-interrupts-per-second max-samples-per-second default-HZ ARCH 200 100 100 X86 500 250 250 ARM64 ... Obviously, the pmu interrupt number far exceed the user's expect. Fixes: e050e3f0a71b ("perf: Fix broken interrupt rate throttling") Signed-off-by: Qing Wang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250405141635.243786-3-wangqing7171@gmail.com --- kernel/events/core.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 3c69a1a3f41c..05136e835042 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10065,14 +10065,14 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle) hwc->interrupts = 1; } else { hwc->interrupts++; - if (unlikely(throttle && - hwc->interrupts > max_samples_per_tick)) { - __this_cpu_inc(perf_throttled_count); - tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(event, 0); - ret = 1; - } + } + + if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) { + __this_cpu_inc(perf_throttled_count); + tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); + hwc->interrupts = MAX_INTERRUPTS; + perf_log_throttle(event, 0); + ret = 1; } if (event->attr.freq) { -- 2.51.0 From efd448540e6243dbdaf0a7e1bcf49734e73f3c93 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 24 Apr 2025 06:47:16 -0700 Subject: [PATCH 08/16] perf/x86/intel: Check the X86 leader for ACR group The auto counter reload group also requires a group flag in the leader. The leader must be a X86 event. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250424134718.311934-4-kan.liang@linux.intel.com --- arch/x86/events/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 9c5cab80866c..e8bce89821be 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -129,7 +129,7 @@ static inline bool is_pebs_counter_event_group(struct perf_event *event) static inline bool is_acr_event_group(struct perf_event *event) { - return event->group_leader->hw.flags & PERF_X86_EVENT_ACR; + return check_leader_group(event->group_leader, PERF_X86_EVENT_ACR); } struct amd_nb { -- 2.51.0 From 3e830f657f69ab6a4822d72ec2f364c6d51beef8 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 24 Apr 2025 06:47:17 -0700 Subject: [PATCH 09/16] perf/x86: Optimize the is_x86_event The current is_x86_event has to go through the hybrid_pmus list to find the matched pmu, then check if it's a X86 PMU and a X86 event. It's not necessary. The X86 PMU has a unique type ID on a non-hybrid machine, and a unique capability type. They are good enough to do the check. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250424134718.311934-5-kan.liang@linux.intel.com --- arch/x86/events/core.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index b2762f268dd0..92c3fb61f2d6 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -762,15 +762,16 @@ void x86_pmu_enable_all(int added) int is_x86_event(struct perf_event *event) { - int i; - - if (!is_hybrid()) - return event->pmu == &pmu; - - for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { - if (event->pmu == &x86_pmu.hybrid_pmu[i].pmu) - return true; - } + /* + * For a non-hybrid platforms, the type of X86 pmu is + * always PERF_TYPE_RAW. + * For a hybrid platform, the PERF_PMU_CAP_EXTENDED_HW_TYPE + * is a unique capability for the X86 PMU. + * Use them to detect a X86 event. + */ + if (event->pmu->type == PERF_TYPE_RAW || + event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE) + return true; return false; } -- 2.51.0 From 22d38babb3adcb1227ecfb91d9423008a46548fe Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 24 Apr 2025 18:11:25 +0200 Subject: [PATCH 10/16] perf: Fix failing inherit_event() doing extra refcount decrement on parent When inherit_event() fails after the child allocation but before the parent refcount has been incremented, calling put_event() wrongly decrements the reference to the parent, risking to free it too early. Also pmu_get_event() can't be holding a reference to the child concurrently at this point since it is under pmus_srcu critical section. Fix it with restoring the deleted free_event() function and call it on the failing child in order to free it directly under the verified assumption that its refcount is only 1. The refcount to the parent is then voluntarily omitted. Fixes: da916e96e2de ("perf: Make perf_pmu_unregister() useable") Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250424161128.29176-2-frederic@kernel.org --- kernel/events/core.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 05136e835042..882db7bca782 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5627,6 +5627,22 @@ static void _free_event(struct perf_event *event) __free_event(event); } +/* + * Used to free events which have a known refcount of 1, such as in error paths + * of inherited events. + */ +static void free_event(struct perf_event *event) +{ + if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, + "unexpected event refcount: %ld; ptr=%p\n", + atomic_long_read(&event->refcount), event)) { + /* leak to avoid use-after-free */ + return; + } + + _free_event(event); +} + /* * Remove user event from the owner task. */ @@ -14184,7 +14200,7 @@ inherit_event(struct perf_event *parent_event, pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); if (IS_ERR(pmu_ctx)) { - put_event(child_event); + free_event(child_event); return ERR_CAST(pmu_ctx); } child_event->pmu_ctx = pmu_ctx; @@ -14199,7 +14215,7 @@ inherit_event(struct perf_event *parent_event, if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { mutex_unlock(&parent_event->child_mutex); - put_event(child_event); + free_event(child_event); return NULL; } -- 2.51.0 From d20eb2d5fe8f8818abcfdadf5ac5109938f1318e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 28 Apr 2025 13:11:47 +0200 Subject: [PATCH 11/16] perf: Fix irq work dereferencing garbage The following commit: da916e96e2de ("perf: Make perf_pmu_unregister() useable") has introduced two significant event's parent lifecycle changes: 1) An event that has exited now has EVENT_TOMBSTONE as a parent. This can result in a situation where the delayed wakeup irq_work can accidentally dereference EVENT_TOMBSTONE on: CPU 0 CPU 1 ----- ----- __schedule() local_irq_disable() rq_lock() perf_event_overflow() irq_work_queue(&child->pending_irq) perf_event_task_sched_out() raw_spin_lock(&ctx->lock) ctx_sched_out() ctx->is_active = 0 event_sched_out(child) raw_spin_unlock(&ctx->lock) perf_event_release_kernel(parent) perf_remove_from_context(child) raw_spin_lock_irq(&ctx->lock) // Sees !ctx->is_active // Removes from context inline __perf_remove_from_context(child) perf_child_detach(child) event->parent = EVENT_TOMBSTONE raw_spin_rq_unlock_irq(rq); perf_pending_irq() perf_event_wakeup(child) ring_buffer_wakeup(child) rcu_dereference(child->parent->rb) <--- CRASH This also concerns the call to kill_fasync() on parent->fasync. 2) The final parent reference count decrement can now happen before the the final child reference count decrement. ie: the parent can now be freed before its child. On PREEMPT_RT, this can result in a situation where the delayed wakeup irq_work can accidentally dereference a freed parent: CPU 0 CPU 1 CPU 2 ----- ----- ------ perf_pmu_unregister() pmu_detach_events() pmu_get_event() atomic_long_inc_not_zero(&child->refcount) perf_event_overflow() irq_work_queue(&child->pending_irq); irq_work_run() wake_irq_workd() preempt_schedule_irq() =========> SWITCH to workd irq_work_run_list() perf_pending_irq() perf_event_wakeup(child) ring_buffer_wakeup(child) event = child->parent perf_event_release_kernel(parent) // Not last ref, PMU holds it put_event(child) // Last ref put_event(parent) free_event() call_rcu(...) rcu_core() free_event_rcu() rcu_dereference(event->rb) <--- CRASH This also concerns the call to kill_fasync() on parent->fasync. The "easy" solution to 1) is to check that event->parent is not EVENT_TOMBSTONE on perf_event_wakeup() (including both ring buffer and fasync uses). The "easy" solution to 2) is to turn perf_event_wakeup() to wholefully run under rcu_read_lock(). However because of 2), sanity would prescribe to make event::parent an __rcu pointer and annotate each and every users to prove they are reliable. Propose an alternate solution and restore the stable pointer to the parent until all its children have called _free_event() themselves to avoid any further accident. Also revert the EVENT_TOMBSTONE design that is mostly here to determine which caller of perf_event_exit_event() must perform the refcount decrement on a child event matching the increment in inherit_event(). Arrange instead for checking the attach state of an event prior to its removal and decrement the refcount of the child accordingly. Fixes: da916e96e2de ("perf: Make perf_pmu_unregister() useable") Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) --- kernel/events/core.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 882db7bca782..e0ca4a88beb5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -208,7 +208,6 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, } #define TASK_TOMBSTONE ((void *)-1L) -#define EVENT_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) { @@ -2338,12 +2337,6 @@ static void perf_child_detach(struct perf_event *event) sync_child_event(event); list_del_init(&event->child_list); - /* - * Cannot set to NULL, as that would confuse the situation vs - * not being a child event. See for example unaccount_event(). - */ - event->parent = EVENT_TOMBSTONE; - put_event(parent_event); } static bool is_orphaned_event(struct perf_event *event) @@ -5705,7 +5698,7 @@ static void put_event(struct perf_event *event) _free_event(event); /* Matches the refcount bump in inherit_event() */ - if (parent && parent != EVENT_TOMBSTONE) + if (parent) put_event(parent); } @@ -9998,7 +9991,7 @@ void perf_event_text_poke(const void *addr, const void *old_bytes, void perf_event_itrace_started(struct perf_event *event) { - event->attach_state |= PERF_ATTACH_ITRACE; + WRITE_ONCE(event->attach_state, event->attach_state | PERF_ATTACH_ITRACE); } static void perf_log_itrace_start(struct perf_event *event) @@ -13922,10 +13915,7 @@ perf_event_exit_event(struct perf_event *event, { struct perf_event *parent_event = event->parent; unsigned long detach_flags = DETACH_EXIT; - bool is_child = !!parent_event; - - if (parent_event == EVENT_TOMBSTONE) - parent_event = NULL; + unsigned int attach_state; if (parent_event) { /* @@ -13942,6 +13932,8 @@ perf_event_exit_event(struct perf_event *event, */ detach_flags |= DETACH_GROUP | DETACH_CHILD; mutex_lock(&parent_event->child_mutex); + /* PERF_ATTACH_ITRACE might be set concurrently */ + attach_state = READ_ONCE(event->attach_state); } if (revoke) @@ -13951,18 +13943,25 @@ perf_event_exit_event(struct perf_event *event, /* * Child events can be freed. */ - if (is_child) { - if (parent_event) { - mutex_unlock(&parent_event->child_mutex); + if (parent_event) { + mutex_unlock(&parent_event->child_mutex); + + /* + * Match the refcount initialization. Make sure it doesn't happen + * twice if pmu_detach_event() calls it on an already exited task. + */ + if (attach_state & PERF_ATTACH_CHILD) { /* * Kick perf_poll() for is_event_hup(); */ perf_event_wakeup(parent_event); /* * pmu_detach_event() will have an extra refcount. + * perf_pending_task() might have one too. */ put_event(event); } + return; } -- 2.51.0 From f400565faa50737ac1d550d2c75128c0dad75765 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 24 Apr 2025 18:11:27 +0200 Subject: [PATCH 12/16] perf: Remove too early and redundant CPU hotplug handling The CPU hotplug handlers are called twice: at prepare and online stage. Their role is to: 1) Enable/disable a CPU context. This is irrelevant and even buggy at the prepare stage because the CPU is still offline. On early secondary CPU up, creating an event attached to that CPU might silently fail because the CPU context is observed as online but the context installation's IPI failure is ignored. 2) Update the scope cpumasks and re-migrate the events accordingly in the CPU down case. This is irrelevant at the prepare stage. 3) Remove the events attached to the context of the offlining CPU. It even uses an (unnecessary) IPI for it. This is also irrelevant at the prepare stage. Also none of the *_PREPARE and *_STARTING architecture perf related CPU hotplug callbacks rely on CPUHP_PERF_PREPARE. CPUHP_AP_PERF_ONLINE is enough and the right place to perform the work. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250424161128.29176-4-frederic@kernel.org --- include/linux/cpuhotplug.h | 1 - kernel/cpu.c | 5 ----- 2 files changed, 6 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 1987400000b4..df366ee15456 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -60,7 +60,6 @@ enum cpuhp_state { /* PREPARE section invoked on a control CPU */ CPUHP_OFFLINE = 0, CPUHP_CREATE_THREADS, - CPUHP_PERF_PREPARE, CPUHP_PERF_X86_PREPARE, CPUHP_PERF_X86_AMD_UNCORE_PREP, CPUHP_PERF_POWER, diff --git a/kernel/cpu.c b/kernel/cpu.c index b08bb34b1718..a59e009e0be4 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2069,11 +2069,6 @@ static struct cpuhp_step cpuhp_hp_states[] = { .teardown.single = NULL, .cant_stop = true, }, - [CPUHP_PERF_PREPARE] = { - .name = "perf:prepare", - .startup.single = perf_event_init_cpu, - .teardown.single = perf_event_exit_cpu, - }, [CPUHP_RANDOM_PREPARE] = { .name = "random:prepare", .startup.single = random_prepare_cpu, -- 2.51.0 From 881097c0549f3818f5aa31af8ccb49213bd99bed Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 24 Apr 2025 18:11:28 +0200 Subject: [PATCH 13/16] perf: Fix confusing aux iteration While an event tears down all links to it as an aux, the iteration happens on the event's group leader instead of the group itself. If the event is a group leader, it has no effect because the event is also its own group leader. But otherwise there would be a risk to detach all the siblings events from the wrong group leader. It just happens to work because each sibling's aux link is tested against the right event before proceeding. Also the ctx lock is the same for the events and their group leader so the iteration is safe. Yet the iteration is confusing. Clarify the actual intent. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250424161128.29176-5-frederic@kernel.org --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index e0ca4a88beb5..b8461074600b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2171,7 +2171,7 @@ static void perf_put_aux_event(struct perf_event *event) * If the event is an aux_event, tear down all links to * it from other events. */ - for_each_sibling_event(iter, event->group_leader) { + for_each_sibling_event(iter, event) { if (iter->aux_event != event) continue; -- 2.51.0 From 18049c8cff9cc89daadc4df6975f7d9069638926 Mon Sep 17 00:00:00 2001 From: Yabin Cui Date: Thu, 8 May 2025 16:26:42 -0700 Subject: [PATCH 14/16] perf/aux: Allocate non-contiguous AUX pages by default perf always allocates contiguous AUX pages based on aux_watermark. However, this contiguous allocation doesn't benefit all PMUs. For instance, ARM SPE and TRBE operate with virtual pages, and Coresight ETR allocates a separate buffer. For these PMUs, allocating contiguous AUX pages unnecessarily exacerbates memory fragmentation. This fragmentation can prevent their use on long-running devices. This patch modifies the perf driver to be memory-friendly by default, by allocating non-contiguous AUX pages. For PMUs requiring contiguous pages (Intel BTS and some Intel PT), the existing PERF_PMU_CAP_AUX_NO_SG capability can be used. For PMUs that don't require but can benefit from contiguous pages (some Intel PT), a new capability, PERF_PMU_CAP_AUX_PREFER_LARGE, is added to maintain their existing behavior. Signed-off-by: Yabin Cui Signed-off-by: Ingo Molnar Reviewed-by: James Clark Reviewed-by: Anshuman Khandual Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Alexander Shishkin Cc: Mark Rutland Cc: Namhyung Kim Link: https://lore.kernel.org/r/20250508232642.148767-1-yabinc@google.com --- arch/x86/events/intel/pt.c | 2 ++ include/linux/perf_event.h | 1 + kernel/events/ring_buffer.c | 29 ++++++++++++++++++++--------- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index fa37565f6418..25ead919fc48 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1863,6 +1863,8 @@ static __init int pt_init(void) if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG; + else + pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_PREFER_LARGE; pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE | diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 947ad12dfdbe..a96c00e2ceca 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -303,6 +303,7 @@ struct perf_event_pmu_context; #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 #define PERF_PMU_CAP_AUX_PAUSE 0x0200 +#define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400 /** * pmu::scope diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 5130b119d0ae..d2aef87c7e9f 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -679,7 +679,15 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, { bool overwrite = !(flags & RING_BUFFER_WRITABLE); int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); - int ret = -ENOMEM, max_order; + bool use_contiguous_pages = event->pmu->capabilities & ( + PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_PREFER_LARGE); + /* + * Initialize max_order to 0 for page allocation. This allocates single + * pages to minimize memory fragmentation. This is overridden if the + * PMU needs or prefers contiguous pages (use_contiguous_pages = true). + */ + int max_order = 0; + int ret = -ENOMEM; if (!has_aux(event)) return -EOPNOTSUPP; @@ -689,8 +697,8 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, if (!overwrite) { /* - * Watermark defaults to half the buffer, and so does the - * max_order, to aid PMU drivers in double buffering. + * Watermark defaults to half the buffer, to aid PMU drivers + * in double buffering. */ if (!watermark) watermark = min_t(unsigned long, @@ -698,16 +706,19 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, (unsigned long)nr_pages << (PAGE_SHIFT - 1)); /* - * Use aux_watermark as the basis for chunking to - * help PMU drivers honor the watermark. + * If using contiguous pages, use aux_watermark as the basis + * for chunking to help PMU drivers honor the watermark. */ - max_order = get_order(watermark); + if (use_contiguous_pages) + max_order = get_order(watermark); } else { /* - * We need to start with the max_order that fits in nr_pages, - * not the other way around, hence ilog2() and not get_order. + * If using contiguous pages, we need to start with the + * max_order that fits in nr_pages, not the other way around, + * hence ilog2() and not get_order. */ - max_order = ilog2(nr_pages); + if (use_contiguous_pages) + max_order = ilog2(nr_pages); watermark = 0; } -- 2.51.0 From 75a9001bab36f0456f6aae1ab0aa487db456464a Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 6 May 2025 17:49:07 +0800 Subject: [PATCH 15/16] perf/x86/intel/ds: Remove redundant assignments to sample.period The perf_sample_data_init() has already set the period of sample, so no need to do it again. Signed-off-by: Changbin Du Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250506094907.2724-1-changbin.du@huawei.com --- arch/x86/events/intel/ds.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 61ee698deaab..319d0d4ce30c 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1831,8 +1831,6 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, perf_sample_data_init(data, 0, event->hw.last_period); - data->period = event->hw.last_period; - /* * Use latency for weight (only avail with PEBS-LL) */ @@ -2085,7 +2083,6 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, sample_type = event->attr.sample_type; format_group = basic->format_group; perf_sample_data_init(data, 0, event->hw.last_period); - data->period = event->hw.last_period; setup_pebs_time(event, data, basic->tsc); -- 2.51.0 From ca559503b89c30bc49178d0e4a1e0b23f991fb9f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 16 May 2025 11:28:38 -0700 Subject: [PATCH 16/16] perf/core: Add the is_event_in_freq_mode() helper to simplify the code Add a helper to check if an event is in freq mode to improve readability. No functional changes. Signed-off-by: Kan Liang Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250516182853.2610284-2-kan.liang@linux.intel.com --- kernel/events/core.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index b8461074600b..952340f1df9d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2351,6 +2351,11 @@ event_filter_match(struct perf_event *event) perf_cgroup_match(event); } +static inline bool is_event_in_freq_mode(struct perf_event *event) +{ + return event->attr.freq && event->attr.sample_freq; +} + static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { @@ -2388,7 +2393,7 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx) if (!is_software_event(event)) cpc->active_oncpu--; - if (event->attr.freq && event->attr.sample_freq) { + if (is_event_in_freq_mode(event)) { ctx->nr_freq--; epc->nr_freq--; } @@ -2686,7 +2691,7 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx) if (!is_software_event(event)) cpc->active_oncpu++; - if (event->attr.freq && event->attr.sample_freq) { + if (is_event_in_freq_mode(event)) { ctx->nr_freq++; epc->nr_freq++; } @@ -4252,11 +4257,11 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list) if (hwc->interrupts == MAX_INTERRUPTS) { hwc->interrupts = 0; perf_log_throttle(event, 1); - if (!event->attr.freq || !event->attr.sample_freq) + if (!is_event_in_freq_mode(event)) event->pmu->start(event, 0); } - if (!event->attr.freq || !event->attr.sample_freq) + if (!is_event_in_freq_mode(event)) continue; /* @@ -12848,7 +12853,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, hwc = &event->hw; hwc->sample_period = attr->sample_period; - if (attr->freq && attr->sample_freq) + if (is_event_in_freq_mode(event)) hwc->sample_period = 1; hwc->last_period = hwc->sample_period; -- 2.51.0