From a2e27e1bb19eb7c1790af7c8b6f7298ec524c1bb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Dec 2024 15:12:09 -0500 Subject: [PATCH 01/16] tracing: Switch trace_events_synth.c code over to use guard() There are a couple functions in trace_events_synth.c that have "goto out" or equivalent on error in order to release locks that were taken. This can be error prone or just simply make the code more complex. Switch every location that ends with unlocking a mutex on error over to using the guard(mutex)() infrastructure to let the compiler worry about releasing locks. This makes the code easier to read and understand. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Link: https://lore.kernel.org/20241219201346.371082515@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_synth.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index c82b401a294d..e3f7d09e5512 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -49,16 +49,11 @@ static char *last_cmd; static int errpos(const char *str) { - int ret = 0; - - mutex_lock(&lastcmd_mutex); + guard(mutex)(&lastcmd_mutex); if (!str || !last_cmd) - goto out; + return 0; - ret = err_pos(last_cmd, str); - out: - mutex_unlock(&lastcmd_mutex); - return ret; + return err_pos(last_cmd, str); } static void last_cmd_set(const char *str) @@ -74,14 +69,12 @@ static void last_cmd_set(const char *str) static void synth_err(u8 err_type, u16 err_pos) { - mutex_lock(&lastcmd_mutex); + guard(mutex)(&lastcmd_mutex); if (!last_cmd) - goto out; + return; tracing_log_err(NULL, "synthetic_events", last_cmd, err_text, err_type, err_pos); - out: - mutex_unlock(&lastcmd_mutex); } static int create_synth_event(const char *raw_command); -- 2.50.1 From 930d2b32c0af6895ba4c6ca6404e7f7b6dc214ed Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Dec 2024 17:25:41 -0500 Subject: [PATCH 02/16] tracing: Switch trace_osnoise.c code over to use guard() and __free() The osnoise_hotplug_workfn() grabs two mutexes and cpu_read_lock(). It has various gotos to handle unlocking them. Switch them over to guard() and let the compiler worry about it. The osnoise_cpus_read() has a temporary mask_str allocated and there's some gotos to make sure it gets freed on error paths. Switch that over to __free() to let the compiler worry about it. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Link: https://lore.kernel.org/20241225222931.517329690@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 40 ++++++++++++------------------------ 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index b9f96c77527d..b25c30b05dd0 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2083,26 +2083,21 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) { unsigned int cpu = smp_processor_id(); - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (!osnoise_has_registered_instances()) - goto out_unlock_trace; + return; - mutex_lock(&interface_lock); - cpus_read_lock(); + guard(mutex)(&interface_lock); + guard(cpus_read_lock)(); if (!cpu_online(cpu)) - goto out_unlock; + return; + if (!cpumask_test_cpu(cpu, &osnoise_cpumask)) - goto out_unlock; + return; start_kthread(cpu); - -out_unlock: - cpus_read_unlock(); - mutex_unlock(&interface_lock); -out_unlock_trace: - mutex_unlock(&trace_types_lock); } static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn); @@ -2300,31 +2295,22 @@ static ssize_t osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - char *mask_str; + char *mask_str __free(kfree) = NULL; int len; - mutex_lock(&interface_lock); + guard(mutex)(&interface_lock); len = snprintf(NULL, 0, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)) + 1; mask_str = kmalloc(len, GFP_KERNEL); - if (!mask_str) { - count = -ENOMEM; - goto out_unlock; - } + if (!mask_str) + return -ENOMEM; len = snprintf(mask_str, len, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)); - if (len >= count) { - count = -EINVAL; - goto out_free; - } + if (len >= count) + return -EINVAL; count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); -out_free: - kfree(mask_str); -out_unlock: - mutex_unlock(&interface_lock); - return count; } -- 2.50.1 From 6c05353e4ff5875807f1a00f8d95e68b3d1e4d7f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Dec 2024 17:25:42 -0500 Subject: [PATCH 03/16] tracing: Switch trace_stack.c code over to use guard() The function stack_trace_sysctl() uses a goto on the error path to jump to the mutex_unlock() code. Replace the logic to use guard() and let the compiler worry about it. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Link: https://lore.kernel.org/20241225222931.684913592@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_stack.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 7f9572a37333..14c6f272c4d8 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -520,20 +520,18 @@ stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer, int was_enabled; int ret; - mutex_lock(&stack_sysctl_mutex); + guard(mutex)(&stack_sysctl_mutex); was_enabled = !!stack_tracer_enabled; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (was_enabled == !!stack_tracer_enabled)) - goto out; + return ret; if (stack_tracer_enabled) register_ftrace_function(&trace_ops); else unregister_ftrace_function(&trace_ops); - out: - mutex_unlock(&stack_sysctl_mutex); return ret; } -- 2.50.1 From 08b767317192e7a20d6d95ff7eca6d9bbc48c192 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Dec 2024 15:12:12 -0500 Subject: [PATCH 04/16] tracing: Switch trace_stat.c code over to use guard() There are a couple functions in trace_stat.c that have "goto out" or equivalent on error in order to release locks that were taken. This can be error prone or just simply make the code more complex. Switch every location that ends with unlocking a mutex on error over to using the guard(mutex)() infrastructure to let the compiler worry about releasing locks. This makes the code easier to read and understand. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Link: https://lore.kernel.org/20241219201346.870318466@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_stat.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index bb247beec447..b3b5586f104d 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -128,7 +128,7 @@ static int stat_seq_init(struct stat_session *session) int ret = 0; int i; - mutex_lock(&session->stat_mutex); + guard(mutex)(&session->stat_mutex); __reset_stat_session(session); if (!ts->stat_cmp) @@ -136,11 +136,11 @@ static int stat_seq_init(struct stat_session *session) stat = ts->stat_start(ts); if (!stat) - goto exit; + return 0; ret = insert_stat(root, stat, ts->stat_cmp); if (ret) - goto exit; + return ret; /* * Iterate over the tracer stat entries and store them in an rbtree. @@ -157,13 +157,10 @@ static int stat_seq_init(struct stat_session *session) goto exit_free_rbtree; } -exit: - mutex_unlock(&session->stat_mutex); return ret; exit_free_rbtree: __reset_stat_session(session); - mutex_unlock(&session->stat_mutex); return ret; } @@ -308,7 +305,7 @@ static int init_stat_file(struct stat_session *session) int register_stat_tracer(struct tracer_stat *trace) { struct stat_session *session, *node; - int ret = -EINVAL; + int ret; if (!trace) return -EINVAL; @@ -316,18 +313,18 @@ int register_stat_tracer(struct tracer_stat *trace) if (!trace->stat_start || !trace->stat_next || !trace->stat_show) return -EINVAL; + guard(mutex)(&all_stat_sessions_mutex); + /* Already registered? */ - mutex_lock(&all_stat_sessions_mutex); list_for_each_entry(node, &all_stat_sessions, session_list) { if (node->ts == trace) - goto out; + return -EINVAL; } - ret = -ENOMEM; /* Init the session */ session = kzalloc(sizeof(*session), GFP_KERNEL); if (!session) - goto out; + return -ENOMEM; session->ts = trace; INIT_LIST_HEAD(&session->session_list); @@ -336,16 +333,13 @@ int register_stat_tracer(struct tracer_stat *trace) ret = init_stat_file(session); if (ret) { destroy_session(session); - goto out; + return ret; } - ret = 0; /* Register */ list_add_tail(&session->session_list, &all_stat_sessions); - out: - mutex_unlock(&all_stat_sessions_mutex); - return ret; + return 0; } void unregister_stat_tracer(struct tracer_stat *trace) -- 2.50.1 From 9e49ca756d207f4313fb7af48648a67da8e4e250 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 20 Dec 2024 10:33:13 -0500 Subject: [PATCH 05/16] tracing/string: Create and use __free(argv_free) in trace_dynevent.c The function dyn_event_release() uses argv_split() which must be freed via argv_free(). It contains several error paths that do a goto out to call argv_free() for cleanup. This makes the code complex and error prone. Create a new __free() directive __free(argv_free) that will call argv_free() for data allocated with argv_split(), and use it in the dyn_event_release() function. Cc: Kees Cook Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Andy Shevchenko Cc: linux-hardening@vger.kernel.org Link: https://lore.kernel.org/20241220103313.4a74ec8e@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- include/linux/string.h | 3 +++ kernel/trace/trace_dynevent.c | 23 +++++++---------------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/include/linux/string.h b/include/linux/string.h index 493ac4862c77..86d5d352068b 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -4,6 +4,7 @@ #include #include +#include /* for DEFINE_FREE() */ #include /* for inline */ #include /* for size_t */ #include /* for NULL */ @@ -312,6 +313,8 @@ extern void *kmemdup_array(const void *src, size_t count, size_t element_size, g extern char **argv_split(gfp_t gfp, const char *str, int *argcp); extern void argv_free(char **argv); +DEFINE_FREE(argv_free, char **, if (!IS_ERR_OR_NULL(_T)) argv_free(_T)) + /* lib/cmdline.c */ extern int get_option(char **str, int *pint); extern char *get_options(const char *str, int nints, int *ints); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 4376887e0d8a..a322e4f249a5 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -74,24 +74,19 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type struct dyn_event *pos, *n; char *system = NULL, *event, *p; int argc, ret = -ENOENT; - char **argv; + char **argv __free(argv_free) = argv_split(GFP_KERNEL, raw_command, &argc); - argv = argv_split(GFP_KERNEL, raw_command, &argc); if (!argv) return -ENOMEM; if (argv[0][0] == '-') { - if (argv[0][1] != ':') { - ret = -EINVAL; - goto out; - } + if (argv[0][1] != ':') + return -EINVAL; event = &argv[0][2]; } else { event = strchr(argv[0], ':'); - if (!event) { - ret = -EINVAL; - goto out; - } + if (!event) + return -EINVAL; event++; } @@ -101,10 +96,8 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type event = p + 1; *p = '\0'; } - if (!system && event[0] == '\0') { - ret = -EINVAL; - goto out; - } + if (!system && event[0] == '\0') + return -EINVAL; mutex_lock(&event_mutex); for_each_dyn_event_safe(pos, n) { @@ -120,8 +113,6 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type } tracing_reset_all_online_cpus(); mutex_unlock(&event_mutex); -out: - argv_free(argv); return ret; } -- 2.50.1 From cff6d93eab00bacf8b6bffdef775fc2de0273c96 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Thu, 12 Dec 2024 13:12:37 +0000 Subject: [PATCH 06/16] tracepoint: Reduce duplication of __DO_TRACE_CALL The logic for invoking __DO_TRACE_CALL was extracted to a static inline function called __rust_do_trace_##name so that Rust can call it directly. This logic does not include the static branch, to avoid a function call when the tracepoint is disabled. Since the C code needs to perform the same logic after checking the static key, this logic is currently duplicated. Thus, remove this duplication by having C call the static inline function too. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241212131237.1988409-1-aliceryhl@google.com Signed-off-by: Alice Ryhl Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 76d9055b2cff..a351763e6965 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -218,7 +218,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DEFINE_RUST_DO_TRACE(name, proto, args) \ notrace void rust_do_trace_##name(proto) \ { \ - __rust_do_trace_##name(args); \ + __do_trace_##name(args); \ } /* @@ -268,7 +268,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ - static inline void __rust_do_trace_##name(proto) \ + static inline void __do_trace_##name(proto) \ { \ if (cond) { \ guard(preempt_notrace)(); \ @@ -277,12 +277,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) } \ static inline void trace_##name(proto) \ { \ - if (static_branch_unlikely(&__tracepoint_##name.key)) { \ - if (cond) { \ - guard(preempt_notrace)(); \ - __DO_TRACE_CALL(name, TP_ARGS(args)); \ - } \ - } \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ + __do_trace_##name(args); \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ WARN_ONCE(!rcu_is_watching(), \ "RCU not watching for tracepoint"); \ @@ -291,7 +287,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE_SYSCALL(name, proto, args, data_proto) \ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ - static inline void __rust_do_trace_##name(proto) \ + static inline void __do_trace_##name(proto) \ { \ guard(rcu_tasks_trace)(); \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ @@ -299,10 +295,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) static inline void trace_##name(proto) \ { \ might_fault(); \ - if (static_branch_unlikely(&__tracepoint_##name.key)) { \ - guard(rcu_tasks_trace)(); \ - __DO_TRACE_CALL(name, TP_ARGS(args)); \ - } \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ + __do_trace_##name(args); \ if (IS_ENABLED(CONFIG_LOCKDEP)) { \ WARN_ONCE(!rcu_is_watching(), \ "RCU not watching for tracepoint"); \ -- 2.50.1 From 22bec11a569983f39c6061cb82279e7de9e3bdfc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Jan 2025 11:11:43 -0500 Subject: [PATCH 07/16] tracing: Fix using ret variable in tracing_set_tracer() When the function tracing_set_tracer() switched over to using the guard() infrastructure, it did not need to save the 'ret' variable and would just return the value when an error arised, instead of setting ret and jumping to an out label. When CONFIG_TRACER_SNAPSHOT is enabled, it had code that expected the "ret" variable to be initialized to zero and had set 'ret' while holding an arch_spin_lock() (not used by guard), and then upon releasing the lock it would check 'ret' and exit if set. But because ret was only set when an error occurred while holding the locks, 'ret' would be used uninitialized if there was no error. The code in the CONFIG_TRACER_SNAPSHOT block should be self contain. Make sure 'ret' is also set when no error occurred. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250106111143.2f90ff65@gandalf.local.home Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202412271654.nJVBuwmF-lkp@intel.com/ Fixes: d33b10c0c73ad ("tracing: Switch trace.c code over to use guard()") Signed-off-by: Steven Rostedt (Google) Acked-by: Masami Hiramatsu (Google) --- kernel/trace/trace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0aaf442271e9..5aeb898054e7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6104,8 +6104,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (t->use_max_tr) { local_irq_disable(); arch_spin_lock(&tr->max_lock); - if (tr->cond_snapshot) - ret = -EBUSY; + ret = tr->cond_snapshot ? -EBUSY : 0; arch_spin_unlock(&tr->max_lock); local_irq_enable(); if (ret) -- 2.50.1 From 1bd13edbbed6e7e396f1aab92b224a4775218e68 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 27 Dec 2024 13:07:57 +0900 Subject: [PATCH 08/16] tracing/hist: Add poll(POLLIN) support on hist file Add poll syscall support on the `hist` file. The Waiter will be waken up when the histogram is updated with POLLIN. Currently, there is no way to wait for a specific event in userspace. So user needs to peek the `trace` periodicaly, or wait on `trace_pipe`. But it is not a good idea to peek at the `trace` for an event that randomly happens. And `trace_pipe` is not coming back until a page is filled with events. This allows a user to wait for a specific event on the `hist` file. User can set a histogram trigger on the event which they want to monitor and poll() on its `hist` file. Since this poll() returns POLLIN, the next poll() will return soon unless a read() happens on that hist file. NOTE: To read the hist file again, you must set the file offset to 0, but just for monitoring the event, you may not need to read the histogram. Cc: Shuah Khan Cc: Mathieu Desnoyers Link: https://lore.kernel.org/173527247756.464571.14236296701625509931.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 14 +++++++ kernel/trace/trace_events.c | 14 +++++++ kernel/trace/trace_events_hist.c | 70 ++++++++++++++++++++++++++++++-- 3 files changed, 95 insertions(+), 3 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 91b8ffbdfa8c..02cde1174487 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -673,6 +673,20 @@ struct trace_event_file { atomic_t tm_ref; /* trigger-mode reference counter */ }; +#ifdef CONFIG_HIST_TRIGGERS +extern struct irq_work hist_poll_work; +extern wait_queue_head_t hist_poll_wq; + +static inline void hist_poll_wakeup(void) +{ + if (wq_has_sleeper(&hist_poll_wq)) + irq_work_queue(&hist_poll_work); +} + +#define hist_poll_wait(file, wait) \ + poll_wait(file, &hist_poll_wq, wait) +#endif + #define __TRACE_EVENT_FLAGS(name, value) \ static int __init trace_init_flags_##name(void) \ { \ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 047d2775184b..2b9222e7bd5a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3094,6 +3094,20 @@ static bool event_in_systems(struct trace_event_call *call, return !*p || isspace(*p) || *p == ','; } +#ifdef CONFIG_HIST_TRIGGERS +/* + * Wake up waiter on the hist_poll_wq from irq_work because the hist trigger + * may happen in any context. + */ +static void hist_poll_event_irq_work(struct irq_work *work) +{ + wake_up_all(&hist_poll_wq); +} + +DEFINE_IRQ_WORK(hist_poll_work, hist_poll_event_irq_work); +DECLARE_WAIT_QUEUE_HEAD(hist_poll_wq); +#endif + static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 879b58892b9d..af4be28f01e0 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5311,6 +5311,8 @@ static void event_hist_trigger(struct event_trigger_data *data, if (resolve_var_refs(hist_data, key, var_ref_vals, true)) hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals); + + hist_poll_wakeup(); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -5590,15 +5592,36 @@ static void hist_trigger_show(struct seq_file *m, n_entries, (u64)atomic64_read(&hist_data->map->drops)); } +struct hist_file_data { + struct file *file; + u64 last_read; +}; + +static u64 get_hist_hit_count(struct trace_event_file *event_file) +{ + struct hist_trigger_data *hist_data; + struct event_trigger_data *data; + u64 ret = 0; + + list_for_each_entry(data, &event_file->triggers, list) { + if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) { + hist_data = data->private_data; + ret += atomic64_read(&hist_data->map->hits); + } + } + return ret; +} + static int hist_show(struct seq_file *m, void *v) { + struct hist_file_data *hist_file = m->private; struct event_trigger_data *data; struct trace_event_file *event_file; int n = 0; guard(mutex)(&event_mutex); - event_file = event_file_file(m->private); + event_file = event_file_file(hist_file->file); if (unlikely(!event_file)) return -ENODEV; @@ -5606,27 +5629,68 @@ static int hist_show(struct seq_file *m, void *v) if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) hist_trigger_show(m, data, n++); } + hist_file->last_read = get_hist_hit_count(event_file); + return 0; } +static __poll_t event_hist_poll(struct file *file, struct poll_table_struct *wait) +{ + struct trace_event_file *event_file; + struct seq_file *m = file->private_data; + struct hist_file_data *hist_file = m->private; + + guard(mutex)(&event_mutex); + + event_file = event_file_data(file); + if (!event_file) + return EPOLLERR; + + hist_poll_wait(file, wait); + + if (hist_file->last_read != get_hist_hit_count(event_file)) + return EPOLLIN | EPOLLRDNORM; + + return 0; +} + +static int event_hist_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct hist_file_data *hist_file = m->private; + + kfree(hist_file); + return tracing_single_release_file_tr(inode, file); +} + static int event_hist_open(struct inode *inode, struct file *file) { + struct hist_file_data *hist_file; int ret; ret = tracing_open_file_tr(inode, file); if (ret) return ret; + hist_file = kzalloc(sizeof(*hist_file), GFP_KERNEL); + if (!hist_file) + return -ENOMEM; + hist_file->file = file; + /* Clear private_data to avoid warning in single_open() */ file->private_data = NULL; - return single_open(file, hist_show, file); + ret = single_open(file, hist_show, hist_file); + if (ret) + kfree(hist_file); + return ret; } const struct file_operations event_hist_fops = { .open = event_hist_open, .read = seq_read, .llseek = seq_lseek, - .release = tracing_single_release_file_tr, + .release = event_hist_release, + .poll = event_hist_poll, }; #ifdef CONFIG_HIST_TRIGGERS_DEBUG -- 2.50.1 From 66fc6f521a0b91051ce6968a216a30bc52267bf8 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 27 Dec 2024 13:08:07 +0900 Subject: [PATCH 09/16] tracing/hist: Support POLLPRI event for poll on histogram Since POLLIN will not be flushed until the hist file is read, the user needs to repeatedly read() and poll() on the hist file for monitoring the event continuously. But the read() is somewhat redundant when the user is only monitoring for event updates. Add POLLPRI poll event on the hist file so the event returns when a histogram is updated after open(), poll() or read(). Thus it is possible to wait for the next event without having to issue a read(). Cc: Shuah Khan Cc: Mathieu Desnoyers Link: https://lore.kernel.org/173527248770.464571.2536902137325258133.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index af4be28f01e0..261163b00137 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5595,6 +5595,7 @@ static void hist_trigger_show(struct seq_file *m, struct hist_file_data { struct file *file; u64 last_read; + u64 last_act; }; static u64 get_hist_hit_count(struct trace_event_file *event_file) @@ -5630,6 +5631,11 @@ static int hist_show(struct seq_file *m, void *v) hist_trigger_show(m, data, n++); } hist_file->last_read = get_hist_hit_count(event_file); + /* + * Update last_act too so that poll()/POLLPRI can wait for the next + * event after any syscall on hist file. + */ + hist_file->last_act = hist_file->last_read; return 0; } @@ -5639,6 +5645,8 @@ static __poll_t event_hist_poll(struct file *file, struct poll_table_struct *wai struct trace_event_file *event_file; struct seq_file *m = file->private_data; struct hist_file_data *hist_file = m->private; + __poll_t ret = 0; + u64 cnt; guard(mutex)(&event_mutex); @@ -5648,10 +5656,15 @@ static __poll_t event_hist_poll(struct file *file, struct poll_table_struct *wai hist_poll_wait(file, wait); - if (hist_file->last_read != get_hist_hit_count(event_file)) - return EPOLLIN | EPOLLRDNORM; + cnt = get_hist_hit_count(event_file); + if (hist_file->last_read != cnt) + ret |= EPOLLIN | EPOLLRDNORM; + if (hist_file->last_act != cnt) { + hist_file->last_act = cnt; + ret |= EPOLLPRI; + } - return 0; + return ret; } static int event_hist_release(struct inode *inode, struct file *file) @@ -5665,6 +5678,7 @@ static int event_hist_release(struct inode *inode, struct file *file) static int event_hist_open(struct inode *inode, struct file *file) { + struct trace_event_file *event_file; struct hist_file_data *hist_file; int ret; @@ -5672,16 +5686,25 @@ static int event_hist_open(struct inode *inode, struct file *file) if (ret) return ret; + guard(mutex)(&event_mutex); + + event_file = event_file_data(file); + if (!event_file) + return -ENODEV; + hist_file = kzalloc(sizeof(*hist_file), GFP_KERNEL); if (!hist_file) return -ENOMEM; + hist_file->file = file; + hist_file->last_act = get_hist_hit_count(event_file); /* Clear private_data to avoid warning in single_open() */ file->private_data = NULL; ret = single_open(file, hist_show, hist_file); if (ret) kfree(hist_file); + return ret; } -- 2.50.1 From 80c3e28528ff9f269937fcfe73895213a2e14905 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sun, 29 Dec 2024 22:24:39 +0900 Subject: [PATCH 10/16] selftests/tracing: Add hist poll() support test Add a testcase for poll() on hist file. This introduces a helper binary to the ftracetest, because there is no good way to reliably execute poll() on hist file. Cc: Shuah Khan Cc: Tom Zanussi Cc: Mathieu Desnoyers Link: https://lore.kernel.org/173547867935.569911.10127126796879854182.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Shuah Khan Signed-off-by: Steven Rostedt (Google) --- tools/testing/selftests/ftrace/Makefile | 2 + tools/testing/selftests/ftrace/poll.c | 74 +++++++++++++++++++ .../test.d/trigger/trigger-hist-poll.tc | 74 +++++++++++++++++++ 3 files changed, 150 insertions(+) create mode 100644 tools/testing/selftests/ftrace/poll.c create mode 100644 tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-poll.tc diff --git a/tools/testing/selftests/ftrace/Makefile b/tools/testing/selftests/ftrace/Makefile index a1e955d2de4c..49d96bb16355 100644 --- a/tools/testing/selftests/ftrace/Makefile +++ b/tools/testing/selftests/ftrace/Makefile @@ -6,4 +6,6 @@ TEST_PROGS := ftracetest-ktap TEST_FILES := test.d settings EXTRA_CLEAN := $(OUTPUT)/logs/* +TEST_GEN_PROGS = poll + include ../lib.mk diff --git a/tools/testing/selftests/ftrace/poll.c b/tools/testing/selftests/ftrace/poll.c new file mode 100644 index 000000000000..53258f7515e7 --- /dev/null +++ b/tools/testing/selftests/ftrace/poll.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Simple poll on a file. + * + * Copyright (c) 2024 Google LLC. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define BUFSIZE 4096 + +/* + * Usage: + * poll [-I|-P] [-t timeout] FILE + */ +int main(int argc, char *argv[]) +{ + struct pollfd pfd = {.events = POLLIN}; + char buf[BUFSIZE]; + int timeout = -1; + int ret, opt; + + while ((opt = getopt(argc, argv, "IPt:")) != -1) { + switch (opt) { + case 'I': + pfd.events = POLLIN; + break; + case 'P': + pfd.events = POLLPRI; + break; + case 't': + timeout = atoi(optarg); + break; + default: + fprintf(stderr, "Usage: %s [-I|-P] [-t timeout] FILE\n", + argv[0]); + return -1; + } + } + if (optind >= argc) { + fprintf(stderr, "Error: Polling file is not specified\n"); + return -1; + } + + pfd.fd = open(argv[optind], O_RDONLY); + if (pfd.fd < 0) { + fprintf(stderr, "failed to open %s", argv[optind]); + perror("open"); + return -1; + } + + /* Reset poll by read if POLLIN is specified. */ + if (pfd.events & POLLIN) + do {} while (read(pfd.fd, buf, BUFSIZE) == BUFSIZE); + + ret = poll(&pfd, 1, timeout); + if (ret < 0 && errno != EINTR) { + perror("poll"); + return -1; + } + close(pfd.fd); + + /* If timeout happned (ret == 0), exit code is 1 */ + if (ret == 0) + return 1; + + return 0; +} diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-poll.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-poll.tc new file mode 100644 index 000000000000..8d275e3238d9 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-poll.tc @@ -0,0 +1,74 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: event trigger - test poll wait on histogram +# requires: set_event events/sched/sched_process_free/trigger events/sched/sched_process_free/hist +# flags: instance + +POLL=${FTRACETEST_ROOT}/poll + +if [ ! -x ${POLL} ]; then + echo "poll program is not compiled!" + exit_unresolved +fi + +EVENT=events/sched/sched_process_free/ + +# Check poll ops is supported. Before implementing poll on hist file, it +# returns soon with POLLIN | POLLOUT, but not POLLPRI. + +# This must wait >1 sec and return 1 (timeout). +set +e +${POLL} -I -t 1000 ${EVENT}/hist +ret=$? +set -e +if [ ${ret} != 1 ]; then + echo "poll on hist file is not supported" + exit_unsupported +fi + +# Test POLLIN +echo > trace +echo 'hist:key=comm if comm =="sleep"' > ${EVENT}/trigger +echo 1 > ${EVENT}/enable + +# This sleep command will exit after 2 seconds. +sleep 2 & +BGPID=$! +# if timeout happens, poll returns 1. +${POLL} -I -t 4000 ${EVENT}/hist +echo 0 > tracing_on + +if [ -d /proc/${BGPID} ]; then + echo "poll exits too soon" + kill -KILL ${BGPID} ||: + exit_fail +fi + +if ! grep -qw "sleep" trace; then + echo "poll exits before event happens" + exit_fail +fi + +# Test POLLPRI +echo > trace +echo 1 > tracing_on + +# This sleep command will exit after 2 seconds. +sleep 2 & +BGPID=$! +# if timeout happens, poll returns 1. +${POLL} -P -t 4000 ${EVENT}/hist +echo 0 > tracing_on + +if [ -d /proc/${BGPID} ]; then + echo "poll exits too soon" + kill -KILL ${BGPID} ||: + exit_fail +fi + +if ! grep -qw "sleep" trace; then + echo "poll exits before event happens" + exit_fail +fi + +exit_pass -- 2.50.1 From 4c86bc531e60900053384867c082675bba82c29f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 16 Jan 2025 09:33:35 -0500 Subject: [PATCH 11/16] tracing: Add :mod: command to enabled module events Add a :mod: command to enable only events from a given module from the set_events file. echo '*:mod:' > set_events Or echo ':mod:' > set_events Will enable all events for that module. Specific events can also be enabled via: echo ':mod:' > set_events Or echo '::mod:' > set_events Or echo '*::mod:' > set_events The ":mod:" keyword is consistent with the function tracing filter to enable functions from a given module. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250116143533.214496360@goodmis.org Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/events.rst | 22 +++++++++++++ kernel/trace/trace.c | 2 ++ kernel/trace/trace_events.c | 59 ++++++++++++++++++++++++++-------- 3 files changed, 70 insertions(+), 13 deletions(-) diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst index 759907c20e75..3db57516eb86 100644 --- a/Documentation/trace/events.rst +++ b/Documentation/trace/events.rst @@ -55,6 +55,28 @@ command:: # echo 'irq:*' > /sys/kernel/tracing/set_event +The set_event file may also be used to enable events associated to only +a specific module:: + + # echo ':mod:' > /sys/kernel/tracing/set_event + +Will enable all events in the module ````. + +The text before ``:mod:`` will be parsed to specify specific events that the +module creates:: + + # echo ':mod:' > /sys/kernel/tracing/set_event + +The above will enable any system or event that ```` matches. If +```` is ``"*"`` then it will match all events. + +To enable only a specific event within a system:: + + # echo '::mod:' > /sys/kernel/tracing/set_event + +If ```` is ``"*"`` then it will match all events within the system +for a given module. + 2.2 Via the 'enable' toggle --------------------------- diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5aeb898054e7..cb85ee4a8807 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5514,6 +5514,8 @@ static const char readme_msg[] = "\t efield: For event probes ('e' types), the field is on of the fields\n" "\t of the /.\n" #endif + " set_event\t\t- Enables events by name written into it\n" + "\t\t\t Can enable module events via: :mod:\n" " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" " events//\t- Directory containing all trace events for :\n" diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2b9222e7bd5a..5c7d0e07618d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1153,17 +1153,36 @@ static void remove_event_file_dir(struct trace_event_file *file) */ static int __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, - const char *sub, const char *event, int set) + const char *sub, const char *event, int set, + const char *mod) { struct trace_event_file *file; struct trace_event_call *call; + char *module __free(kfree) = NULL; const char *name; int ret = -EINVAL; int eret = 0; + if (mod) { + char *p; + + module = kstrdup(mod, GFP_KERNEL); + if (!module) + return -ENOMEM; + + /* Replace all '-' with '_' as that's what modules do */ + for (p = strchr(module, '-'); p; p = strchr(p + 1, '-')) + *p = '_'; + } + list_for_each_entry(file, &tr->events, list) { call = file->event_call; + + /* If a module is specified, skip events that are not that module */ + if (module && (!call->module || strcmp(module_name(call->module), module))) + continue; + name = trace_event_name(call); if (!name || !call->class || !call->class->reg) @@ -1200,12 +1219,13 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, } static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, - const char *sub, const char *event, int set) + const char *sub, const char *event, int set, + const char *mod) { int ret; mutex_lock(&event_mutex); - ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set); + ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set, mod); mutex_unlock(&event_mutex); return ret; @@ -1213,11 +1233,20 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) { - char *event = NULL, *sub = NULL, *match; + char *event = NULL, *sub = NULL, *match, *mod; int ret; if (!tr) return -ENOENT; + + /* Modules events can be appened with :mod: */ + mod = strstr(buf, ":mod:"); + if (mod) { + *mod = '\0'; + /* move to the module name */ + mod += 5; + } + /* * The buf format can be : * *: means any event by that name. @@ -1240,9 +1269,13 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) sub = NULL; if (!strlen(event) || strcmp(event, "*") == 0) event = NULL; + } else if (mod) { + /* Allow wildcard for no length or star */ + if (!strlen(match) || strcmp(match, "*") == 0) + match = NULL; } - ret = __ftrace_set_clr_event(tr, match, sub, event, set); + ret = __ftrace_set_clr_event(tr, match, sub, event, set, mod); /* Put back the colon to allow this to be called again */ if (buf) @@ -1270,7 +1303,7 @@ int trace_set_clr_event(const char *system, const char *event, int set) if (!tr) return -ENODEV; - return __ftrace_set_clr_event(tr, NULL, system, event, set); + return __ftrace_set_clr_event(tr, NULL, system, event, set, NULL); } EXPORT_SYMBOL_GPL(trace_set_clr_event); @@ -1296,7 +1329,7 @@ int trace_array_set_clr_event(struct trace_array *tr, const char *system, return -ENOENT; set = (enable == true) ? 1 : 0; - return __ftrace_set_clr_event(tr, NULL, system, event, set); + return __ftrace_set_clr_event(tr, NULL, system, event, set, NULL); } EXPORT_SYMBOL_GPL(trace_array_set_clr_event); @@ -1646,7 +1679,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (system) name = system->name; - ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val); + ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val, NULL); if (ret) goto out; @@ -4094,7 +4127,7 @@ int event_trace_del_tracer(struct trace_array *tr) __ftrace_clear_event_pids(tr, TRACE_PIDS | TRACE_NO_PIDS); /* Disable any running events */ - __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); + __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0, NULL); /* Make sure no more events are being executed */ tracepoint_synchronize_unregister(); @@ -4378,7 +4411,7 @@ static __init void event_trace_self_tests(void) pr_info("Testing event system %s: ", system->name); - ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error enabling system %s\n", system->name); @@ -4387,7 +4420,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); - ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error disabling system %s\n", system->name); @@ -4402,7 +4435,7 @@ static __init void event_trace_self_tests(void) pr_info("Running tests on all trace events:\n"); pr_info("Testing all events: "); - ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error enabling all events\n"); return; @@ -4411,7 +4444,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); /* reset sysname */ - ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error disabling all events\n"); return; -- 2.50.1 From b355247df104ef6644288884afd2c08b7bf49897 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 16 Jan 2025 09:33:36 -0500 Subject: [PATCH 12/16] tracing: Cache ":mod:" events for modules not loaded yet When the :mod: command is written into /sys/kernel/tracing/set_event (or that file within an instance), if the module specified after the ":mod:" is not yet loaded, it will store that string internally. When the module is loaded, it will enable the events as if the module was loaded when the string was written into the set_event file. This can also be useful to enable events that are in the init section of the module, as the events are enabled before the init section is executed. This also works on the kernel command line: trace_event=:mod: Will enable the events for when it is loaded. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250116143533.514730995@goodmis.org Signed-off-by: Steven Rostedt (Google) --- .../admin-guide/kernel-parameters.txt | 8 + Documentation/trace/events.rst | 4 +- kernel/trace/ftrace.c | 17 -- kernel/trace/trace.c | 26 ++ kernel/trace/trace.h | 12 + kernel/trace/trace_events.c | 241 +++++++++++++++++- 6 files changed, 279 insertions(+), 29 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3872bc6ec49d..4f563cb0ca0f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6858,6 +6858,14 @@ comma-separated list of trace events to enable. See also Documentation/trace/events.rst + To enable modules, use :mod: keyword: + + trace_event=:mod: + + The value before :mod: will only enable specific events + that are part of the module. See the above mentioned + document for more information. + trace_instance=[instance-info] [FTRACE] Create a ring buffer instance early in boot up. This will be listed in: diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst index 3db57516eb86..2d88a2acacc0 100644 --- a/Documentation/trace/events.rst +++ b/Documentation/trace/events.rst @@ -60,7 +60,9 @@ a specific module:: # echo ':mod:' > /sys/kernel/tracing/set_event -Will enable all events in the module ````. +Will enable all events in the module ````. If the module is not yet +loaded, the string will be saved and when a module is that matches ```` +is loaded, then it will apply the enabling of events then. The text before ``:mod:`` will be parsed to specify specific events that the module creates:: diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9b17efb1a87d..cafcfc97ff2a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4930,23 +4930,6 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, return __ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable); } -static bool module_exists(const char *module) -{ - /* All modules have the symbol __this_module */ - static const char this_mod[] = "__this_module"; - char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; - unsigned long val; - int n; - - n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod); - - if (n > sizeof(modname) - 1) - return false; - - val = module_kallsyms_lookup_name(modname); - return val != 0; -} - static int cache_mod(struct trace_array *tr, const char *func, char *module, int enable) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cb85ee4a8807..87402b6e8c58 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9407,6 +9407,10 @@ trace_array_create_systems(const char *name, const char *systems, INIT_LIST_HEAD(&tr->hist_vars); INIT_LIST_HEAD(&tr->err_log); +#ifdef CONFIG_MODULES + INIT_LIST_HEAD(&tr->mod_events); +#endif + if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; @@ -9823,6 +9827,24 @@ late_initcall_sync(trace_eval_sync); #ifdef CONFIG_MODULES + +bool module_exists(const char *module) +{ + /* All modules have the symbol __this_module */ + static const char this_mod[] = "__this_module"; + char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; + unsigned long val; + int n; + + n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod); + + if (n > sizeof(modname) - 1) + return false; + + val = module_kallsyms_lookup_name(modname); + return val != 0; +} + static void trace_module_add_evals(struct module *mod) { if (!mod->num_trace_evals) @@ -10535,6 +10557,10 @@ __init static int tracer_alloc_buffers(void) #endif ftrace_init_global_array_ops(&global_trace); +#ifdef CONFIG_MODULES + INIT_LIST_HEAD(&global_trace.mod_events); +#endif + init_trace_flags_index(&global_trace); register_tracer(&nop_trace); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9691b47b5f3d..05ea0ebf5eba 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -400,6 +400,9 @@ struct trace_array { cpumask_var_t pipe_cpumask; int ref; int trace_ref; +#ifdef CONFIG_MODULES + struct list_head mod_events; +#endif #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; struct trace_pid_list __rcu *function_pids; @@ -434,6 +437,15 @@ enum { TRACE_ARRAY_FL_BOOT = BIT(1), }; +#ifdef CONFIG_MODULES +bool module_exists(const char *module); +#else +static inline bool module_exists(const char *module) +{ + return false; +} +#endif + extern struct list_head ftrace_trace_arrays; extern struct mutex trace_types_lock; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5c7d0e07618d..f762e554fad4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -857,6 +857,120 @@ static int ftrace_event_enable_disable(struct trace_event_file *file, return __ftrace_event_enable_disable(file, enable, 0); } +#if CONFIG_MODULES +struct event_mod_load { + struct list_head list; + char *module; + char *match; + char *system; + char *event; +}; + +static void free_event_mod(struct event_mod_load *event_mod) +{ + list_del(&event_mod->list); + kfree(event_mod->module); + kfree(event_mod->match); + kfree(event_mod->system); + kfree(event_mod->event); + kfree(event_mod); +} + +static void clear_mod_events(struct trace_array *tr) +{ + struct event_mod_load *event_mod, *n; + + list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) { + free_event_mod(event_mod); + } +} + +static int remove_cache_mod(struct trace_array *tr, const char *mod, + const char *match, const char *system, const char *event) +{ + struct event_mod_load *event_mod, *n; + int ret = -EINVAL; + + list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) { + if (strcmp(event_mod->module, mod) != 0) + continue; + + if (match && strcmp(event_mod->match, match) != 0) + continue; + + if (system && + (!event_mod->system || strcmp(event_mod->system, system) != 0)) + continue; + + if (event && + (!event_mod->event || strcmp(event_mod->event, event) != 0)) + continue; + + free_event_mod(event_mod); + ret = 0; + } + + return ret; +} + +static int cache_mod(struct trace_array *tr, const char *mod, int set, + const char *match, const char *system, const char *event) +{ + struct event_mod_load *event_mod; + + /* If the module exists, then this just failed to find an event */ + if (module_exists(mod)) + return -EINVAL; + + /* See if this is to remove a cached filter */ + if (!set) + return remove_cache_mod(tr, mod, match, system, event); + + event_mod = kzalloc(sizeof(*event_mod), GFP_KERNEL); + if (!event_mod) + return -ENOMEM; + + INIT_LIST_HEAD(&event_mod->list); + event_mod->module = kstrdup(mod, GFP_KERNEL); + if (!event_mod->module) + goto out_free; + + if (match) { + event_mod->match = kstrdup(match, GFP_KERNEL); + if (!event_mod->match) + goto out_free; + } + + if (system) { + event_mod->system = kstrdup(system, GFP_KERNEL); + if (!event_mod->system) + goto out_free; + } + + if (event) { + event_mod->event = kstrdup(event, GFP_KERNEL); + if (!event_mod->event) + goto out_free; + } + + list_add(&event_mod->list, &tr->mod_events); + + return 0; + + out_free: + free_event_mod(event_mod); + + return -ENOMEM; +} +#else /* CONFIG_MODULES */ +static inline void clear_mod_events(struct trace_array *tr) { } +static int cache_mod(struct trace_array *tr, const char *mod, int set, + const char *match, const char *system, const char *event) +{ + return -EINVAL; +} +#endif + static void ftrace_clear_events(struct trace_array *tr) { struct trace_event_file *file; @@ -865,6 +979,7 @@ static void ftrace_clear_events(struct trace_array *tr) list_for_each_entry(file, &tr->events, list) { ftrace_event_enable_disable(file, 0); } + clear_mod_events(tr); mutex_unlock(&event_mutex); } @@ -1215,6 +1330,13 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, ret = eret; } + /* + * If this is a module setting and nothing was found, + * check if the module was loaded. If it wasn't cache it. + */ + if (module && ret == -EINVAL && !eret) + ret = cache_mod(tr, module, set, match, sub, event); + return ret; } @@ -1416,37 +1538,71 @@ static void *t_start(struct seq_file *m, loff_t *pos) return file; } +enum set_event_iter_type { + SET_EVENT_FILE, + SET_EVENT_MOD, +}; + +struct set_event_iter { + enum set_event_iter_type type; + union { + struct trace_event_file *file; + struct event_mod_load *event_mod; + }; +}; + static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct trace_event_file *file = v; + struct set_event_iter *iter = v; + struct trace_event_file *file; struct trace_array *tr = m->private; (*pos)++; - list_for_each_entry_continue(file, &tr->events, list) { - if (file->flags & EVENT_FILE_FL_ENABLED) - return file; + if (iter->type == SET_EVENT_FILE) { + file = iter->file; + list_for_each_entry_continue(file, &tr->events, list) { + if (file->flags & EVENT_FILE_FL_ENABLED) { + iter->file = file; + return iter; + } + } +#ifdef CONFIG_MODULES + iter->type = SET_EVENT_MOD; + iter->event_mod = list_entry(&tr->mod_events, struct event_mod_load, list); +#endif } +#ifdef CONFIG_MODULES + list_for_each_entry_continue(iter->event_mod, &tr->mod_events, list) + return iter; +#endif + return NULL; } static void *s_start(struct seq_file *m, loff_t *pos) { - struct trace_event_file *file; struct trace_array *tr = m->private; + struct set_event_iter *iter; loff_t l; + iter = kzalloc(sizeof(iter), GFP_KERNEL); + if (!iter) + return NULL; + mutex_lock(&event_mutex); - file = list_entry(&tr->events, struct trace_event_file, list); + iter->type = SET_EVENT_FILE; + iter->file = list_entry(&tr->events, struct trace_event_file, list); + for (l = 0; l <= *pos; ) { - file = s_next(m, file, &l); - if (!file) + iter = s_next(m, iter, &l); + if (!iter) break; } - return file; + return iter; } static int t_show(struct seq_file *m, void *v) @@ -1466,6 +1622,45 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } +#ifdef CONFIG_MODULES +static int s_show(struct seq_file *m, void *v) +{ + struct set_event_iter *iter = v; + const char *system; + const char *event; + + if (iter->type == SET_EVENT_FILE) + return t_show(m, iter->file); + + /* When match is set, system and event are not */ + if (iter->event_mod->match) { + seq_printf(m, "%s:mod:%s", iter->event_mod->match, + iter->event_mod->module); + return 0; + } + + system = iter->event_mod->system ? : "*"; + event = iter->event_mod->event ? : "*"; + + seq_printf(m, "%s:%s:mod:%s\n", system, event, iter->event_mod->module); + + return 0; +} +#else /* CONFIG_MODULES */ +static int s_show(struct seq_file *m, void *v) +{ + struct set_event_iter *iter = v; + + return t_show(m, iter->file); +} +#endif + +static void s_stop(struct seq_file *m, void *p) +{ + kfree(p); + t_stop(m, NULL); +} + static void * __next(struct seq_file *m, void *v, loff_t *pos, int type) { @@ -2253,8 +2448,8 @@ static const struct seq_operations show_event_seq_ops = { static const struct seq_operations show_set_event_seq_ops = { .start = s_start, .next = s_next, - .show = t_show, - .stop = t_stop, + .show = s_show, + .stop = s_stop, }; static const struct seq_operations show_set_pid_seq_ops = { @@ -3385,6 +3580,28 @@ EXPORT_SYMBOL_GPL(trace_remove_event_call); event++) #ifdef CONFIG_MODULES +static void update_cache(struct trace_array *tr, struct module *mod) +{ + struct event_mod_load *event_mod, *n; + + list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) { + if (strcmp(event_mod->module, mod->name) != 0) + continue; + + __ftrace_set_clr_event_nolock(tr, event_mod->match, + event_mod->system, + event_mod->event, 1, mod->name); + free_event_mod(event_mod); + } +} + +static void update_cache_events(struct module *mod) +{ + struct trace_array *tr; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) + update_cache(tr, mod); +} static void trace_module_add_events(struct module *mod) { @@ -3407,6 +3624,8 @@ static void trace_module_add_events(struct module *mod) __register_event(*call, mod); __add_event_to_tracers(*call); } + + update_cache_events(mod); } static void trace_module_remove_events(struct module *mod) -- 2.50.1 From 542079b4b12e89f82c8a689b6e9b119ab7d52018 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 16 Jan 2025 09:33:37 -0500 Subject: [PATCH 13/16] selftests/ftrace: Add test that tests event :mod: commands Now that here's a :mod: command that can be sent into set_event, add a test that tests its use. Both setting events for a loaded module, as well as caching what events to set for a module that is not loaded yet. Cc: Shuah Khan Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: linux-kselftest@vger.kernel.org Link: https://lore.kernel.org/20250116143533.819228058@goodmis.org Signed-off-by: Steven Rostedt (Google) --- .../ftrace/test.d/event/event-mod.tc | 191 ++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/event/event-mod.tc diff --git a/tools/testing/selftests/ftrace/test.d/event/event-mod.tc b/tools/testing/selftests/ftrace/test.d/event/event-mod.tc new file mode 100644 index 000000000000..175243cd9ab7 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/event/event-mod.tc @@ -0,0 +1,191 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: event tracing - enable/disable with module event +# requires: set_event "Can enable module events via: :mod:":README +# flags: instance + +rmmod trace-events-sample ||: +if ! modprobe trace-events-sample ; then + echo "No trace-events sample module - please make CONFIG_SAMPLE_TRACE_EVENTS=m" + exit_unresolved; +fi +trap "rmmod trace-events-sample" EXIT + +# Set events for the module +echo ":mod:trace-events-sample" > set_event + +test_all_enabled() { + + # Check if more than one is enabled + grep -q sample-trace:foo_bar set_event + grep -q sample-trace:foo_bar_with_cond set_event + grep -q sample-trace:foo_bar_with_fn set_event + + # All of them should be enabled. Check via the enable file + val=`cat events/sample-trace/enable` + if [ $val -ne 1 ]; then + exit_fail + fi +} + +clear_events() { + echo > set_event + val=`cat events/enable` + if [ "$val" != "0" ]; then + exit_fail + fi + count=`cat set_event | wc -l` + if [ $count -ne 0 ]; then + exit_fail + fi +} + +test_all_enabled + +echo clear all events +echo 0 > events/enable + +echo Confirm the events are disabled +val=`cat events/sample-trace/enable` +if [ $val -ne 0 ]; then + exit_fail +fi + +echo And the set_event file is empty + +cnt=`wc -l set_event` +if [ $cnt -ne 0 ]; then + exit_fail +fi + +echo now enable all events +echo 1 > events/enable + +echo Confirm the events are enabled again +val=`cat events/sample-trace/enable` +if [ $val -ne 1 ]; then + exit_fail +fi + +echo disable just the module events +echo '!:mod:trace-events-sample' >> set_event + +echo Should have mix of events enabled +val=`cat events/enable` +if [ "$val" != "X" ]; then + exit_fail +fi + +echo Confirm the module events are disabled +val=`cat events/sample-trace/enable` +if [ $val -ne 0 ]; then + exit_fail +fi + +echo 0 > events/enable + +echo now enable the system events +echo 'sample-trace:mod:trace-events-sample' > set_event + +test_all_enabled + +echo clear all events +echo 0 > events/enable + +echo Confirm the events are disabled +val=`cat events/sample-trace/enable` +if [ $val -ne 0 ]; then + exit_fail +fi + +echo Test enabling foo_bar only +echo 'foo_bar:mod:trace-events-sample' > set_event + +grep -q sample-trace:foo_bar set_event + +echo make sure nothing is found besides foo_bar +if grep -q -v sample-trace:foo_bar set_event ; then + exit_fail +fi + +echo Append another using the system and event name +echo 'sample-trace:foo_bar_with_cond:mod:trace-events-sample' >> set_event + +grep -q sample-trace:foo_bar set_event +grep -q sample-trace:foo_bar_with_cond set_event + +count=`cat set_event | wc -l` + +if [ $count -ne 2 ]; then + exit_fail +fi + +clear_events + +rmmod trace-events-sample + +echo ':mod:trace-events-sample' > set_event + +echo make sure that the module shows up, and '-' is converted to '_' +grep -q '\*:\*:mod:trace_events_sample' set_event + +modprobe trace-events-sample + +test_all_enabled + +clear_events + +rmmod trace-events-sample + +echo Enable just the system events +echo 'sample-trace:mod:trace-events-sample' > set_event +grep -q 'sample-trace:mod:trace_events_sample' set_event + +modprobe trace-events-sample + +test_all_enabled + +clear_events + +rmmod trace-events-sample + +echo Enable event with just event name +echo 'foo_bar:mod:trace-events-sample' > set_event +grep -q 'foo_bar:mod:trace_events_sample' set_event + +echo Enable another event with both system and event name +echo 'sample-trace:foo_bar_with_cond:mod:trace-events-sample' >> set_event +grep -q 'sample-trace:foo_bar_with_cond:mod:trace_events_sample' set_event +echo Make sure the other event was still there +grep -q 'foo_bar:mod:trace_events_sample' set_event + +modprobe trace-events-sample + +echo There should be no :mod: cached events +if grep -q ':mod:' set_event; then + exit_fail +fi + +echo two events should be enabled +count=`cat set_event | wc -l` +if [ $count -ne 2 ]; then + exit_fail +fi + +echo only two events should be enabled +val=`cat events/sample-trace/enable` +if [ "$val" != "X" ]; then + exit_fail +fi + +val=`cat events/sample-trace/foo_bar/enable` +if [ "$val" != "1" ]; then + exit_fail +fi + +val=`cat events/sample-trace/foo_bar_with_cond/enable` +if [ "$val" != "1" ]; then + exit_fail +fi + +clear_trace -- 2.50.1 From a925df6f5036013b6592eef28f7ec4a45bf465a9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Jan 2025 12:57:45 -0500 Subject: [PATCH 14/16] tracing: Fix #if CONFIG_MODULES to #ifdef CONFIG_MODULES A typo was introduced when adding the ":mod:" command that did a "#if CONFIG_MODULES" instead of a "#ifdef CONFIG_MODULES". Fix it. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Linus Torvalds Link: https://lore.kernel.org/20250120125745.4ac90ca6@gandalf.local.home Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501190121.E2CIJuUj-lkp@intel.com/ Fixes: b355247df104e ("tracing: Cache ":mod:" events for modules not loaded yet") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f762e554fad4..bb1406719c3f 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -857,7 +857,7 @@ static int ftrace_event_enable_disable(struct trace_event_file *file, return __ftrace_event_enable_disable(file, enable, 0); } -#if CONFIG_MODULES +#ifdef CONFIG_MODULES struct event_mod_load { struct list_head list; char *module; -- 2.50.1 From 22412b72cafd1b2570c2f9f14b7a133bdff8b80c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Jan 2025 17:27:56 -0500 Subject: [PATCH 15/16] tracing: Rename update_cache() to update_mod_cache() The static function in trace_events.c called update_cache() is too generic and conflicts with the function defined in arch/openrisc/include/asm/pgtable.h Rename it to update_mod_cache() to make it less generic. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250120172756.4ecfb43f@batman.local.home Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501210550.Ufrj5CRn-lkp@intel.com/ Fixes: b355247df104e ("tracing: Cache ":mod:" events for modules not loaded yet") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index bb1406719c3f..51c5014877e8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3580,7 +3580,7 @@ EXPORT_SYMBOL_GPL(trace_remove_event_call); event++) #ifdef CONFIG_MODULES -static void update_cache(struct trace_array *tr, struct module *mod) +static void update_mod_cache(struct trace_array *tr, struct module *mod) { struct event_mod_load *event_mod, *n; @@ -3600,7 +3600,7 @@ static void update_cache_events(struct module *mod) struct trace_array *tr; list_for_each_entry(tr, &ftrace_trace_arrays, list) - update_cache(tr, mod); + update_mod_cache(tr, mod); } static void trace_module_add_events(struct module *mod) -- 2.50.1 From f95ee542947d748d4ca01b4d3103dbdc4fdc8889 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 21 Jan 2025 15:12:36 -0500 Subject: [PATCH 16/16] tracing: Fix allocation of printing set_event file content The adding of cached events for modules not loaded yet required a descriptor to separate the iteration of events with the iteration of cached events for a module. But the allocation used the size of the pointer and not the size of the contents to allocate its data and caused a slab-out-of-bounds. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Linus Torvalds Link: https://lore.kernel.org/20250121151236.47fcf433@gandalf.local.home Reported-by: Sasha Levin Closes: https://lore.kernel.org/all/Z4_OHKESRSiJcr-b@lappy/ Fixes: b355247df104e ("tracing: Cache ":mod:" events for modules not loaded yet") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 51c5014877e8..5217dcddcb4c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1588,7 +1588,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) struct set_event_iter *iter; loff_t l; - iter = kzalloc(sizeof(iter), GFP_KERNEL); + iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return NULL; -- 2.50.1