From 5f3719f697c3fdfae5cd6805f10ac7a04b0f4e43 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Mar 2025 11:45:47 -0500 Subject: [PATCH 01/16] tracing: Update modules to persistent instances when loaded When a module is loaded and a persistent buffer is actively tracing, add it to the list of modules in the persistent memory. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250305164609.469844721@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 27 +++++++++++++++++++++++++ kernel/trace/trace.h | 2 ++ kernel/trace/trace_events.c | 40 ++++++++++++++++++++++++++----------- 3 files changed, 57 insertions(+), 12 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d22f8d34b18d..66c1683fa1dd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10087,6 +10087,32 @@ static void trace_module_remove_evals(struct module *mod) static inline void trace_module_remove_evals(struct module *mod) { } #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ +static bool trace_array_active(struct trace_array *tr) +{ + if (tr->current_trace != &nop_trace) + return true; + + /* 0 is no events, 1 is all disabled */ + return trace_events_enabled(tr, NULL) > 1; +} + +static void trace_module_record(struct module *mod) +{ + struct trace_array *tr; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + /* Update any persistent trace array that has already been started */ + if ((tr->flags & (TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT)) == + TRACE_ARRAY_FL_BOOT) { + /* Only update if the trace array is active */ + if (trace_array_active(tr)) { + guard(mutex)(&scratch_mutex); + save_mod(mod, tr); + } + } + } +} + static int trace_module_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -10095,6 +10121,7 @@ static int trace_module_notify(struct notifier_block *self, switch (val) { case MODULE_STATE_COMING: trace_module_add_evals(mod); + trace_module_record(mod); break; case MODULE_STATE_GOING: trace_module_remove_evals(mod); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3a020fb82a34..90493220c362 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -786,6 +786,8 @@ extern void trace_find_cmdline(int pid, char comm[]); extern int trace_find_tgid(int pid); extern void trace_event_follow_fork(struct trace_array *tr, bool enable); +extern int trace_events_enabled(struct trace_array *tr, const char *system); + #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; extern unsigned long ftrace_number_of_pages; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 513de9ceb80e..7b3ef1d26167 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1818,28 +1818,28 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } -static ssize_t -system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) +/* + * Returns: + * 0 : no events exist? + * 1 : all events are disabled + * 2 : all events are enabled + * 3 : some events are enabled and some are enabled + */ +int trace_events_enabled(struct trace_array *tr, const char *system) { - const char set_to_char[4] = { '?', '0', '1', 'X' }; - struct trace_subsystem_dir *dir = filp->private_data; - struct event_subsystem *system = dir->subsystem; struct trace_event_call *call; struct trace_event_file *file; - struct trace_array *tr = dir->tr; - char buf[2]; int set = 0; - int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); + list_for_each_entry(file, &tr->events, list) { call = file->event_call; if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || !trace_event_name(call) || !call->class || !call->class->reg) continue; - if (system && strcmp(call->class->system, system->name) != 0) + if (system && strcmp(call->class->system, system) != 0) continue; /* @@ -1855,7 +1855,23 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, if (set == 3) break; } - mutex_unlock(&event_mutex); + + return set; +} + +static ssize_t +system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char set_to_char[4] = { '?', '0', '1', 'X' }; + struct trace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; + struct trace_array *tr = dir->tr; + char buf[2]; + int set; + int ret; + + set = trace_events_enabled(tr, system ? system->name : NULL); buf[0] = set_to_char[set]; buf[1] = '\n'; -- 2.51.0 From 74e2498ccf7b303e7fdd881f58a849e884afb486 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 19 Feb 2025 00:08:58 +0900 Subject: [PATCH 02/16] mm/memblock: Add reserved memory release function Add reserve_mem_release_by_name() to release a reserved memory region with a given name. This allows us to release reserved memory which is defined by kernel cmdline, after boot. Signed-off-by: Masami Hiramatsu (Google) Acked-by: Mike Rapoport (Microsoft) Cc: Andrew Morton Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: linux-mm@kvack.org Link: https://lore.kernel.org/173989133862.230693.14094993331347437600.stgit@devnote2 Signed-off-by: Steven Rostedt (Google) --- include/linux/mm.h | 1 + mm/memblock.c | 66 +++++++++++++++++++++++++++++++++++++--------- 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b1068ddcbb7..1ee9e7447485 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4123,6 +4123,7 @@ void vma_pgtable_walk_begin(struct vm_area_struct *vma); void vma_pgtable_walk_end(struct vm_area_struct *vma); int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size); +int reserve_mem_release_by_name(const char *name); #ifdef CONFIG_64BIT int do_mseal(unsigned long start, size_t len_in, unsigned long flags); diff --git a/mm/memblock.c b/mm/memblock.c index 95af35fd1389..8cd95f60015d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -2283,6 +2284,7 @@ struct reserve_mem_table { }; static struct reserve_mem_table reserved_mem_table[RESERVE_MEM_MAX_ENTRIES]; static int reserved_mem_count; +static DEFINE_MUTEX(reserve_mem_lock); /* Add wildcard region with a lookup name */ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size, @@ -2296,6 +2298,21 @@ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size, strscpy(map->name, name); } +static struct reserve_mem_table *reserve_mem_find_by_name_nolock(const char *name) +{ + struct reserve_mem_table *map; + int i; + + for (i = 0; i < reserved_mem_count; i++) { + map = &reserved_mem_table[i]; + if (!map->size) + continue; + if (strcmp(name, map->name) == 0) + return map; + } + return NULL; +} + /** * reserve_mem_find_by_name - Find reserved memory region with a given name * @name: The name that is attached to a reserved memory region @@ -2309,22 +2326,47 @@ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size, int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size) { struct reserve_mem_table *map; - int i; - for (i = 0; i < reserved_mem_count; i++) { - map = &reserved_mem_table[i]; - if (!map->size) - continue; - if (strcmp(name, map->name) == 0) { - *start = map->start; - *size = map->size; - return 1; - } - } - return 0; + guard(mutex)(&reserve_mem_lock); + map = reserve_mem_find_by_name_nolock(name); + if (!map) + return 0; + + *start = map->start; + *size = map->size; + return 1; } EXPORT_SYMBOL_GPL(reserve_mem_find_by_name); +/** + * reserve_mem_release_by_name - Release reserved memory region with a given name + * @name: The name that is attatched to a reserved memory region + * + * Forcibly release the pages in the reserved memory region so that those memory + * can be used as free memory. After released the reserved region size becomes 0. + * + * Returns: 1 if released or 0 if not found. + */ +int reserve_mem_release_by_name(const char *name) +{ + char buf[RESERVE_MEM_NAME_SIZE + 12]; + struct reserve_mem_table *map; + void *start, *end; + + guard(mutex)(&reserve_mem_lock); + map = reserve_mem_find_by_name_nolock(name); + if (!map) + return 0; + + start = phys_to_virt(map->start); + end = start + map->size - 1; + snprintf(buf, sizeof(buf), "reserve_mem:%s", name); + free_reserved_area(start, end, 0, buf); + map->size = 0; + + return 1; +} + /* * Parse reserve_mem=nn:align:name */ -- 2.51.0 From fb6d03238e35f96cc1d6a5411ee1d684221d1c39 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 19 Feb 2025 00:09:08 +0900 Subject: [PATCH 03/16] tracing: Freeable reserved ring buffer Make the ring buffer on reserved memory to be freeable. This allows us to free the trace instance on the reserved memory without changing cmdline and rebooting. Even if we can not change the kernel cmdline for security reason, we can release the reserved memory for the ring buffer as free (available) memory. For example, boot kernel with reserved memory; "reserve_mem=20M:2M:trace trace_instance=boot_mapped^traceoff@trace" ~ # free total used free shared buff/cache available Mem: 1995548 50544 1927568 14964 17436 1911480 Swap: 0 0 0 ~ # rmdir /sys/kernel/tracing/instances/boot_mapped/ [ 23.704023] Freeing reserve_mem:trace memory: 20476K ~ # free total used free shared buff/cache available Mem: 2016024 41844 1956740 14968 17440 1940572 Swap: 0 0 0 Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Mike Rapoport Link: https://lore.kernel.org/173989134814.230693.18199312930337815629.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 13 ++++++++++++- kernel/trace/trace.h | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 66c1683fa1dd..e7a14bb7d7aa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9479,6 +9479,9 @@ static void free_trace_buffers(struct trace_array *tr) #ifdef CONFIG_TRACER_MAX_TRACE free_trace_buffer(&tr->max_buffer); #endif + + if (tr->range_addr_start) + vunmap((void *)tr->range_addr_start); } static void init_trace_flags_index(struct trace_array *tr) @@ -9640,6 +9643,7 @@ trace_array_create_systems(const char *name, const char *systems, free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree_const(tr->system_names); + kfree(tr->range_name); kfree(tr->name); kfree(tr); @@ -9766,6 +9770,11 @@ static int __remove_instance(struct trace_array *tr) free_trace_buffers(tr); clear_tracing_err_log(tr); + if (tr->range_name) { + reserve_mem_release_by_name(tr->range_name); + kfree(tr->range_name); + } + for (i = 0; i < tr->nr_topts; i++) { kfree(tr->topts[i].topts); } @@ -10590,6 +10599,7 @@ __init static void enable_instances(void) bool traceoff = false; char *flag_delim; char *addr_delim; + char *rname __free(kfree) = NULL; tok = strsep(&curr_str, ","); @@ -10646,6 +10656,7 @@ __init static void enable_instances(void) pr_warn("Failed to map boot instance %s to %s\n", name, tok); continue; } + rname = kstrdup(tok, GFP_KERNEL); } if (start) { @@ -10682,7 +10693,7 @@ __init static void enable_instances(void) */ if (start) { tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT; - tr->ref++; + tr->range_name = no_free_ptr(rname); } while ((tok = strsep(&curr_str, ","))) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 90493220c362..0d6efb8a1179 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -348,6 +348,7 @@ struct trace_array { unsigned int mapped; unsigned long range_addr_start; unsigned long range_addr_size; + char *range_name; long text_delta; void *scratch; /* pointer in persistent memory */ int scratch_size; -- 2.51.0 From f00c9201f942dddb58617c881ffd4e4a1a1c49ab Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 18 Mar 2025 22:39:13 +0900 Subject: [PATCH 04/16] tracing: Fix a compilation error without CONFIG_MODULES There are some code which depends on CONFIG_MODULES. #ifdef to enclose it. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/174230515367.2909896.8132122175220657625.stgit@mhiramat.tok.corp.google.com Fixes: dca91c1c5468 ("tracing: Have persistent trace instances save module addresses") Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e7a14bb7d7aa..7f8b0c43d2a5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6001,6 +6001,7 @@ struct trace_scratch { static DEFINE_MUTEX(scratch_mutex); +#ifdef CONFIG_MODULES static int save_mod(struct module *mod, void *data) { struct trace_array *tr = data; @@ -6025,6 +6026,12 @@ static int save_mod(struct module *mod, void *data) return 0; } +#else +static int save_mod(struct module *mod, void *data) +{ + return 0; +} +#endif static void update_last_data(struct trace_array *tr) { -- 2.51.0 From 5dbeb56bb9589e1051f6af6877cd375f3a901afb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Mar 2025 07:16:18 -0400 Subject: [PATCH 05/16] tracing: Initialize scratch_size to zero to prevent UB In allocate_trace_buffer() the following code: buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0, tr->range_addr_start, tr->range_addr_size, struct_size(tscratch, entries, 128)); tscratch = ring_buffer_meta_scratch(buf->buffer, &scratch_size); setup_trace_scratch(tr, tscratch, scratch_size); Has undefined behavior if ring_buffer_alloc_range() fails because "scratch_size" is not initialize. If the allocation fails, then buf->buffer will be NULL. The ring_buffer_meta_scratch() will return NULL immediately if it is passed a NULL buffer and it will not update scratch_size. Then setup_trace_scratch() will return immediately if tscratch is NULL. Although there's no real issue here, but it is considered undefined behavior to pass an uninitialized variable to a function as input, and UBSan may complain about it. Just initialize scratch_size to zero to make the code defined behavior and a little more robust. Link: https://lore.kernel.org/all/44c5deaa-b094-4852-90f9-52f3fb10e67a@stanley.mountain/ Reported-by: Dan Carpenter Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7f8b0c43d2a5..78ae76666695 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9398,7 +9398,7 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size { enum ring_buffer_flags rb_flags; struct trace_scratch *tscratch; - unsigned int scratch_size; + unsigned int scratch_size = 0; rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; -- 2.51.0 From 486fbcb3806c0c7a5dbeea326c4a146fd4ed4eff Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Mar 2025 10:30:03 +0900 Subject: [PATCH 06/16] tracing: Skip update_last_data() if cleared and remove active check for save_mod() If the last boot data is already cleared, there is no reason to update it again. Skip if the TRACE_ARRAY_FL_LAST_BOOT is cleared. Also, for calling save_mod() when module loading, we don't need to check the trace is active or not because any module address can be on the stacktrace. Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/174165660328.1173316.15529357882704817499.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 78ae76666695..382c7a562303 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6040,6 +6040,12 @@ static void update_last_data(struct trace_array *tr) if (!(tr->flags & TRACE_ARRAY_FL_BOOT)) return; + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return; + + /* Only if the buffer has previous boot data clear and update it. */ + tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT; + /* Reset the module list and reload them */ if (tr->scratch) { struct trace_scratch *tscratch = tr->scratch; @@ -6052,9 +6058,6 @@ static void update_last_data(struct trace_array *tr) module_for_each_mod(save_mod, tr); } - if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) - return; - /* * Need to clear all CPU buffers as there cannot be events * from the previous boot mixed with events with this boot @@ -6077,7 +6080,6 @@ static void update_last_data(struct trace_array *tr) #else tscratch->kaslr_addr = 0; #endif - tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT; } /** @@ -10103,15 +10105,6 @@ static void trace_module_remove_evals(struct module *mod) static inline void trace_module_remove_evals(struct module *mod) { } #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ -static bool trace_array_active(struct trace_array *tr) -{ - if (tr->current_trace != &nop_trace) - return true; - - /* 0 is no events, 1 is all disabled */ - return trace_events_enabled(tr, NULL) > 1; -} - static void trace_module_record(struct module *mod) { struct trace_array *tr; @@ -10120,11 +10113,8 @@ static void trace_module_record(struct module *mod) /* Update any persistent trace array that has already been started */ if ((tr->flags & (TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT)) == TRACE_ARRAY_FL_BOOT) { - /* Only update if the trace array is active */ - if (trace_array_active(tr)) { - guard(mutex)(&scratch_mutex); - save_mod(mod, tr); - } + guard(mutex)(&scratch_mutex); + save_mod(mod, tr); } } } -- 2.51.0 From de48d7fff7b4668a61c3c1d13ca0f6a6b3995519 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Mon, 17 Mar 2025 09:55:24 +0800 Subject: [PATCH 07/16] ring-buffer: Remove the unused variable bmeta MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Variable bmeta is not effectively used, so delete it. kernel/trace/ring_buffer.c:1952:27: warning: variable ‘bmeta’ set but not used. Link: https://lore.kernel.org/20250317015524.3902-1-jiapeng.chong@linux.alibaba.com Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=19524 Signed-off-by: Jiapeng Chong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b981eb20c206..f25966b3a1fc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1949,7 +1949,6 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int scratch_size) { struct ring_buffer_cpu_meta *meta; - struct ring_buffer_meta *bmeta; unsigned long *subbuf_mask; unsigned long delta; void *subbuf; @@ -1964,8 +1963,6 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int sc if (rb_meta_init(buffer, scratch_size)) valid = true; - bmeta = buffer->meta; - for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *next_meta; -- 2.51.0 From 35a380ddbc653c07ee64e2a74c274b9835b0efc2 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 24 Mar 2025 23:34:52 +0900 Subject: [PATCH 08/16] tracing: Show last module text symbols in the stacktrace Since the previous boot trace buffer can include module text address in the stacktrace. As same as the kernel text address, convert the module text address using the module address information. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/174282689201.356346.17647540360450727687.stgit@mhiramat.tok.corp.google.com Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 133 ++++++++++++++++++++++++++++++++++-- kernel/trace/trace.h | 8 +++ kernel/trace/trace_output.c | 4 +- 3 files changed, 138 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 382c7a562303..fa17397fdc1f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -49,6 +49,7 @@ #include #include #include +#include #include /* COMMAND_LINE_SIZE and kaslr_offset() */ @@ -6001,6 +6002,59 @@ struct trace_scratch { static DEFINE_MUTEX(scratch_mutex); +static int cmp_mod_entry(const void *key, const void *pivot) +{ + unsigned long addr = (unsigned long)key; + const struct trace_mod_entry *ent = pivot; + + if (addr >= ent[0].mod_addr && addr < ent[1].mod_addr) + return 0; + else + return addr - ent->mod_addr; +} + +/** + * trace_adjust_address() - Adjust prev boot address to current address. + * @tr: Persistent ring buffer's trace_array. + * @addr: Address in @tr which is adjusted. + */ +unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr) +{ + struct trace_module_delta *module_delta; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + int idx = 0, nr_entries; + + /* If we don't have last boot delta, return the address */ + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return addr; + + /* tr->module_delta must be protected by rcu. */ + guard(rcu)(); + tscratch = tr->scratch; + /* if there is no tscrach, module_delta must be NULL. */ + module_delta = READ_ONCE(tr->module_delta); + if (!module_delta || tscratch->entries[0].mod_addr > addr) + return addr + tr->text_delta; + + /* Note that entries must be sorted. */ + nr_entries = tscratch->nr_entries; + if (nr_entries == 1 || + tscratch->entries[nr_entries - 1].mod_addr < addr) + idx = nr_entries - 1; + else { + entry = __inline_bsearch((void *)addr, + tscratch->entries, + nr_entries - 1, + sizeof(tscratch->entries[0]), + cmp_mod_entry); + if (entry) + idx = entry - tscratch->entries; + } + + return addr + module_delta->delta[idx]; +} + #ifdef CONFIG_MODULES static int save_mod(struct module *mod, void *data) { @@ -6035,6 +6089,7 @@ static int save_mod(struct module *mod, void *data) static void update_last_data(struct trace_array *tr) { + struct trace_module_delta *module_delta; struct trace_scratch *tscratch; if (!(tr->flags & TRACE_ARRAY_FL_BOOT)) @@ -6073,6 +6128,9 @@ static void update_last_data(struct trace_array *tr) return; tscratch = tr->scratch; + module_delta = READ_ONCE(tr->module_delta); + WRITE_ONCE(tr->module_delta, NULL); + kfree_rcu(module_delta, rcu); /* Set the persistent ring buffer meta data to this address */ #ifdef CONFIG_RANDOMIZE_BASE @@ -9355,10 +9413,51 @@ static struct dentry *trace_instance_dir; static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); +#ifdef CONFIG_MODULES +static int make_mod_delta(struct module *mod, void *data) +{ + struct trace_module_delta *module_delta; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + struct trace_array *tr = data; + int i; + + tscratch = tr->scratch; + module_delta = READ_ONCE(tr->module_delta); + for (i = 0; i < tscratch->nr_entries; i++) { + entry = &tscratch->entries[i]; + if (strcmp(mod->name, entry->mod_name)) + continue; + if (mod->state == MODULE_STATE_GOING) + module_delta->delta[i] = 0; + else + module_delta->delta[i] = (unsigned long)mod->mem[MOD_TEXT].base + - entry->mod_addr; + break; + } + return 0; +} +#else +static int make_mod_delta(struct module *mod, void *data) +{ + return 0; +} +#endif + +static int mod_addr_comp(const void *a, const void *b, const void *data) +{ + const struct trace_mod_entry *e1 = a; + const struct trace_mod_entry *e2 = b; + + return e1->mod_addr > e2->mod_addr ? 1 : -1; +} + static void setup_trace_scratch(struct trace_array *tr, struct trace_scratch *tscratch, unsigned int size) { + struct trace_module_delta *module_delta; struct trace_mod_entry *entry; + int i, nr_entries; if (!tscratch) return; @@ -9375,7 +9474,7 @@ static void setup_trace_scratch(struct trace_array *tr, goto reset; /* Check if each module name is a valid string */ - for (int i = 0; i < tscratch->nr_entries; i++) { + for (i = 0; i < tscratch->nr_entries; i++) { int n; entry = &tscratch->entries[i]; @@ -9389,6 +9488,25 @@ static void setup_trace_scratch(struct trace_array *tr, if (n == MODULE_NAME_LEN) goto reset; } + + /* Sort the entries so that we can find appropriate module from address. */ + nr_entries = tscratch->nr_entries; + sort_r(tscratch->entries, nr_entries, sizeof(struct trace_mod_entry), + mod_addr_comp, NULL, NULL); + + if (IS_ENABLED(CONFIG_MODULES)) { + module_delta = kzalloc(struct_size(module_delta, delta, nr_entries), GFP_KERNEL); + if (!module_delta) { + pr_info("module_delta allocation failed. Not able to decode module address."); + goto reset; + } + init_rcu_head(&module_delta->rcu); + } else + module_delta = NULL; + WRITE_ONCE(tr->module_delta, module_delta); + + /* Scan modules to make text delta for modules. */ + module_for_each_mod(make_mod_delta, tr); return; reset: /* Invalid trace modules */ @@ -10105,16 +10223,20 @@ static void trace_module_remove_evals(struct module *mod) static inline void trace_module_remove_evals(struct module *mod) { } #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ -static void trace_module_record(struct module *mod) +static void trace_module_record(struct module *mod, bool add) { struct trace_array *tr; + unsigned long flags; list_for_each_entry(tr, &ftrace_trace_arrays, list) { + flags = tr->flags & (TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT); /* Update any persistent trace array that has already been started */ - if ((tr->flags & (TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT)) == - TRACE_ARRAY_FL_BOOT) { + if (flags == TRACE_ARRAY_FL_BOOT && add) { guard(mutex)(&scratch_mutex); save_mod(mod, tr); + } else if (flags & TRACE_ARRAY_FL_LAST_BOOT) { + /* Update delta if the module loaded in previous boot */ + make_mod_delta(mod, tr); } } } @@ -10127,10 +10249,11 @@ static int trace_module_notify(struct notifier_block *self, switch (val) { case MODULE_STATE_COMING: trace_module_add_evals(mod); - trace_module_record(mod); + trace_module_record(mod, true); break; case MODULE_STATE_GOING: trace_module_remove_evals(mod); + trace_module_record(mod, false); break; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0d6efb8a1179..ab7c7a1930cc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -312,6 +312,11 @@ struct trace_func_repeats { u64 ts_last_call; }; +struct trace_module_delta { + struct rcu_head rcu; + long delta[]; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -350,6 +355,7 @@ struct trace_array { unsigned long range_addr_size; char *range_name; long text_delta; + struct trace_module_delta *module_delta; void *scratch; /* pointer in persistent memory */ int scratch_size; @@ -466,6 +472,8 @@ extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); extern bool trace_clock_in_ns(struct trace_array *tr); +extern unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr); + /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 03d56f711ad1..1ad54fcf25cb 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -5,6 +5,7 @@ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt * */ +#include "trace.h" #include #include #include @@ -1248,7 +1249,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, struct trace_seq *s = &iter->seq; unsigned long *p; unsigned long *end; - long delta = iter->tr->text_delta; trace_assign_type(field, iter->ent); end = (unsigned long *)((long)iter->ent + iter->ent_size); @@ -1265,7 +1265,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_seq_puts(s, "[FTRACE TRAMPOLINE]\n"); continue; } - seq_print_ip_sym(s, (*p) + delta, flags); + seq_print_ip_sym(s, trace_adjust_address(iter->tr, *p), flags); trace_seq_putc(s, '\n'); } -- 2.51.0 From 028a58ec154257e618c27fb0eba8d9e30379bc3d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 26 Mar 2025 22:03:04 -0400 Subject: [PATCH 09/16] tracing: Use _text and the kernel offset in last_boot_info Instead of using kaslr_offset() just record the location of "_text". This makes it possible for user space to use either the System.map or /proc/kallsyms as what to map all addresses to functions with. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250326220304.38dbedcd@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fa17397fdc1f..14c38fcd6f9e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -51,7 +51,7 @@ #include #include -#include /* COMMAND_LINE_SIZE and kaslr_offset() */ +#include /* COMMAND_LINE_SIZE */ #include "trace.h" #include "trace_output.h" @@ -5995,7 +5995,7 @@ struct trace_mod_entry { }; struct trace_scratch { - unsigned long kaslr_addr; + unsigned long text_addr; unsigned long nr_entries; struct trace_mod_entry entries[]; }; @@ -6133,11 +6133,7 @@ static void update_last_data(struct trace_array *tr) kfree_rcu(module_delta, rcu); /* Set the persistent ring buffer meta data to this address */ -#ifdef CONFIG_RANDOMIZE_BASE - tscratch->kaslr_addr = kaslr_offset(); -#else - tscratch->kaslr_addr = 0; -#endif + tscratch->text_addr = (unsigned long)_text; } /** @@ -6996,7 +6992,7 @@ static void show_last_boot_header(struct seq_file *m, struct trace_array *tr) * should not be the same as the current boot. */ if (tscratch && (tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) - seq_printf(m, "%lx\t[kernel]\n", tscratch->kaslr_addr); + seq_printf(m, "%lx\t[kernel]\n", tscratch->text_addr); else seq_puts(m, "# Current\n"); } @@ -9465,10 +9461,8 @@ static void setup_trace_scratch(struct trace_array *tr, tr->scratch = tscratch; tr->scratch_size = size; -#ifdef CONFIG_RANDOMIZE_BASE - if (tscratch->kaslr_addr) - tr->text_delta = kaslr_offset() - tscratch->kaslr_addr; -#endif + if (tscratch->text_addr) + tr->text_delta = (unsigned long)_text - tscratch->text_addr; if (struct_size(tscratch, entries, tscratch->nr_entries) > size) goto reset; -- 2.51.0 From 897c0b4e27135132dc5b348c1a3773d059668489 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Thu, 6 Mar 2025 17:20:59 +0100 Subject: [PATCH 10/16] MAINTAINERS: Update the MODULE SUPPORT section Change my role for MODULE SUPPORT from a reviewer to a maintainer. We started to rotate its maintainership and I currently look after the modules tree. This not being reflected in MAINTAINERS proved to confuse folks. Add lib/tests/module/ and tools/testing/selftests/module/ to maintained files. They were introduced previously by commit 84b4a51fce4c ("selftests: add new kallsyms selftests"). Acked-by: Steven Rostedt (Google) Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/20250306162117.18876-1-petr.pavlu@suse.com Signed-off-by: Petr Pavlu --- MAINTAINERS | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index ed7aa6867674..cacaf564a188 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15982,7 +15982,7 @@ F: include/dt-bindings/clock/mobileye,eyeq5-clk.h MODULE SUPPORT M: Luis Chamberlain -R: Petr Pavlu +M: Petr Pavlu R: Sami Tolvanen R: Daniel Gomez L: linux-modules@vger.kernel.org @@ -15993,8 +15993,10 @@ F: include/linux/kmod.h F: include/linux/module*.h F: kernel/module/ F: lib/test_kmod.c +F: lib/tests/module/ F: scripts/module* F: tools/testing/selftests/kmod/ +F: tools/testing/selftests/module/ MONOLITHIC POWER SYSTEM PMIC DRIVER M: Saravanan Sekar -- 2.51.0 From 112e43e9fd3b999513b1914e2bf523ae509f4c7d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 28 Mar 2025 11:22:54 -0700 Subject: [PATCH 11/16] Revert "Merge tag 'irq-msi-2025-03-23' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip" This reverts commit 36f5f026df6c1cd8a20373adc4388d2b3401ce91, reversing changes made to 43a7eec035a5b64546c8adefdc9cf96a116da14b. Thomas says: "I just noticed that for some incomprehensible reason, probably sheer incompetemce when trying to utilize b4, I managed to merge an outdated _and_ buggy version of that series. Can you please revert that merge completely?" Done. Requested-by: Thomas Gleixner Signed-off-by: Linus Torvalds --- drivers/ntb/msi.c | 22 ++-- drivers/pci/controller/pci-hyperv.c | 14 ++- drivers/pci/msi/api.c | 6 +- drivers/pci/msi/msi.c | 167 ++++++++++------------------ drivers/pci/pci.h | 9 -- drivers/pci/tph.c | 44 +++++++- drivers/soc/ti/ti_sci_inta_msi.c | 10 +- drivers/ufs/host/ufs-qcom.c | 75 ++++++------- include/linux/cleanup.h | 17 --- include/linux/irqdomain.h | 2 - include/linux/msi.h | 12 +- kernel/irq/msi.c | 161 ++++++++++++++++++--------- 12 files changed, 287 insertions(+), 252 deletions(-) diff --git a/drivers/ntb/msi.c b/drivers/ntb/msi.c index 368f6d894bba..6295e55ef85e 100644 --- a/drivers/ntb/msi.c +++ b/drivers/ntb/msi.c @@ -106,10 +106,10 @@ int ntb_msi_setup_mws(struct ntb_dev *ntb) if (!ntb->msi) return -EINVAL; - scoped_guard (msi_descs_lock, &ntb->pdev->dev) { - desc = msi_first_desc(&ntb->pdev->dev, MSI_DESC_ASSOCIATED); - addr = desc->msg.address_lo + ((uint64_t)desc->msg.address_hi << 32); - } + msi_lock_descs(&ntb->pdev->dev); + desc = msi_first_desc(&ntb->pdev->dev, MSI_DESC_ASSOCIATED); + addr = desc->msg.address_lo + ((uint64_t)desc->msg.address_hi << 32); + msi_unlock_descs(&ntb->pdev->dev); for (peer = 0; peer < ntb_peer_port_count(ntb); peer++) { peer_widx = ntb_peer_highest_mw_idx(ntb, peer); @@ -289,7 +289,7 @@ int ntbm_msi_request_threaded_irq(struct ntb_dev *ntb, irq_handler_t handler, if (!ntb->msi) return -EINVAL; - guard(msi_descs_lock)(dev); + msi_lock_descs(dev); msi_for_each_desc(entry, dev, MSI_DESC_ASSOCIATED) { if (irq_has_action(entry->irq)) continue; @@ -307,11 +307,17 @@ int ntbm_msi_request_threaded_irq(struct ntb_dev *ntb, irq_handler_t handler, ret = ntbm_msi_setup_callback(ntb, entry, msi_desc); if (ret) { devm_free_irq(&ntb->dev, entry->irq, dev_id); - return ret; + goto unlock; } - return entry->irq; + + ret = entry->irq; + goto unlock; } - return -ENODEV; + ret = -ENODEV; + +unlock: + msi_unlock_descs(dev); + return ret; } EXPORT_SYMBOL(ntbm_msi_request_threaded_irq); diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 178da6b9fc33..44d7f4339306 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -3975,18 +3975,24 @@ static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg) { struct irq_data *irq_data; struct msi_desc *entry; + int ret = 0; if (!pdev->msi_enabled && !pdev->msix_enabled) return 0; - guard(msi_descs_lock)(&pdev->dev); + msi_lock_descs(&pdev->dev); msi_for_each_desc(entry, &pdev->dev, MSI_DESC_ASSOCIATED) { irq_data = irq_get_irq_data(entry->irq); - if (WARN_ON_ONCE(!irq_data)) - return -EINVAL; + if (WARN_ON_ONCE(!irq_data)) { + ret = -EINVAL; + break; + } + hv_compose_msi_msg(irq_data, &entry->msg); } - return 0; + msi_unlock_descs(&pdev->dev); + + return ret; } /* diff --git a/drivers/pci/msi/api.c b/drivers/pci/msi/api.c index d89f491afdf0..b956ce591f96 100644 --- a/drivers/pci/msi/api.c +++ b/drivers/pci/msi/api.c @@ -53,9 +53,10 @@ void pci_disable_msi(struct pci_dev *dev) if (!pci_msi_enabled() || !dev || !dev->msi_enabled) return; - guard(msi_descs_lock)(&dev->dev); + msi_lock_descs(&dev->dev); pci_msi_shutdown(dev); pci_free_msi_irqs(dev); + msi_unlock_descs(&dev->dev); } EXPORT_SYMBOL(pci_disable_msi); @@ -195,9 +196,10 @@ void pci_disable_msix(struct pci_dev *dev) if (!pci_msi_enabled() || !dev || !dev->msix_enabled) return; - guard(msi_descs_lock)(&dev->dev); + msi_lock_descs(&dev->dev); pci_msix_shutdown(dev); pci_free_msi_irqs(dev); + msi_unlock_descs(&dev->dev); } EXPORT_SYMBOL(pci_disable_msix); diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c index 7058d59e7c5f..6569ba3577fe 100644 --- a/drivers/pci/msi/msi.c +++ b/drivers/pci/msi/msi.c @@ -335,11 +335,41 @@ static int msi_verify_entries(struct pci_dev *dev) return !entry ? 0 : -EIO; } -static int __msi_capability_init(struct pci_dev *dev, int nvec, struct irq_affinity_desc *masks) +/** + * msi_capability_init - configure device's MSI capability structure + * @dev: pointer to the pci_dev data structure of MSI device function + * @nvec: number of interrupts to allocate + * @affd: description of automatic IRQ affinity assignments (may be %NULL) + * + * Setup the MSI capability structure of the device with the requested + * number of interrupts. A return value of zero indicates the successful + * setup of an entry with the new MSI IRQ. A negative return value indicates + * an error, and a positive return value indicates the number of interrupts + * which could have been allocated. + */ +static int msi_capability_init(struct pci_dev *dev, int nvec, + struct irq_affinity *affd) { - int ret = msi_setup_msi_desc(dev, nvec, masks); + struct irq_affinity_desc *masks = NULL; struct msi_desc *entry, desc; + int ret; + /* Reject multi-MSI early on irq domain enabled architectures */ + if (nvec > 1 && !pci_msi_domain_supports(dev, MSI_FLAG_MULTI_PCI_MSI, ALLOW_LEGACY)) + return 1; + + /* + * Disable MSI during setup in the hardware, but mark it enabled + * so that setup code can evaluate it. + */ + pci_msi_set_enable(dev, 0); + dev->msi_enabled = 1; + + if (affd) + masks = irq_create_affinity_masks(nvec, affd); + + msi_lock_descs(&dev->dev); + ret = msi_setup_msi_desc(dev, nvec, masks); if (ret) goto fail; @@ -368,48 +398,19 @@ static int __msi_capability_init(struct pci_dev *dev, int nvec, struct irq_affin pcibios_free_irq(dev); dev->irq = entry->irq; - return 0; + goto unlock; + err: pci_msi_unmask(&desc, msi_multi_mask(&desc)); pci_free_msi_irqs(dev); fail: dev->msi_enabled = 0; +unlock: + msi_unlock_descs(&dev->dev); + kfree(masks); return ret; } -/** - * msi_capability_init - configure device's MSI capability structure - * @dev: pointer to the pci_dev data structure of MSI device function - * @nvec: number of interrupts to allocate - * @affd: description of automatic IRQ affinity assignments (may be %NULL) - * - * Setup the MSI capability structure of the device with the requested - * number of interrupts. A return value of zero indicates the successful - * setup of an entry with the new MSI IRQ. A negative return value indicates - * an error, and a positive return value indicates the number of interrupts - * which could have been allocated. - */ -static int msi_capability_init(struct pci_dev *dev, int nvec, - struct irq_affinity *affd) -{ - /* Reject multi-MSI early on irq domain enabled architectures */ - if (nvec > 1 && !pci_msi_domain_supports(dev, MSI_FLAG_MULTI_PCI_MSI, ALLOW_LEGACY)) - return 1; - - /* - * Disable MSI during setup in the hardware, but mark it enabled - * so that setup code can evaluate it. - */ - pci_msi_set_enable(dev, 0); - dev->msi_enabled = 1; - - struct irq_affinity_desc *masks __free(kfree) = - affd ? irq_create_affinity_masks(nvec, affd) : NULL; - - guard(msi_descs_lock)(&dev->dev); - return __msi_capability_init(dev, nvec, masks); -} - int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec, struct irq_affinity *affd) { @@ -662,41 +663,40 @@ static void msix_mask_all(void __iomem *base, int tsize) writel(ctrl, base + PCI_MSIX_ENTRY_VECTOR_CTRL); } -static int __msix_setup_interrupts(struct pci_dev *dev, struct msix_entry *entries, - int nvec, struct irq_affinity_desc *masks) +static int msix_setup_interrupts(struct pci_dev *dev, struct msix_entry *entries, + int nvec, struct irq_affinity *affd) { - int ret = msix_setup_msi_descs(dev, entries, nvec, masks); + struct irq_affinity_desc *masks = NULL; + int ret; + + if (affd) + masks = irq_create_affinity_masks(nvec, affd); + msi_lock_descs(&dev->dev); + ret = msix_setup_msi_descs(dev, entries, nvec, masks); if (ret) - goto fail; + goto out_free; ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSIX); if (ret) - goto fail; + goto out_free; /* Check if all MSI entries honor device restrictions */ ret = msi_verify_entries(dev); if (ret) - goto fail; + goto out_free; msix_update_entries(dev, entries); - return 0; + goto out_unlock; -fail: +out_free: pci_free_msi_irqs(dev); +out_unlock: + msi_unlock_descs(&dev->dev); + kfree(masks); return ret; } -static int msix_setup_interrupts(struct pci_dev *dev, struct msix_entry *entries, - int nvec, struct irq_affinity *affd) -{ - struct irq_affinity_desc *masks __free(kfree) = - affd ? irq_create_affinity_masks(nvec, affd) : NULL; - - guard(msi_descs_lock)(&dev->dev); - return __msix_setup_interrupts(dev, entries, nvec, masks); -} - /** * msix_capability_init - configure device's MSI-X capability * @dev: pointer to the pci_dev data structure of MSI-X device function @@ -870,13 +870,13 @@ void __pci_restore_msix_state(struct pci_dev *dev) write_msg = arch_restore_msi_irqs(dev); - scoped_guard (msi_descs_lock, &dev->dev) { - msi_for_each_desc(entry, &dev->dev, MSI_DESC_ALL) { - if (write_msg) - __pci_write_msi_msg(entry, &entry->msg); - pci_msix_write_vector_ctrl(entry, entry->pci.msix_ctrl); - } + msi_lock_descs(&dev->dev); + msi_for_each_desc(entry, &dev->dev, MSI_DESC_ALL) { + if (write_msg) + __pci_write_msi_msg(entry, &entry->msg); + pci_msix_write_vector_ctrl(entry, entry->pci.msix_ctrl); } + msi_unlock_descs(&dev->dev); pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0); } @@ -915,53 +915,6 @@ void pci_free_msi_irqs(struct pci_dev *dev) } } -#ifdef CONFIG_PCIE_TPH -/** - * pci_msix_write_tph_tag - Update the TPH tag for a given MSI-X vector - * @pdev: The PCIe device to update - * @index: The MSI-X index to update - * @tag: The tag to write - * - * Returns: 0 on success, error code on failure - */ -int pci_msix_write_tph_tag(struct pci_dev *pdev, unsigned int index, u16 tag) -{ - struct msi_desc *msi_desc; - struct irq_desc *irq_desc; - unsigned int virq; - - if (!pdev->msix_enabled) - return -ENXIO; - - guard(msi_descs_lock)(&pdev->dev); - virq = msi_get_virq(&pdev->dev, index); - if (!virq) - return -ENXIO; - /* - * This is a horrible hack, but short of implementing a PCI - * specific interrupt chip callback and a huge pile of - * infrastructure, this is the minor nuissance. It provides the - * protection against concurrent operations on this entry and keeps - * the control word cache in sync. - */ - irq_desc = irq_to_desc(virq); - if (!irq_desc) - return -ENXIO; - - guard(raw_spinlock_irq)(&irq_desc->lock); - msi_desc = irq_data_get_msi_desc(&irq_desc->irq_data); - if (!msi_desc || msi_desc->pci.msi_attrib.is_virtual) - return -ENXIO; - - msi_desc->pci.msix_ctrl &= ~PCI_MSIX_ENTRY_CTRL_ST; - msi_desc->pci.msix_ctrl |= FIELD_PREP(PCI_MSIX_ENTRY_CTRL_ST, tag); - pci_msix_write_vector_ctrl(msi_desc, msi_desc->pci.msix_ctrl); - /* Flush the write */ - readl(pci_msix_desc_addr(msi_desc)); - return 0; -} -#endif - /* Misc. infrastructure */ struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 2e9cf26a9ee9..01e51db8d285 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -989,15 +989,6 @@ int pcim_request_region_exclusive(struct pci_dev *pdev, int bar, const char *name); void pcim_release_region(struct pci_dev *pdev, int bar); -#ifdef CONFIG_PCI_MSI -int pci_msix_write_tph_tag(struct pci_dev *pdev, unsigned int index, u16 tag); -#else -static inline int pci_msix_write_tph_tag(struct pci_dev *pdev, unsigned int index, u16 tag) -{ - return -ENODEV; -} -#endif - /* * Config Address for PCI Configuration Mechanism #1 * diff --git a/drivers/pci/tph.c b/drivers/pci/tph.c index 77fce5e1b830..07de59ca2ebf 100644 --- a/drivers/pci/tph.c +++ b/drivers/pci/tph.c @@ -204,6 +204,48 @@ static u8 get_rp_completer_type(struct pci_dev *pdev) return FIELD_GET(PCI_EXP_DEVCAP2_TPH_COMP_MASK, reg); } +/* Write ST to MSI-X vector control reg - Return 0 if OK, otherwise -errno */ +static int write_tag_to_msix(struct pci_dev *pdev, int msix_idx, u16 tag) +{ +#ifdef CONFIG_PCI_MSI + struct msi_desc *msi_desc = NULL; + void __iomem *vec_ctrl; + u32 val; + int err = 0; + + msi_lock_descs(&pdev->dev); + + /* Find the msi_desc entry with matching msix_idx */ + msi_for_each_desc(msi_desc, &pdev->dev, MSI_DESC_ASSOCIATED) { + if (msi_desc->msi_index == msix_idx) + break; + } + + if (!msi_desc) { + err = -ENXIO; + goto err_out; + } + + /* Get the vector control register (offset 0xc) pointed by msix_idx */ + vec_ctrl = pdev->msix_base + msix_idx * PCI_MSIX_ENTRY_SIZE; + vec_ctrl += PCI_MSIX_ENTRY_VECTOR_CTRL; + + val = readl(vec_ctrl); + val &= ~PCI_MSIX_ENTRY_CTRL_ST; + val |= FIELD_PREP(PCI_MSIX_ENTRY_CTRL_ST, tag); + writel(val, vec_ctrl); + + /* Read back to flush the update */ + val = readl(vec_ctrl); + +err_out: + msi_unlock_descs(&pdev->dev); + return err; +#else + return -ENODEV; +#endif +} + /* Write tag to ST table - Return 0 if OK, otherwise -errno */ static int write_tag_to_st_table(struct pci_dev *pdev, int index, u16 tag) { @@ -304,7 +346,7 @@ int pcie_tph_set_st_entry(struct pci_dev *pdev, unsigned int index, u16 tag) switch (loc) { case PCI_TPH_LOC_MSIX: - err = pci_msix_write_tph_tag(pdev, index, tag); + err = write_tag_to_msix(pdev, index, tag); break; case PCI_TPH_LOC_CAP: err = write_tag_to_st_table(pdev, index, tag); diff --git a/drivers/soc/ti/ti_sci_inta_msi.c b/drivers/soc/ti/ti_sci_inta_msi.c index 193266f5e3f9..c36364522157 100644 --- a/drivers/soc/ti/ti_sci_inta_msi.c +++ b/drivers/soc/ti/ti_sci_inta_msi.c @@ -103,15 +103,19 @@ int ti_sci_inta_msi_domain_alloc_irqs(struct device *dev, if (ret) return ret; - guard(msi_descs_lock)(dev); + msi_lock_descs(dev); nvec = ti_sci_inta_msi_alloc_descs(dev, res); - if (nvec <= 0) - return nvec; + if (nvec <= 0) { + ret = nvec; + goto unlock; + } /* Use alloc ALL as it's unclear whether there are gaps in the indices */ ret = msi_domain_alloc_irqs_all_locked(dev, MSI_DEFAULT_DOMAIN, nvec); if (ret) dev_err(dev, "Failed to allocate IRQs %d\n", ret); +unlock: + msi_unlock_descs(dev); return ret; } EXPORT_SYMBOL_GPL(ti_sci_inta_msi_domain_alloc_irqs); diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c index 6913dda3a744..1b37449fbffc 100644 --- a/drivers/ufs/host/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -1806,19 +1806,15 @@ static void ufs_qcom_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg) ufshcd_mcq_config_esi(hba, msg); } -struct ufs_qcom_irq { - unsigned int irq; - unsigned int idx; - struct ufs_hba *hba; -}; - static irqreturn_t ufs_qcom_mcq_esi_handler(int irq, void *data) { - struct ufs_qcom_irq *qi = data; - struct ufs_hba *hba = qi->hba; - struct ufs_hw_queue *hwq = &hba->uhq[qi->idx]; + struct msi_desc *desc = data; + struct device *dev = msi_desc_to_dev(desc); + struct ufs_hba *hba = dev_get_drvdata(dev); + u32 id = desc->msi_index; + struct ufs_hw_queue *hwq = &hba->uhq[id]; - ufshcd_mcq_write_cqis(hba, 0x1, qi->idx); + ufshcd_mcq_write_cqis(hba, 0x1, id); ufshcd_mcq_poll_cqe_lock(hba, hwq); return IRQ_HANDLED; @@ -1827,7 +1823,8 @@ static irqreturn_t ufs_qcom_mcq_esi_handler(int irq, void *data) static int ufs_qcom_config_esi(struct ufs_hba *hba) { struct ufs_qcom_host *host = ufshcd_get_variant(hba); - struct ufs_qcom_irq *qi; + struct msi_desc *desc; + struct msi_desc *failed_desc = NULL; int nr_irqs, ret; if (host->esi_enabled) @@ -1838,47 +1835,47 @@ static int ufs_qcom_config_esi(struct ufs_hba *hba) * 2. Poll queues do not need ESI. */ nr_irqs = hba->nr_hw_queues - hba->nr_queues[HCTX_TYPE_POLL]; - qi = devm_kcalloc(hba->dev, nr_irqs, sizeof(*qi), GFP_KERNEL); - if (qi) - return -ENOMEM; - ret = platform_device_msi_init_and_alloc_irqs(hba->dev, nr_irqs, ufs_qcom_write_msi_msg); if (ret) { dev_err(hba->dev, "Failed to request Platform MSI %d\n", ret); - goto cleanup; + return ret; } - for (int idx = 0; idx < nr_irqs; idx++) { - qi[idx].irq = msi_get_virq(hba->dev, idx); - qi[idx].idx = idx; - qi[idx].hba = hba; - - ret = devm_request_irq(hba->dev, qi[idx].irq, ufs_qcom_mcq_esi_handler, - IRQF_SHARED, "qcom-mcq-esi", qi + idx); + msi_lock_descs(hba->dev); + msi_for_each_desc(desc, hba->dev, MSI_DESC_ALL) { + ret = devm_request_irq(hba->dev, desc->irq, + ufs_qcom_mcq_esi_handler, + IRQF_SHARED, "qcom-mcq-esi", desc); if (ret) { dev_err(hba->dev, "%s: Fail to request IRQ for %d, err = %d\n", - __func__, qi[idx].irq, ret); - qi[idx].irq = 0; - goto cleanup; + __func__, desc->irq, ret); + failed_desc = desc; + break; } } + msi_unlock_descs(hba->dev); - if (host->hw_ver.major == 6 && host->hw_ver.minor == 0 && - host->hw_ver.step == 0) { - ufshcd_rmwl(hba, ESI_VEC_MASK, - FIELD_PREP(ESI_VEC_MASK, MAX_ESI_VEC - 1), - REG_UFS_CFG3); + if (ret) { + /* Rewind */ + msi_lock_descs(hba->dev); + msi_for_each_desc(desc, hba->dev, MSI_DESC_ALL) { + if (desc == failed_desc) + break; + devm_free_irq(hba->dev, desc->irq, hba); + } + msi_unlock_descs(hba->dev); + platform_device_msi_free_irqs_all(hba->dev); + } else { + if (host->hw_ver.major == 6 && host->hw_ver.minor == 0 && + host->hw_ver.step == 0) + ufshcd_rmwl(hba, ESI_VEC_MASK, + FIELD_PREP(ESI_VEC_MASK, MAX_ESI_VEC - 1), + REG_UFS_CFG3); + ufshcd_mcq_enable_esi(hba); + host->esi_enabled = true; } - ufshcd_mcq_enable_esi(hba); - host->esi_enabled = true; - return 0; -cleanup: - for (int idx = 0; qi[idx].irq; idx++) - devm_free_irq(hba->dev, qi[idx].irq, hba); - platform_device_msi_free_irqs_all(hba->dev); - devm_kfree(hba->dev, qi); return ret; } diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 2b32a5759b22..7e57047e1564 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -216,23 +216,6 @@ const volatile void * __must_check_fn(const volatile void *val) #define return_ptr(p) return no_free_ptr(p) -/* - * Only for situations where an allocation is handed in to another function - * and consumed by that function on success. - * - * struct foo *f __free(kfree) = kzalloc(sizeof(*f), GFP_KERNEL); - * - * setup(f); - * if (some_condition) - * return -EINVAL; - * .... - * ret = bar(f); - * if (!ret) - * retain_ptr(f); - * return ret; - */ -#define retain_ptr(p) \ - __get_and_null(p, NULL) /* * DEFINE_CLASS(name, type, exit, init, init_args...): diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 5126482515cb..33ff41eef8f7 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -281,8 +281,6 @@ static inline struct fwnode_handle *irq_domain_alloc_fwnode(phys_addr_t *pa) void irq_domain_free_fwnode(struct fwnode_handle *fwnode); -DEFINE_FREE(irq_domain_free_fwnode, struct fwnode_handle *, if (_T) irq_domain_free_fwnode(_T)) - struct irq_domain_chip_generic_info; /** diff --git a/include/linux/msi.h b/include/linux/msi.h index e71c991d7f61..86e42742fd0f 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -80,6 +80,7 @@ struct device_attribute; struct irq_domain; struct irq_affinity_desc; +void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg); #ifdef CONFIG_GENERIC_MSI_IRQ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg); #else @@ -228,11 +229,8 @@ struct msi_dev_domain { int msi_setup_device_data(struct device *dev); -void __msi_lock_descs(struct device *dev); -void __msi_unlock_descs(struct device *dev); - -DEFINE_LOCK_GUARD_1(msi_descs_lock, struct device, __msi_lock_descs(_T->lock), - __msi_unlock_descs(_T->lock)); +void msi_lock_descs(struct device *dev); +void msi_unlock_descs(struct device *dev); struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid, enum msi_desc_filter filter); @@ -636,6 +634,8 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid); bool msi_match_device_irq_domain(struct device *dev, unsigned int domid, enum irq_domain_bus_token bus_token); +int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last); int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last); int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int nirqs); @@ -644,6 +644,8 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u const struct irq_affinity_desc *affdesc, union msi_instance_cookie *cookie); +void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last); void msi_domain_free_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last); void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 1951a08f0421..5c8d43cdb0a3 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -270,11 +270,16 @@ fail: return ret; } +void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) +{ + *msg = entry->msg; +} + void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) { struct msi_desc *entry = irq_get_msi_desc(irq); - *msg = entry->msg; + __get_cached_msi_msg(entry, msg); } EXPORT_SYMBOL_GPL(get_cached_msi_msg); @@ -338,30 +343,26 @@ int msi_setup_device_data(struct device *dev) } /** - * __msi_lock_descs - Lock the MSI descriptor storage of a device + * msi_lock_descs - Lock the MSI descriptor storage of a device * @dev: Device to operate on - * - * Internal function for guard(msi_descs_lock). Don't use in code. */ -void __msi_lock_descs(struct device *dev) +void msi_lock_descs(struct device *dev) { mutex_lock(&dev->msi.data->mutex); } -EXPORT_SYMBOL_GPL(__msi_lock_descs); +EXPORT_SYMBOL_GPL(msi_lock_descs); /** - * __msi_unlock_descs - Unlock the MSI descriptor storage of a device + * msi_unlock_descs - Unlock the MSI descriptor storage of a device * @dev: Device to operate on - * - * Internal function for guard(msi_descs_lock). Don't use in code. */ -void __msi_unlock_descs(struct device *dev) +void msi_unlock_descs(struct device *dev) { /* Invalidate the index which was cached by the iterator */ dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX; mutex_unlock(&dev->msi.data->mutex); } -EXPORT_SYMBOL_GPL(__msi_unlock_descs); +EXPORT_SYMBOL_GPL(msi_unlock_descs); static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid, enum msi_desc_filter filter) @@ -447,6 +448,7 @@ EXPORT_SYMBOL_GPL(msi_next_desc); unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index) { struct msi_desc *desc; + unsigned int ret = 0; bool pcimsi = false; struct xarray *xa; @@ -460,7 +462,7 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN) pcimsi = to_pci_dev(dev)->msi_enabled; - guard(msi_descs_lock)(dev); + msi_lock_descs(dev); xa = &dev->msi.data->__domains[domid].store; desc = xa_load(xa, pcimsi ? 0 : index); if (desc && desc->irq) { @@ -469,12 +471,16 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne * PCI-MSIX and platform MSI use a descriptor per * interrupt. */ - if (!pcimsi) - return desc->irq; - if (index < desc->nvec_used) - return desc->irq + index; + if (pcimsi) { + if (index < desc->nvec_used) + ret = desc->irq + index; + } else { + ret = desc->irq; + } } - return 0; + + msi_unlock_descs(dev); + return ret; } EXPORT_SYMBOL_GPL(msi_domain_get_virq); @@ -992,8 +998,9 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, void *chip_data) { struct irq_domain *domain, *parent = dev->msi.domain; + struct fwnode_handle *fwnode, *fwnalloced = NULL; + struct msi_domain_template *bundle; const struct msi_parent_ops *pops; - struct fwnode_handle *fwnode; if (!irq_domain_is_msi_parent(parent)) return false; @@ -1001,8 +1008,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, if (domid >= MSI_MAX_DEVICE_IRQDOMAINS) return false; - struct msi_domain_template *bundle __free(kfree) = - bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL); + bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL); if (!bundle) return false; @@ -1025,36 +1031,41 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, * node as they are not guaranteed to have a fwnode. They are never * looked up and always handled in the context of the device. */ - struct fwnode_handle *fwnode_alloced __free(irq_domain_free_fwnode) = NULL; - - if (!(bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE)) - fwnode = fwnode_alloced = irq_domain_alloc_named_fwnode(bundle->name); - else + if (bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE) fwnode = dev->fwnode; + else + fwnode = fwnalloced = irq_domain_alloc_named_fwnode(bundle->name); if (!fwnode) - return false; + goto free_bundle; if (msi_setup_device_data(dev)) - return false; + goto free_fwnode; + + msi_lock_descs(dev); - guard(msi_descs_lock)(dev); if (WARN_ON_ONCE(msi_get_device_domain(dev, domid))) - return false; + goto fail; if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info)) - return false; + goto fail; domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent); if (!domain) - return false; + goto fail; - /* @bundle and @fwnode_alloced are now in use. Prevent cleanup */ - retain_ptr(bundle); - retain_ptr(fwnode_alloced); domain->dev = dev; dev->msi.data->__domains[domid].domain = domain; + msi_unlock_descs(dev); return true; + +fail: + msi_unlock_descs(dev); +free_fwnode: + irq_domain_free_fwnode(fwnalloced); +free_bundle: + kfree(bundle); + return false; } /** @@ -1068,10 +1079,12 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid) struct msi_domain_info *info; struct irq_domain *domain; - guard(msi_descs_lock)(dev); + msi_lock_descs(dev); + domain = msi_get_device_domain(dev, domid); + if (!domain || !irq_domain_is_msi_device(domain)) - return; + goto unlock; dev->msi.data->__domains[domid].domain = NULL; info = domain->host_data; @@ -1080,6 +1093,9 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid) irq_domain_remove(domain); irq_domain_free_fwnode(fwnode); kfree(container_of(info, struct msi_domain_template, info)); + +unlock: + msi_unlock_descs(dev); } /** @@ -1095,14 +1111,16 @@ bool msi_match_device_irq_domain(struct device *dev, unsigned int domid, { struct msi_domain_info *info; struct irq_domain *domain; + bool ret = false; - guard(msi_descs_lock)(dev); + msi_lock_descs(dev); domain = msi_get_device_domain(dev, domid); if (domain && irq_domain_is_msi_device(domain)) { info = domain->host_data; - return info->bus_token == bus_token; + ret = info->bus_token == bus_token; } - return false; + msi_unlock_descs(dev); + return ret; } static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, @@ -1334,17 +1352,21 @@ static int msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl) } /** - * msi_domain_alloc_irqs_range - Allocate interrupts from a MSI interrupt domain + * msi_domain_alloc_irqs_range_locked - Allocate interrupts from a MSI interrupt domain * @dev: Pointer to device struct of the device for which the interrupts * are allocated * @domid: Id of the interrupt domain to operate on * @first: First index to allocate (inclusive) * @last: Last index to allocate (inclusive) * + * Must be invoked from within a msi_lock_descs() / msi_unlock_descs() + * pair. Use this for MSI irqdomains which implement their own descriptor + * allocation/free. + * * Return: %0 on success or an error code. */ -int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, - unsigned int first, unsigned int last) +int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) { struct msi_ctrl ctrl = { .domid = domid, @@ -1353,9 +1375,29 @@ int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, .nirqs = last + 1 - first, }; - guard(msi_descs_lock)(dev); return msi_domain_alloc_locked(dev, &ctrl); } + +/** + * msi_domain_alloc_irqs_range - Allocate interrupts from a MSI interrupt domain + * @dev: Pointer to device struct of the device for which the interrupts + * are allocated + * @domid: Id of the interrupt domain to operate on + * @first: First index to allocate (inclusive) + * @last: Last index to allocate (inclusive) + * + * Return: %0 on success or an error code. + */ +int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) +{ + int ret; + + msi_lock_descs(dev); + ret = msi_domain_alloc_irqs_range_locked(dev, domid, first, last); + msi_unlock_descs(dev); + return ret; +} EXPORT_SYMBOL_GPL(msi_domain_alloc_irqs_range); /** @@ -1458,8 +1500,12 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u const struct irq_affinity_desc *affdesc, union msi_instance_cookie *icookie) { - guard(msi_descs_lock)(dev); - return __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie); + struct msi_map map; + + msi_lock_descs(dev); + map = __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie); + msi_unlock_descs(dev); + return map; } /** @@ -1496,11 +1542,13 @@ int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq, icookie.value = ((u64)type << 32) | hwirq; - guard(msi_descs_lock)(dev); + msi_lock_descs(dev); if (WARN_ON_ONCE(msi_get_device_domain(dev, domid) != domain)) map.index = -EINVAL; else map = __msi_domain_alloc_irq_at(dev, domid, MSI_ANY_INDEX, NULL, &icookie); + msi_unlock_descs(dev); + return map.index >= 0 ? map.virq : map.index; } @@ -1570,8 +1618,8 @@ static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl) * @first: First index to free (inclusive) * @last: Last index to free (inclusive) */ -static void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, - unsigned int first, unsigned int last) +void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) { struct msi_ctrl ctrl = { .domid = domid, @@ -1593,8 +1641,9 @@ static void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int d void msi_domain_free_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last) { - guard(msi_descs_lock)(dev); + msi_lock_descs(dev); msi_domain_free_irqs_range_locked(dev, domid, first, last); + msi_unlock_descs(dev); } EXPORT_SYMBOL_GPL(msi_domain_free_irqs_all); @@ -1624,8 +1673,9 @@ void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid) */ void msi_domain_free_irqs_all(struct device *dev, unsigned int domid) { - guard(msi_descs_lock)(dev); + msi_lock_descs(dev); msi_domain_free_irqs_all_locked(dev, domid); + msi_unlock_descs(dev); } /** @@ -1644,11 +1694,12 @@ void msi_device_domain_free_wired(struct irq_domain *domain, unsigned int virq) if (WARN_ON_ONCE(!dev || !desc || domain->bus_token != DOMAIN_BUS_WIRED_TO_MSI)) return; - guard(msi_descs_lock)(dev); - if (WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) - return; - msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index, - desc->msi_index); + msi_lock_descs(dev); + if (!WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) { + msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index, + desc->msi_index); + } + msi_unlock_descs(dev); } /** -- 2.51.0 From 1e7857b28020ba57ca7fdafae7ac855ba326c697 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 31 Mar 2025 14:19:55 -0700 Subject: [PATCH 12/16] x86: don't re-generate cpufeaturemasks.h so eagerly It turns out the code to generate the x86 cpufeaturemasks.h header was way too aggressive, and would re-generate it whenever the timestamp on the kernel config file changed. Now, the regular 'make *config' tools are fairly careful to not rewrite the kernel config file unless the contents change, but other usecases aren't that careful. Michael Kelley reports that 'make-kpkg' ends up doing "make syncconfig" multiple times in prepping to build, and will modify the config file in the process (and then modify it back, but by then the timestamps have changed). Jakub Kicinski reports that the netdev CI does something similar in how it generates the config file in multiple steps. In both cases, the config file timestamp updates then cause the cpufeaturemasks.h file to be regenerated, and that in turn then causes lots of unnecessary rebuilds due to all the normal dependencies. Fix it by using our 'filechk' infrastructure in the Makefile to generate the header file. That will only write a new version of the file if the contents of the file have actually changed. Fixes: 841326332bcb ("x86/cpufeatures: Generate the header based on build config") Reported-by: Michael Kelley Reported-by: Jakub Kicinski Link: https://lore.kernel.org/all/SN6PR02MB415756D1829740F6E8AC11D1D4D82@SN6PR02MB4157.namprd02.prod.outlook.com/ Link: https://lore.kernel.org/all/20250328162311.08134fa6@kernel.org/ Cc: Peter Anvin Signed-off-by: Linus Torvalds --- arch/x86/Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 0fc7e8fd1a2e..27efe2dc2aa8 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -277,12 +277,11 @@ cpufeaturemasks.hdr := arch/x86/include/generated/asm/cpufeaturemasks.h cpufeaturemasks.awk := $(srctree)/arch/x86/tools/cpufeaturemasks.awk cpufeatures_hdr := $(srctree)/arch/x86/include/asm/cpufeatures.h targets += $(cpufeaturemasks.hdr) -quiet_cmd_gen_featuremasks = GEN $@ - cmd_gen_featuremasks = $(AWK) -f $(cpufeaturemasks.awk) $(cpufeatures_hdr) $(KCONFIG_CONFIG) > $@ + filechk_gen_featuremasks = $(AWK) -f $(cpufeaturemasks.awk) $(cpufeatures_hdr) $(KCONFIG_CONFIG) $(cpufeaturemasks.hdr): $(cpufeaturemasks.awk) $(cpufeatures_hdr) $(KCONFIG_CONFIG) FORCE $(shell mkdir -p $(dir $@)) - $(call if_changed,gen_featuremasks) + $(call filechk,gen_featuremasks) archprepare: $(cpufeaturemasks.hdr) ### -- 2.51.0 From 2510859475d7f46ed7940db0853f3342bf1b65ee Mon Sep 17 00:00:00 2001 From: Roman Smirnov Date: Mon, 31 Mar 2025 11:22:49 +0300 Subject: [PATCH 13/16] cifs: fix integer overflow in match_server() The echo_interval is not limited in any way during mounting, which makes it possible to write a large number to it. This can cause an overflow when multiplying ctx->echo_interval by HZ in match_server(). Add constraints for echo_interval to smb3_fs_context_parse_param(). Found by Linux Verification Center (linuxtesting.org) with Svace. Fixes: adfeb3e00e8e1 ("cifs: Make echo interval tunable") Cc: stable@vger.kernel.org Signed-off-by: Roman Smirnov Signed-off-by: Steve French --- fs/smb/client/fs_context.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index bdb762d398af..9c3ded0cf006 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -1383,6 +1383,11 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->closetimeo = HZ * result.uint_32; break; case Opt_echo_interval: + if (result.uint_32 < SMB_ECHO_INTERVAL_MIN || + result.uint_32 > SMB_ECHO_INTERVAL_MAX) { + cifs_errorf(fc, "echo interval is out of bounds\n"); + goto cifs_parse_mount_err; + } ctx->echo_interval = result.uint_32; break; case Opt_snapshot: -- 2.51.0 From be5d361e3083a469385eff34b46ad58eb97b1e38 Mon Sep 17 00:00:00 2001 From: Roman Smirnov Date: Mon, 31 Mar 2025 11:22:50 +0300 Subject: [PATCH 14/16] cifs: remove unreachable code in cifs_get_tcp_session() echo_interval is checked at mount time, the code has become unreachable. Signed-off-by: Roman Smirnov Signed-off-by: Steve French --- fs/smb/client/connect.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index d7bad2c3af37..0721e557f2e0 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -1731,12 +1731,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, */ tcp_ses->tcpStatus = CifsNew; ++tcp_ses->srv_count; + tcp_ses->echo_interval = ctx->echo_interval * HZ; - if (ctx->echo_interval >= SMB_ECHO_INTERVAL_MIN && - ctx->echo_interval <= SMB_ECHO_INTERVAL_MAX) - tcp_ses->echo_interval = ctx->echo_interval * HZ; - else - tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ; if (tcp_ses->rdma) { #ifndef CONFIG_CIFS_SMB_DIRECT cifs_dbg(VFS, "CONFIG_CIFS_SMB_DIRECT is not enabled\n"); -- 2.51.0 From a091d9711bdee46a76fa14fad31cb261a6dad74a Mon Sep 17 00:00:00 2001 From: Wang Zhaolong Date: Mon, 31 Mar 2025 21:33:13 +0800 Subject: [PATCH 15/16] smb:client: smb: client: Add reverse mapping from tcon to superblocks Currently, when a SMB connection is reset and renegotiated with the server, there's no way to update all related mount points with new negotiated sizes. This is because while superblocks (cifs_sb_info) maintain references to tree connections (tcon) through tcon_link structures, there is no reverse mapping from a tcon back to all the superblocks using it. This patch adds a bidirectional relationship between tcon and cifs_sb_info structures by: 1. Adding a cifs_sb_list to tcon structure with appropriate locking 2. Adding tcon_sb_link to cifs_sb_info to join the list 3. Managing the list entries during mount and umount operations The bidirectional relationship enables future functionality to locate and update all superblocks connected to a specific tree connection, such as: - Updating negotiated parameters after reconnection - Efficiently notifying all affected mounts of capability changes This is the first part of a series to improve connection resilience by keeping all mount parameters in sync with server capabilities after reconnection. Signed-off-by: Wang Zhaolong Signed-off-by: Steve French --- fs/smb/client/cifs_fs_sb.h | 1 + fs/smb/client/cifsglob.h | 3 ++- fs/smb/client/connect.c | 15 +++++++++++++++ fs/smb/client/misc.c | 2 ++ 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/cifs_fs_sb.h b/fs/smb/client/cifs_fs_sb.h index 651759192280..5e8d163cb5f8 100644 --- a/fs/smb/client/cifs_fs_sb.h +++ b/fs/smb/client/cifs_fs_sb.h @@ -49,6 +49,7 @@ struct cifs_sb_info { struct rb_root tlink_tree; + struct list_head tcon_sb_link; spinlock_t tlink_tree_lock; struct tcon_link *master_tlink; struct nls_table *local_nls; diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 6ae170a2a042..2cb352c16c1a 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1321,7 +1321,8 @@ struct cifs_tcon { #endif struct list_head pending_opens; /* list of incomplete opens */ struct cached_fids *cfids; - /* BB add field for back pointer to sb struct(s)? */ + struct list_head cifs_sb_list; + spinlock_t sb_list_lock; #ifdef CONFIG_CIFS_DFS_UPCALL struct delayed_work dfs_cache_work; struct list_head dfs_ses_list; diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 0721e557f2e0..2349597d5bfc 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -3477,6 +3477,7 @@ int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb) struct smb3_fs_context *ctx = cifs_sb->ctx; INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks); + INIT_LIST_HEAD(&cifs_sb->tcon_sb_link); spin_lock_init(&cifs_sb->tlink_tree_lock); cifs_sb->tlink_tree = RB_ROOT; @@ -3709,6 +3710,10 @@ static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, tlink_rb_insert(&cifs_sb->tlink_tree, tlink); spin_unlock(&cifs_sb->tlink_tree_lock); + spin_lock(&tcon->sb_list_lock); + list_add(&cifs_sb->tcon_sb_link, &tcon->cifs_sb_list); + spin_unlock(&tcon->sb_list_lock); + queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks, TLINK_IDLE_EXPIRE); return 0; @@ -4050,9 +4055,19 @@ cifs_umount(struct cifs_sb_info *cifs_sb) struct rb_root *root = &cifs_sb->tlink_tree; struct rb_node *node; struct tcon_link *tlink; + struct cifs_tcon *tcon = NULL; cancel_delayed_work_sync(&cifs_sb->prune_tlinks); + if (cifs_sb->master_tlink) { + tcon = cifs_sb->master_tlink->tl_tcon; + if (tcon) { + spin_lock(&tcon->sb_list_lock); + list_del_init(&cifs_sb->tcon_sb_link); + spin_unlock(&tcon->sb_list_lock); + } + } + spin_lock(&cifs_sb->tlink_tree_lock); while ((node = rb_first(root))) { tlink = rb_entry(node, struct tcon_link, tl_rbnode); diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index b328dc5c7988..7b6ed9b23e71 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -137,8 +137,10 @@ tcon_info_alloc(bool dir_leases_enabled, enum smb3_tcon_ref_trace trace) spin_lock_init(&ret_buf->tc_lock); INIT_LIST_HEAD(&ret_buf->openFileList); INIT_LIST_HEAD(&ret_buf->tcon_list); + INIT_LIST_HEAD(&ret_buf->cifs_sb_list); spin_lock_init(&ret_buf->open_file_lock); spin_lock_init(&ret_buf->stat_lock); + spin_lock_init(&ret_buf->sb_list_lock); atomic_set(&ret_buf->num_local_opens, 0); atomic_set(&ret_buf->num_remote_opens, 0); ret_buf->stats_from_time = ktime_get_real_seconds(); -- 2.51.0 From 287906b20035a04a234d1a3c64f760a5678387be Mon Sep 17 00:00:00 2001 From: Wang Zhaolong Date: Mon, 31 Mar 2025 21:33:14 +0800 Subject: [PATCH 16/16] smb: client: Store original IO parameters and prevent zero IO sizes During mount option processing and negotiation with the server, the original user-specified rsize/wsize values were being modified directly. This makes it impossible to recover these values after a connection reset, leading to potential degraded performance after reconnection. The other problem is that When negotiating read and write sizes, there are cases where the negotiated values might calculate to zero, especially during reconnection when server->max_read or server->max_write might be reset. In general, these values come from the negotiation response. According to MS-SMB2 specification, these values should be at least 65536 bytes. This patch improves IO parameter handling: 1. Adds vol_rsize and vol_wsize fields to store the original user-specified values separately from the negotiated values 2. Uses got_rsize/got_wsize flags to determine if values were user-specified rather than checking for non-zero values, which is more reliable 3. Adds a prevent_zero_iosize() helper function to ensure IO sizes are never negotiated down to zero, which could happen in edge cases like when server->max_read/write is zero The changes make the CIFS client more resilient to unusual server responses and reconnection scenarios, preventing potential failures when IO sizes are calculated to be zero. Signed-off-by: Wang Zhaolong Signed-off-by: Steve French --- fs/smb/client/fs_context.c | 2 ++ fs/smb/client/fs_context.h | 3 +++ fs/smb/client/smb1ops.c | 6 +++--- fs/smb/client/smb2ops.c | 27 +++++++++++++++++++-------- fs/smb/common/smb2pdu.h | 3 +++ 5 files changed, 30 insertions(+), 11 deletions(-) diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 9c3ded0cf006..ed543325c518 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -1333,6 +1333,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_rsize: ctx->rsize = result.uint_32; ctx->got_rsize = true; + ctx->vol_rsize = ctx->rsize; break; case Opt_wsize: ctx->wsize = result.uint_32; @@ -1348,6 +1349,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->wsize, PAGE_SIZE); } } + ctx->vol_wsize = ctx->wsize; break; case Opt_acregmax: if (result.uint_32 > CIFS_MAX_ACTIMEO / HZ) { diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index 42c6b66c2c1a..23491401dac5 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -280,6 +280,9 @@ struct smb3_fs_context { bool use_client_guid:1; /* reuse existing guid for multichannel */ u8 client_guid[SMB2_CLIENT_GUID_SIZE]; + /* User-specified original r/wsize value */ + unsigned int vol_rsize; + unsigned int vol_wsize; unsigned int bsize; unsigned int rasize; unsigned int rsize; diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index 8701484805cd..06b28da60a2d 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -444,8 +444,8 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int wsize; /* start with specified wsize, or default */ - if (ctx->wsize) - wsize = ctx->wsize; + if (ctx->got_wsize) + wsize = ctx->vol_wsize; else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) wsize = CIFS_DEFAULT_IOSIZE; else @@ -497,7 +497,7 @@ cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) else defsize = server->maxBuf - sizeof(READ_RSP); - rsize = ctx->rsize ? ctx->rsize : defsize; + rsize = ctx->got_rsize ? ctx->vol_rsize : defsize; /* * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index a700e5921961..98643a546c68 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -470,6 +470,17 @@ smb2_negotiate(const unsigned int xid, return rc; } +static inline unsigned int +prevent_zero_iosize(unsigned int size, const char *type) +{ + if (size == 0) { + cifs_dbg(VFS, "SMB: Zero %ssize calculated, using minimum value %u\n", + type, CIFS_MIN_DEFAULT_IOSIZE); + return CIFS_MIN_DEFAULT_IOSIZE; + } + return size; +} + static unsigned int smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) { @@ -477,12 +488,12 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int wsize; /* start with specified wsize, or default */ - wsize = ctx->wsize ? ctx->wsize : CIFS_DEFAULT_IOSIZE; + wsize = ctx->got_wsize ? ctx->vol_wsize : CIFS_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); - return wsize; + return prevent_zero_iosize(wsize, "w"); } static unsigned int @@ -492,7 +503,7 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int wsize; /* start with specified wsize, or default */ - wsize = ctx->wsize ? ctx->wsize : SMB3_DEFAULT_IOSIZE; + wsize = ctx->got_wsize ? ctx->vol_wsize : SMB3_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { @@ -514,7 +525,7 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); - return wsize; + return prevent_zero_iosize(wsize, "w"); } static unsigned int @@ -524,13 +535,13 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int rsize; /* start with specified rsize, or default */ - rsize = ctx->rsize ? ctx->rsize : CIFS_DEFAULT_IOSIZE; + rsize = ctx->got_rsize ? ctx->vol_rsize : CIFS_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); - return rsize; + return prevent_zero_iosize(rsize, "r"); } static unsigned int @@ -540,7 +551,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int rsize; /* start with specified rsize, or default */ - rsize = ctx->rsize ? ctx->rsize : SMB3_DEFAULT_IOSIZE; + rsize = ctx->got_rsize ? ctx->vol_rsize : SMB3_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { @@ -563,7 +574,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); - return rsize; + return prevent_zero_iosize(rsize, "r"); } /* diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index c7a0efda4403..764dca80c15c 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -95,6 +95,9 @@ */ #define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024) +/* According to MS-SMB2 specification The minimum recommended value is 65536.*/ +#define CIFS_MIN_DEFAULT_IOSIZE (65536) + /* * SMB2 Header Definition * -- 2.51.0