From 66611c0475709607f398e2a5d691b1fc72fe9dfc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 21 Jan 2025 19:44:36 -0500 Subject: [PATCH 01/16] fgraph: Remove calltime and rettime from generic operations The function graph infrastructure is now generic so that kretprobes, fprobes and BPF can use it. But there is still some leftover logic that only the function graph tracer itself uses. This is the calculation of the calltime and return time of the functions. The calculation of the calltime has been moved into the function graph tracer and those users that need it so that it doesn't cause overhead to the other users. But the return function timestamp was still called. Instead of just moving the taking of the timestamp into the function graph trace remove the calltime and rettime completely from the ftrace_graph_ret structure. Instead, move it into the function graph return entry event structure and this also moves all the calltime and rettime logic out of the generic fgraph.c code and into the tracing code that uses it. This has been reported to decrease the overhead by ~27%. Link: https://lore.kernel.org/all/Z3aSuql3fnXMVMoM@krava/ Link: https://lore.kernel.org/all/173665959558.1629214.16724136597211810729.stgit@devnote2/ Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250121194436.15bdf71a@gandalf.local.home Reported-by: Jiri Olsa Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 2 -- kernel/trace/fgraph.c | 1 - kernel/trace/trace.h | 4 +++- kernel/trace/trace_entries.h | 8 +++---- kernel/trace/trace_functions_graph.c | 33 ++++++++++++++++------------ kernel/trace/trace_irqsoff.c | 5 +++-- kernel/trace/trace_sched_wakeup.c | 6 +++-- 7 files changed, 33 insertions(+), 26 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 07092dfb21a4..fbabc3d848b3 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1151,8 +1151,6 @@ struct ftrace_graph_ret { int depth; /* Number of functions that overran the depth limit for current task */ unsigned int overrun; - unsigned long long calltime; - unsigned long long rettime; } __packed; struct fgraph_ops; diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 9e6b5a71555b..5dddfc2149f6 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -826,7 +826,6 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe return (unsigned long)panic; } - trace.rettime = trace_clock_local(); if (fregs) ftrace_regs_set_instruction_pointer(fregs, ret); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 04058a9889b7..2742d14df383 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -912,7 +912,9 @@ extern int __trace_graph_retaddr_entry(struct trace_array *tr, unsigned long retaddr); extern void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, - unsigned int trace_ctx); + unsigned int trace_ctx, + u64 calltime, u64 rettime); + extern void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops); extern int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops); extern void free_fgraph_ops(struct trace_array *tr); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 82fd174ebbe0..fbfb396905a6 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -124,8 +124,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, __field_packed( unsigned long, ret, retval ) __field_packed( int, ret, depth ) __field_packed( unsigned int, ret, overrun ) - __field_packed( unsigned long long, ret, calltime) - __field_packed( unsigned long long, ret, rettime ) + __field(unsigned long long, calltime ) + __field(unsigned long long, rettime ) ), F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx", @@ -146,8 +146,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, __field_packed( unsigned long, ret, func ) __field_packed( int, ret, depth ) __field_packed( unsigned int, ret, overrun ) - __field_packed( unsigned long long, ret, calltime) - __field_packed( unsigned long long, ret, rettime ) + __field(unsigned long long, calltime ) + __field(unsigned long long, rettime ) ), F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index dc62eb93837a..54d850997c0a 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -266,12 +266,10 @@ __trace_graph_function(struct trace_array *tr, struct ftrace_graph_ret ret = { .func = ip, .depth = 0, - .calltime = time, - .rettime = time, }; __trace_graph_entry(tr, &ent, trace_ctx); - __trace_graph_return(tr, &ret, trace_ctx); + __trace_graph_return(tr, &ret, trace_ctx, time, time); } void @@ -283,8 +281,9 @@ trace_graph_function(struct trace_array *tr, } void __trace_graph_return(struct trace_array *tr, - struct ftrace_graph_ret *trace, - unsigned int trace_ctx) + struct ftrace_graph_ret *trace, + unsigned int trace_ctx, + u64 calltime, u64 rettime) { struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -296,6 +295,8 @@ void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; + entry->calltime = calltime; + entry->rettime = rettime; trace_buffer_unlock_commit_nostack(buffer, event); } @@ -317,10 +318,13 @@ void trace_graph_return(struct ftrace_graph_ret *trace, struct trace_array_cpu *data; struct fgraph_times *ftimes; unsigned int trace_ctx; + u64 calltime, rettime; long disabled; int size; int cpu; + rettime = trace_clock_local(); + ftrace_graph_addr_finish(gops, trace); if (*task_var & TRACE_GRAPH_NOTRACE) { @@ -334,7 +338,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace, handle_nosleeptime(trace, ftimes, size); - trace->calltime = ftimes->calltime; + calltime = ftimes->calltime; preempt_disable_notrace(); cpu = raw_smp_processor_id(); @@ -342,7 +346,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace, disabled = atomic_read(&data->disabled); if (likely(!disabled)) { trace_ctx = tracing_gen_ctx(); - __trace_graph_return(tr, trace, trace_ctx); + __trace_graph_return(tr, trace, trace_ctx, calltime, rettime); } preempt_enable_notrace(); } @@ -367,10 +371,8 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, handle_nosleeptime(trace, ftimes, size); - trace->calltime = ftimes->calltime; - if (tracing_thresh && - (trace->rettime - ftimes->calltime < tracing_thresh)) + (trace_clock_local() - ftimes->calltime < tracing_thresh)) return; else trace_graph_return(trace, gops, fregs); @@ -856,7 +858,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, graph_ret = &ret_entry->ret; call = &entry->graph_ent; - duration = graph_ret->rettime - graph_ret->calltime; + duration = ret_entry->rettime - ret_entry->calltime; func = call->func + iter->tr->text_delta; @@ -1137,11 +1139,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, } static enum print_line_t -print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, +print_graph_return(struct ftrace_graph_ret_entry *retentry, struct trace_seq *s, struct trace_entry *ent, struct trace_iterator *iter, u32 flags) { - unsigned long long duration = trace->rettime - trace->calltime; + struct ftrace_graph_ret *trace = &retentry->ret; + u64 calltime = retentry->calltime; + u64 rettime = retentry->rettime; + unsigned long long duration = rettime - calltime; struct fgraph_data *data = iter->private; struct trace_array *tr = iter->tr; unsigned long func; @@ -1342,7 +1347,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) case TRACE_GRAPH_RET: { struct ftrace_graph_ret_entry *field; trace_assign_type(field, entry); - return print_graph_return(&field->ret, s, entry, iter, flags); + return print_graph_return(field, s, entry, iter, flags); } case TRACE_STACK: case TRACE_FN: diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 08786c59d397..7294ad676379 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -223,6 +223,7 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace, unsigned long flags; unsigned int trace_ctx; u64 *calltime; + u64 rettime; int size; ftrace_graph_addr_finish(gops, trace); @@ -230,13 +231,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace, if (!func_prolog_dec(tr, &data, &flags)) return; + rettime = trace_clock_local(); calltime = fgraph_retrieve_data(gops->idx, &size); if (!calltime) return; - trace->calltime = *calltime; trace_ctx = tracing_gen_ctx_flags(flags); - __trace_graph_return(tr, trace, trace_ctx); + __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); atomic_dec(&data->disabled); } diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index f372252dc8bb..af30586f1aea 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -158,6 +158,7 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace, struct trace_array_cpu *data; unsigned int trace_ctx; u64 *calltime; + u64 rettime; int size; ftrace_graph_addr_finish(gops, trace); @@ -165,12 +166,13 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace, if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) return; + rettime = trace_clock_local(); + calltime = fgraph_retrieve_data(gops->idx, &size); if (!calltime) return; - trace->calltime = *calltime; - __trace_graph_return(tr, trace, trace_ctx); + __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); atomic_dec(&data->disabled); preempt_enable_notrace(); -- 2.51.0 From 5f537664e705b0bf8b7e329861f20128534f6a83 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 21 Jan 2025 09:27:22 -0800 Subject: [PATCH 02/16] cachestat: fix page cache statistics permission checking When the 'cachestat()' system call was added in commit cf264e1329fb ("cachestat: implement cachestat syscall"), it was meant to be a much more convenient (and performant) version of mincore() that didn't need mapping things into the user virtual address space in order to work. But it ended up missing the "check for writability or ownership" fix for mincore(), done in commit 134fca9063ad ("mm/mincore.c: make mincore() more conservative"). This just adds equivalent logic to 'cachestat()', modified for the file context (rather than vma). Reported-by: Sudheendra Raghav Neela Fixes: cf264e1329fb ("cachestat: implement cachestat syscall") Tested-by: Johannes Weiner Acked-by: Johannes Weiner Acked-by: Nhat Pham Signed-off-by: Linus Torvalds --- mm/filemap.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mm/filemap.c b/mm/filemap.c index 4f476411a9a2..440922a7d8f1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -4375,6 +4375,20 @@ resched: rcu_read_unlock(); } +/* + * See mincore: reveal pagecache information only for files + * that the calling process has write access to, or could (if + * tried) open for writing. + */ +static inline bool can_do_cachestat(struct file *f) +{ + if (f->f_mode & FMODE_WRITE) + return true; + if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f))) + return true; + return file_permission(f, MAY_WRITE) == 0; +} + /* * The cachestat(2) system call. * @@ -4430,6 +4444,9 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd, if (is_file_hugepages(fd_file(f))) return -EOPNOTSUPP; + if (!can_do_cachestat(fd_file(f))) + return -EPERM; + if (flags != 0) return -EINVAL; -- 2.51.0 From be125a0b8946a69cd8d91340ae14ec72ef6558fc Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 22 Jan 2025 10:18:06 +0300 Subject: [PATCH 03/16] ALSA: hda: tas2781-spi: Delete some dead code The scnprintf() function never returns negatives. And it won't return zero here either, plus if it did we'd need to fix the error code. Delete this dead code. Signed-off-by: Dan Carpenter Link: https://patch.msgid.link/d57ded9e-9969-4922-8347-67b758499483@stanley.mountain Signed-off-by: Takashi Iwai --- sound/pci/hda/tas2781_hda_spi.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sound/pci/hda/tas2781_hda_spi.c b/sound/pci/hda/tas2781_hda_spi.c index 5be71b538ce0..02794fd6003d 100644 --- a/sound/pci/hda/tas2781_hda_spi.c +++ b/sound/pci/hda/tas2781_hda_spi.c @@ -274,13 +274,9 @@ static int tascodec_spi_init(struct tasdevice_priv *tas_priv, */ guard(mutex)(&tas_priv->codec_lock); - ret = scnprintf(tas_priv->rca_binaryname, + scnprintf(tas_priv->rca_binaryname, sizeof(tas_priv->rca_binaryname), "%sRCA%d.bin", tas_priv->dev_name, tas_priv->index); - if (ret <= 0) { - dev_err(tas_priv->dev, "rca name err:0x%08x\n", ret); - return ret; - } crc8_populate_msb(tas_priv->crc8_lkp_tbl, TASDEVICE_CRC8_POLYNOMIAL); tas_priv->codec = codec; ret = request_firmware_nowait(module, FW_ACTION_UEVENT, -- 2.51.0 From 807563cdc85dac2d151d7d93676d1551d067c72b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 22 Jan 2025 10:18:13 +0300 Subject: [PATCH 04/16] ALSA: hda: tas2781-spi: Fix error code in tas2781_read_acpi() Propagate the error code from devm_gpiod_get_index_optional(). The current code returns success. Fixes: bb5f86ea50ff ("ALSA: hda/tas2781: Add tas2781 hda SPI driver") Signed-off-by: Dan Carpenter Link: https://patch.msgid.link/6103e81a-13bf-4eab-89af-f6830c14e14c@stanley.mountain Signed-off-by: Takashi Iwai --- sound/pci/hda/tas2781_hda_spi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/tas2781_hda_spi.c b/sound/pci/hda/tas2781_hda_spi.c index 02794fd6003d..eba9c3a3b944 100644 --- a/sound/pci/hda/tas2781_hda_spi.c +++ b/sound/pci/hda/tas2781_hda_spi.c @@ -447,6 +447,7 @@ static int tas2781_read_acpi(struct tas2781_hda *tas_hda, p->reset = devm_gpiod_get_index_optional(physdev, "reset", p->index, GPIOD_OUT_LOW); if (IS_ERR(p->reset)) { + ret = PTR_ERR(p->reset); dev_err_probe(p->dev, ret, "Failed on reset GPIO\n"); goto err; } -- 2.51.0 From 6aa96f780204bfdac225eb4c8f51f86c38cc1a26 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 22 Jan 2025 09:47:55 +0100 Subject: [PATCH 05/16] ALSA: hda: tas2781-spi: Fix bogus error handling in tas2781_hda_spi_probe() The error handling in tas2781_hda_spi_probe() has quite a few problems, as reported by Dan Carpenter. The code jumps to err label and calls tas2781_hda_remove(), but this call would rather crash. In some places, no error code is set properly, and the runtime PM setup is doubly done. This patch tries to address those bogus error handling. Basically we can return immediately at each error before adding the component. Also, the error code should be set properly for the unmatched SPI device name. And finally, component_add() should be added before enabling the runtime PM. Fixes: bb5f86ea50ff ("ALSA: hda/tas2781: Add tas2781 hda SPI driver") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/ae5fcd48-58ac-49a8-a434-5f779bad0fb7@stanley.mountain Link: https://patch.msgid.link/20250122084756.23876-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/tas2781_hda_spi.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/sound/pci/hda/tas2781_hda_spi.c b/sound/pci/hda/tas2781_hda_spi.c index eba9c3a3b944..a42fa990e7b9 100644 --- a/sound/pci/hda/tas2781_hda_spi.c +++ b/sound/pci/hda/tas2781_hda_spi.c @@ -1101,7 +1101,7 @@ static int tas2781_hda_spi_probe(struct spi_device *spi) tas_priv = devm_kzalloc(&spi->dev, sizeof(*tas_priv), GFP_KERNEL); if (!tas_priv) - goto err; + return -ENOMEM; tas_priv->dev = &spi->dev; tas_hda->priv = tas_priv; tas_priv->regmap = devm_regmap_init_spi(spi, &tasdevice_regmap); @@ -1109,14 +1109,16 @@ static int tas2781_hda_spi_probe(struct spi_device *spi) ret = PTR_ERR(tas_priv->regmap); dev_err(tas_priv->dev, "Failed to allocate regmap: %d\n", ret); - goto err; + return ret; } if (strstr(dev_name(&spi->dev), "TXNW2781")) { device_name = "TXNW2781"; tas_priv->save_calibration = tas2781_save_calibration; tas_priv->apply_calibration = tas2781_apply_calib; } else { - goto err; + dev_err(tas_priv->dev, "Unmatched spi dev %s\n", + dev_name(&spi->dev)); + return -ENODEV; } tas_priv->irq = spi->irq; @@ -1129,6 +1131,12 @@ static int tas2781_hda_spi_probe(struct spi_device *spi) tasdevice_spi_init(tas_priv); + ret = component_add(tas_priv->dev, &tas2781_hda_comp_ops); + if (ret) { + dev_err(tas_priv->dev, "Register component fail: %d\n", ret); + return ret; + } + pm_runtime_set_autosuspend_delay(tas_priv->dev, 3000); pm_runtime_use_autosuspend(tas_priv->dev); pm_runtime_mark_last_busy(tas_priv->dev); @@ -1138,17 +1146,7 @@ static int tas2781_hda_spi_probe(struct spi_device *spi) pm_runtime_put_autosuspend(tas_priv->dev); - ret = component_add(tas_priv->dev, &tas2781_hda_comp_ops); - if (ret) { - dev_err(tas_priv->dev, "Register component fail: %d\n", ret); - pm_runtime_disable(tas_priv->dev); - } - -err: - if (ret) - tas2781_hda_remove(&spi->dev); - - return ret; + return 0; } static void tas2781_hda_spi_remove(struct spi_device *spi) -- 2.51.0 From 6c8ad3ab45ad0e94bfb7a9c71f2fa9c6cacea4b2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 22 Jan 2025 14:43:11 -0500 Subject: [PATCH 06/16] atomic64: Use arch_spin_locks instead of raw_spin_locks raw_spin_locks can be traced by lockdep or tracing itself. Atomic64 operations can be used in the tracing infrastructure. When an architecture does not have true atomic64 operations it can use the generic version that disables interrupts and uses spin_locks. The tracing ring buffer code uses atomic64 operations for the time keeping. But because some architectures use the default operations, the locking inside the atomic operations can cause an infinite recursion. As atomic64 implementation is architecture specific, it should not be using raw_spin_locks() but instead arch_spin_locks as that is the purpose of arch_spin_locks. To be used in architecture specific implementations of generic infrastructure like atomic64 operations. Note, by switching from raw_spin_locks to arch_spin_locks, the locks taken to emulate the atomic64 operations will not have lockdep, mmio, or any kind of checks done on them. They will not even disable preemption, although the code will disable interrupts preventing the tasks that hold the locks from being preempted. As the locks held are done so for very short periods of time, and the logic is only done to emulate atomic64, not having them be instrumented should not be an issue. Cc: stable@vger.kernel.org Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Cc: Andreas Larsson Link: https://lore.kernel.org/20250122144311.64392baf@gandalf.local.home Fixes: c84897c0ff592 ("ring-buffer: Remove 32bit timestamp logic") Closes: https://lore.kernel.org/all/86fb4f86-a0e4-45a2-a2df-3154acc4f086@gaisler.com/ Reported-by: Ludwig Rydberg Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- lib/atomic64.c | 78 +++++++++++++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 30 deletions(-) diff --git a/lib/atomic64.c b/lib/atomic64.c index caf895789a1e..1a72bba36d24 100644 --- a/lib/atomic64.c +++ b/lib/atomic64.c @@ -25,15 +25,15 @@ * Ensure each lock is in a separate cacheline. */ static union { - raw_spinlock_t lock; + arch_spinlock_t lock; char pad[L1_CACHE_BYTES]; } atomic64_lock[NR_LOCKS] __cacheline_aligned_in_smp = { [0 ... (NR_LOCKS - 1)] = { - .lock = __RAW_SPIN_LOCK_UNLOCKED(atomic64_lock.lock), + .lock = __ARCH_SPIN_LOCK_UNLOCKED, }, }; -static inline raw_spinlock_t *lock_addr(const atomic64_t *v) +static inline arch_spinlock_t *lock_addr(const atomic64_t *v) { unsigned long addr = (unsigned long) v; @@ -45,12 +45,14 @@ static inline raw_spinlock_t *lock_addr(const atomic64_t *v) s64 generic_atomic64_read(const atomic64_t *v) { unsigned long flags; - raw_spinlock_t *lock = lock_addr(v); + arch_spinlock_t *lock = lock_addr(v); s64 val; - raw_spin_lock_irqsave(lock, flags); + local_irq_save(flags); + arch_spin_lock(lock); val = v->counter; - raw_spin_unlock_irqrestore(lock, flags); + arch_spin_unlock(lock); + local_irq_restore(flags); return val; } EXPORT_SYMBOL(generic_atomic64_read); @@ -58,11 +60,13 @@ EXPORT_SYMBOL(generic_atomic64_read); void generic_atomic64_set(atomic64_t *v, s64 i) { unsigned long flags; - raw_spinlock_t *lock = lock_addr(v); + arch_spinlock_t *lock = lock_addr(v); - raw_spin_lock_irqsave(lock, flags); + local_irq_save(flags); + arch_spin_lock(lock); v->counter = i; - raw_spin_unlock_irqrestore(lock, flags); + arch_spin_unlock(lock); + local_irq_restore(flags); } EXPORT_SYMBOL(generic_atomic64_set); @@ -70,11 +74,13 @@ EXPORT_SYMBOL(generic_atomic64_set); void generic_atomic64_##op(s64 a, atomic64_t *v) \ { \ unsigned long flags; \ - raw_spinlock_t *lock = lock_addr(v); \ + arch_spinlock_t *lock = lock_addr(v); \ \ - raw_spin_lock_irqsave(lock, flags); \ + local_irq_save(flags); \ + arch_spin_lock(lock); \ v->counter c_op a; \ - raw_spin_unlock_irqrestore(lock, flags); \ + arch_spin_unlock(lock); \ + local_irq_restore(flags); \ } \ EXPORT_SYMBOL(generic_atomic64_##op); @@ -82,12 +88,14 @@ EXPORT_SYMBOL(generic_atomic64_##op); s64 generic_atomic64_##op##_return(s64 a, atomic64_t *v) \ { \ unsigned long flags; \ - raw_spinlock_t *lock = lock_addr(v); \ + arch_spinlock_t *lock = lock_addr(v); \ s64 val; \ \ - raw_spin_lock_irqsave(lock, flags); \ + local_irq_save(flags); \ + arch_spin_lock(lock); \ val = (v->counter c_op a); \ - raw_spin_unlock_irqrestore(lock, flags); \ + arch_spin_unlock(lock); \ + local_irq_restore(flags); \ return val; \ } \ EXPORT_SYMBOL(generic_atomic64_##op##_return); @@ -96,13 +104,15 @@ EXPORT_SYMBOL(generic_atomic64_##op##_return); s64 generic_atomic64_fetch_##op(s64 a, atomic64_t *v) \ { \ unsigned long flags; \ - raw_spinlock_t *lock = lock_addr(v); \ + arch_spinlock_t *lock = lock_addr(v); \ s64 val; \ \ - raw_spin_lock_irqsave(lock, flags); \ + local_irq_save(flags); \ + arch_spin_lock(lock); \ val = v->counter; \ v->counter c_op a; \ - raw_spin_unlock_irqrestore(lock, flags); \ + arch_spin_unlock(lock); \ + local_irq_restore(flags); \ return val; \ } \ EXPORT_SYMBOL(generic_atomic64_fetch_##op); @@ -131,14 +141,16 @@ ATOMIC64_OPS(xor, ^=) s64 generic_atomic64_dec_if_positive(atomic64_t *v) { unsigned long flags; - raw_spinlock_t *lock = lock_addr(v); + arch_spinlock_t *lock = lock_addr(v); s64 val; - raw_spin_lock_irqsave(lock, flags); + local_irq_save(flags); + arch_spin_lock(lock); val = v->counter - 1; if (val >= 0) v->counter = val; - raw_spin_unlock_irqrestore(lock, flags); + arch_spin_unlock(lock); + local_irq_restore(flags); return val; } EXPORT_SYMBOL(generic_atomic64_dec_if_positive); @@ -146,14 +158,16 @@ EXPORT_SYMBOL(generic_atomic64_dec_if_positive); s64 generic_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n) { unsigned long flags; - raw_spinlock_t *lock = lock_addr(v); + arch_spinlock_t *lock = lock_addr(v); s64 val; - raw_spin_lock_irqsave(lock, flags); + local_irq_save(flags); + arch_spin_lock(lock); val = v->counter; if (val == o) v->counter = n; - raw_spin_unlock_irqrestore(lock, flags); + arch_spin_unlock(lock); + local_irq_restore(flags); return val; } EXPORT_SYMBOL(generic_atomic64_cmpxchg); @@ -161,13 +175,15 @@ EXPORT_SYMBOL(generic_atomic64_cmpxchg); s64 generic_atomic64_xchg(atomic64_t *v, s64 new) { unsigned long flags; - raw_spinlock_t *lock = lock_addr(v); + arch_spinlock_t *lock = lock_addr(v); s64 val; - raw_spin_lock_irqsave(lock, flags); + local_irq_save(flags); + arch_spin_lock(lock); val = v->counter; v->counter = new; - raw_spin_unlock_irqrestore(lock, flags); + arch_spin_unlock(lock); + local_irq_restore(flags); return val; } EXPORT_SYMBOL(generic_atomic64_xchg); @@ -175,14 +191,16 @@ EXPORT_SYMBOL(generic_atomic64_xchg); s64 generic_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { unsigned long flags; - raw_spinlock_t *lock = lock_addr(v); + arch_spinlock_t *lock = lock_addr(v); s64 val; - raw_spin_lock_irqsave(lock, flags); + local_irq_save(flags); + arch_spin_lock(lock); val = v->counter; if (val != u) v->counter += a; - raw_spin_unlock_irqrestore(lock, flags); + arch_spin_unlock(lock); + local_irq_restore(flags); return val; } -- 2.51.0 From 0a9b00e5e5c5fc3c77cbfd01e6ffbe77fc7fe74a Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Fri, 17 Jan 2025 17:38:56 -0300 Subject: [PATCH 07/16] smb: client: get rid of TCP_Server_Info::refpath_lock TCP_Server_Info::leaf_fullpath is allocated in cifs_get_tcp_session() and never changed afterwards, so there is no need to serialize its access. Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 13 ++----------- fs/smb/client/connect.c | 20 ++++---------------- fs/smb/client/dfs_cache.c | 4 +--- 3 files changed, 7 insertions(+), 30 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index c747b6f9baca..49ffc040f736 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -811,18 +811,9 @@ struct TCP_Server_Info { bool use_swn_dstaddr; struct sockaddr_storage swn_dstaddr; #endif - struct mutex refpath_lock; /* protects leaf_fullpath */ /* - * leaf_fullpath: Canonical DFS referral path related to this - * connection. - * It is used in DFS cache refresher, reconnect and may - * change due to nested DFS links. - * - * Protected by @refpath_lock and @srv_lock. The @refpath_lock is - * mostly used for not requiring a copy of @leaf_fullpath when getting - * cached or new DFS referrals (which might also sleep during I/O). - * While @srv_lock is held for making string and NULL comparisons against - * both fields as in mount(2) and cache refresh. + * Canonical DFS referral path used in cifs_reconnect() for failover as + * well as in DFS cache refresher. * * format: \\HOST\SHARE[\OPTIONAL PATH] */ diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 73d07d95d435..1053ef8915fb 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -492,7 +492,7 @@ static int reconnect_target_locked(struct TCP_Server_Info *server, static int reconnect_dfs_server(struct TCP_Server_Info *server) { struct dfs_cache_tgt_iterator *target_hint = NULL; - + const char *ref_path = server->leaf_fullpath + 1; DFS_CACHE_TGT_LIST(tl); int num_targets = 0; int rc = 0; @@ -505,10 +505,8 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) * through /proc/fs/cifs/dfscache or the target list is empty due to server settings after * refreshing the referral, so, in this case, default it to 1. */ - mutex_lock(&server->refpath_lock); - if (!dfs_cache_noreq_find(server->leaf_fullpath + 1, NULL, &tl)) + if (!dfs_cache_noreq_find(ref_path, NULL, &tl)) num_targets = dfs_cache_get_nr_tgts(&tl); - mutex_unlock(&server->refpath_lock); if (!num_targets) num_targets = 1; @@ -552,9 +550,7 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) mod_delayed_work(cifsiod_wq, &server->reconnect, 0); } while (server->tcpStatus == CifsNeedReconnect); - mutex_lock(&server->refpath_lock); - dfs_cache_noreq_update_tgthint(server->leaf_fullpath + 1, target_hint); - mutex_unlock(&server->refpath_lock); + dfs_cache_noreq_update_tgthint(ref_path, target_hint); dfs_cache_free_tgts(&tl); /* Need to set up echo worker again once connection has been established */ @@ -569,13 +565,8 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) { - mutex_lock(&server->refpath_lock); - if (!server->leaf_fullpath) { - mutex_unlock(&server->refpath_lock); + if (!server->leaf_fullpath) return __cifs_reconnect(server, mark_smb_session); - } - mutex_unlock(&server->refpath_lock); - return reconnect_dfs_server(server); } #else @@ -1754,9 +1745,6 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request); INIT_DELAYED_WORK(&tcp_ses->reconnect, smb2_reconnect_server); mutex_init(&tcp_ses->reconnect_mutex); -#ifdef CONFIG_CIFS_DFS_UPCALL - mutex_init(&tcp_ses->refpath_lock); -#endif memcpy(&tcp_ses->srcaddr, &ctx->srcaddr, sizeof(tcp_ses->srcaddr)); memcpy(&tcp_ses->dstaddr, &ctx->dstaddr, diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c index 72527623f433..5022bb1f122a 100644 --- a/fs/smb/client/dfs_cache.c +++ b/fs/smb/client/dfs_cache.c @@ -1141,13 +1141,11 @@ static char *get_ses_refpath(struct cifs_ses *ses) struct TCP_Server_Info *server = ses->server; char *path = ERR_PTR(-ENOENT); - mutex_lock(&server->refpath_lock); if (server->leaf_fullpath) { - path = kstrdup(server->leaf_fullpath + 1, GFP_ATOMIC); + path = kstrdup(server->leaf_fullpath + 1, GFP_KERNEL); if (!path) path = ERR_PTR(-ENOMEM); } - mutex_unlock(&server->refpath_lock); return path; } -- 2.51.0 From 056e91cbc9804f15704b5bc2f02f91c23b1abea1 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Fri, 17 Jan 2025 17:52:15 -0300 Subject: [PATCH 08/16] smb: client: don't check for @leaf_fullpath in match_server() The matching of DFS connections is already handled by @dfs_conn, so remove @leaf_fullpath matching altogether. Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/connect.c | 38 +++----------------------------------- 1 file changed, 3 insertions(+), 35 deletions(-) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 1053ef8915fb..880d7cf8b730 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -1526,42 +1526,10 @@ static int match_server(struct TCP_Server_Info *server, if (!cifs_match_ipaddr((struct sockaddr *)&ctx->srcaddr, (struct sockaddr *)&server->srcaddr)) return 0; - /* - * When matching cifs.ko superblocks (@match_super == true), we can't - * really match either @server->leaf_fullpath or @server->dstaddr - * directly since this @server might belong to a completely different - * server -- in case of domain-based DFS referrals or DFS links -- as - * provided earlier by mount(2) through 'source' and 'ip' options. - * - * Otherwise, match the DFS referral in @server->leaf_fullpath or the - * destination address in @server->dstaddr. - * - * When using 'nodfs' mount option, we avoid sharing it with DFS - * connections as they might failover. - */ - if (!match_super) { - if (!ctx->nodfs) { - if (server->leaf_fullpath) { - if (!ctx->leaf_fullpath || - strcasecmp(server->leaf_fullpath, - ctx->leaf_fullpath)) - return 0; - } else if (ctx->leaf_fullpath) { - return 0; - } - } else if (server->leaf_fullpath) { - return 0; - } - } - /* - * Match for a regular connection (address/hostname/port) which has no - * DFS referrals set. - */ - if (!server->leaf_fullpath && - (strcasecmp(server->hostname, ctx->server_hostname) || - !match_server_address(server, addr) || - !match_port(server, addr))) + if (strcasecmp(server->hostname, ctx->server_hostname) || + !match_server_address(server, addr) || + !match_port(server, addr)) return 0; if (!match_security(server, ctx)) -- 2.51.0 From 3681c74d342db75b0d641ba60de27bf73e16e66b Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 21 Jan 2025 15:25:36 -0300 Subject: [PATCH 09/16] smb: client: handle lack of EA support in smb2_query_path_info() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit If the server doesn't support both EAs and reparse point in a file, the SMB2_QUERY_INFO request will fail with either STATUS_NO_EAS_ON_FILE or STATUS_EAS_NOT_SUPPORT in the compound chain, so ignore it as long as reparse point isn't IO_REPARSE_TAG_LX_(CHR|BLK), which would require the EAs to know about major/minor numbers. Reported-by: Pali Rohár Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/smb2inode.c | 92 +++++++++++++++++++++++++++++---------- 1 file changed, 69 insertions(+), 23 deletions(-) diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 7d3685dd655a..c97f14757c27 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -176,27 +176,27 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct kvec *out_iov, int *out_buftype, struct dentry *dentry) { - struct reparse_data_buffer *rbuf; + struct smb2_query_info_rsp *qi_rsp = NULL; struct smb2_compound_vars *vars = NULL; - struct kvec *rsp_iov, *iov; - struct smb_rqst *rqst; - int rc; - __le16 *utf16_path = NULL; __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; - struct cifs_fid fid; + struct cifs_open_info_data *idata; struct cifs_ses *ses = tcon->ses; + struct reparse_data_buffer *rbuf; struct TCP_Server_Info *server; - int num_rqst = 0, i; int resp_buftype[MAX_COMPOUND]; - struct smb2_query_info_rsp *qi_rsp = NULL; - struct cifs_open_info_data *idata; + int retries = 0, cur_sleep = 1; + __u8 delete_pending[8] = {1,}; + struct kvec *rsp_iov, *iov; struct inode *inode = NULL; - int flags = 0; - __u8 delete_pending[8] = {1, 0, 0, 0, 0, 0, 0, 0}; + __le16 *utf16_path = NULL; + struct smb_rqst *rqst; unsigned int size[2]; - void *data[2]; + struct cifs_fid fid; + int num_rqst = 0, i; unsigned int len; - int retries = 0, cur_sleep = 1; + int tmp_rc, rc; + int flags = 0; + void *data[2]; replay_again: /* reinitialize for possible replay */ @@ -639,7 +639,14 @@ finished: tcon->need_reconnect = true; } + tmp_rc = rc; for (i = 0; i < num_cmds; i++) { + char *buf = rsp_iov[i + i].iov_base; + + if (buf && resp_buftype[i + 1] != CIFS_NO_BUFFER) + rc = server->ops->map_error(buf, false); + else + rc = tmp_rc; switch (cmds[i]) { case SMB2_OP_QUERY_INFO: idata = in_iov[i].iov_base; @@ -805,6 +812,7 @@ finished: } } SMB2_close_free(&rqst[num_rqst]); + rc = tmp_rc; num_cmds += 2; if (out_iov && out_buftype) { @@ -860,22 +868,52 @@ static int parse_create_response(struct cifs_open_info_data *data, return rc; } +/* Check only if SMB2_OP_QUERY_WSL_EA command failed in the compound chain */ +static bool ea_unsupported(int *cmds, int num_cmds, + struct kvec *out_iov, int *out_buftype) +{ + int i; + + if (cmds[num_cmds - 1] != SMB2_OP_QUERY_WSL_EA) + return false; + + for (i = 1; i < num_cmds - 1; i++) { + struct smb2_hdr *hdr = out_iov[i].iov_base; + + if (out_buftype[i] == CIFS_NO_BUFFER || !hdr || + hdr->Status != STATUS_SUCCESS) + return false; + } + return true; +} + +static inline void free_rsp_iov(struct kvec *iovs, int *buftype, int count) +{ + int i; + + for (i = 0; i < count; i++) { + free_rsp_buf(buftype[i], iovs[i].iov_base); + memset(&iovs[i], 0, sizeof(*iovs)); + buftype[i] = CIFS_NO_BUFFER; + } +} + int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, struct cifs_open_info_data *data) { + struct kvec in_iov[3], out_iov[5] = {}; + struct cached_fid *cfid = NULL; struct cifs_open_parms oparms; - __u32 create_options = 0; struct cifsFileInfo *cfile; - struct cached_fid *cfid = NULL; + __u32 create_options = 0; + int out_buftype[5] = {}; struct smb2_hdr *hdr; - struct kvec in_iov[3], out_iov[3] = {}; - int out_buftype[3] = {}; + int num_cmds = 0; int cmds[3]; bool islink; - int i, num_cmds = 0; int rc, rc2; data->adjust_tz = false; @@ -945,14 +983,14 @@ int smb2_query_path_info(const unsigned int xid, if (rc || !data->reparse_point) goto out; - if (!tcon->posix_extensions) - cmds[num_cmds++] = SMB2_OP_QUERY_WSL_EA; /* * Skip SMB2_OP_GET_REPARSE if symlink already parsed in create * response. */ if (data->reparse.tag != IO_REPARSE_TAG_SYMLINK) cmds[num_cmds++] = SMB2_OP_GET_REPARSE; + if (!tcon->posix_extensions) + cmds[num_cmds++] = SMB2_OP_QUERY_WSL_EA; oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_READ_ATTRIBUTES | @@ -960,9 +998,18 @@ int smb2_query_path_info(const unsigned int xid, FILE_OPEN, create_options | OPEN_REPARSE_POINT, ACL_NO_MODE); cifs_get_readable_path(tcon, full_path, &cfile); + free_rsp_iov(out_iov, out_buftype, ARRAY_SIZE(out_iov)); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, in_iov, cmds, num_cmds, - cfile, NULL, NULL, NULL); + cfile, out_iov, out_buftype, NULL); + if (rc && ea_unsupported(cmds, num_cmds, + out_iov, out_buftype)) { + if (data->reparse.tag != IO_REPARSE_TAG_LX_BLK && + data->reparse.tag != IO_REPARSE_TAG_LX_CHR) + rc = 0; + else + rc = -EOPNOTSUPP; + } break; case -EREMOTE: break; @@ -980,8 +1027,7 @@ int smb2_query_path_info(const unsigned int xid, } out: - for (i = 0; i < ARRAY_SIZE(out_buftype); i++) - free_rsp_buf(out_buftype[i], out_iov[i].iov_base); + free_rsp_iov(out_iov, out_buftype, ARRAY_SIZE(out_iov)); return rc; } -- 2.51.0 From a3a860bc0fd6c07332e4911cf9a238d20de90173 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Fri, 27 Dec 2024 17:39:09 +0200 Subject: [PATCH 10/16] tpm: Change to kvalloc() in eventlog/acpi.c The following failure was reported on HPE ProLiant D320: [ 10.693310][ T1] tpm_tis STM0925:00: 2.0 TPM (device-id 0x3, rev-id 0) [ 10.848132][ T1] ------------[ cut here ]------------ [ 10.853559][ T1] WARNING: CPU: 59 PID: 1 at mm/page_alloc.c:4727 __alloc_pages_noprof+0x2ca/0x330 [ 10.862827][ T1] Modules linked in: [ 10.866671][ T1] CPU: 59 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.12.0-lp155.2.g52785e2-default #1 openSUSE Tumbleweed (unreleased) 588cd98293a7c9eba9013378d807364c088c9375 [ 10.882741][ T1] Hardware name: HPE ProLiant DL320 Gen12/ProLiant DL320 Gen12, BIOS 1.20 10/28/2024 [ 10.892170][ T1] RIP: 0010:__alloc_pages_noprof+0x2ca/0x330 [ 10.898103][ T1] Code: 24 08 e9 4a fe ff ff e8 34 36 fa ff e9 88 fe ff ff 83 fe 0a 0f 86 b3 fd ff ff 80 3d 01 e7 ce 01 00 75 09 c6 05 f8 e6 ce 01 01 <0f> 0b 45 31 ff e9 e5 fe ff ff f7 c2 00 00 08 00 75 42 89 d9 80 e1 [ 10.917750][ T1] RSP: 0000:ffffb7cf40077980 EFLAGS: 00010246 [ 10.923777][ T1] RAX: 0000000000000000 RBX: 0000000000040cc0 RCX: 0000000000000000 [ 10.931727][ T1] RDX: 0000000000000000 RSI: 000000000000000c RDI: 0000000000040cc0 The above transcript shows that ACPI pointed a 16 MiB buffer for the log events because RSI maps to the 'order' parameter of __alloc_pages_noprof(). Address the bug by moving from devm_kmalloc() to devm_add_action() and kvmalloc() and devm_add_action(). Suggested-by: Ard Biesheuvel Cc: stable@vger.kernel.org # v2.6.16+ Fixes: 55a82ab3181b ("[PATCH] tpm: add bios measurement log") Reported-by: Andy Liang Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219495 Reviewed-by: Ard Biesheuvel Reviewed-by: Stefan Berger Reviewed-by: Takashi Iwai Tested-by: Andy Liang Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/eventlog/acpi.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/char/tpm/eventlog/acpi.c b/drivers/char/tpm/eventlog/acpi.c index 69533d0bfb51..cf02ec646f46 100644 --- a/drivers/char/tpm/eventlog/acpi.c +++ b/drivers/char/tpm/eventlog/acpi.c @@ -63,6 +63,11 @@ static bool tpm_is_tpm2_log(void *bios_event_log, u64 len) return n == 0; } +static void tpm_bios_log_free(void *data) +{ + kvfree(data); +} + /* read binary bios log */ int tpm_read_log_acpi(struct tpm_chip *chip) { @@ -136,7 +141,7 @@ int tpm_read_log_acpi(struct tpm_chip *chip) } /* malloc EventLog space */ - log->bios_event_log = devm_kmalloc(&chip->dev, len, GFP_KERNEL); + log->bios_event_log = kvmalloc(len, GFP_KERNEL); if (!log->bios_event_log) return -ENOMEM; @@ -161,10 +166,16 @@ int tpm_read_log_acpi(struct tpm_chip *chip) goto err; } + ret = devm_add_action(&chip->dev, tpm_bios_log_free, log->bios_event_log); + if (ret) { + log->bios_event_log = NULL; + goto err; + } + return format; err: - devm_kfree(&chip->dev, log->bios_event_log); + tpm_bios_log_free(log->bios_event_log); log->bios_event_log = NULL; return ret; } -- 2.51.0 From be8ee18152b0523752f3a44900363838bd1573bb Mon Sep 17 00:00:00 2001 From: Atul Kumar Pant Date: Sat, 18 Jan 2025 14:09:27 +0530 Subject: [PATCH 11/16] sched_ext: Fixes typos in comments Fixes some spelling errors in the comments. Signed-off-by: Atul Kumar Pant Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 8857c0709bdd..283d7f1addc5 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -416,7 +416,7 @@ struct sched_ext_ops { /** * @update_idle: Update the idle state of a CPU - * @cpu: CPU to udpate the idle state for + * @cpu: CPU to update the idle state for * @idle: whether entering or exiting the idle state * * This operation is called when @rq's CPU goes or leaves the idle @@ -1214,7 +1214,7 @@ static bool scx_kf_allowed_if_unlocked(void) /** * nldsq_next_task - Iterate to the next task in a non-local DSQ - * @dsq: user dsq being interated + * @dsq: user dsq being iterated * @cur: current position, %NULL to start iteration * @rev: walk backwards * @@ -2078,7 +2078,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p) /* * list_add_tail() must be used. scx_ops_bypass() depends on tasks being - * appened to the runnable_list. + * appended to the runnable_list. */ list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); } @@ -2480,7 +2480,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, /* * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly * banging on the same DSQ on a large NUMA system to the point where switching - * to the bypass mode can take a long time. Inject artifical delays while the + * to the bypass mode can take a long time. Inject artificial delays while the * bypass mode is switching to guarantee timely completion. */ static void scx_ops_breather(struct rq *rq) @@ -3144,7 +3144,7 @@ static struct task_struct *pick_task_scx(struct rq *rq) * * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used * to implement the default task ordering. The older the timestamp, the higher - * prority the task - the global FIFO ordering matching the default scheduling + * priority the task - the global FIFO ordering matching the default scheduling * behavior. * * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to @@ -4590,7 +4590,7 @@ static int scx_cgroup_init(void) cgroup_warned_missing_idle = false; /* - * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk * cgroups and init, all online cgroups are initialized. */ rcu_read_lock(); -- 2.51.0 From 2279563e3a8cac367b267b09c15cf1e39c06c5cc Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 22 Jan 2025 10:05:25 +0100 Subject: [PATCH 12/16] sched_ext: Include task weight in the error state dump Report the task weight when dumping the task state during an error exit. Moreover, adjust the output format to display dsq_vtime, slice, and weight on the same line. This can help identify whether certain tasks were excessively prioritized or de-prioritized due to large niceness gaps. Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 283d7f1addc5..7081c7be5f62 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5277,9 +5277,10 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, ops_state >> SCX_OPSS_QSEQ_SHIFT); - dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu slice=%llu", - p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf, - p->scx.dsq_vtime, p->scx.slice); + dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", + p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); + dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", + p->scx.dsq_vtime, p->scx.slice, p->scx.weight); dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); if (SCX_HAS_OP(dump_task)) { -- 2.51.0 From 74ca334338a4489173d9e50775b13fa20cbd5958 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Thu, 23 Jan 2025 13:46:06 +0100 Subject: [PATCH 13/16] selftests/sched_ext: Fix enum resolution All scx enums are now automatically generated from vmlinux.h and they must be initialized using the SCX_ENUM_INIT() macro. Fix the scx selftests to use this macro to properly initialize these values. Fixes: 8da7bf2cee27 ("tools/sched_ext: Receive updates from SCX repo") Reported-by: Ihor Solodrai Closes: https://lore.kernel.org/all/Z2tNK2oFDX1OPp8C@slm.duckdns.org/ Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- .../testing/selftests/sched_ext/create_dsq.c | 10 ++++---- .../selftests/sched_ext/ddsp_bogus_dsq_fail.c | 7 ++++-- .../sched_ext/ddsp_vtimelocal_fail.c | 7 ++++-- .../selftests/sched_ext/dsp_local_on.c | 1 + .../sched_ext/enq_last_no_enq_fails.c | 10 ++++---- .../sched_ext/enq_select_cpu_fails.c | 10 ++++---- tools/testing/selftests/sched_ext/exit.c | 1 + tools/testing/selftests/sched_ext/hotplug.c | 6 +++-- .../selftests/sched_ext/init_enable_count.c | 25 ++++++------------- tools/testing/selftests/sched_ext/maximal.c | 7 ++++-- tools/testing/selftests/sched_ext/minimal.c | 10 ++++---- tools/testing/selftests/sched_ext/prog_run.c | 10 ++++---- .../testing/selftests/sched_ext/reload_loop.c | 9 +++---- .../selftests/sched_ext/select_cpu_dfl.c | 7 ++++-- .../sched_ext/select_cpu_dfl_nodispatch.c | 7 ++++-- .../selftests/sched_ext/select_cpu_dispatch.c | 7 ++++-- .../sched_ext/select_cpu_dispatch_bad_dsq.c | 7 ++++-- .../sched_ext/select_cpu_dispatch_dbl_dsp.c | 7 ++++-- .../selftests/sched_ext/select_cpu_vtime.c | 7 ++++-- 19 files changed, 88 insertions(+), 67 deletions(-) diff --git a/tools/testing/selftests/sched_ext/create_dsq.c b/tools/testing/selftests/sched_ext/create_dsq.c index fa946d9146d4..d67431f57ac6 100644 --- a/tools/testing/selftests/sched_ext/create_dsq.c +++ b/tools/testing/selftests/sched_ext/create_dsq.c @@ -14,11 +14,11 @@ static enum scx_test_status setup(void **ctx) { struct create_dsq *skel; - skel = create_dsq__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = create_dsq__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(create_dsq__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c index e65d22f23f3b..b6d13496b24e 100644 --- a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c +++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c @@ -15,8 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct ddsp_bogus_dsq_fail *skel; - skel = ddsp_bogus_dsq_fail__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = ddsp_bogus_dsq_fail__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(ddsp_bogus_dsq_fail__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c index abafee587cd6..af9ce4ee8baa 100644 --- a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c +++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c @@ -14,8 +14,11 @@ static enum scx_test_status setup(void **ctx) { struct ddsp_vtimelocal_fail *skel; - skel = ddsp_vtimelocal_fail__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = ddsp_vtimelocal_fail__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(ddsp_vtimelocal_fail__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.c b/tools/testing/selftests/sched_ext/dsp_local_on.c index 0ff27e57fe43..e1f2ce4abfe6 100644 --- a/tools/testing/selftests/sched_ext/dsp_local_on.c +++ b/tools/testing/selftests/sched_ext/dsp_local_on.c @@ -15,6 +15,7 @@ static enum scx_test_status setup(void **ctx) skel = dsp_local_on__open(); SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); skel->rodata->nr_cpus = libbpf_num_possible_cpus(); SCX_FAIL_IF(dsp_local_on__load(skel), "Failed to load skel"); diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c index 73e679953e27..d3387ae03679 100644 --- a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c +++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c @@ -15,11 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct enq_last_no_enq_fails *skel; - skel = enq_last_no_enq_fails__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = enq_last_no_enq_fails__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(enq_last_no_enq_fails__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c index dd1350e5f002..a80e3a3b3698 100644 --- a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c +++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c @@ -15,11 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct enq_select_cpu_fails *skel; - skel = enq_select_cpu_fails__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = enq_select_cpu_fails__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(enq_select_cpu_fails__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/exit.c b/tools/testing/selftests/sched_ext/exit.c index 31bcd06e21cd..9451782689de 100644 --- a/tools/testing/selftests/sched_ext/exit.c +++ b/tools/testing/selftests/sched_ext/exit.c @@ -23,6 +23,7 @@ static enum scx_test_status run(void *ctx) char buf[16]; skel = exit__open(); + SCX_ENUM_INIT(skel); skel->rodata->exit_point = tc; exit__load(skel); link = bpf_map__attach_struct_ops(skel->maps.exit_ops); diff --git a/tools/testing/selftests/sched_ext/hotplug.c b/tools/testing/selftests/sched_ext/hotplug.c index 87bf220b1bce..1c9ceb661c43 100644 --- a/tools/testing/selftests/sched_ext/hotplug.c +++ b/tools/testing/selftests/sched_ext/hotplug.c @@ -49,8 +49,10 @@ static enum scx_test_status test_hotplug(bool onlining, bool cbs_defined) SCX_ASSERT(is_cpu_online()); - skel = hotplug__open_and_load(); - SCX_ASSERT(skel); + skel = hotplug__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(hotplug__load(skel), "Failed to load skel"); /* Testing the offline -> online path, so go offline before starting */ if (onlining) diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c index 97d45f1e5597..0f3eddc7a17a 100644 --- a/tools/testing/selftests/sched_ext/init_enable_count.c +++ b/tools/testing/selftests/sched_ext/init_enable_count.c @@ -15,22 +15,6 @@ #define SCHED_EXT 7 -static struct init_enable_count * -open_load_prog(bool global) -{ - struct init_enable_count *skel; - - skel = init_enable_count__open(); - SCX_BUG_ON(!skel, "Failed to open skel"); - - if (!global) - skel->struct_ops.init_enable_count_ops->flags |= SCX_OPS_SWITCH_PARTIAL; - - SCX_BUG_ON(init_enable_count__load(skel), "Failed to load skel"); - - return skel; -} - static enum scx_test_status run_test(bool global) { struct init_enable_count *skel; @@ -40,7 +24,14 @@ static enum scx_test_status run_test(bool global) struct sched_param param = {}; pid_t pids[num_pre_forks]; - skel = open_load_prog(global); + skel = init_enable_count__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + + if (!global) + skel->struct_ops.init_enable_count_ops->flags |= SCX_OPS_SWITCH_PARTIAL; + + SCX_FAIL_IF(init_enable_count__load(skel), "Failed to load skel"); /* * Fork a bunch of children before we attach the scheduler so that we diff --git a/tools/testing/selftests/sched_ext/maximal.c b/tools/testing/selftests/sched_ext/maximal.c index f38fc973c380..c6be50a9941d 100644 --- a/tools/testing/selftests/sched_ext/maximal.c +++ b/tools/testing/selftests/sched_ext/maximal.c @@ -14,8 +14,11 @@ static enum scx_test_status setup(void **ctx) { struct maximal *skel; - skel = maximal__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = maximal__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(maximal__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/minimal.c b/tools/testing/selftests/sched_ext/minimal.c index 6c5db8ebbf8a..89f7261757ff 100644 --- a/tools/testing/selftests/sched_ext/minimal.c +++ b/tools/testing/selftests/sched_ext/minimal.c @@ -15,11 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct minimal *skel; - skel = minimal__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = minimal__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(minimal__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/prog_run.c b/tools/testing/selftests/sched_ext/prog_run.c index 3cd57ef8daaa..05974820ca69 100644 --- a/tools/testing/selftests/sched_ext/prog_run.c +++ b/tools/testing/selftests/sched_ext/prog_run.c @@ -15,11 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct prog_run *skel; - skel = prog_run__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = prog_run__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(prog_run__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/reload_loop.c b/tools/testing/selftests/sched_ext/reload_loop.c index 5cfba2d6e056..308211d80436 100644 --- a/tools/testing/selftests/sched_ext/reload_loop.c +++ b/tools/testing/selftests/sched_ext/reload_loop.c @@ -18,11 +18,10 @@ bool force_exit = false; static enum scx_test_status setup(void **ctx) { - skel = maximal__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = maximal__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(maximal__load(skel), "Failed to load skel"); return SCX_TEST_PASS; } diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c index a53a40c2d2f0..5b6e045e1109 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dfl.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c @@ -17,8 +17,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dfl *skel; - skel = select_cpu_dfl__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dfl__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dfl__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c index 1d85bf4bf3a3..9b5d232efb7f 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c @@ -17,8 +17,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dfl_nodispatch *skel; - skel = select_cpu_dfl_nodispatch__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dfl_nodispatch__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dfl_nodispatch__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c index 0309ca8785b3..80283dbc41b7 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c @@ -17,8 +17,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dispatch *skel; - skel = select_cpu_dispatch__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dispatch__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dispatch__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c index 47eb6ed7627d..5e72ebbc90a5 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c @@ -15,8 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dispatch_bad_dsq *skel; - skel = select_cpu_dispatch_bad_dsq__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dispatch_bad_dsq__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dispatch_bad_dsq__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c index 48ff028a3c46..aa85949478bc 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c @@ -15,8 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dispatch_dbl_dsp *skel; - skel = select_cpu_dispatch_dbl_dsp__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dispatch_dbl_dsp__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dispatch_dbl_dsp__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.c index b4629c2364f5..1e9b5c9bfff1 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_vtime.c +++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.c @@ -15,8 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_vtime *skel; - skel = select_cpu_vtime__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_vtime__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_vtime__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; -- 2.51.0 From e9fe182772dcb2630964724fd93e9c90b68ea0fd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Jan 2025 10:48:25 -1000 Subject: [PATCH 14/16] sched_ext: selftests/dsp_local_on: Fix sporadic failures dsp_local_on has several incorrect assumptions, one of which is that p->nr_cpus_allowed always tracks p->cpus_ptr. This is not true when a task is scheduled out while migration is disabled - p->cpus_ptr is temporarily overridden to the previous CPU while p->nr_cpus_allowed remains unchanged. This led to sporadic test faliures when dsp_local_on_dispatch() tries to put a migration disabled task to a different CPU. Fix it by keeping the previous CPU when migration is disabled. There are SCX schedulers that make use of p->nr_cpus_allowed. They should also implement explicit handling for p->migration_disabled. Signed-off-by: Tejun Heo Reported-by: Ihor Solodrai Cc: Andrea Righi Cc: Changwoo Min --- tools/testing/selftests/sched_ext/dsp_local_on.bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c index fbda6bf54671..758b479bd1ee 100644 --- a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c +++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c @@ -43,7 +43,7 @@ void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev) if (!p) return; - if (p->nr_cpus_allowed == nr_cpus) + if (p->nr_cpus_allowed == nr_cpus && !p->migration_disabled) target = bpf_get_prandom_u32() % nr_cpus; else target = scx_bpf_task_cpu(p); -- 2.51.0 From d6f3e7d564b2309e1f17e709a70eca78d7ca2bb8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Jan 2025 12:22:12 -1000 Subject: [PATCH 15/16] sched_ext: Fix incorrect autogroup migration detection scx_move_task() is called from sched_move_task() and tells the BPF scheduler that cgroup migration is being committed. sched_move_task() is used by both cgroup and autogroup migrations and scx_move_task() tried to filter out autogroup migrations by testing the destination cgroup and PF_EXITING but this is not enough. In fact, without explicitly tagging the thread which is doing the cgroup migration, there is no good way to tell apart scx_move_task() invocations for racing migration to the root cgroup and an autogroup migration. This led to scx_move_task() incorrectly ignoring a migration from non-root cgroup to an autogroup of the root cgroup triggering the following warning: WARNING: CPU: 7 PID: 1 at kernel/sched/ext.c:3725 scx_cgroup_can_attach+0x196/0x340 ... Call Trace: cgroup_migrate_execute+0x5b1/0x700 cgroup_attach_task+0x296/0x400 __cgroup_procs_write+0x128/0x140 cgroup_procs_write+0x17/0x30 kernfs_fop_write_iter+0x141/0x1f0 vfs_write+0x31d/0x4a0 __x64_sys_write+0x72/0xf0 do_syscall_64+0x82/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fix it by adding an argument to sched_move_task() that indicates whether the moving is for a cgroup or autogroup migration. After the change, scx_move_task() is called only for cgroup migrations and renamed to scx_cgroup_move_task(). Link: https://github.com/sched-ext/scx/issues/370 Fixes: 819513666966 ("sched_ext: Add cgroup support") Cc: stable@vger.kernel.org # v6.12+ Acked-by: Peter Zijlstra (Intel) Signed-off-by: Tejun Heo --- kernel/sched/autogroup.c | 4 ++-- kernel/sched/core.c | 7 ++++--- kernel/sched/ext.c | 15 +-------------- kernel/sched/ext.h | 4 ++-- kernel/sched/sched.h | 2 +- 5 files changed, 10 insertions(+), 22 deletions(-) diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index db68a964e34e..c4a3ccf6a8ac 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -150,7 +150,7 @@ void sched_autogroup_exit_task(struct task_struct *p) * see this thread after that: we can no longer use signal->autogroup. * See the PF_EXITING check in task_wants_autogroup(). */ - sched_move_task(p); + sched_move_task(p, true); } static void @@ -182,7 +182,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * sched_autogroup_exit_task(). */ for_each_thread(p, t) - sched_move_task(t); + sched_move_task(t, true); unlock_task_sighand(p, &flags); autogroup_kref_put(prev); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 901170708e2a..e77897a62442 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9042,7 +9042,7 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect * its new group. */ -void sched_move_task(struct task_struct *tsk) +void sched_move_task(struct task_struct *tsk, bool for_autogroup) { int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; @@ -9071,7 +9071,8 @@ void sched_move_task(struct task_struct *tsk) put_prev_task(rq, tsk); sched_change_group(tsk, group); - scx_move_task(tsk); + if (!for_autogroup) + scx_cgroup_move_task(tsk); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -9172,7 +9173,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; cgroup_taskset_for_each(task, css, tset) - sched_move_task(task); + sched_move_task(task, false); scx_cgroup_finish_attach(); } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7081c7be5f62..c7b159f48834 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4323,24 +4323,11 @@ err: return ops_sanitize_err("cgroup_prep_move", ret); } -void scx_move_task(struct task_struct *p) +void scx_cgroup_move_task(struct task_struct *p) { if (!scx_cgroup_enabled) return; - /* - * We're called from sched_move_task() which handles both cgroup and - * autogroup moves. Ignore the latter. - * - * Also ignore exiting tasks, because in the exit path tasks transition - * from the autogroup to the root group, so task_group_is_autogroup() - * alone isn't able to catch exiting autogroup tasks. This is safe for - * cgroup_move(), because cgroup migrations never happen for PF_EXITING - * tasks. - */ - if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING)) - return; - /* * @p must have ops.cgroup_prep_move() called on it and thus * cgrp_moving_from set. diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 4d022d17ac7d..1079b56b0f7a 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -73,7 +73,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {} int scx_tg_online(struct task_group *tg); void scx_tg_offline(struct task_group *tg); int scx_cgroup_can_attach(struct cgroup_taskset *tset); -void scx_move_task(struct task_struct *p); +void scx_cgroup_move_task(struct task_struct *p); void scx_cgroup_finish_attach(void); void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); @@ -82,7 +82,7 @@ void scx_group_set_idle(struct task_group *tg, bool idle); static inline int scx_tg_online(struct task_group *tg) { return 0; } static inline void scx_tg_offline(struct task_group *tg) {} static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -static inline void scx_move_task(struct task_struct *p) {} +static inline void scx_cgroup_move_task(struct task_struct *p) {} static inline void scx_cgroup_finish_attach(void) {} static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 38e0e323dda2..b93c8c3dc05a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -572,7 +572,7 @@ extern void sched_online_group(struct task_group *tg, extern void sched_destroy_group(struct task_group *tg); extern void sched_release_group(struct task_group *tg); -extern void sched_move_task(struct task_struct *tsk); +extern void sched_move_task(struct task_struct *tsk, bool for_autogroup); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); -- 2.51.0 From 5f52bbf2f6e0997394cf9c449d44e1c80ff4282c Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 25 Jan 2025 18:14:12 +0100 Subject: [PATCH 16/16] tools/sched_ext: Add helper to check task migration state Introduce a new helper for BPF schedulers to determine whether a task can migrate or not (supporting both SMP and UP systems). Fixes: e9fe182772dc ("sched_ext: selftests/dsp_local_on: Fix sporadic failures") Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/sched_ext/include/scx/common.bpf.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index f3e15e9efa76..f254a39b86a5 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -404,6 +404,17 @@ static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask) return (const struct cpumask *)mask; } +/* + * Return true if task @p cannot migrate to a different CPU, false + * otherwise. + */ +static inline bool is_migration_disabled(const struct task_struct *p) +{ + if (bpf_core_field_exists(p->migration_disabled)) + return p->migration_disabled; + return false; +} + /* rcu */ void bpf_rcu_read_lock(void) __ksym; void bpf_rcu_read_unlock(void) __ksym; -- 2.51.0