From 5d0b204654de25615cf712be86c3192eca68ed80 Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Thu, 27 Feb 2025 16:10:52 +0800 Subject: [PATCH 01/16] xsk: Fix __xsk_generic_xmit() error code when cq is full When the cq reservation is failed, the error code is not set which is initialized to zero in __xsk_generic_xmit(). That means the packet is not send successfully but sendto() return ok. Considering the impact on uapi, return -EAGAIN is a good idea. The cq is full usually because it is not released in time, try to send msg again is appropriate. The bug was at the very early implementation of xsk, so the Fixes tag targets the commit that introduced the changes in xsk_cq_reserve_addr_locked where this fix depends on. Fixes: e6c4047f5122 ("xsk: Use xsk_buff_pool directly for cq functions") Suggested-by: Magnus Karlsson Signed-off-by: Wang Liang Signed-off-by: Martin KaFai Lau Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250227081052.4096337-1-wangliang74@huawei.com Signed-off-by: Alexei Starovoitov --- net/xdp/xsk.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index e5d104ce7b82..5696af45bcf7 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -806,8 +806,11 @@ static int __xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - if (xsk_cq_reserve_addr_locked(xs->pool, desc.addr)) + err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr); + if (err) { + err = -EAGAIN; goto out; + } skb = xsk_build_skb(xs, &desc); if (IS_ERR(skb)) { -- 2.51.0 From 00387808d36e23c7d56e9e04a31de0be1443b0e9 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 27 Mar 2025 11:55:28 -0700 Subject: [PATCH 02/16] selftests/bpf: Fix tests after fields reorder in struct file The change in struct file [1] moved f_ref to the 3rd cache line. It made *(u64 *)file dereference invalid from the verifier point of view, because btf_struct_walk() walks into f_lock field, which is 4-byte long. Fix the selftests to deference the file pointer as a 4-byte access. [1] commit e249056c91a2 ("fs: place f_ref to 3rd cache line in struct file to resolve false sharing") Reported-by: Jakub Kicinski Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20250327185528.1740787-1-song@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_module_attach.c | 2 +- tools/testing/selftests/bpf/progs/test_subprogs_extable.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_module_attach.c b/tools/testing/selftests/bpf/progs/test_module_attach.c index fb07f5773888..7f3c233943b3 100644 --- a/tools/testing/selftests/bpf/progs/test_module_attach.c +++ b/tools/testing/selftests/bpf/progs/test_module_attach.c @@ -117,7 +117,7 @@ int BPF_PROG(handle_fexit_ret, int arg, struct file *ret) bpf_probe_read_kernel(&buf, 8, ret); bpf_probe_read_kernel(&buf, 8, (char *)ret + 256); - *(volatile long long *)ret; + *(volatile int *)ret; *(volatile int *)&ret->f_mode; return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_subprogs_extable.c b/tools/testing/selftests/bpf/progs/test_subprogs_extable.c index e2a21fbd4e44..dcac69f5928a 100644 --- a/tools/testing/selftests/bpf/progs/test_subprogs_extable.c +++ b/tools/testing/selftests/bpf/progs/test_subprogs_extable.c @@ -21,7 +21,7 @@ static __u64 test_cb(struct bpf_map *map, __u32 *key, __u64 *val, void *data) SEC("fexit/bpf_testmod_return_ptr") int BPF_PROG(handle_fexit_ret_subprogs, int arg, struct file *ret) { - *(volatile long *)ret; + *(volatile int *)ret; *(volatile int *)&ret->f_mode; bpf_for_each_map_elem(&test_array, test_cb, NULL, 0); triggered++; @@ -31,7 +31,7 @@ int BPF_PROG(handle_fexit_ret_subprogs, int arg, struct file *ret) SEC("fexit/bpf_testmod_return_ptr") int BPF_PROG(handle_fexit_ret_subprogs2, int arg, struct file *ret) { - *(volatile long *)ret; + *(volatile int *)ret; *(volatile int *)&ret->f_mode; bpf_for_each_map_elem(&test_array, test_cb, NULL, 0); triggered++; @@ -41,7 +41,7 @@ int BPF_PROG(handle_fexit_ret_subprogs2, int arg, struct file *ret) SEC("fexit/bpf_testmod_return_ptr") int BPF_PROG(handle_fexit_ret_subprogs3, int arg, struct file *ret) { - *(volatile long *)ret; + *(volatile int *)ret; *(volatile int *)&ret->f_mode; bpf_for_each_map_elem(&test_array, test_cb, NULL, 0); triggered++; -- 2.51.0 From 14d84357a0af7783a440d4a40794752ed20d2607 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 28 Mar 2025 12:31:24 -0700 Subject: [PATCH 03/16] selftests/bpf: Fix verifier_bpf_fastcall test Commit [1] moves percpu data on x86 from address 0x000... to address 0xfff... Before [1]: 159020: 0000000000030700 0 OBJECT GLOBAL DEFAULT 23 pcpu_hot After [1]: 152602: ffffffff83a3e034 4 OBJECT GLOBAL DEFAULT 35 pcpu_hot As a result, verifier_bpf_fastcall tests should now expect a negative value for pcpu_hot, IOW, the disassemble should show "r=" instead of "w=". Fix this in the test. Note that, a later change created a new variable "cpu_number" for bpf_get_smp_processor_id() [2]. The inlining logic is updated properly as part of this change, so there is no need to fix anything on the kernel side. [1] commit 9d7de2aa8b41 ("x86/percpu/64: Use relative percpu offsets") [2] commit 01c7bc5198e9 ("x86/smp: Move cpu number to percpu hot section") Reported-by: Jakub Kicinski Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20250328193124.808784-1-song@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c index a9be6ae49454..c258b0722e04 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c +++ b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c @@ -12,7 +12,7 @@ SEC("raw_tp") __arch_x86_64 __log_level(4) __msg("stack depth 8") __xlated("4: r5 = 5") -__xlated("5: w0 = ") +__xlated("5: r0 = ") __xlated("6: r0 = &(void __percpu *)(r0)") __xlated("7: r0 = *(u32 *)(r0 +0)") __xlated("8: exit") @@ -704,7 +704,7 @@ SEC("raw_tp") __arch_x86_64 __log_level(4) __msg("stack depth 32+0") __xlated("2: r1 = 1") -__xlated("3: w0 =") +__xlated("3: r0 =") __xlated("4: r0 = &(void __percpu *)(r0)") __xlated("5: r0 = *(u32 *)(r0 +0)") /* bpf_loop params setup */ @@ -753,7 +753,7 @@ __arch_x86_64 __log_level(4) __msg("stack depth 40+0") /* call bpf_get_smp_processor_id */ __xlated("2: r1 = 42") -__xlated("3: w0 =") +__xlated("3: r0 =") __xlated("4: r0 = &(void __percpu *)(r0)") __xlated("5: r0 = *(u32 *)(r0 +0)") /* call bpf_get_prandom_u32 */ -- 2.51.0 From 3f8ad18f81841a9ce70f603c45d5a278528c67e6 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 30 Mar 2025 20:38:28 -0700 Subject: [PATCH 04/16] selftests/bpf: Fix verifier_private_stack test failure Several verifier_private_stack tests failed with latest bpf-next. For example, for 'Private stack, single prog' subtest, the jitted code: func #0: 0: f3 0f 1e fa endbr64 4: 0f 1f 44 00 00 nopl (%rax,%rax) 9: 0f 1f 00 nopl (%rax) c: 55 pushq %rbp d: 48 89 e5 movq %rsp, %rbp 10: f3 0f 1e fa endbr64 14: 49 b9 58 74 8a 8f 7d 60 00 00 movabsq $0x607d8f8a7458, %r9 1e: 65 4c 03 0c 25 28 c0 48 87 addq %gs:-0x78b73fd8, %r9 27: bf 2a 00 00 00 movl $0x2a, %edi 2c: 49 89 b9 00 ff ff ff movq %rdi, -0x100(%r9) 33: 31 c0 xorl %eax, %eax 35: c9 leave 36: e9 20 5d 0f e1 jmp 0xffffffffe10f5d5b The insn 'addq %gs:-0x78b73fd8, %r9' does not match the expected regex 'addq %gs:0x{{.*}}, %r9' and this caused test failure. Fix it by changing '%gs:0x{{.*}}' to '%gs:{{.*}}' to accommodate the possible negative offset. A few other subtests are fixed in a similar way. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20250331033828.365077-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_private_stack.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_private_stack.c b/tools/testing/selftests/bpf/progs/verifier_private_stack.c index b1fbdf119553..fc91b414364e 100644 --- a/tools/testing/selftests/bpf/progs/verifier_private_stack.c +++ b/tools/testing/selftests/bpf/progs/verifier_private_stack.c @@ -27,7 +27,7 @@ __description("Private stack, single prog") __success __arch_x86_64 __jited(" movabsq $0x{{.*}}, %r9") -__jited(" addq %gs:0x{{.*}}, %r9") +__jited(" addq %gs:{{.*}}, %r9") __jited(" movl $0x2a, %edi") __jited(" movq %rdi, -0x100(%r9)") __naked void private_stack_single_prog(void) @@ -74,7 +74,7 @@ __success __arch_x86_64 /* private stack fp for the main prog */ __jited(" movabsq $0x{{.*}}, %r9") -__jited(" addq %gs:0x{{.*}}, %r9") +__jited(" addq %gs:{{.*}}, %r9") __jited(" movl $0x2a, %edi") __jited(" movq %rdi, -0x200(%r9)") __jited(" pushq %r9") @@ -122,7 +122,7 @@ __jited(" pushq %rbp") __jited(" movq %rsp, %rbp") __jited(" endbr64") __jited(" movabsq $0x{{.*}}, %r9") -__jited(" addq %gs:0x{{.*}}, %r9") +__jited(" addq %gs:{{.*}}, %r9") __jited(" pushq %r9") __jited(" callq") __jited(" popq %r9") -- 2.51.0 From 390513642ee6763c7ada07f0a1470474986e6c1c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 3 Apr 2025 12:29:30 +0100 Subject: [PATCH 05/16] io_uring: always do atomic put from iowq io_uring always switches requests to atomic refcounting for iowq execution before there is any parallilism by setting REQ_F_REFCOUNT, and the flag is not cleared until the request completes. That should be fine as long as the compiler doesn't make up a non existing value for the flags, however KCSAN still complains when the request owner changes oter flag bits: BUG: KCSAN: data-race in io_req_task_cancel / io_wq_free_work ... read to 0xffff888117207448 of 8 bytes by task 3871 on cpu 0: req_ref_put_and_test io_uring/refs.h:22 [inline] Skip REQ_F_REFCOUNT checks for iowq, we know it's set. Reported-by: syzbot+903a2ad71fb3f1e47cf5@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d880bc27fb8c3209b54641be4ff6ac02b0e5789a.1743679736.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- io_uring/refs.h | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a4065e3d13d0..c6209fe44cb1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1796,7 +1796,7 @@ struct io_wq_work *io_wq_free_work(struct io_wq_work *work) struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_kiocb *nxt = NULL; - if (req_ref_put_and_test(req)) { + if (req_ref_put_and_test_atomic(req)) { if (req->flags & IO_REQ_LINK_FLAGS) nxt = io_req_find_next(req); io_free_req(req); diff --git a/io_uring/refs.h b/io_uring/refs.h index 63982ead9f7d..0d928d87c4ed 100644 --- a/io_uring/refs.h +++ b/io_uring/refs.h @@ -17,6 +17,13 @@ static inline bool req_ref_inc_not_zero(struct io_kiocb *req) return atomic_inc_not_zero(&req->refs); } +static inline bool req_ref_put_and_test_atomic(struct io_kiocb *req) +{ + WARN_ON_ONCE(!(data_race(req->flags) & REQ_F_REFCOUNT)); + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + return atomic_dec_and_test(&req->refs); +} + static inline bool req_ref_put_and_test(struct io_kiocb *req) { if (likely(!(req->flags & REQ_F_REFCOUNT))) -- 2.51.0 From 01b91bf14f6d4893e03e357006e7af3a20c03fee Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 3 Apr 2025 18:54:02 +0800 Subject: [PATCH 06/16] block: don't grab elevator lock during queue initialization ->elevator_lock depends on queue freeze lock, see block/blk-sysfs.c. queue freeze lock depends on fs_reclaim. So don't grab elevator lock during queue initialization which needs to call kmalloc(GFP_KERNEL), and we can cut the dependency between ->elevator_lock and fs_reclaim, then the lockdep warning can be killed. This way is safe because elevator setting isn't ready to run during queue initialization. There isn't such issue in __blk_mq_update_nr_hw_queues() because memalloc_noio_save() is called before acquiring elevator lock. Fixes the following lockdep warning: https://lore.kernel.org/linux-block/67e6b425.050a0220.2f068f.007b.GAE@google.com/ Reported-by: syzbot+4c7e0f9b94ad65811efb@syzkaller.appspotmail.com Cc: Nilay Shroff Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250403105402.1334206-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 0cfd1a149f64..c2697db59109 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4464,14 +4464,12 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( return NULL; } -static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, - struct request_queue *q) +static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, + struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i, j; - /* protect against switching io scheduler */ - mutex_lock(&q->elevator_lock); for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); @@ -4504,7 +4502,19 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, xa_for_each_start(&q->hctx_table, j, hctx, j) blk_mq_exit_hctx(q, set, hctx, j); - mutex_unlock(&q->elevator_lock); +} + +static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, + struct request_queue *q, bool lock) +{ + if (lock) { + /* protect against switching io scheduler */ + mutex_lock(&q->elevator_lock); + __blk_mq_realloc_hw_ctxs(set, q); + mutex_unlock(&q->elevator_lock); + } else { + __blk_mq_realloc_hw_ctxs(set, q); + } /* unregister cpuhp callbacks for exited hctxs */ blk_mq_remove_hw_queues_cpuhp(q); @@ -4536,7 +4546,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, xa_init(&q->hctx_table); - blk_mq_realloc_hw_ctxs(set, q); + blk_mq_realloc_hw_ctxs(set, q, false); if (!q->nr_hw_queues) goto err_hctxs; @@ -5032,7 +5042,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { - blk_mq_realloc_hw_ctxs(set, q); + blk_mq_realloc_hw_ctxs(set, q, true); if (q->nr_hw_queues != set->nr_hw_queues) { int i = prev_nr_hw_queues; -- 2.51.0 From 9364f17ba40422d2661da295bb0da68ca87cc57e Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Wed, 2 Apr 2025 21:45:44 +0800 Subject: [PATCH 07/16] bcachefs: Add error handling for zlib_deflateInit2() In attempt_compress(), the return value of zlib_deflateInit2() needs to be checked. A proper implementation can be found in pstore_compress(). Add an error check and return 0 immediately if the initialzation fails. Fixes: 986e9842fb68 ("bcachefs: Compression levels") Signed-off-by: Wentao Liang Signed-off-by: Kent Overstreet --- fs/bcachefs/compress.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 85fc90342492..28ed32449913 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -371,13 +371,14 @@ static int attempt_compress(struct bch_fs *c, }; zlib_set_workspace(&strm, workspace); - zlib_deflateInit2(&strm, + if (zlib_deflateInit2(&strm, compression.level ? clamp_t(unsigned, compression.level, Z_BEST_SPEED, Z_BEST_COMPRESSION) : Z_DEFAULT_COMPRESSION, Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, - Z_DEFAULT_STRATEGY); + Z_DEFAULT_STRATEGY) != Z_OK) + return 0; if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) return 0; -- 2.51.0 From b2ffadcc7f8fd2059e389d640f9c81febd606daf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 1 Apr 2025 13:01:29 -0400 Subject: [PATCH 08/16] bcachefs: Fix scheduling while atomic from logging changes Two fixes from the recent logging changes: bch2_inconsistent(), bch2_fs_inconsistent() be called from interrupt context, or with rcu_read_lock() held. The one syzbot found is in bch2_bkey_pick_read_device bch2_dev_rcu bch2_fs_inconsistent We're starting to switch to lift the printbufs up to higher levels so we can emit better log messages and print them all in one go (avoid garbling), so that conversion will help with spotting these in the future; when we declare a printbuf it must be flagged if we're in an atomic context. Secondly, in btree_node_write_endio: 00085 BUG: sleeping function called from invalid context at include/linux/sched/mm.h:321 00085 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 618, name: bch-reclaim/fa6 00085 preempt_count: 10001, expected: 0 00085 RCU nest depth: 0, expected: 0 00085 4 locks held by bch-reclaim/fa6/618: 00085 #0: ffffff80d7ccad68 (&j->reclaim_lock){+.+.}-{4:4}, at: bch2_journal_reclaim_thread+0x84/0x198 00085 #1: ffffff80d7c84218 (&c->btree_trans_barrier){.+.+}-{0:0}, at: __bch2_trans_get+0x1c0/0x440 00085 #2: ffffff80cd3f8140 (bcachefs_btree){+.+.}-{0:0}, at: __bch2_trans_get+0x22c/0x440 00085 #3: ffffff80c3823c20 (&vblk->vqs[i].lock){-.-.}-{3:3}, at: virtblk_done+0x58/0x130 00085 irq event stamp: 328 00085 hardirqs last enabled at (327): [] finish_task_switch.isra.0+0xbc/0x2a0 00085 hardirqs last disabled at (328): [] el1_interrupt+0x20/0x60 00085 softirqs last enabled at (0): [] copy_process+0x7c8/0x2118 00085 softirqs last disabled at (0): [<0000000000000000>] 0x0 00085 Preemption disabled at: 00085 [] irq_enter_rcu+0x18/0x90 00085 CPU: 8 UID: 0 PID: 618 Comm: bch-reclaim/fa6 Not tainted 6.14.0-rc6-ktest-g04630bde23e8 #18798 00085 Hardware name: linux,dummy-virt (DT) 00085 Call trace: 00085 show_stack+0x1c/0x30 (C) 00085 dump_stack_lvl+0x84/0xc0 00085 dump_stack+0x14/0x20 00085 __might_resched+0x180/0x288 00085 __might_sleep+0x4c/0x88 00085 __kmalloc_node_track_caller_noprof+0x34c/0x3e0 00085 krealloc_noprof+0x1a0/0x2d8 00085 bch2_printbuf_make_room+0x9c/0x120 00085 bch2_prt_printf+0x60/0x1b8 00085 btree_node_write_endio+0x1b0/0x2d8 00085 bio_endio+0x138/0x1f0 00085 btree_node_write_endio+0xe8/0x2d8 00085 bio_endio+0x138/0x1f0 00085 blk_update_request+0x220/0x4c0 00085 blk_mq_end_request+0x28/0x148 00085 virtblk_request_done+0x64/0xe8 00085 blk_mq_complete_request+0x34/0x40 00085 virtblk_done+0x78/0x130 00085 vring_interrupt+0x6c/0xb0 00085 __handle_irq_event_percpu+0x8c/0x2e0 00085 handle_irq_event+0x50/0xb0 00085 handle_fasteoi_irq+0xc4/0x250 00085 handle_irq_desc+0x44/0x60 00085 generic_handle_domain_irq+0x20/0x30 00085 gic_handle_irq+0x54/0xc8 00085 call_on_irq_stack+0x24/0x40 Reported-by: syzbot+c82cd2906e2f192410bb@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 1 + fs/bcachefs/error.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index ac1f029a7eb2..5fd4a58d2ad2 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -2146,6 +2146,7 @@ static void btree_node_write_endio(struct bio *bio) if (ca && bio->bi_status) { struct printbuf buf = PRINTBUF; + buf.atomic++; prt_printf(&buf, "btree write error: %s\n ", bch2_blk_status_to_str(bio->bi_status)); bch2_btree_pos_to_text(&buf, c, b); diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index d4dfd13a8076..b885bd92834c 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -45,6 +45,8 @@ bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) bool bch2_inconsistent_error(struct bch_fs *c) { struct printbuf buf = PRINTBUF; + buf.atomic++; + printbuf_indent_add_nextline(&buf, 2); bool ret = __bch2_inconsistent_error(c, &buf); @@ -59,6 +61,7 @@ static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *tra const char *fmt, va_list args) { struct printbuf buf = PRINTBUF; + buf.atomic++; bch2_log_msg_start(c, &buf); -- 2.51.0 From 570f5050bb0739f24aeb94034d8ec134c450b4aa Mon Sep 17 00:00:00 2001 From: Bharadwaj Raju Date: Wed, 2 Apr 2025 23:45:53 +0530 Subject: [PATCH 09/16] bcachefs: use nonblocking variant of print_string_as_lines in error path The inconsistency error path calls print_string_as_lines, which calls console_lock, which is a potentially-sleeping function and so can't be called in an atomic context. Replace calls to it with the nonblocking variant which is safe to call. Signed-off-by: Bharadwaj Raju Signed-off-by: Kent Overstreet --- fs/bcachefs/error.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index b885bd92834c..baf5dfb32298 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -34,7 +34,7 @@ bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) journal_cur_seq(&c->journal)); return true; case BCH_ON_ERROR_panic: - bch2_print_string_as_lines(KERN_ERR, out->buf); + bch2_print_string_as_lines_nonblocking(KERN_ERR, out->buf); panic(bch2_fmt(c, "panic after error")); return true; default: @@ -71,7 +71,7 @@ static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *tra if (trans) bch2_trans_updates_to_text(&buf, trans); bool ret = __bch2_inconsistent_error(c, &buf); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf); printbuf_exit(&buf); return ret; -- 2.51.0 From 83d539b1b04705f972b53b4669fb587c54def0db Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 2 Apr 2025 14:31:12 -0400 Subject: [PATCH 10/16] bcachefs: Fix check_snapshot_exists() restart handling Codepaths that create entries in the snapshots btree currently call bch2_mark_snapshot(), which updates the in-memory snapshot table, before transaction commit. This is because bch2_mark_snapshot() is an atomic trigger, run with btree write locks held, and isn't allowed to fail - but it might need to reallocate the table, hence we call it early when we're still allowed to fail. This is generally harmless - if we fail, we'll have left an entry in the snapshots table around, but nothing will reference it and it'll get overwritten if reused by another transaction. But check_snapshot_exists(), which reconstructs snapshots when the snapshots btree has been corrupted or lost, was erronously rechecking if the snapshot exists inside the transaction commit loop - so on transaction restart (in this case mem_realloced), the second iteration would return without repairing. This code needs some cleanup: splitting out a "maybe realloc snapshots table" helper would have avoided this, that will be in the next patch. Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index c8536eb36060..b7de29aed839 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -843,9 +843,6 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) { struct bch_fs *c = trans->c; - if (bch2_snapshot_exists(c, id)) - return 0; - /* Do we need to reconstruct the snapshot_tree entry as well? */ struct btree_iter iter; struct bkey_s_c k; -- 2.51.0 From 39ebd74864f5c4f7d44f1fe026c71a270631186b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 2 Apr 2025 12:48:23 -0400 Subject: [PATCH 11/16] bcachefs: Fix null ptr deref in invalidate_one_bucket() bch2_backpointer_get_key() returns bkey_s_c_null when the target isn't found. backpointer_get_key() flags the error, so there's nothing else to do here - just skip it and move on. Link: https://github.com/koverstreet/bcachefs/issues/847 Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 6b6c2521c11f..94ea9e49aec4 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -2084,6 +2084,9 @@ static int invalidate_one_bp(struct btree_trans *trans, if (ret) return ret; + if (!extent_k.k) + return 0; + struct bkey_i *n = bch2_bkey_make_mut(trans, &extent_iter, &extent_k, BTREE_UPDATE_internal_snapshot_node); -- 2.51.0 From 2581f89ac8d7174fda975523ae6c2bdc8ad62144 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 2 Apr 2025 12:54:25 -0400 Subject: [PATCH 12/16] bcachefs: backpointer_get_key: check for null from peek_slot() peek_slot() doesn't normally return bkey_s_c_null - except when we ask for a key at a btree level that doesn't exist, which can happen here. We might want to revisit this, but we'll have to look over all the places where we use peek_slot() on interior nodes. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index dc1cd8de18ac..ff26bb515150 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -258,6 +258,18 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, return k; } + /* + * peek_slot() doesn't normally return NULL - except when we ask for a + * key at a btree level that doesn't exist. + * + * We may want to revisit this and change peek_slot(): + */ + if (!k.k) { + bkey_init(&iter->k); + iter->k.p = bp.v->pos; + k.k = &iter->k; + } + if (k.k && extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) return k; -- 2.51.0 From 77ad1df82b9e8d169e3ec9ee8b7caabfa45872ce Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 2 Apr 2025 12:23:25 -0400 Subject: [PATCH 13/16] bcachefs: Fix "journal stuck" during recovery If we crash when the journal pin fifo is completely full - i.e. we're at the maximum number of dirty journal entries - that may put us in a sticky situation in recovery, as journal replay will need to be able to open new journal entries in order to get going. bch2_fs_journal_start() already had provisions for resizing the journal pin fifo if needed, but it needs a fudge factor to ensure there's room for journal replay. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 11f104f436e3..d8f74b6d0a75 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1404,6 +1404,14 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) nr = cur_seq - last_seq; + /* + * Extra fudge factor, in case we crashed when the journal pin fifo was + * nearly or completely full. We'll need to be able to open additional + * journal entries (at least a few) in order for journal replay to get + * going: + */ + nr += nr / 4; + if (nr + 1 > j->pin.size) { free_fifo(&j->pin); init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); -- 2.51.0 From c0dbd11ada2c94edc337a5f6665cbaa6079ff785 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 3 Apr 2025 16:43:50 +0200 Subject: [PATCH 14/16] fs: actually hold the namespace semaphore Don't use a scoped guard that only protects the next statement. Use a regular guard to make sure that the namespace semaphore is held across the whole function. Signed-off-by: Christian Brauner Reported-by: Leon Romanovsky Link: https://lore.kernel.org/all/20250401170715.GA112019@unreal/ Fixes: db04662e2f4f ("fs: allow detached mounts in clone_private_mount()") Signed-off-by: Linus Torvalds --- fs/namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 16292ff760c9..14935a0500a2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2478,7 +2478,8 @@ struct vfsmount *clone_private_mount(const struct path *path) struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; - scoped_guard(rwsem_read, &namespace_sem) + guard(rwsem_read)(&namespace_sem); + if (IS_MNT_UNBINDABLE(old_mnt)) return ERR_PTR(-EINVAL); -- 2.51.0 From 72070e57b0a518ec8e562a2b68fdfc796ef5c040 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 4 Apr 2025 08:18:49 +0800 Subject: [PATCH 15/16] selftests: ublk: fix test_stripe_04 Commit 57ed58c13256 ("selftests: ublk: enable zero copy for stripe target") added test entry of test_stripe_04, but forgot to add the test script. So fix the test by adding the script file. Reported-by: Uday Shankar Signed-off-by: Ming Lei Reviewed-by: Uday Shankar Link: https://lore.kernel.org/r/20250404001849.1443064-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- .../testing/selftests/ublk/test_stripe_04.sh | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100755 tools/testing/selftests/ublk/test_stripe_04.sh diff --git a/tools/testing/selftests/ublk/test_stripe_04.sh b/tools/testing/selftests/ublk/test_stripe_04.sh new file mode 100755 index 000000000000..1f2b642381d1 --- /dev/null +++ b/tools/testing/selftests/ublk/test_stripe_04.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="stripe_04" +ERR_CODE=0 + +_prep_test "stripe" "mkfs & mount & umount on zero copy" + +backfile_0=$(_create_backfile 256M) +backfile_1=$(_create_backfile 256M) +dev_id=$(_add_ublk_dev -t stripe -z -q 2 "$backfile_0" "$backfile_1") +_check_add_dev $TID $? "$backfile_0" "$backfile_1" + +_mkfs_mount_test /dev/ublkb"${dev_id}" +ERR_CODE=$? + +_cleanup_test "stripe" + +_remove_backfile "$backfile_0" +_remove_backfile "$backfile_1" + +_show_result $TID $ERR_CODE -- 2.51.0 From d05af90d6218e9c8f1c2026990c3f53c1b41bfb0 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 25 Mar 2025 09:57:46 +0800 Subject: [PATCH 16/16] md/raid10: fix missing discard IO accounting md_account_bio() is not called from raid10_handle_discard(), now that we handle bitmap inside md_account_bio(), also fix missing bitmap_startwrite for discard. Test whole disk discard for 20G raid10: Before: Device d/s dMB/s drqm/s %drqm d_await dareq-sz md0 48.00 16.00 0.00 0.00 5.42 341.33 After: Device d/s dMB/s drqm/s %drqm d_await dareq-sz md0 68.00 20462.00 0.00 0.00 2.65 308133.65 Link: https://lore.kernel.org/linux-raid/20250325015746.3195035-1-yukuai1@huaweicloud.com Fixes: 528bc2cf2fcc ("md/raid10: enable io accounting") Signed-off-by: Yu Kuai Acked-by: Coly Li --- drivers/md/raid10.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 846c5f29486e..ba32bac975b8 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1735,6 +1735,7 @@ retry_discard: * The discard bio returns only first r10bio finishes */ if (first_copy) { + md_account_bio(mddev, &bio); r10_bio->master_bio = bio; set_bit(R10BIO_Discard, &r10_bio->state); first_copy = false; -- 2.51.0