From 28580052e634fe8fa327e6f25c35590374be754b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 1 May 2025 12:38:00 -0400 Subject: [PATCH 01/16] bcachefs: add missing sched_annotate_sleep() 00594 ------------[ cut here ]------------ 00594 do not call blocking ops when !TASK_RUNNING; state=2 set at [<000000003e51ef4a>] prepare_to_wait_event+0x5c/0x1c0 00594 WARNING: CPU: 12 PID: 1117 at kernel/sched/core.c:8741 __might_sleep+0x74/0x88 00594 Modules linked in: 00594 CPU: 12 UID: 0 PID: 1117 Comm: umount Not tainted 6.15.0-rc4-ktest-g3a72e369412d #21845 PREEMPT 00594 Hardware name: linux,dummy-virt (DT) 00594 pstate: 60001005 (nZCv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--) 00594 pc : __might_sleep+0x74/0x88 00594 lr : __might_sleep+0x74/0x88 00594 sp : ffffff80c8d67a90 00594 x29: ffffff80c8d67a90 x28: ffffff80f5903500 x27: 0000000000000000 00594 x26: 0000000000000000 x25: ffffff80cf5002a0 x24: ffffffc087dad000 00594 x23: ffffff80c8d67b40 x22: 0000000000000000 x21: 0000000000000000 00594 x20: 0000000000000242 x19: ffffffc080b92020 x18: 00000000ffffffff 00594 x17: 30303c5b20746120 x16: 74657320323d6574 x15: 617473203b474e49 00594 x14: 0000000000000001 x13: 00000000000c0000 x12: ffffff80facc0000 00594 x11: 0000000000000001 x10: 0000000000000001 x9 : ffffffc0800b0774 00594 x8 : c0000000fffbffff x7 : ffffffc087dac670 x6 : 00000000015fffa8 00594 x5 : ffffff80facbffa8 x4 : ffffff80fbd30b90 x3 : 0000000000000000 00594 x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffffff80f5903500 00594 Call trace: 00594 __might_sleep+0x74/0x88 (P) 00594 __mutex_lock+0x64/0x8d8 00594 mutex_lock_nested+0x28/0x38 00594 bch2_fs_ec_flush+0xf8/0x128 00594 __bch2_fs_read_only+0x54/0x1d8 00594 bch2_fs_read_only+0x3e0/0x438 00594 __bch2_fs_stop+0x5c/0x250 00594 bch2_put_super+0x18/0x28 00594 generic_shutdown_super+0x6c/0x140 00594 bch2_kill_sb+0x1c/0x38 00594 deactivate_locked_super+0x54/0xd0 00594 deactivate_super+0x70/0x90 00594 cleanup_mnt+0xec/0x188 00594 __cleanup_mnt+0x18/0x28 00594 task_work_run+0x90/0xd8 00594 do_notify_resume+0x138/0x148 00594 el0_svc+0x9c/0xa0 00594 el0t_64_sync_handler+0x104/0x130 00594 el0t_64_sync+0x154/0x158 Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index a396865e8b17..fff58b78327c 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -2204,10 +2204,10 @@ void bch2_fs_ec_stop(struct bch_fs *c) static bool bch2_fs_ec_flush_done(struct bch_fs *c) { - bool ret; + sched_annotate_sleep(); mutex_lock(&c->ec_stripe_new_lock); - ret = list_empty(&c->ec_stripe_new_list); + bool ret = list_empty(&c->ec_stripe_new_list); mutex_unlock(&c->ec_stripe_new_lock); return ret; -- 2.51.0 From 6846100b00d97d3d6f05766ae86a0d821d849e78 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Fri, 2 May 2025 04:01:31 +0800 Subject: [PATCH 02/16] bcachefs: Remove incorrect __counted_by annotation This actually reverts 86e92eeeb237 ("bcachefs: Annotate struct bch_xattr with __counted_by()"). After the x_name, there is a value. According to the disscussion[1], __counted_by assumes that the flexible array member contains exactly the amount of elements that are specified. Now there are users came across a false positive detection of an out of bounds write caused by the __counted_by here[2], so revert that. [1] https://lore.kernel.org/lkml/Zv8VDKWN1GzLRT-_@archlinux/T/#m0ce9541c5070146320efd4f928cc1ff8de69e9b2 [2] https://privatebin.net/?a0d4e97d590d71e1#9bLmp2Kb5NU6X6cZEucchDcu88HzUQwHUah8okKPReEt Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/xattr_format.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h index c7916011ef34..67426e33d04e 100644 --- a/fs/bcachefs/xattr_format.h +++ b/fs/bcachefs/xattr_format.h @@ -13,7 +13,13 @@ struct bch_xattr { __u8 x_type; __u8 x_name_len; __le16 x_val_len; - __u8 x_name[] __counted_by(x_name_len); + /* + * x_name contains the name and value counted by + * x_name_len + x_val_len. The introduction of + * __counted_by(x_name_len) caused a false positive + * detection of an out of bounds write. + */ + __u8 x_name[]; } __packed __aligned(8); #endif /* _BCACHEFS_XATTR_FORMAT_H */ -- 2.51.0 From df2e19a883fdea698cdbed4987987db999b19d58 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 4 May 2025 13:50:09 -0400 Subject: [PATCH 03/16] bcachefs: thread_with_stdio: fix spinning instead of exiting bch2_stdio_redirect_vprintf() was missing a check for stdio->done, i.e. exiting. This caused the thread attempting to print to spin, and since it was being called from the kthread ran by thread_with_stdio, the userspace side hung as well. Change it to return -EPIPE - i.e. writing to a pipe that's been closed. Reported-by: Jan Solanti Signed-off-by: Kent Overstreet --- fs/bcachefs/thread_with_file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index dea73bc1cb51..314a24d15d4e 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -455,8 +455,10 @@ ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocki struct stdio_buf *buf = &stdio->output; unsigned long flags; ssize_t ret; - again: + if (stdio->done) + return -EPIPE; + spin_lock_irqsave(&buf->lock, flags); ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args); spin_unlock_irqrestore(&buf->lock, flags); -- 2.51.0 From 844f766e02d0aba973d78f3ade38ad8bae399347 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 4 May 2025 15:01:34 -0400 Subject: [PATCH 04/16] bcachefs: Improve want_cached_ptr() If promote target isn't set, rebalance should still leave a cached copy on the faster device. Fall back to foreground_target if it's set, or allow a cached copy on any device if neither are set. Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index dca2b8425cc0..e597fb9c9823 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1056,8 +1056,9 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, struct bch_extent_ptr *ptr) { - if (!opts->promote_target || - !bch2_dev_in_target(c, ptr->dev, opts->promote_target)) + unsigned target = opts->promote_target ?: opts->foreground_target; + + if (target && !bch2_dev_in_target(c, ptr->dev, target)) return false; struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); -- 2.51.0 From 50a7b899a0d806f961edadfc3357cb6679826795 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 4 May 2025 16:31:40 -0400 Subject: [PATCH 05/16] bcachefs: Ensure proper write alignment There was a buggy version of bcachefs-tools which picked misaligned bucket sizes when formatting, and we're also about to do dynamic block sizes - which will allow picking logical block size or physical block size of the device per-write, allowing for better compression ratios at the cost of slightly worse write performance (i.e. forcing the device to do RMW or extra buffering). To account for this, tweak bch2_alloc_sectors_start() to properly align open_buckets to the blocksize of the write we're about to do. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index effafc3e0ced..7ec022e9361a 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1422,11 +1422,31 @@ alloc_done: wp->sectors_free = UINT_MAX; - open_bucket_for_each(c, &wp->ptrs, ob, i) + open_bucket_for_each(c, &wp->ptrs, ob, i) { + /* + * Ensure proper write alignment - either due to misaligned + * bucket sizes (from buggy bcachefs-tools), or writes that mix + * logical/physical alignment: + */ + struct bch_dev *ca = ob_dev(c, ob); + u64 offset = bucket_to_sector(ca, ob->bucket) + + ca->mi.bucket_size - + ob->sectors_free; + unsigned align = round_up(offset, block_sectors(c)) - offset; + + ob->sectors_free = max_t(int, 0, ob->sectors_free - align); + wp->sectors_free = min(wp->sectors_free, ob->sectors_free); + } wp->sectors_free = rounddown(wp->sectors_free, block_sectors(c)); + /* Did alignment use up space in an open_bucket? */ + if (unlikely(!wp->sectors_free)) { + bch2_alloc_sectors_done(c, wp); + goto retry; + } + BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); return 0; -- 2.51.0 From 7a69fa65718a7258aa89fca06b975f4357daeac3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 5 May 2025 14:13:21 -0400 Subject: [PATCH 06/16] bcachefs: Add missing barriers before wake_up_bit() wake_up() doesn't require a barrier - but wake_up_bit() does. This only affected non x86, and primarily lead to lost wakeups after btree node reads. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 9 ++++++++- fs/bcachefs/buckets.h | 1 + fs/bcachefs/ec.h | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 5fd4a58d2ad2..60782f3e5aec 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -41,6 +41,7 @@ void bch2_btree_node_io_unlock(struct btree *b) clear_btree_node_write_in_flight_inner(b); clear_btree_node_write_in_flight(b); + smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); } @@ -1400,6 +1401,7 @@ start: printbuf_exit(&buf); clear_btree_node_read_in_flight(b); + smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); } @@ -1595,6 +1597,7 @@ fsck_err: printbuf_exit(&buf); clear_btree_node_read_in_flight(b); + smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); } @@ -1721,6 +1724,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, set_btree_node_read_error(b); bch2_btree_lost_data(c, b->c.btree_id); clear_btree_node_read_in_flight(b); + smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); printbuf_exit(&buf); return; @@ -2061,8 +2065,10 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start if (new & (1U << BTREE_NODE_write_in_flight)) __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type); - else + else { + smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); + } } static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) @@ -2175,6 +2181,7 @@ static void btree_node_write_endio(struct bio *bio) } clear_btree_node_write_in_flight_inner(b); + smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner); INIT_WORK(&wb->work, btree_node_write_work); queue_work(c->btree_io_complete_wq, &wb->work); diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 8d75b27a1418..af1532de4a37 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -44,6 +44,7 @@ static inline void bucket_unlock(struct bucket *b) BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock); + smp_mb__after_atomic(); wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR); } diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 62d27e04d763..51893e1ee874 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -160,6 +160,7 @@ static inline void gc_stripe_unlock(struct gc_stripe *s) BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock); + smp_mb__after_atomic(); wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR); } -- 2.51.0 From aed4ccbf4595e08f3fa80c7faaf1d2188d61bc70 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 4 May 2025 18:46:16 -0400 Subject: [PATCH 07/16] bcachefs: fix hung task timeout in journal read Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 63cdf885c9e2..ded18a94ed02 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -19,6 +19,7 @@ #include #include +#include void bch2_journal_pos_from_member_info_set(struct bch_fs *c) { @@ -1262,7 +1263,8 @@ int bch2_journal_read(struct bch_fs *c, degraded = true; } - closure_sync(&jlist.cl); + while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2)) + ; if (jlist.ret) return jlist.ret; -- 2.51.0 From 9c618560994794d6f477e81f3b695fe799feec8a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 5 May 2025 15:52:57 -0400 Subject: [PATCH 08/16] bcachefs: Call bch2_fs_start before getting vfs superblock This reverts 1fdbe0b184c8 bcachefs: Make sure c->vfs_sb is set before starting fs switched up bch2_fs_get_tree() so that we got a superblock before calling bch2_fs_start, so that c->vfs_sb would always be initialized while the filesystem was active. This turned out not to be necessary, because blk_holder_ops were implemented using our own locking, not vfs locking. And this had the side effect of creating a super_block and doing our full recovery (including potentially fsck) before setting SB_BORN, which causes things like sync calls to hang until our recovery is finished. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 113db85b6ef9..b6801861c66f 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -2502,10 +2502,9 @@ static int bch2_fs_get_tree(struct fs_context *fc) bch2_opts_apply(&c->opts, opts); - /* - * need to initialise sb and set c->vfs_sb _before_ starting fs, - * for blk_holder_ops - */ + ret = bch2_fs_start(c); + if (ret) + goto err_stop_fs; sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); ret = PTR_ERR_OR_ZERO(sb); @@ -2567,10 +2566,6 @@ got_sb: sb->s_shrink->seeks = 0; - ret = bch2_fs_start(c); - if (ret) - goto err_put_super; - #ifdef CONFIG_UNICODE sb->s_encoding = c->cf_encoding; #endif -- 2.51.0 From 473f09f362e5979efbff40c9bbce58892ad39d66 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 6 May 2025 00:22:26 -0400 Subject: [PATCH 09/16] bcachefs: journal_shutdown is EROFS, not EIO We often filter out EROFS errors to avoid log spew after an emergency shutdown - journal_shutdown is just another emergency shutdown error. Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index a615e4852ded..d9ebffa5b3a2 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -269,7 +269,7 @@ x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \ x(BCH_ERR_invalid, invalid_bkey) \ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ - x(EIO, journal_shutdown) \ + x(EROFS, journal_shutdown) \ x(EIO, journal_flush_err) \ x(EIO, journal_write_err) \ x(EIO, btree_node_read_err) \ -- 2.51.0 From 2fea3aa76e352cd071bee71e5c43da339639b310 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 7 May 2025 13:50:00 -0400 Subject: [PATCH 10/16] bcachefs: Filter out harmless EROFS error messages These just indicate that we're shutting down. Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index fc396b9fa754..dfdbb9259985 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -784,7 +784,8 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, goto err; ret = bch2_btree_write_buffer_tryflush(trans); - bch_err_msg(c, ret, "flushing btree write buffer"); + if (!bch2_err_matches(ret, EROFS)) + bch_err_msg(c, ret, "flushing btree write buffer"); if (ret) goto err; -- 2.51.0 From da18dabc3784663a088943e613c36cd17aeb52d3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 7 May 2025 16:54:25 -0400 Subject: [PATCH 11/16] bcachefs: Ensure superblock gets written when we go ERO When we go emergency read-only, make sure we do a final write_super() to persist counters and error counts - this can be critical for piecing together what fsck was doing. Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 27943082c093..84a37d971ffd 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -377,6 +377,11 @@ void bch2_fs_read_only(struct bch_fs *c) bch_verbose(c, "marking filesystem clean"); bch2_fs_mark_clean(c); } else { + /* Make sure error counts/counters are persisted */ + mutex_lock(&c->sb_lock); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + bch_verbose(c, "done going read-only, filesystem not clean"); } } -- 2.51.0 From 8e4d28036c293241b312b1fceafb32b994f80fcc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 7 May 2025 13:32:15 -0400 Subject: [PATCH 12/16] bcachefs: Don't aggressively discard the journal We frequently use 'bcachefs list_journal -a' for debugging, as it provides a record of all btree transactions, and a history of what happened. But it's not so useful if we immediately discard journal buckets right after they're no longer dirty. This tweaks journal reclaim to only discard when we're low on space, keeping the journal mostly un-discarded. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_reclaim.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index ea670c3c43d8..976464d8a695 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -266,10 +266,11 @@ out: static bool should_discard_bucket(struct journal *j, struct journal_device *ja) { - bool ret; - spin_lock(&j->lock); - ret = ja->discard_idx != ja->dirty_idx_ondisk; + unsigned min_free = max(4, ja->nr / 8); + + bool ret = bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) < min_free && + ja->discard_idx != ja->dirty_idx_ondisk; spin_unlock(&j->lock); return ret; -- 2.51.0 From cd52cc3544e400e53e6d4b7bfc5263e7a867b5ab Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 May 2025 11:30:21 -0400 Subject: [PATCH 13/16] bcachefs: Don't strip rebalance_opts from indirect extents Fix bch2_bkey_clear_needs_rebalance(): indirect extents are never supposed to have bch_extent_rebalance stripped off, because that's how we get the IO path options when we don't have the original inode it belonged to. Signed-off-by: Kent Overstreet --- fs/bcachefs/rebalance.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 4ccdfc1f34aa..623273556aa9 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -309,7 +309,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { - if (!bch2_bkey_rebalance_opts(k)) + if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k)) return 0; struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); -- 2.51.0 From b1c71cb492bb26dc883b11b5bd085d2467508f84 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 May 2025 14:42:37 -0400 Subject: [PATCH 14/16] bcachefs: Fix broken btree_path lock invariants in next_node() This fixes btree locking assert pops users were seeing during evacuate: https://github.com/koverstreet/bcachefs/issues/878 May 09 22:45:02 sharon kernel: bcachefs (68116e25-fa2d-4c6f-86c7-e8b431d792ae): bch2_btree_insert_node(): node not locked at level 1 May 09 22:45:02 sharon kernel: bch2_btree_node_rewrite [bcachefs]: watermark=btree no_check_rw alloc l=0-1 mode=none nodes_written=0 cl.remaining=2 journal_seq=0 May 09 22:45:02 sharon kernel: path: idx 1 ref 1:0 S B btree=alloc level=0 pos 0:3699637:0 0:3698012:1-0:3699637:0 bch2_move_btree.isra.0+0x1db/0x490 [bcachefs] uptodate 0 locks_want 2 May 09 22:45:02 sharon kernel: l=0 locks intent seq 4 node ffff8bd700c93600 May 09 22:45:02 sharon kernel: l=1 locks unlocked seq 1712 node ffff8bd6fd5e7a00 May 09 22:45:02 sharon kernel: l=2 locks unlocked seq 2295 node ffff8bd6cc725400 May 09 22:45:02 sharon kernel: l=3 locks unlocked seq 0 node 0000000000000000 Evacuate walks btree nodes with bch2_btree_iter_next_node() and rewrites them, bch2_btree_update_start() upgrades the path to take intent locks as far as it needs to. But next_node() does low level unlock/relock calls on individual nodes, and didn't handle the case where a path is supposed to be holding multiple intent locks. If a path has locks_want > 1, it needs to be either holding locks on all the btree nodes (at each level) requested, or none of them. Fix this with a bch2_btree_path_downgrade(). Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 59fa527ac685..9c9bc7b6c712 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1971,6 +1971,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_ return NULL; } + /* + * We don't correctly handle nodes with extra intent locks here: + * downgrade so we don't violate locking invariants + */ + bch2_btree_path_downgrade(trans, path); + if (!bch2_btree_node_relock(trans, path, path->level + 1)) { __bch2_btree_path_unlock(trans, path); path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); -- 2.51.0 From 7b6759b1991d427cf9a562b2891b8c3e87a19c76 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 13 May 2025 12:55:44 -0400 Subject: [PATCH 15/16] bcachefs: Fix livelock in journal_entry_open() When the journal is low on space, we might do discards from journal_res_get() -> journal_entry_open(). Make sure we set j->can_discard correctly, so that if we're low on space but not because discards aren't keeping up we don't livelock. Fixes: 8e4d28036c29 ("bcachefs: Don't aggressively discard the journal") Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_reclaim.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 976464d8a695..cc00b0fc40d8 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -17,6 +17,8 @@ #include #include +static bool __should_discard_bucket(struct journal *, struct journal_device *); + /* Free space calculations: */ static unsigned journal_space_from(struct journal_device *ja, @@ -203,8 +205,7 @@ void bch2_journal_space_available(struct journal *j) ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; - if (ja->discard_idx != ja->dirty_idx_ondisk) - can_discard = true; + can_discard |= __should_discard_bucket(j, ja); max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); nr_online++; @@ -264,13 +265,19 @@ out: /* Discards - last part of journal reclaim: */ -static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +static bool __should_discard_bucket(struct journal *j, struct journal_device *ja) { - spin_lock(&j->lock); unsigned min_free = max(4, ja->nr / 8); - bool ret = bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) < min_free && + return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) < + min_free && ja->discard_idx != ja->dirty_idx_ondisk; +} + +static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +{ + spin_lock(&j->lock); + bool ret = __should_discard_bucket(j, ja); spin_unlock(&j->lock); return ret; -- 2.51.0 From 19b22d04cd44ded1ea5af7849aec9cbf4021c852 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 13 May 2025 14:27:01 -0400 Subject: [PATCH 16/16] bcachefs: Don't set btree nodes as accessed on fill Prevent jobs that do lots of scanning (i.e. evacuatee, scrub) from causing OOMs. The shrinker code seems to be having issues when it doesn't do any freeing because it's just flipping off the acccessed bit - and the accessed bit shouldn't be set on first use anyways. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 9b80201c7982..899891295797 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -852,7 +852,6 @@ out: b->sib_u64s[1] = 0; b->whiteout_u64s = 0; bch2_btree_keys_init(b); - set_btree_node_accessed(b); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], start_time); @@ -1286,6 +1285,10 @@ lock_node: six_unlock_read(&b->c.lock); goto retry; } + + /* avoid atomic set bit if it's not needed: */ + if (!btree_node_accessed(b)) + set_btree_node_accessed(b); } /* XXX: waiting on IO with btree locks held: */ @@ -1301,10 +1304,6 @@ lock_node: prefetch(p + L1_CACHE_BYTES * 2); } - /* avoid atomic set bit if it's not needed: */ - if (!btree_node_accessed(b)) - set_btree_node_accessed(b); - if (unlikely(btree_node_read_error(b))) { six_unlock_read(&b->c.lock); b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); -- 2.51.0