From 28580052e634fe8fa327e6f25c35590374be754b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 1 May 2025 12:38:00 -0400
Subject: [PATCH 01/16] bcachefs: add missing sched_annotate_sleep()

00594 ------------[ cut here ]------------
00594 do not call blocking ops when !TASK_RUNNING; state=2 set at [<000000003e51ef4a>] prepare_to_wait_event+0x5c/0x1c0
00594 WARNING: CPU: 12 PID: 1117 at kernel/sched/core.c:8741 __might_sleep+0x74/0x88
00594 Modules linked in:
00594 CPU: 12 UID: 0 PID: 1117 Comm: umount Not tainted 6.15.0-rc4-ktest-g3a72e369412d #21845 PREEMPT
00594 Hardware name: linux,dummy-virt (DT)
00594 pstate: 60001005 (nZCv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--)
00594 pc : __might_sleep+0x74/0x88
00594 lr : __might_sleep+0x74/0x88
00594 sp : ffffff80c8d67a90
00594 x29: ffffff80c8d67a90 x28: ffffff80f5903500 x27: 0000000000000000
00594 x26: 0000000000000000 x25: ffffff80cf5002a0 x24: ffffffc087dad000
00594 x23: ffffff80c8d67b40 x22: 0000000000000000 x21: 0000000000000000
00594 x20: 0000000000000242 x19: ffffffc080b92020 x18: 00000000ffffffff
00594 x17: 30303c5b20746120 x16: 74657320323d6574 x15: 617473203b474e49
00594 x14: 0000000000000001 x13: 00000000000c0000 x12: ffffff80facc0000
00594 x11: 0000000000000001 x10: 0000000000000001 x9 : ffffffc0800b0774
00594 x8 : c0000000fffbffff x7 : ffffffc087dac670 x6 : 00000000015fffa8
00594 x5 : ffffff80facbffa8 x4 : ffffff80fbd30b90 x3 : 0000000000000000
00594 x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffffff80f5903500
00594 Call trace:
00594  __might_sleep+0x74/0x88 (P)
00594  __mutex_lock+0x64/0x8d8
00594  mutex_lock_nested+0x28/0x38
00594  bch2_fs_ec_flush+0xf8/0x128
00594  __bch2_fs_read_only+0x54/0x1d8
00594  bch2_fs_read_only+0x3e0/0x438
00594  __bch2_fs_stop+0x5c/0x250
00594  bch2_put_super+0x18/0x28
00594  generic_shutdown_super+0x6c/0x140
00594  bch2_kill_sb+0x1c/0x38
00594  deactivate_locked_super+0x54/0xd0
00594  deactivate_super+0x70/0x90
00594  cleanup_mnt+0xec/0x188
00594  __cleanup_mnt+0x18/0x28
00594  task_work_run+0x90/0xd8
00594  do_notify_resume+0x138/0x148
00594  el0_svc+0x9c/0xa0
00594  el0t_64_sync_handler+0x104/0x130
00594  el0t_64_sync+0x154/0x158

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index a396865e8b17..fff58b78327c 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -2204,10 +2204,10 @@ void bch2_fs_ec_stop(struct bch_fs *c)
 
 static bool bch2_fs_ec_flush_done(struct bch_fs *c)
 {
-	bool ret;
+	sched_annotate_sleep();
 
 	mutex_lock(&c->ec_stripe_new_lock);
-	ret = list_empty(&c->ec_stripe_new_list);
+	bool ret = list_empty(&c->ec_stripe_new_list);
 	mutex_unlock(&c->ec_stripe_new_lock);
 
 	return ret;
-- 
2.51.0


From 6846100b00d97d3d6f05766ae86a0d821d849e78 Mon Sep 17 00:00:00 2001
From: Alan Huang <mmpgouride@gmail.com>
Date: Fri, 2 May 2025 04:01:31 +0800
Subject: [PATCH 02/16] bcachefs: Remove incorrect __counted_by annotation

This actually reverts 86e92eeeb237 ("bcachefs: Annotate struct bch_xattr
with __counted_by()").

After the x_name, there is a value. According to the disscussion[1],
__counted_by assumes that the flexible array member contains exactly
the amount of elements that are specified. Now there are users came across
a false positive detection of an out of bounds write caused by
the __counted_by here[2], so revert that.

[1] https://lore.kernel.org/lkml/Zv8VDKWN1GzLRT-_@archlinux/T/#m0ce9541c5070146320efd4f928cc1ff8de69e9b2
[2] https://privatebin.net/?a0d4e97d590d71e1#9bLmp2Kb5NU6X6cZEucchDcu88HzUQwHUah8okKPReEt

Signed-off-by: Alan Huang <mmpgouride@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/xattr_format.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h
index c7916011ef34..67426e33d04e 100644
--- a/fs/bcachefs/xattr_format.h
+++ b/fs/bcachefs/xattr_format.h
@@ -13,7 +13,13 @@ struct bch_xattr {
 	__u8			x_type;
 	__u8			x_name_len;
 	__le16			x_val_len;
-	__u8			x_name[] __counted_by(x_name_len);
+	/*
+	 * x_name contains the name and value counted by
+	 * x_name_len + x_val_len. The introduction of
+	 * __counted_by(x_name_len) caused a false positive
+	 * detection of an out of bounds write.
+	 */
+	__u8			x_name[];
 } __packed __aligned(8);
 
 #endif /* _BCACHEFS_XATTR_FORMAT_H */
-- 
2.51.0


From df2e19a883fdea698cdbed4987987db999b19d58 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 4 May 2025 13:50:09 -0400
Subject: [PATCH 03/16] bcachefs: thread_with_stdio: fix spinning instead of
 exiting

bch2_stdio_redirect_vprintf() was missing a check for stdio->done, i.e.
exiting.

This caused the thread attempting to print to spin, and since it was
being called from the kthread ran by thread_with_stdio, the userspace
side hung as well.

Change it to return -EPIPE - i.e. writing to a pipe that's been closed.

Reported-by: Jan Solanti <jhs@psonet.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/thread_with_file.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index dea73bc1cb51..314a24d15d4e 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -455,8 +455,10 @@ ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocki
 	struct stdio_buf *buf = &stdio->output;
 	unsigned long flags;
 	ssize_t ret;
-
 again:
+	if (stdio->done)
+		return -EPIPE;
+
 	spin_lock_irqsave(&buf->lock, flags);
 	ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);
 	spin_unlock_irqrestore(&buf->lock, flags);
-- 
2.51.0


From 844f766e02d0aba973d78f3ade38ad8bae399347 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 4 May 2025 15:01:34 -0400
Subject: [PATCH 04/16] bcachefs: Improve want_cached_ptr()

If promote target isn't set, rebalance should still leave a cached copy
on the faster device.

Fall back to foreground_target if it's set, or allow a cached copy on
any device if neither are set.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index dca2b8425cc0..e597fb9c9823 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1056,8 +1056,9 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke
 static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
 			    struct bch_extent_ptr *ptr)
 {
-	if (!opts->promote_target ||
-	    !bch2_dev_in_target(c, ptr->dev, opts->promote_target))
+	unsigned target = opts->promote_target ?: opts->foreground_target;
+
+	if (target && !bch2_dev_in_target(c, ptr->dev, target))
 		return false;
 
 	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
-- 
2.51.0


From 50a7b899a0d806f961edadfc3357cb6679826795 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 4 May 2025 16:31:40 -0400
Subject: [PATCH 05/16] bcachefs: Ensure proper write alignment

There was a buggy version of bcachefs-tools which picked misaligned
bucket sizes when formatting, and we're also about to do dynamic block
sizes - which will allow picking logical block size or physical block
size of the device per-write, allowing for better compression ratios at
the cost of slightly worse write performance (i.e. forcing the device to
do RMW or extra buffering).

To account for this, tweak bch2_alloc_sectors_start() to properly align
open_buckets to the blocksize of the write we're about to do.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index effafc3e0ced..7ec022e9361a 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1422,11 +1422,31 @@ alloc_done:
 
 	wp->sectors_free = UINT_MAX;
 
-	open_bucket_for_each(c, &wp->ptrs, ob, i)
+	open_bucket_for_each(c, &wp->ptrs, ob, i) {
+		/*
+		 * Ensure proper write alignment - either due to misaligned
+		 * bucket sizes (from buggy bcachefs-tools), or writes that mix
+		 * logical/physical alignment:
+		 */
+		struct bch_dev *ca = ob_dev(c, ob);
+		u64 offset = bucket_to_sector(ca, ob->bucket) +
+			ca->mi.bucket_size -
+			ob->sectors_free;
+		unsigned align = round_up(offset, block_sectors(c)) - offset;
+
+		ob->sectors_free = max_t(int, 0, ob->sectors_free - align);
+
 		wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
+	}
 
 	wp->sectors_free = rounddown(wp->sectors_free, block_sectors(c));
 
+	/* Did alignment use up space in an open_bucket? */
+	if (unlikely(!wp->sectors_free)) {
+		bch2_alloc_sectors_done(c, wp);
+		goto retry;
+	}
+
 	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
 
 	return 0;
-- 
2.51.0


From 7a69fa65718a7258aa89fca06b975f4357daeac3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 5 May 2025 14:13:21 -0400
Subject: [PATCH 06/16] bcachefs: Add missing barriers before wake_up_bit()

wake_up() doesn't require a barrier - but wake_up_bit() does.

This only affected non x86, and primarily lead to lost wakeups after
btree node reads.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 9 ++++++++-
 fs/bcachefs/buckets.h  | 1 +
 fs/bcachefs/ec.h       | 1 +
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 5fd4a58d2ad2..60782f3e5aec 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -41,6 +41,7 @@ void bch2_btree_node_io_unlock(struct btree *b)
 
 	clear_btree_node_write_in_flight_inner(b);
 	clear_btree_node_write_in_flight(b);
+	smp_mb__after_atomic();
 	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
 }
 
@@ -1400,6 +1401,7 @@ start:
 
 	printbuf_exit(&buf);
 	clear_btree_node_read_in_flight(b);
+	smp_mb__after_atomic();
 	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
 }
 
@@ -1595,6 +1597,7 @@ fsck_err:
 	printbuf_exit(&buf);
 
 	clear_btree_node_read_in_flight(b);
+	smp_mb__after_atomic();
 	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
 }
 
@@ -1721,6 +1724,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
 		set_btree_node_read_error(b);
 		bch2_btree_lost_data(c, b->c.btree_id);
 		clear_btree_node_read_in_flight(b);
+		smp_mb__after_atomic();
 		wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
 		printbuf_exit(&buf);
 		return;
@@ -2061,8 +2065,10 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start
 
 	if (new & (1U << BTREE_NODE_write_in_flight))
 		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
-	else
+	else {
+		smp_mb__after_atomic();
 		wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+	}
 }
 
 static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
@@ -2175,6 +2181,7 @@ static void btree_node_write_endio(struct bio *bio)
 	}
 
 	clear_btree_node_write_in_flight_inner(b);
+	smp_mb__after_atomic();
 	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
 	INIT_WORK(&wb->work, btree_node_write_work);
 	queue_work(c->btree_io_complete_wq, &wb->work);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 8d75b27a1418..af1532de4a37 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -44,6 +44,7 @@ static inline void bucket_unlock(struct bucket *b)
 	BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
 
 	clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock);
+	smp_mb__after_atomic();
 	wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR);
 }
 
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 62d27e04d763..51893e1ee874 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -160,6 +160,7 @@ static inline void gc_stripe_unlock(struct gc_stripe *s)
 	BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
 
 	clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock);
+	smp_mb__after_atomic();
 	wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR);
 }
 
-- 
2.51.0


From aed4ccbf4595e08f3fa80c7faaf1d2188d61bc70 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 4 May 2025 18:46:16 -0400
Subject: [PATCH 07/16] bcachefs: fix hung task timeout in journal read

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 63cdf885c9e2..ded18a94ed02 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -19,6 +19,7 @@
 
 #include <linux/ioprio.h>
 #include <linux/string_choices.h>
+#include <linux/sched/sysctl.h>
 
 void bch2_journal_pos_from_member_info_set(struct bch_fs *c)
 {
@@ -1262,7 +1263,8 @@ int bch2_journal_read(struct bch_fs *c,
 			degraded = true;
 	}
 
-	closure_sync(&jlist.cl);
+	while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2))
+		;
 
 	if (jlist.ret)
 		return jlist.ret;
-- 
2.51.0


From 9c618560994794d6f477e81f3b695fe799feec8a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 5 May 2025 15:52:57 -0400
Subject: [PATCH 08/16] bcachefs: Call bch2_fs_start before getting vfs
 superblock

This reverts

1fdbe0b184c8 bcachefs: Make sure c->vfs_sb is set before starting fs

switched up bch2_fs_get_tree() so that we got a superblock before
calling bch2_fs_start, so that c->vfs_sb would always be initialized
while the filesystem was active.

This turned out not to be necessary, because blk_holder_ops were
implemented using our own locking, not vfs locking.

And this had the side effect of creating a super_block and doing our
full recovery (including potentially fsck) before setting SB_BORN, which
causes things like sync calls to hang until our recovery is finished.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 113db85b6ef9..b6801861c66f 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -2502,10 +2502,9 @@ static int bch2_fs_get_tree(struct fs_context *fc)
 
 	bch2_opts_apply(&c->opts, opts);
 
-	/*
-	 * need to initialise sb and set c->vfs_sb _before_ starting fs,
-	 * for blk_holder_ops
-	 */
+	ret = bch2_fs_start(c);
+	if (ret)
+		goto err_stop_fs;
 
 	sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
 	ret = PTR_ERR_OR_ZERO(sb);
@@ -2567,10 +2566,6 @@ got_sb:
 
 	sb->s_shrink->seeks = 0;
 
-	ret = bch2_fs_start(c);
-	if (ret)
-		goto err_put_super;
-
 #ifdef CONFIG_UNICODE
 	sb->s_encoding = c->cf_encoding;
 #endif
-- 
2.51.0


From 473f09f362e5979efbff40c9bbce58892ad39d66 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 6 May 2025 00:22:26 -0400
Subject: [PATCH 09/16] bcachefs: journal_shutdown is EROFS, not EIO

We often filter out EROFS errors to avoid log spew after an emergency
shutdown - journal_shutdown is just another emergency shutdown error.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/errcode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index a615e4852ded..d9ebffa5b3a2 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -269,7 +269,7 @@
 	x(BCH_ERR_invalid_sb,		invalid_sb_downgrade)			\
 	x(BCH_ERR_invalid,		invalid_bkey)				\
 	x(BCH_ERR_operation_blocked,    nocow_lock_blocked)			\
-	x(EIO,				journal_shutdown)			\
+	x(EROFS,			journal_shutdown)			\
 	x(EIO,				journal_flush_err)			\
 	x(EIO,				journal_write_err)			\
 	x(EIO,				btree_node_read_err)			\
-- 
2.51.0


From 2fea3aa76e352cd071bee71e5c43da339639b310 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 7 May 2025 13:50:00 -0400
Subject: [PATCH 10/16] bcachefs: Filter out harmless EROFS error messages

These just indicate that we're shutting down.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index fc396b9fa754..dfdbb9259985 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -784,7 +784,8 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
 		goto err;
 
 	ret = bch2_btree_write_buffer_tryflush(trans);
-	bch_err_msg(c, ret, "flushing btree write buffer");
+	if (!bch2_err_matches(ret, EROFS))
+		bch_err_msg(c, ret, "flushing btree write buffer");
 	if (ret)
 		goto err;
 
-- 
2.51.0


From da18dabc3784663a088943e613c36cd17aeb52d3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 7 May 2025 16:54:25 -0400
Subject: [PATCH 11/16] bcachefs: Ensure superblock gets written when we go ERO

When we go emergency read-only, make sure we do a final write_super() to
persist counters and error counts - this can be critical for piecing
together what fsck was doing.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 27943082c093..84a37d971ffd 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -377,6 +377,11 @@ void bch2_fs_read_only(struct bch_fs *c)
 		bch_verbose(c, "marking filesystem clean");
 		bch2_fs_mark_clean(c);
 	} else {
+		/* Make sure error counts/counters are persisted */
+		mutex_lock(&c->sb_lock);
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+
 		bch_verbose(c, "done going read-only, filesystem not clean");
 	}
 }
-- 
2.51.0


From 8e4d28036c293241b312b1fceafb32b994f80fcc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 7 May 2025 13:32:15 -0400
Subject: [PATCH 12/16] bcachefs: Don't aggressively discard the journal

We frequently use 'bcachefs list_journal -a' for debugging, as it
provides a record of all btree transactions, and a history of what
happened.

But it's not so useful if we immediately discard journal buckets right
after they're no longer dirty.

This tweaks journal reclaim to only discard when we're low on space,
keeping the journal mostly un-discarded.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index ea670c3c43d8..976464d8a695 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -266,10 +266,11 @@ out:
 
 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
 {
-	bool ret;
-
 	spin_lock(&j->lock);
-	ret = ja->discard_idx != ja->dirty_idx_ondisk;
+	unsigned min_free = max(4, ja->nr / 8);
+
+	bool ret = bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) < min_free &&
+		ja->discard_idx != ja->dirty_idx_ondisk;
 	spin_unlock(&j->lock);
 
 	return ret;
-- 
2.51.0


From cd52cc3544e400e53e6d4b7bfc5263e7a867b5ab Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 10 May 2025 11:30:21 -0400
Subject: [PATCH 13/16] bcachefs: Don't strip rebalance_opts from indirect
 extents

Fix bch2_bkey_clear_needs_rebalance(): indirect extents are never
supposed to have bch_extent_rebalance stripped off, because that's how
we get the IO path options when we don't have the original inode it
belonged to.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/rebalance.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 4ccdfc1f34aa..623273556aa9 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -309,7 +309,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
 					   struct btree_iter *iter,
 					   struct bkey_s_c k)
 {
-	if (!bch2_bkey_rebalance_opts(k))
+	if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))
 		return 0;
 
 	struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
-- 
2.51.0


From b1c71cb492bb26dc883b11b5bd085d2467508f84 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 10 May 2025 14:42:37 -0400
Subject: [PATCH 14/16] bcachefs: Fix broken btree_path lock invariants in
 next_node()

This fixes btree locking assert pops users were seeing during evacuate:

https://github.com/koverstreet/bcachefs/issues/878

May 09 22:45:02 sharon kernel: bcachefs (68116e25-fa2d-4c6f-86c7-e8b431d792ae):   bch2_btree_insert_node(): node not locked at level 1
May 09 22:45:02 sharon kernel:   bch2_btree_node_rewrite [bcachefs]: watermark=btree no_check_rw alloc l=0-1 mode=none nodes_written=0 cl.remaining=2 journal_seq=0
May 09 22:45:02 sharon kernel:   path: idx   1 ref 1:0   S B btree=alloc level=0 pos 0:3699637:0 0:3698012:1-0:3699637:0 bch2_move_btree.isra.0+0x1db/0x490 [bcachefs] uptodate 0 locks_want 2
May 09 22:45:02 sharon kernel:     l=0 locks intent seq 4 node ffff8bd700c93600
May 09 22:45:02 sharon kernel:     l=1 locks unlocked seq 1712 node ffff8bd6fd5e7a00
May 09 22:45:02 sharon kernel:     l=2 locks unlocked seq 2295 node ffff8bd6cc725400
May 09 22:45:02 sharon kernel:     l=3 locks unlocked seq 0 node 0000000000000000

Evacuate walks btree nodes with bch2_btree_iter_next_node() and rewrites
them, bch2_btree_update_start() upgrades the path to take intent locks
as far as it needs to.

But next_node() does low level unlock/relock calls on individual nodes,
and didn't handle the case where a path is supposed to be holding
multiple intent locks. If a path has locks_want > 1, it needs to be
either holding locks on all the btree nodes (at each level) requested,
or none of them.

Fix this with a bch2_btree_path_downgrade().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 59fa527ac685..9c9bc7b6c712 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1971,6 +1971,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_
 		return NULL;
 	}
 
+	/*
+	 * We don't correctly handle nodes with extra intent locks here:
+	 * downgrade so we don't violate locking invariants
+	 */
+	bch2_btree_path_downgrade(trans, path);
+
 	if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
 		__bch2_btree_path_unlock(trans, path);
 		path->l[path->level].b		= ERR_PTR(-BCH_ERR_no_btree_node_relock);
-- 
2.51.0


From 7b6759b1991d427cf9a562b2891b8c3e87a19c76 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 13 May 2025 12:55:44 -0400
Subject: [PATCH 15/16] bcachefs: Fix livelock in journal_entry_open()

When the journal is low on space, we might do discards from
journal_res_get() -> journal_entry_open().

Make sure we set j->can_discard correctly, so that if we're low on space
but not because discards aren't keeping up we don't livelock.

Fixes: 8e4d28036c29 ("bcachefs: Don't aggressively discard the journal")
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 976464d8a695..cc00b0fc40d8 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -17,6 +17,8 @@
 #include <linux/kthread.h>
 #include <linux/sched/mm.h>
 
+static bool __should_discard_bucket(struct journal *, struct journal_device *);
+
 /* Free space calculations: */
 
 static unsigned journal_space_from(struct journal_device *ja,
@@ -203,8 +205,7 @@ void bch2_journal_space_available(struct journal *j)
 		       ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
 			ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
 
-		if (ja->discard_idx != ja->dirty_idx_ondisk)
-			can_discard = true;
+		can_discard |= __should_discard_bucket(j, ja);
 
 		max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
 		nr_online++;
@@ -264,13 +265,19 @@ out:
 
 /* Discards - last part of journal reclaim: */
 
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+static bool __should_discard_bucket(struct journal *j, struct journal_device *ja)
 {
-	spin_lock(&j->lock);
 	unsigned min_free = max(4, ja->nr / 8);
 
-	bool ret = bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) < min_free &&
+	return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) <
+		min_free &&
 		ja->discard_idx != ja->dirty_idx_ondisk;
+}
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+	spin_lock(&j->lock);
+	bool ret = __should_discard_bucket(j, ja);
 	spin_unlock(&j->lock);
 
 	return ret;
-- 
2.51.0


From 19b22d04cd44ded1ea5af7849aec9cbf4021c852 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 13 May 2025 14:27:01 -0400
Subject: [PATCH 16/16] bcachefs: Don't set btree nodes as accessed on fill

Prevent jobs that do lots of scanning (i.e. evacuatee, scrub) from
causing OOMs.

The shrinker code seems to be having issues when it doesn't do any
freeing because it's just flipping off the acccessed bit - and the
accessed bit shouldn't be set on first use anyways.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 9b80201c7982..899891295797 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -852,7 +852,6 @@ out:
 	b->sib_u64s[1]		= 0;
 	b->whiteout_u64s	= 0;
 	bch2_btree_keys_init(b);
-	set_btree_node_accessed(b);
 
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
 			       start_time);
@@ -1286,6 +1285,10 @@ lock_node:
 			six_unlock_read(&b->c.lock);
 			goto retry;
 		}
+
+		/* avoid atomic set bit if it's not needed: */
+		if (!btree_node_accessed(b))
+			set_btree_node_accessed(b);
 	}
 
 	/* XXX: waiting on IO with btree locks held: */
@@ -1301,10 +1304,6 @@ lock_node:
 		prefetch(p + L1_CACHE_BYTES * 2);
 	}
 
-	/* avoid atomic set bit if it's not needed: */
-	if (!btree_node_accessed(b))
-		set_btree_node_accessed(b);
-
 	if (unlikely(btree_node_read_error(b))) {
 		six_unlock_read(&b->c.lock);
 		b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
-- 
2.51.0