From 03ba05058365973a955e1641da908696545703dd Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 6 Sep 2024 16:00:43 +0100
Subject: [PATCH 01/16] btrfs: add and use helper to remove extent map from its
 inode's tree

Move the common code to remove an extent map from its inode's tree into a
helper function and use it, reducing duplicated code.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_map.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index a8b86f12b00d..ed2a5ccbb189 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -77,10 +77,13 @@ static u64 range_end(u64 start, u64 len)
 	return start + len;
 }
 
-static void dec_evictable_extent_maps(struct btrfs_inode *inode)
+static void remove_em(struct btrfs_inode *inode, struct extent_map *em)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 
+	rb_erase(&em->rb_node, &inode->extent_tree.root);
+	RB_CLEAR_NODE(&em->rb_node);
+
 	if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root)))
 		percpu_counter_dec(&fs_info->evictable_extent_maps);
 }
@@ -339,7 +342,6 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map
 static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	struct extent_map_tree *tree = &inode->extent_tree;
 	struct extent_map *merge = NULL;
 	struct rb_node *rb;
 
@@ -371,10 +373,8 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
 			em->flags |= EXTENT_FLAG_MERGED;
 
 			validate_extent_map(fs_info, em);
-			rb_erase(&merge->rb_node, &tree->root);
-			RB_CLEAR_NODE(&merge->rb_node);
+			remove_em(inode, merge);
 			free_extent_map(merge);
-			dec_evictable_extent_maps(inode);
 		}
 	}
 
@@ -386,12 +386,10 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
 		if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
 			merge_ondisk_extents(em, merge, em);
 		validate_extent_map(fs_info, em);
-		rb_erase(&merge->rb_node, &tree->root);
-		RB_CLEAR_NODE(&merge->rb_node);
 		em->generation = max(em->generation, merge->generation);
 		em->flags |= EXTENT_FLAG_MERGED;
+		remove_em(inode, merge);
 		free_extent_map(merge);
-		dec_evictable_extent_maps(inode);
 	}
 }
 
@@ -588,12 +586,10 @@ void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
 	lockdep_assert_held_write(&tree->lock);
 
 	WARN_ON(em->flags & EXTENT_FLAG_PINNED);
-	rb_erase(&em->rb_node, &tree->root);
 	if (!(em->flags & EXTENT_FLAG_LOGGING))
 		list_del_init(&em->list);
-	RB_CLEAR_NODE(&em->rb_node);
 
-	dec_evictable_extent_maps(inode);
+	remove_em(inode, em);
 }
 
 static void replace_extent_mapping(struct btrfs_inode *inode,
-- 
2.51.0


From 1020443840569535f6025a855958f07ea3eebf71 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 29 Aug 2024 15:23:32 +0100
Subject: [PATCH 02/16] btrfs: make the extent map shrinker run asynchronously
 as a work queue job

Currently the extent map shrinker is run synchronously for kswapd tasks
that end up calling the fs shrinker (fs/super.c:super_cache_scan()).
This has some disadvantages and for some heavy workloads with memory
pressure it can cause some delays and stalls that make a machine
unresponsive for some periods. This happens because:

1) We can have several kswapd tasks on machines with multiple NUMA zones,
   and running the extent map shrinker concurrently can cause high
   contention on some spin locks, namely the spin locks that protect
   the radix tree that tracks roots, the per root xarray that tracks
   open inodes and the list of delayed iputs. This not only delays the
   shrinker but also causes high CPU consumption and makes the task
   running the shrinker monopolize a core, resulting in the symptoms
   of an unresponsive system. This was noted in previous commits such as
   commit ae1e766f623f ("btrfs: only run the extent map shrinker from
   kswapd tasks");

2) The extent map shrinker's iteration over inodes can often be slow, even
   after changing the data structure that tracks open inodes for a root
   from a red black tree (up to kernel 6.10) to an xarray (kernel 6.10+).
   The transition to the xarray while it made things a bit faster, it's
   still somewhat slow - for example in a test scenario with 10000 inodes
   that have no extent maps loaded, the extent map shrinker took between
   5ms to 8ms, using a release, non-debug kernel. Iterating over the
   extent maps of an inode can also be slow if have an inode with many
   thousands of extent maps, since we use a red black tree to track and
   search extent maps. So having the extent map shrinker run synchronously
   adds extra delay for other things a kswapd task does.

So make the extent map shrinker run asynchronously as a job for the
system unbounded workqueue, just like what we do for data and metadata
space reclaim jobs.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c    |  2 ++
 fs/btrfs/extent_map.c | 51 ++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/extent_map.h |  3 ++-
 fs/btrfs/fs.h         |  2 ++
 fs/btrfs/super.c      | 13 +++--------
 5 files changed, 52 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 02aaee0bc4e3..8ad625a8d9f9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2785,6 +2785,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	btrfs_init_scrub(fs_info);
 	btrfs_init_balance(fs_info);
 	btrfs_init_async_reclaim_work(fs_info);
+	btrfs_init_extent_map_shrinker_work(fs_info);
 
 	rwlock_init(&fs_info->block_group_cache_lock);
 	fs_info->block_group_cache_tree = RB_ROOT_CACHED;
@@ -4292,6 +4293,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 	cancel_work_sync(&fs_info->async_reclaim_work);
 	cancel_work_sync(&fs_info->async_data_reclaim_work);
 	cancel_work_sync(&fs_info->preempt_reclaim_work);
+	cancel_work_sync(&fs_info->extent_map_shrinker_work);
 
 	/* Cancel or finish ongoing discard work */
 	btrfs_discard_cleanup(fs_info);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index ed2a5ccbb189..4b9ab3f9c731 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1124,7 +1124,8 @@ struct btrfs_em_shrink_ctx {
 
 static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
 {
-	const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info);
 	struct extent_map_tree *tree = &inode->extent_tree;
 	long nr_dropped = 0;
 	struct rb_node *node;
@@ -1197,7 +1198,8 @@ next:
 		 * lock. This is to avoid slowing other tasks trying to take the
 		 * lock.
 		 */
-		if (need_resched() || rwlock_needbreak(&tree->lock))
+		if (need_resched() || rwlock_needbreak(&tree->lock) ||
+		    btrfs_fs_closing(fs_info))
 			break;
 		node = next;
 	}
@@ -1221,7 +1223,8 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
 		ctx->last_ino = btrfs_ino(inode);
 		btrfs_add_delayed_iput(inode);
 
-		if (ctx->scanned >= ctx->nr_to_scan)
+		if (ctx->scanned >= ctx->nr_to_scan ||
+		    btrfs_fs_closing(inode->root->fs_info))
 			break;
 
 		cond_resched();
@@ -1250,16 +1253,19 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
 	return nr_dropped;
 }
 
-long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
 {
+	struct btrfs_fs_info *fs_info;
 	struct btrfs_em_shrink_ctx ctx;
 	u64 start_root_id;
 	u64 next_root_id;
 	bool cycled = false;
 	long nr_dropped = 0;
 
+	fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work);
+
 	ctx.scanned = 0;
-	ctx.nr_to_scan = nr_to_scan;
+	ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
 
 	/*
 	 * In case we have multiple tasks running this shrinker, make the next
@@ -1277,12 +1283,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 	if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
 		s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
 
-		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
+		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan,
 							   nr, ctx.last_root,
 							   ctx.last_ino);
 	}
 
-	while (ctx.scanned < ctx.nr_to_scan) {
+	while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) {
 		struct btrfs_root *root;
 		unsigned long count;
 
@@ -1340,5 +1346,34 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 							  ctx.last_ino);
 	}
 
-	return nr_dropped;
+	atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
+}
+
+void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+{
+	/*
+	 * Do nothing if the shrinker is already running. In case of high memory
+	 * pressure we can have a lot of tasks calling us and all passing the
+	 * same nr_to_scan value, but in reality we may need only to free
+	 * nr_to_scan extent maps (or less). In case we need to free more than
+	 * that, we will be called again by the fs shrinker, so no worries about
+	 * not doing enough work to reclaim memory from extent maps.
+	 * We can also be repeatedly called with the same nr_to_scan value
+	 * simply because the shrinker runs asynchronously and multiple calls
+	 * to this function are made before the shrinker does enough progress.
+	 *
+	 * That's why we set the atomic counter to nr_to_scan only if its
+	 * current value is zero, instead of incrementing the counter by
+	 * nr_to_scan.
+	 */
+	if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
+		return;
+
+	queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work);
+}
+
+void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
+{
+	atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
+	INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker);
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 5154a8f1d26c..cd123b266b64 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
 int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
 				   struct extent_map *new_em,
 				   bool modified);
-long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
+void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
+void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info);
 
 #endif
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 85e6a644aff2..678ce5259332 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -638,6 +638,8 @@ struct btrfs_fs_info {
 	spinlock_t extent_map_shrinker_lock;
 	u64 extent_map_shrinker_last_root;
 	u64 extent_map_shrinker_last_ino;
+	atomic64_t extent_map_shrinker_nr_to_scan;
+	struct work_struct extent_map_shrinker_work;
 
 	/* Protected by 'trans_lock'. */
 	struct list_head dirty_cowonly_roots;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e8aeeca0795f..7dc7c4fee996 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -28,7 +28,6 @@
 #include <linux/btrfs.h>
 #include <linux/security.h>
 #include <linux/fs_parser.h>
-#include <linux/swap.h>
 #include "messages.h"
 #include "delayed-inode.h"
 #include "ctree.h"
@@ -2408,16 +2407,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
 	const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 
-	/*
-	 * We may be called from any task trying to allocate memory and we don't
-	 * want to slow it down with scanning and dropping extent maps. It would
-	 * also cause heavy lock contention if many tasks concurrently enter
-	 * here. Therefore only allow kswapd tasks to scan and drop extent maps.
-	 */
-	if (!current_is_kswapd())
-		return 0;
+	btrfs_free_extent_maps(fs_info, nr_to_scan);
 
-	return btrfs_free_extent_maps(fs_info, nr_to_scan);
+	/* The extent map shrinker runs asynchronously, so always return 0. */
+	return 0;
 }
 
 static const struct super_operations btrfs_super_ops = {
-- 
2.51.0


From 70a5f9e266cf1f544f33ce56f1eee634d79f3030 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 4 Sep 2024 17:03:43 +0100
Subject: [PATCH 03/16] btrfs: simplify tracking progress for the extent map
 shrinker

Now that the extent map shrinker can only be run by a single task (as a
work queue item) there is no need to keep the progress of the shrinker
protected by a spinlock and passing the progress to trace events as
parameters. So remove the lock and simplify the arguments for the trace
events.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c           |  2 --
 fs/btrfs/extent_map.c        | 55 ++++++++----------------------------
 fs/btrfs/fs.h                |  1 -
 include/trace/events/btrfs.h | 21 +++++++-------
 4 files changed, 22 insertions(+), 57 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8ad625a8d9f9..0cbc31908264 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2852,8 +2852,6 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
 	if (ret)
 		return ret;
 
-	spin_lock_init(&fs_info->extent_map_shrinker_lock);
-
 	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
 	if (ret)
 		return ret;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 4b9ab3f9c731..f6e338dd28ec 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1118,8 +1118,6 @@ out_free_pre:
 struct btrfs_em_shrink_ctx {
 	long nr_to_scan;
 	long scanned;
-	u64 last_ino;
-	u64 last_root;
 };
 
 static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
@@ -1211,16 +1209,17 @@ next:
 
 static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
 {
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_inode *inode;
 	long nr_dropped = 0;
-	u64 min_ino = ctx->last_ino + 1;
+	u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1;
 
 	inode = btrfs_find_first_inode(root, min_ino);
 	while (inode) {
 		nr_dropped += btrfs_scan_inode(inode, ctx);
 
 		min_ino = btrfs_ino(inode) + 1;
-		ctx->last_ino = btrfs_ino(inode);
+		fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
 		btrfs_add_delayed_iput(inode);
 
 		if (ctx->scanned >= ctx->nr_to_scan ||
@@ -1240,14 +1239,14 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
 		 * inode if there is one or we will find out this was the last
 		 * one and move to the next root.
 		 */
-		ctx->last_root = btrfs_root_id(root);
+		fs_info->extent_map_shrinker_last_root = btrfs_root_id(root);
 	} else {
 		/*
 		 * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
 		 * that when processing the next root we start from its first inode.
 		 */
-		ctx->last_ino = 0;
-		ctx->last_root = btrfs_root_id(root) + 1;
+		fs_info->extent_map_shrinker_last_ino = 0;
+		fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1;
 	}
 
 	return nr_dropped;
@@ -1267,25 +1266,13 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
 	ctx.scanned = 0;
 	ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
 
-	/*
-	 * In case we have multiple tasks running this shrinker, make the next
-	 * one start from the next inode in case it starts before we finish.
-	 */
-	spin_lock(&fs_info->extent_map_shrinker_lock);
-	ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
-	fs_info->extent_map_shrinker_last_ino++;
-	ctx.last_root = fs_info->extent_map_shrinker_last_root;
-	spin_unlock(&fs_info->extent_map_shrinker_lock);
-
-	start_root_id = ctx.last_root;
-	next_root_id = ctx.last_root;
+	start_root_id = fs_info->extent_map_shrinker_last_root;
+	next_root_id = fs_info->extent_map_shrinker_last_root;
 
 	if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
 		s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
 
-		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan,
-							   nr, ctx.last_root,
-							   ctx.last_ino);
+		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr);
 	}
 
 	while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) {
@@ -1302,8 +1289,8 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
 			spin_unlock(&fs_info->fs_roots_radix_lock);
 			if (start_root_id > 0 && !cycled) {
 				next_root_id = 0;
-				ctx.last_root = 0;
-				ctx.last_ino = 0;
+				fs_info->extent_map_shrinker_last_root = 0;
+				fs_info->extent_map_shrinker_last_ino = 0;
 				cycled = true;
 				continue;
 			}
@@ -1322,28 +1309,10 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
 		btrfs_put_root(root);
 	}
 
-	/*
-	 * In case of multiple tasks running this extent map shrinking code this
-	 * isn't perfect but it's simple and silences things like KCSAN. It's
-	 * not possible to know which task made more progress because we can
-	 * cycle back to the first root and first inode if it's not the first
-	 * time the shrinker ran, see the above logic. Also a task that started
-	 * later may finish earlier than another task and made less progress. So
-	 * make this simple and update to the progress of the last task that
-	 * finished, with the occasional possibility of having two consecutive
-	 * runs of the shrinker process the same inodes.
-	 */
-	spin_lock(&fs_info->extent_map_shrinker_lock);
-	fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
-	fs_info->extent_map_shrinker_last_root = ctx.last_root;
-	spin_unlock(&fs_info->extent_map_shrinker_lock);
-
 	if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
 		s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
 
-		trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped,
-							  nr, ctx.last_root,
-							  ctx.last_ino);
+		trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
 	}
 
 	atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 678ce5259332..827af2b027df 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -635,7 +635,6 @@ struct btrfs_fs_info {
 	s32 delalloc_batch;
 
 	struct percpu_counter evictable_extent_maps;
-	spinlock_t extent_map_shrinker_lock;
 	u64 extent_map_shrinker_last_root;
 	u64 extent_map_shrinker_last_ino;
 	atomic64_t extent_map_shrinker_nr_to_scan;
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index bd515415ea8b..879016b4e391 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -2555,10 +2555,9 @@ TRACE_EVENT(btrfs_extent_map_shrinker_count,
 
 TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
 
-	TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr,
-		 u64 last_root_id, u64 last_ino),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, long nr),
 
-	TP_ARGS(fs_info, nr_to_scan, nr, last_root_id, last_ino),
+	TP_ARGS(fs_info, nr),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	long,	nr_to_scan	)
@@ -2568,10 +2567,11 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
 	),
 
 	TP_fast_assign_btrfs(fs_info,
-		__entry->nr_to_scan	= nr_to_scan;
+		__entry->nr_to_scan	= \
+		     atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
 		__entry->nr		= nr;
-		__entry->last_root_id	= last_root_id;
-		__entry->last_ino	= last_ino;
+		__entry->last_root_id	= fs_info->extent_map_shrinker_last_root;
+		__entry->last_ino	= fs_info->extent_map_shrinker_last_ino;
 	),
 
 	TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
@@ -2581,10 +2581,9 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
 
 TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
 
-	TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr,
-		 u64 last_root_id, u64 last_ino),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr),
 
-	TP_ARGS(fs_info, nr_dropped, nr, last_root_id, last_ino),
+	TP_ARGS(fs_info, nr_dropped, nr),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	long,	nr_dropped	)
@@ -2596,8 +2595,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
 	TP_fast_assign_btrfs(fs_info,
 		__entry->nr_dropped	= nr_dropped;
 		__entry->nr		= nr;
-		__entry->last_root_id	= last_root_id;
-		__entry->last_ino	= last_ino;
+		__entry->last_root_id	= fs_info->extent_map_shrinker_last_root;
+		__entry->last_ino	= fs_info->extent_map_shrinker_last_ino;
 	),
 
 	TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
-- 
2.51.0


From e7fa845010f12bb98251a2bf9dcf39fd601f9424 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 5 Sep 2024 11:31:49 +0100
Subject: [PATCH 04/16] btrfs: rename extent map shrinker members from struct
 btrfs_fs_info

The names for the members of struct btrfs_fs_info related to the extent
map shrinker are a bit too long, so rename them to be shorter by replacing
the "extent_map_" prefix with the "em_" prefix.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c           |  2 +-
 fs/btrfs/extent_map.c        | 32 ++++++++++++++++----------------
 fs/btrfs/fs.h                |  8 ++++----
 include/trace/events/btrfs.h | 10 +++++-----
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0cbc31908264..bcf6e53bc2a7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4291,7 +4291,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 	cancel_work_sync(&fs_info->async_reclaim_work);
 	cancel_work_sync(&fs_info->async_data_reclaim_work);
 	cancel_work_sync(&fs_info->preempt_reclaim_work);
-	cancel_work_sync(&fs_info->extent_map_shrinker_work);
+	cancel_work_sync(&fs_info->em_shrinker_work);
 
 	/* Cancel or finish ongoing discard work */
 	btrfs_discard_cleanup(fs_info);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index f6e338dd28ec..67ce85ff0ae2 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1212,14 +1212,14 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_inode *inode;
 	long nr_dropped = 0;
-	u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1;
+	u64 min_ino = fs_info->em_shrinker_last_ino + 1;
 
 	inode = btrfs_find_first_inode(root, min_ino);
 	while (inode) {
 		nr_dropped += btrfs_scan_inode(inode, ctx);
 
 		min_ino = btrfs_ino(inode) + 1;
-		fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
+		fs_info->em_shrinker_last_ino = btrfs_ino(inode);
 		btrfs_add_delayed_iput(inode);
 
 		if (ctx->scanned >= ctx->nr_to_scan ||
@@ -1239,14 +1239,14 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
 		 * inode if there is one or we will find out this was the last
 		 * one and move to the next root.
 		 */
-		fs_info->extent_map_shrinker_last_root = btrfs_root_id(root);
+		fs_info->em_shrinker_last_root = btrfs_root_id(root);
 	} else {
 		/*
 		 * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
 		 * that when processing the next root we start from its first inode.
 		 */
-		fs_info->extent_map_shrinker_last_ino = 0;
-		fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1;
+		fs_info->em_shrinker_last_ino = 0;
+		fs_info->em_shrinker_last_root = btrfs_root_id(root) + 1;
 	}
 
 	return nr_dropped;
@@ -1261,13 +1261,13 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
 	bool cycled = false;
 	long nr_dropped = 0;
 
-	fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work);
+	fs_info = container_of(work, struct btrfs_fs_info, em_shrinker_work);
 
 	ctx.scanned = 0;
-	ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
+	ctx.nr_to_scan = atomic64_read(&fs_info->em_shrinker_nr_to_scan);
 
-	start_root_id = fs_info->extent_map_shrinker_last_root;
-	next_root_id = fs_info->extent_map_shrinker_last_root;
+	start_root_id = fs_info->em_shrinker_last_root;
+	next_root_id = fs_info->em_shrinker_last_root;
 
 	if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
 		s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
@@ -1289,8 +1289,8 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
 			spin_unlock(&fs_info->fs_roots_radix_lock);
 			if (start_root_id > 0 && !cycled) {
 				next_root_id = 0;
-				fs_info->extent_map_shrinker_last_root = 0;
-				fs_info->extent_map_shrinker_last_ino = 0;
+				fs_info->em_shrinker_last_root = 0;
+				fs_info->em_shrinker_last_ino = 0;
 				cycled = true;
 				continue;
 			}
@@ -1315,7 +1315,7 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
 		trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
 	}
 
-	atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
+	atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0);
 }
 
 void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
@@ -1335,14 +1335,14 @@ void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 	 * current value is zero, instead of incrementing the counter by
 	 * nr_to_scan.
 	 */
-	if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
+	if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
 		return;
 
-	queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work);
+	queue_work(system_unbound_wq, &fs_info->em_shrinker_work);
 }
 
 void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
 {
-	atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
-	INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker);
+	atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0);
+	INIT_WORK(&fs_info->em_shrinker_work, btrfs_extent_map_shrinker_worker);
 }
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 827af2b027df..79a1a3d6f04d 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -635,10 +635,10 @@ struct btrfs_fs_info {
 	s32 delalloc_batch;
 
 	struct percpu_counter evictable_extent_maps;
-	u64 extent_map_shrinker_last_root;
-	u64 extent_map_shrinker_last_ino;
-	atomic64_t extent_map_shrinker_nr_to_scan;
-	struct work_struct extent_map_shrinker_work;
+	u64 em_shrinker_last_root;
+	u64 em_shrinker_last_ino;
+	atomic64_t em_shrinker_nr_to_scan;
+	struct work_struct em_shrinker_work;
 
 	/* Protected by 'trans_lock'. */
 	struct list_head dirty_cowonly_roots;
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 879016b4e391..4df93ca9b7a8 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -2568,10 +2568,10 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
 
 	TP_fast_assign_btrfs(fs_info,
 		__entry->nr_to_scan	= \
-		     atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
+		     atomic64_read(&fs_info->em_shrinker_nr_to_scan);
 		__entry->nr		= nr;
-		__entry->last_root_id	= fs_info->extent_map_shrinker_last_root;
-		__entry->last_ino	= fs_info->extent_map_shrinker_last_ino;
+		__entry->last_root_id	= fs_info->em_shrinker_last_root;
+		__entry->last_ino	= fs_info->em_shrinker_last_ino;
 	),
 
 	TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
@@ -2595,8 +2595,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
 	TP_fast_assign_btrfs(fs_info,
 		__entry->nr_dropped	= nr_dropped;
 		__entry->nr		= nr;
-		__entry->last_root_id	= fs_info->extent_map_shrinker_last_root;
-		__entry->last_ino	= fs_info->extent_map_shrinker_last_ino;
+		__entry->last_root_id	= fs_info->em_shrinker_last_root;
+		__entry->last_ino	= fs_info->em_shrinker_last_ino;
 	),
 
 	TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
-- 
2.51.0


From a8371fccf016c83e1c3d9fb832ebb60d783d1853 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 17 Sep 2024 12:03:11 +0100
Subject: [PATCH 05/16] btrfs: re-enable the extent map shrinker

Now that the extent map shrinker can only be run by a single task and runs
asynchronously as a work queue job, enable it as it can no longer cause
stalls on tasks allocating memory and entering the extent map shrinker
through the fs shrinker (implemented by btrfs_free_cached_objects()).

This is crucial to prevent exhaustion of memory due to unbounded extent
map creation, primarily with direct IO but also for buffered IO on files
with holes. This problem, for the direct IO case, was first reported in
the Link tag below. That report was added to a Link tag of the first patch
that introduced the extent map shrinker, commit 956a17d9d050 ("btrfs: add
a shrinker for extent maps"), however the Link tag disappeared somehow
from the committed patch (but was included in the submitted patch to the
mailing list), so adding it below for future reference.

Link: https://lore.kernel.org/linux-btrfs/13f94633dcf04d29aaf1f0a43d42c55e@amazon.com/
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 7dc7c4fee996..7cd62307cd41 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2393,13 +2393,7 @@ static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_contro
 
 	trace_btrfs_extent_map_shrinker_count(fs_info, nr);
 
-	/*
-	 * Only report the real number for EXPERIMENTAL builds, as there are
-	 * reports of serious performance degradation caused by too frequent shrinks.
-	 */
-	if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL))
-		return nr;
-	return 0;
+	return nr;
 }
 
 static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc)
-- 
2.51.0


From 2b1ef80d68100a064f94fb2e658881b9f260cf08 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 16 Oct 2024 10:45:47 +0100
Subject: [PATCH 06/16] btrfs: remove redundant level argument from
 read_block_for_search()

The level parameter passed to read_block_for_search() always matches the
level of the extent buffer passed in the "eb_ret" parameter, which we are
also extracting into the "parent_level" local variable.

So remove the level parameter and instead use the "parent_level" variable
which in fact has a better name (it's the level of the parent node from
which we are reading a child node/leaf).

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4f34208126f7..428c5650559a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1508,7 +1508,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
  */
 static int
 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
-		      struct extent_buffer **eb_ret, int level, int slot,
+		      struct extent_buffer **eb_ret, int slot,
 		      const struct btrfs_key *key)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1542,7 +1542,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	tmp = find_extent_buffer(fs_info, blocknr);
 	if (tmp) {
 		if (p->reada == READA_FORWARD_ALWAYS)
-			reada_for_search(fs_info, p, level, slot, key->objectid);
+			reada_for_search(fs_info, p, parent_level, slot, key->objectid);
 
 		/* first we do an atomic uptodate check */
 		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
@@ -1568,7 +1568,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 		}
 
 		if (!p->skip_locking) {
-			btrfs_unlock_up_safe(p, level + 1);
+			btrfs_unlock_up_safe(p, parent_level + 1);
 			tmp_locked = true;
 			btrfs_tree_read_lock(tmp);
 			btrfs_release_path(p);
@@ -1595,12 +1595,12 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	}
 
 	if (!p->skip_locking) {
-		btrfs_unlock_up_safe(p, level + 1);
+		btrfs_unlock_up_safe(p, parent_level + 1);
 		ret = -EAGAIN;
 	}
 
 	if (p->reada != READA_NONE)
-		reada_for_search(fs_info, p, level, slot, key->objectid);
+		reada_for_search(fs_info, p, parent_level, slot, key->objectid);
 
 	tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
 	if (IS_ERR(tmp)) {
@@ -2236,7 +2236,7 @@ cow_done:
 			goto done;
 		}
 
-		err = read_block_for_search(root, p, &b, level, slot, key);
+		err = read_block_for_search(root, p, &b, slot, key);
 		if (err == -EAGAIN && !p->nowait)
 			goto again;
 		if (err) {
@@ -2363,7 +2363,7 @@ again:
 			goto done;
 		}
 
-		err = read_block_for_search(root, p, &b, level, slot, key);
+		err = read_block_for_search(root, p, &b, slot, key);
 		if (err == -EAGAIN && !p->nowait)
 			goto again;
 		if (err) {
@@ -4969,8 +4969,7 @@ again:
 		}
 
 		next = c;
-		ret = read_block_for_search(root, path, &next, level,
-					    slot, &key);
+		ret = read_block_for_search(root, path, &next, slot, &key);
 		if (ret == -EAGAIN && !path->nowait)
 			goto again;
 
@@ -5013,8 +5012,7 @@ again:
 		if (!level)
 			break;
 
-		ret = read_block_for_search(root, path, &next, level,
-					    0, &key);
+		ret = read_block_for_search(root, path, &next, 0, &key);
 		if (ret == -EAGAIN && !path->nowait)
 			goto again;
 
-- 
2.51.0


From c88ebf1db588fb1852e6a9ea93a35d770c3f45da Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 16 Oct 2024 10:57:48 +0100
Subject: [PATCH 07/16] btrfs: simplify arguments for btrfs_verify_level_key()

The only caller of btrfs_verify_level_key() is read_block_for_search() and
it's passing 3 arguments to it that can be extracted from its on stack
variable of type struct btrfs_tree_parent_check.

So change btrfs_verify_level_key() to accept an argument of type
struct btrfs_tree_parent_check instead of level, first key and parent
transid arguments.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c        |  3 +--
 fs/btrfs/tree-checker.c | 16 ++++++++--------
 fs/btrfs/tree-checker.h |  4 ++--
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 428c5650559a..f68a9b586079 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1551,8 +1551,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 			 * being cached, read from scrub, or have multiple
 			 * parents (shared tree blocks).
 			 */
-			if (btrfs_verify_level_key(tmp,
-					parent_level - 1, &check.first_key, gen)) {
+			if (btrfs_verify_level_key(tmp, &check)) {
 				ret = -EUCLEAN;
 				goto out;
 			}
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 7b50263723bc..148d8cefa40e 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -2183,8 +2183,8 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
 	return 0;
 }
 
-int btrfs_verify_level_key(struct extent_buffer *eb, int level,
-			   struct btrfs_key *first_key, u64 parent_transid)
+int btrfs_verify_level_key(struct extent_buffer *eb,
+			   const struct btrfs_tree_parent_check *check)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	int found_level;
@@ -2192,16 +2192,16 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
 	int ret;
 
 	found_level = btrfs_header_level(eb);
-	if (found_level != level) {
+	if (found_level != check->level) {
 		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
 		     KERN_ERR "BTRFS: tree level check failed\n");
 		btrfs_err(fs_info,
 "tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
-			  eb->start, level, found_level);
+			  eb->start, check->level, found_level);
 		return -EIO;
 	}
 
-	if (!first_key)
+	if (!check->has_first_key)
 		return 0;
 
 	/*
@@ -2226,15 +2226,15 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
 		btrfs_node_key_to_cpu(eb, &found_key, 0);
 	else
 		btrfs_item_key_to_cpu(eb, &found_key, 0);
-	ret = btrfs_comp_cpu_keys(first_key, &found_key);
+	ret = btrfs_comp_cpu_keys(&check->first_key, &found_key);
 
 	if (ret) {
 		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
 		     KERN_ERR "BTRFS: tree first key check failed\n");
 		btrfs_err(fs_info,
 "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
-			  eb->start, parent_transid, first_key->objectid,
-			  first_key->type, first_key->offset,
+			  eb->start, check->transid, check->first_key.objectid,
+			  check->first_key.type, check->first_key.offset,
 			  found_key.objectid, found_key.type,
 			  found_key.offset);
 	}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 01669cfa6578..db67f96cbe4b 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -69,7 +69,7 @@ int btrfs_check_node(struct extent_buffer *node);
 int btrfs_check_chunk_valid(struct extent_buffer *leaf,
 			    struct btrfs_chunk *chunk, u64 logical);
 int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);
-int btrfs_verify_level_key(struct extent_buffer *eb, int level,
-			   struct btrfs_key *first_key, u64 parent_transid);
+int btrfs_verify_level_key(struct extent_buffer *eb,
+			   const struct btrfs_tree_parent_check *check);
 
 #endif
-- 
2.51.0


From b8e63ea4058da6719c2c89cdf383167aa120e904 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 16 Oct 2024 11:09:26 +0100
Subject: [PATCH 08/16] btrfs: remove redundant initializations for struct
 btrfs_tree_parent_check

It's pointless to initialize the has_first_key field of the stack local
btrfs_tree_parent_check structure at btrfs_tree_parent_check() and at
btrfs_qgroup_trace_subtree() since all fields not explicitly initialized
are zeroed out. In the case of the first function it's a bit odd because
we are assigning 0 and the field is of type bool, however not incorrect
since a 0 is converted to false.

Just remove the explicit initializations due to their redundancy.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 1 -
 fs/btrfs/qgroup.c    | 1 -
 2 files changed, 2 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 999d5cdad3c0..4cba6d78e631 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4286,7 +4286,6 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
 				u64 bytenr, u64 owner_root, u64 gen, int level)
 {
 	struct btrfs_tree_parent_check check = {
-		.has_first_key = 0,
 		.level = level,
 		.transid = gen
 	};
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index df76bad8a067..a6f92836c9b1 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2648,7 +2648,6 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
 
 	if (!extent_buffer_uptodate(root_eb)) {
 		struct btrfs_tree_parent_check check = {
-			.has_first_key = false,
 			.transid = root_gen,
 			.level = root_level
 		};
-- 
2.51.0


From 4b5c1200f78ed40e7f1d6f05a6b00a04cd39cc7a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 16 Oct 2024 11:13:24 +0100
Subject: [PATCH 09/16] btrfs: remove local generation variable from
 read_block_for_search()

It's redundant to have the 'gen' variable since we already have the same
value in the local btrfs_tree_parent_check structure. So remove it and
instead use the structure's field.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f68a9b586079..148648ea1c8b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1514,7 +1514,6 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_tree_parent_check check = { 0 };
 	u64 blocknr;
-	u64 gen;
 	struct extent_buffer *tmp = NULL;
 	int ret = 0;
 	int parent_level;
@@ -1524,12 +1523,11 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	bool path_released = false;
 
 	blocknr = btrfs_node_blockptr(*eb_ret, slot);
-	gen = btrfs_node_ptr_generation(*eb_ret, slot);
 	parent_level = btrfs_header_level(*eb_ret);
 	btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
 	check.has_first_key = true;
 	check.level = parent_level - 1;
-	check.transid = gen;
+	check.transid = btrfs_node_ptr_generation(*eb_ret, slot);
 	check.owner_root = btrfs_root_id(root);
 
 	/*
@@ -1545,7 +1543,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 			reada_for_search(fs_info, p, parent_level, slot, key->objectid);
 
 		/* first we do an atomic uptodate check */
-		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+		if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) {
 			/*
 			 * Do extra check for first_key, eb can be stale due to
 			 * being cached, read from scrub, or have multiple
-- 
2.51.0


From 70958a949d852cbecc3d46127bf0b24786df0130 Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Tue, 15 Oct 2024 14:27:32 -0700
Subject: [PATCH 10/16] btrfs: do not clear read-only when adding sprout device

If you follow the seed/sprout wiki, it suggests the following workflow:

btrfstune -S 1 seed_dev
mount seed_dev mnt
btrfs device add sprout_dev
mount -o remount,rw mnt

The first mount mounts the FS readonly, which results in not setting
BTRFS_FS_OPEN, and setting the readonly bit on the sb. The device add
somewhat surprisingly clears the readonly bit on the sb (though the
mount is still practically readonly, from the users perspective...).
Finally, the remount checks the readonly bit on the sb against the flag
and sees no change, so it does not run the code intended to run on
ro->rw transitions, leaving BTRFS_FS_OPEN unset.

As a result, when the cleaner_kthread runs, it sees no BTRFS_FS_OPEN and
does no work. This results in leaking deleted snapshots until we run out
of space.

I propose fixing it at the first departure from what feels reasonable:
when we clear the readonly bit on the sb during device add.

A new fstest I have written reproduces the bug and confirms the fix.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f5bfb4c8adb2..1cccaf9c2b0d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2842,8 +2842,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);
 
 	if (seeding_dev) {
-		btrfs_clear_sb_rdonly(sb);
-
 		/* GFP_KERNEL allocation must not be under device_list_mutex */
 		seed_devices = btrfs_init_sprout(fs_info);
 		if (IS_ERR(seed_devices)) {
@@ -2986,8 +2984,6 @@ error_sysfs:
 	mutex_unlock(&fs_info->chunk_mutex);
 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 error_trans:
-	if (seeding_dev)
-		btrfs_set_sb_rdonly(sb);
 	if (trans)
 		btrfs_end_transaction(trans);
 error_free_zone:
-- 
2.51.0


From 8511074c42b6255e03eceb09396338572572f1c7 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 9 Oct 2024 09:37:04 +1030
Subject: [PATCH 11/16] btrfs: remove unused btrfs_folio_start_writer_lock()

This function is not really suitable to lock a folio, as it lacks the
proper mapping checks, thus the locked folio may not even belong to
btrfs.

And due to the above reason, the last user inside lock_delalloc_folios()
is already removed, and we can remove this function.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/subpage.c | 47 ----------------------------------------------
 fs/btrfs/subpage.h |  2 --
 2 files changed, 49 deletions(-)

diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 17bc53a8df01..26d4d05dd165 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -295,26 +295,6 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
 			     orig_start + orig_len) - *start;
 }
 
-static void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
-				       struct folio *folio, u64 start, u32 len)
-{
-	struct btrfs_subpage *subpage = folio_get_private(folio);
-	const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
-	const int nbits = (len >> fs_info->sectorsize_bits);
-	unsigned long flags;
-	int ret;
-
-	btrfs_subpage_assert(fs_info, folio, start, len);
-
-	spin_lock_irqsave(&subpage->lock, flags);
-	ASSERT(atomic_read(&subpage->readers) == 0);
-	ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
-	bitmap_set(subpage->bitmaps, start_bit, nbits);
-	ret = atomic_add_return(nbits, &subpage->writers);
-	ASSERT(ret == nbits);
-	spin_unlock_irqrestore(&subpage->lock, flags);
-}
-
 static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
 					      struct folio *folio, u64 start, u32 len)
 {
@@ -351,33 +331,6 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf
 	return last;
 }
 
-/*
- * Lock a folio for delalloc page writeback.
- *
- * Return -EAGAIN if the page is not properly initialized.
- * Return 0 with the page locked, and writer counter updated.
- *
- * Even with 0 returned, the page still need extra check to make sure
- * it's really the correct page, as the caller is using
- * filemap_get_folios_contig(), which can race with page invalidating.
- */
-int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
-				  struct folio *folio, u64 start, u32 len)
-{
-	if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
-		folio_lock(folio);
-		return 0;
-	}
-	folio_lock(folio);
-	if (!folio_test_private(folio) || !folio_get_private(folio)) {
-		folio_unlock(folio);
-		return -EAGAIN;
-	}
-	btrfs_subpage_clamp_range(folio, &start, &len);
-	btrfs_subpage_start_writer(fs_info, folio, start, len);
-	return 0;
-}
-
 /*
  * Handle different locked folios:
  *
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index d16fbddeda68..e4076c5b06bc 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -100,8 +100,6 @@ void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
 void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
 			      struct folio *folio, u64 start, u32 len);
 
-int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
-				  struct folio *folio, u64 start, u32 len);
 void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
 				 struct folio *folio, u64 start, u32 len);
 void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
-- 
2.51.0


From 336e69f3025fb70db9d0dfb7f36ac79887bf5341 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 9 Oct 2024 16:21:06 +1030
Subject: [PATCH 12/16] btrfs: unify to use writer locks for subpage locking

Since commit d7172f52e993 ("btrfs: use per-buffer locking for
extent_buffer reading"), metadata read no longer relies on the subpage
reader locking.

This means we do not need to maintain a different metadata/data split
for locking, so we can convert the existing reader lock users by:

- add_ra_bio_pages()
  Convert to btrfs_folio_set_writer_lock()

- end_folio_read()
  Convert to btrfs_folio_end_writer_lock()

- begin_folio_read()
  Convert to btrfs_folio_set_writer_lock()

- folio_range_has_eb()
  Remove the subpage->readers checks, since it is always 0.

- Remove btrfs_subpage_start_reader() and btrfs_subpage_end_reader()

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c |  3 +-
 fs/btrfs/extent_io.c   | 10 ++-----
 fs/btrfs/subpage.c     | 62 ++----------------------------------------
 fs/btrfs/subpage.h     | 13 ---------
 4 files changed, 5 insertions(+), 83 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index abcf8ed06afc..e51d55bd1a29 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -545,8 +545,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 		 * subpage::readers and to unlock the page.
 		 */
 		if (fs_info->sectorsize < PAGE_SIZE)
-			btrfs_subpage_start_reader(fs_info, folio, cur,
-						   add_size);
+			btrfs_folio_set_writer_lock(fs_info, folio, cur, add_size);
 		folio_put(folio);
 		cur += add_size;
 	}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4cba6d78e631..3eb9ed5d55f2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -438,7 +438,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
 	if (!btrfs_is_subpage(fs_info, folio->mapping))
 		folio_unlock(folio);
 	else
-		btrfs_subpage_end_reader(fs_info, folio, start, len);
+		btrfs_folio_end_writer_lock(fs_info, folio, start, len);
 }
 
 /*
@@ -495,7 +495,7 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
 		return;
 
 	ASSERT(folio_test_private(folio));
-	btrfs_subpage_start_reader(fs_info, folio, folio_pos(folio), PAGE_SIZE);
+	btrfs_folio_set_writer_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE);
 }
 
 /*
@@ -2517,12 +2517,6 @@ static bool folio_range_has_eb(struct folio *folio)
 		subpage = folio_get_private(folio);
 		if (atomic_read(&subpage->eb_refs))
 			return true;
-		/*
-		 * Even there is no eb refs here, we may still have
-		 * end_folio_read() call relying on page::private.
-		 */
-		if (atomic_read(&subpage->readers))
-			return true;
 	}
 	return false;
 }
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 26d4d05dd165..6c3d54a45956 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -140,12 +140,10 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
 		return ERR_PTR(-ENOMEM);
 
 	spin_lock_init(&ret->lock);
-	if (type == BTRFS_SUBPAGE_METADATA) {
+	if (type == BTRFS_SUBPAGE_METADATA)
 		atomic_set(&ret->eb_refs, 0);
-	} else {
-		atomic_set(&ret->readers, 0);
+	else
 		atomic_set(&ret->writers, 0);
-	}
 	return ret;
 }
 
@@ -221,62 +219,6 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
 	__start_bit;							\
 })
 
-void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
-				struct folio *folio, u64 start, u32 len)
-{
-	struct btrfs_subpage *subpage = folio_get_private(folio);
-	const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
-	const int nbits = len >> fs_info->sectorsize_bits;
-	unsigned long flags;
-
-
-	btrfs_subpage_assert(fs_info, folio, start, len);
-
-	spin_lock_irqsave(&subpage->lock, flags);
-	/*
-	 * Even though it's just for reading the page, no one should have
-	 * locked the subpage range.
-	 */
-	ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
-	bitmap_set(subpage->bitmaps, start_bit, nbits);
-	atomic_add(nbits, &subpage->readers);
-	spin_unlock_irqrestore(&subpage->lock, flags);
-}
-
-void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
-			      struct folio *folio, u64 start, u32 len)
-{
-	struct btrfs_subpage *subpage = folio_get_private(folio);
-	const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
-	const int nbits = len >> fs_info->sectorsize_bits;
-	unsigned long flags;
-	bool is_data;
-	bool last;
-
-	btrfs_subpage_assert(fs_info, folio, start, len);
-	is_data = is_data_inode(BTRFS_I(folio->mapping->host));
-
-	spin_lock_irqsave(&subpage->lock, flags);
-
-	/* The range should have already been locked. */
-	ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits));
-	ASSERT(atomic_read(&subpage->readers) >= nbits);
-
-	bitmap_clear(subpage->bitmaps, start_bit, nbits);
-	last = atomic_sub_and_test(nbits, &subpage->readers);
-
-	/*
-	 * For data we need to unlock the page if the last read has finished.
-	 *
-	 * And please don't replace @last with atomic_sub_and_test() call
-	 * inside if () condition.
-	 * As we want the atomic_sub_and_test() to be always executed.
-	 */
-	if (is_data && last)
-		folio_unlock(folio);
-	spin_unlock_irqrestore(&subpage->lock, flags);
-}
-
 static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
 {
 	u64 orig_start = *start;
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index e4076c5b06bc..c150aba9318e 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -45,14 +45,6 @@ enum {
 struct btrfs_subpage {
 	/* Common members for both data and metadata pages */
 	spinlock_t lock;
-	/*
-	 * Both data and metadata needs to track how many readers are for the
-	 * page.
-	 * Data relies on @readers to unlock the page when last reader finished.
-	 * While metadata doesn't need page unlock, it needs to prevent
-	 * page::private get cleared before the last end_page_read().
-	 */
-	atomic_t readers;
 	union {
 		/*
 		 * Structures only used by metadata
@@ -95,11 +87,6 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage);
 void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
 void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
 
-void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
-				struct folio *folio, u64 start, u32 len);
-void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
-			      struct folio *folio, u64 start, u32 len);
-
 void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
 				 struct folio *folio, u64 start, u32 len);
 void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
-- 
2.51.0


From 0f7120266584490616f031873e7148495d77dd68 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 9 Oct 2024 16:21:07 +1030
Subject: [PATCH 13/16] btrfs: rename btrfs_folio_(set|start|end)_writer_lock()

Since there is no user of reader locks, rename the writer locks into a
more generic name, by removing the "_writer" part from the name.

And also rename btrfs_subpage::writer into btrfs_subpage::locked.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c |  2 +-
 fs/btrfs/extent_io.c   | 14 ++++++-------
 fs/btrfs/subpage.c     | 46 +++++++++++++++++++++---------------------
 fs/btrfs/subpage.h     | 20 ++++++++++--------
 4 files changed, 43 insertions(+), 39 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e51d55bd1a29..21d73ea594a9 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -545,7 +545,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 		 * subpage::readers and to unlock the page.
 		 */
 		if (fs_info->sectorsize < PAGE_SIZE)
-			btrfs_folio_set_writer_lock(fs_info, folio, cur, add_size);
+			btrfs_folio_set_lock(fs_info, folio, cur, add_size);
 		folio_put(folio);
 		cur += add_size;
 	}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3eb9ed5d55f2..1beaba232532 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -190,7 +190,7 @@ static void process_one_folio(struct btrfs_fs_info *fs_info,
 		btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);
 
 	if (folio != locked_folio && (page_ops & PAGE_UNLOCK))
-		btrfs_folio_end_writer_lock(fs_info, folio, start, len);
+		btrfs_folio_end_lock(fs_info, folio, start, len);
 }
 
 static void __process_folios_contig(struct address_space *mapping,
@@ -276,7 +276,7 @@ static noinline int lock_delalloc_folios(struct inode *inode,
 			range_start = max_t(u64, folio_pos(folio), start);
 			range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
 					  end + 1) - range_start;
-			btrfs_folio_set_writer_lock(fs_info, folio, range_start, range_len);
+			btrfs_folio_set_lock(fs_info, folio, range_start, range_len);
 
 			processed_end = range_start + range_len - 1;
 		}
@@ -438,7 +438,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
 	if (!btrfs_is_subpage(fs_info, folio->mapping))
 		folio_unlock(folio);
 	else
-		btrfs_folio_end_writer_lock(fs_info, folio, start, len);
+		btrfs_folio_end_lock(fs_info, folio, start, len);
 }
 
 /*
@@ -495,7 +495,7 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
 		return;
 
 	ASSERT(folio_test_private(folio));
-	btrfs_folio_set_writer_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE);
+	btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE);
 }
 
 /*
@@ -1184,7 +1184,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 	for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
 		u64 start = page_start + (bit << fs_info->sectorsize_bits);
 
-		btrfs_folio_set_writer_lock(fs_info, folio, start, fs_info->sectorsize);
+		btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize);
 	}
 
 	/* Lock all (subpage) delalloc ranges inside the folio first. */
@@ -1520,7 +1520,7 @@ done:
 	 * Only unlock ranges that are submitted. As there can be some async
 	 * submitted ranges inside the folio.
 	 */
-	btrfs_folio_end_writer_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap);
+	btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap);
 	ASSERT(ret <= 0);
 	return ret;
 }
@@ -2298,7 +2298,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
 						       cur, cur_len, !ret);
 			mapping_set_error(mapping, ret);
 		}
-		btrfs_folio_end_writer_lock(fs_info, folio, cur, cur_len);
+		btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
 		if (ret < 0)
 			found_error = true;
 next_page:
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 6c3d54a45956..d4cab3c55742 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -143,7 +143,7 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
 	if (type == BTRFS_SUBPAGE_METADATA)
 		atomic_set(&ret->eb_refs, 0);
 	else
-		atomic_set(&ret->writers, 0);
+		atomic_set(&ret->nr_locked, 0);
 	return ret;
 }
 
@@ -237,8 +237,8 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
 			     orig_start + orig_len) - *start;
 }
 
-static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
-					      struct folio *folio, u64 start, u32 len)
+static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
+					    struct folio *folio, u64 start, u32 len)
 {
 	struct btrfs_subpage *subpage = folio_get_private(folio);
 	const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
@@ -256,9 +256,9 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf
 	 * extent_clear_unlock_delalloc() for compression path.
 	 *
 	 * This @locked_page is locked by plain lock_page(), thus its
-	 * subpage::writers is 0.  Handle them in a special way.
+	 * subpage::locked is 0.  Handle them in a special way.
 	 */
-	if (atomic_read(&subpage->writers) == 0) {
+	if (atomic_read(&subpage->nr_locked) == 0) {
 		spin_unlock_irqrestore(&subpage->lock, flags);
 		return true;
 	}
@@ -267,8 +267,8 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf
 		clear_bit(bit, subpage->bitmaps);
 		cleared++;
 	}
-	ASSERT(atomic_read(&subpage->writers) >= cleared);
-	last = atomic_sub_and_test(cleared, &subpage->writers);
+	ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
+	last = atomic_sub_and_test(cleared, &subpage->nr_locked);
 	spin_unlock_irqrestore(&subpage->lock, flags);
 	return last;
 }
@@ -289,8 +289,8 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf
  *   bitmap, reduce the writer lock number, and unlock the page if that's
  *   the last locked range.
  */
-void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
-				 struct folio *folio, u64 start, u32 len)
+void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
+			  struct folio *folio, u64 start, u32 len)
 {
 	struct btrfs_subpage *subpage = folio_get_private(folio);
 
@@ -303,24 +303,24 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
 
 	/*
 	 * For subpage case, there are two types of locked page.  With or
-	 * without writers number.
+	 * without locked number.
 	 *
-	 * Since we own the page lock, no one else could touch subpage::writers
+	 * Since we own the page lock, no one else could touch subpage::locked
 	 * and we are safe to do several atomic operations without spinlock.
 	 */
-	if (atomic_read(&subpage->writers) == 0) {
-		/* No writers, locked by plain lock_page(). */
+	if (atomic_read(&subpage->nr_locked) == 0) {
+		/* No subpage lock, locked by plain lock_page(). */
 		folio_unlock(folio);
 		return;
 	}
 
 	btrfs_subpage_clamp_range(folio, &start, &len);
-	if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len))
+	if (btrfs_subpage_end_and_test_lock(fs_info, folio, start, len))
 		folio_unlock(folio);
 }
 
-void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
-					struct folio *folio, unsigned long bitmap)
+void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
+				 struct folio *folio, unsigned long bitmap)
 {
 	struct btrfs_subpage *subpage = folio_get_private(folio);
 	const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked;
@@ -334,8 +334,8 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
 		return;
 	}
 
-	if (atomic_read(&subpage->writers) == 0) {
-		/* No writers, locked by plain lock_page(). */
+	if (atomic_read(&subpage->nr_locked) == 0) {
+		/* No subpage lock, locked by plain lock_page(). */
 		folio_unlock(folio);
 		return;
 	}
@@ -345,8 +345,8 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
 		if (test_and_clear_bit(bit + start_bit, subpage->bitmaps))
 			cleared++;
 	}
-	ASSERT(atomic_read(&subpage->writers) >= cleared);
-	last = atomic_sub_and_test(cleared, &subpage->writers);
+	ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
+	last = atomic_sub_and_test(cleared, &subpage->nr_locked);
 	spin_unlock_irqrestore(&subpage->lock, flags);
 	if (last)
 		folio_unlock(folio);
@@ -671,8 +671,8 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
  * This populates the involved subpage ranges so that subpage helpers can
  * properly unlock them.
  */
-void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
-				 struct folio *folio, u64 start, u32 len)
+void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
+			  struct folio *folio, u64 start, u32 len)
 {
 	struct btrfs_subpage *subpage;
 	unsigned long flags;
@@ -691,7 +691,7 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
 	/* Target range should not yet be locked. */
 	ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
 	bitmap_set(subpage->bitmaps, start_bit, nbits);
-	ret = atomic_add_return(nbits, &subpage->writers);
+	ret = atomic_add_return(nbits, &subpage->nr_locked);
 	ASSERT(ret <= fs_info->sectors_per_page);
 	spin_unlock_irqrestore(&subpage->lock, flags);
 }
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index c150aba9318e..428fa9389fd4 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -54,8 +54,12 @@ struct btrfs_subpage {
 		 */
 		atomic_t eb_refs;
 
-		/* Structures only used by data */
-		atomic_t writers;
+		/*
+		 * Structures only used by data,
+		 *
+		 * How many sectors inside the page is locked.
+		 */
+		atomic_t nr_locked;
 	};
 	unsigned long bitmaps[];
 };
@@ -87,12 +91,12 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage);
 void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
 void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
 
-void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
-				 struct folio *folio, u64 start, u32 len);
-void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
-				 struct folio *folio, u64 start, u32 len);
-void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
-					struct folio *folio, unsigned long bitmap);
+void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
+			  struct folio *folio, u64 start, u32 len);
+void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
+			  struct folio *folio, u64 start, u32 len);
+void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
+				 struct folio *folio, unsigned long bitmap);
 /*
  * Template for subpage related operations.
  *
-- 
2.51.0


From 4f285a7752071a925f3b3062b24220d01b8c0f8a Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Mon, 21 Oct 2024 23:40:22 +0200
Subject: [PATCH 14/16] btrfs: use str_yes_no() helper function in
 btrfs_dump_free_space()

Remove hard-coded strings by using the str_yes_no() and str_no_yes()
helper functions.

Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0d2db205b9f6..b50b621c6afc 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -11,6 +11,7 @@
 #include <linux/ratelimit.h>
 #include <linux/error-injection.h>
 #include <linux/sched/mm.h>
+#include <linux/string_choices.h>
 #include "ctree.h"
 #include "fs.h"
 #include "messages.h"
@@ -2935,12 +2936,11 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
 		if (info->bytes >= bytes && !block_group->ro)
 			count++;
 		btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
-			   info->offset, info->bytes,
-		       (info->bitmap) ? "yes" : "no");
+			   info->offset, info->bytes, str_yes_no(info->bitmap));
 	}
 	spin_unlock(&ctl->tree_lock);
 	btrfs_info(fs_info, "block group has cluster?: %s",
-	       list_empty(&block_group->cluster_list) ? "no" : "yes");
+	       str_no_yes(list_empty(&block_group->cluster_list)));
 	btrfs_info(fs_info,
 		   "%d free space entries at or bigger than %llu bytes",
 		   count, bytes);
-- 
2.51.0


From b1c5f6eda2d024c187d4d47f0310bc5a1cc8b1d3 Mon Sep 17 00:00:00 2001
From: Mark Harmstone <maharmstone@fb.com>
Date: Tue, 15 Oct 2024 12:37:29 +0100
Subject: [PATCH 15/16] btrfs: fix wrong sizeof in btrfs_do_encoded_write()

btrfs_do_encoded_write() was converted to use folios in 400b172b8cdc,
but we're still allocating based on sizeof(struct page *) rather than
sizeof(struct folio *). There's no functional change.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Mark Harmstone <maharmstone@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6abcfdfe7b30..42e87a0f0946 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9438,7 +9438,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
 	 */
 	disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
 	nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
-	folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
+	folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT);
 	if (!folios)
 		return -ENOMEM;
 	for (i = 0; i < nr_folios; i++) {
-- 
2.51.0


From c87c299776e4d75bcc5559203ae2c37dc0396d80 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 10 Oct 2024 15:16:12 +1030
Subject: [PATCH 16/16] btrfs: make buffered write to copy one page a time

Currently the btrfs_buffered_write() is preparing multiple page a time,
allowing a better performance.

But the current trend is to support larger folio as an optimization,
instead of implementing own multi-page optimization.

This is inspired by generic_perform_write(), which is copying one folio
a time.

Such change will prepare us to migrate to implement the write_begin()
and write_end() callbacks, and make every involved function a little
easier.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c             | 237 ++++++++++++------------------------
 fs/btrfs/file.h             |   5 +-
 fs/btrfs/free-space-cache.c |  15 ++-
 3 files changed, 94 insertions(+), 163 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 033f85ea8c9d..0c5fc0e73c55 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -37,22 +37,19 @@
 #include "file.h"
 #include "super.h"
 
-/* simple helper to fault in pages and copy.  This should go away
- * and be replaced with calls into generic code.
+/*
+ * Helper to fault in page and copy.  This should go away and be replaced with
+ * calls into generic code.
  */
 static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
-					 struct page **prepared_pages,
-					 struct iov_iter *i)
+					 struct page *page, struct iov_iter *i)
 {
 	size_t copied = 0;
 	size_t total_copied = 0;
-	int pg = 0;
 	int offset = offset_in_page(pos);
 
 	while (write_bytes > 0) {
-		size_t count = min_t(size_t,
-				     PAGE_SIZE - offset, write_bytes);
-		struct page *page = prepared_pages[pg];
+		size_t count = min_t(size_t, PAGE_SIZE - offset, write_bytes);
 		/*
 		 * Copy data from userspace to the current page
 		 */
@@ -63,7 +60,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
 
 		/*
 		 * if we get a partial write, we can end up with
-		 * partially up to date pages.  These add
+		 * partially up to date page.  These add
 		 * a lot of complexity, so make sure they don't
 		 * happen by forcing this copy to be retried.
 		 *
@@ -82,10 +79,6 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
 		write_bytes -= copied;
 		total_copied += copied;
 		offset += copied;
-		if (offset == PAGE_SIZE) {
-			pg++;
-			offset = 0;
-		}
 	}
 	return total_copied;
 }
@@ -93,27 +86,24 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
 /*
  * unlocks pages after btrfs_file_write is done with them
  */
-static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
-			     struct page **pages, size_t num_pages,
-			     u64 pos, u64 copied)
+static void btrfs_drop_page(struct btrfs_fs_info *fs_info, struct page *page,
+			    u64 pos, u64 copied)
 {
-	size_t i;
 	u64 block_start = round_down(pos, fs_info->sectorsize);
 	u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
 
 	ASSERT(block_len <= U32_MAX);
-	for (i = 0; i < num_pages; i++) {
-		/* page checked is some magic around finding pages that
-		 * have been modified without going through btrfs_set_page_dirty
-		 * clear it here. There should be no need to mark the pages
-		 * accessed as prepare_pages should have marked them accessed
-		 * in prepare_pages via find_or_create_page()
-		 */
-		btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]),
-						block_start, block_len);
-		unlock_page(pages[i]);
-		put_page(pages[i]);
-	}
+	/*
+	 * Page checked is some magic around finding pages that have been
+	 * modified without going through btrfs_set_page_dirty clear it here.
+	 * There should be no need to mark the pages accessed as
+	 * prepare_one_page() should have marked them accessed in
+	 * prepare_one_page() via find_or_create_page()
+	 */
+	btrfs_folio_clamp_clear_checked(fs_info, page_folio(page), block_start,
+					block_len);
+	unlock_page(page);
+	put_page(page);
 }
 
 /*
@@ -123,19 +113,16 @@ static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
  * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
  * - Update inode size for past EOF write
  */
-int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
-		      loff_t pos, size_t write_bytes,
-		      struct extent_state **cached, bool noreserve)
+int btrfs_dirty_page(struct btrfs_inode *inode, struct page *page, loff_t pos,
+		     size_t write_bytes, struct extent_state **cached, bool noreserve)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	int ret = 0;
-	int i;
-	const int num_pages = (round_up(pos + write_bytes, PAGE_SIZE) -
-			       round_down(pos, PAGE_SIZE)) >> PAGE_SHIFT;
 	u64 num_bytes;
 	u64 start_pos;
 	u64 end_of_last_block;
 	u64 end_pos = pos + write_bytes;
+	struct folio *folio = page_folio(page);
 	loff_t isize = i_size_read(&inode->vfs_inode);
 	unsigned int extra_bits = 0;
 
@@ -149,6 +136,8 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
 	num_bytes = round_up(write_bytes + pos - start_pos,
 			     fs_info->sectorsize);
 	ASSERT(num_bytes <= U32_MAX);
+	ASSERT(folio_pos(folio) <= pos &&
+	       folio_pos(folio) + folio_size(folio) >= pos + write_bytes);
 
 	end_of_last_block = start_pos + num_bytes - 1;
 
@@ -165,16 +154,9 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
 	if (ret)
 		return ret;
 
-	for (i = 0; i < num_pages; i++) {
-		struct page *p = pages[i];
-
-		btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p),
-					       start_pos, num_bytes);
-		btrfs_folio_clamp_clear_checked(fs_info, page_folio(p),
-						start_pos, num_bytes);
-		btrfs_folio_clamp_set_dirty(fs_info, page_folio(p),
-					    start_pos, num_bytes);
-	}
+	btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes);
+	btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes);
+	btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes);
 
 	/*
 	 * we've only changed i_size in ram, and we haven't updated
@@ -922,62 +904,47 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
 }
 
 /*
- * this just gets pages into the page cache and locks them down.
+ * this just gets page into the page cache and locks them down.
  */
-static noinline int prepare_pages(struct inode *inode, struct page **pages,
-				  size_t num_pages, loff_t pos,
-				  size_t write_bytes, bool force_uptodate,
-				  bool nowait)
+static noinline int prepare_one_page(struct inode *inode, struct page **page_ret,
+				     loff_t pos, size_t write_bytes,
+				     bool force_uptodate, bool nowait)
 {
-	int i;
 	unsigned long index = pos >> PAGE_SHIFT;
 	gfp_t mask = get_prepare_gfp_flags(inode, nowait);
 	fgf_t fgp_flags = get_prepare_fgp_flags(nowait);
+	struct page *page;
 	int ret = 0;
-	int faili;
 
-	for (i = 0; i < num_pages; i++) {
 again:
-		pages[i] = pagecache_get_page(inode->i_mapping, index + i,
-					      fgp_flags, mask | __GFP_WRITE);
-		if (!pages[i]) {
-			faili = i - 1;
-			if (nowait)
-				ret = -EAGAIN;
-			else
-				ret = -ENOMEM;
-			goto fail;
-		}
-
-		ret = set_page_extent_mapped(pages[i]);
-		if (ret < 0) {
-			faili = i;
-			goto fail;
-		}
-
-		ret = prepare_uptodate_page(inode, pages[i], pos, write_bytes,
-					    force_uptodate);
-		if (ret) {
-			put_page(pages[i]);
-			if (!nowait && ret == -EAGAIN) {
-				ret = 0;
-				goto again;
-			}
-			faili = i - 1;
-			goto fail;
+	page = pagecache_get_page(inode->i_mapping, index, fgp_flags,
+				  mask | __GFP_WRITE);
+	if (!page) {
+		if (nowait)
+			ret = -EAGAIN;
+		else
+			ret = -ENOMEM;
+		return ret;
+	}
+	ret = set_page_extent_mapped(page);
+	if (ret < 0) {
+		unlock_page(page);
+		put_page(page);
+		return ret;
+	}
+	ret = prepare_uptodate_page(inode, page, pos, write_bytes, force_uptodate);
+	if (ret) {
+		/* The page is already unlocked. */
+		put_page(page);
+		if (!nowait && ret == -EAGAIN) {
+			ret = 0;
+			goto again;
 		}
-		wait_on_page_writeback(pages[i]);
+		return ret;
 	}
-
+	wait_on_page_writeback(page);
+	*page_ret = page;
 	return 0;
-fail:
-	while (faili >= 0) {
-		unlock_page(pages[faili]);
-		put_page(pages[faili]);
-		faili--;
-	}
-	return ret;
-
 }
 
 /*
@@ -988,19 +955,16 @@ fail:
  * 1 - the extent is locked
  * 0 - the extent is not locked, and everything is OK
  * -EAGAIN - need re-prepare the pages
- * the other < 0 number - Something wrong happens
  */
 static noinline int
-lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
-				size_t num_pages, loff_t pos,
-				size_t write_bytes,
+lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page *page,
+				loff_t pos, size_t write_bytes,
 				u64 *lockstart, u64 *lockend, bool nowait,
 				struct extent_state **cached_state)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	u64 start_pos;
 	u64 last_pos;
-	int i;
 	int ret = 0;
 
 	start_pos = round_down(pos, fs_info->sectorsize);
@@ -1012,12 +976,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
 		if (nowait) {
 			if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
 					     cached_state)) {
-				for (i = 0; i < num_pages; i++) {
-					unlock_page(pages[i]);
-					put_page(pages[i]);
-					pages[i] = NULL;
-				}
-
+				unlock_page(page);
+				put_page(page);
 				return -EAGAIN;
 			}
 		} else {
@@ -1031,10 +991,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
 		    ordered->file_offset <= last_pos) {
 			unlock_extent(&inode->io_tree, start_pos, last_pos,
 				      cached_state);
-			for (i = 0; i < num_pages; i++) {
-				unlock_page(pages[i]);
-				put_page(pages[i]);
-			}
+			unlock_page(page);
+			put_page(page);
 			btrfs_start_ordered_extent(ordered);
 			btrfs_put_ordered_extent(ordered);
 			return -EAGAIN;
@@ -1048,11 +1006,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
 	}
 
 	/*
-	 * We should be called after prepare_pages() which should have locked
+	 * We should be called after prepare_one_page() which should have locked
 	 * all pages in the range.
 	 */
-	for (i = 0; i < num_pages; i++)
-		WARN_ON(!PageLocked(pages[i]));
+	WARN_ON(!PageLocked(page));
 
 	return ret;
 }
@@ -1196,20 +1153,17 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
 	loff_t pos;
 	struct inode *inode = file_inode(file);
 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
-	struct page **pages = NULL;
 	struct extent_changeset *data_reserved = NULL;
 	u64 release_bytes = 0;
 	u64 lockstart;
 	u64 lockend;
 	size_t num_written = 0;
-	int nrptrs;
 	ssize_t ret;
-	bool only_release_metadata = false;
-	bool force_page_uptodate = false;
 	loff_t old_isize = i_size_read(inode);
 	unsigned int ilock_flags = 0;
 	const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
 	unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
+	bool only_release_metadata = false;
 
 	if (nowait)
 		ilock_flags |= BTRFS_ILOCK_TRY;
@@ -1227,32 +1181,21 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
 		goto out;
 
 	pos = iocb->ki_pos;
-	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
-			PAGE_SIZE / (sizeof(struct page *)));
-	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
-	nrptrs = max(nrptrs, 8);
-	pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
-	if (!pages) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
 	while (iov_iter_count(i) > 0) {
 		struct extent_state *cached_state = NULL;
 		size_t offset = offset_in_page(pos);
 		size_t sector_offset;
-		size_t write_bytes = min(iov_iter_count(i),
-					 nrptrs * (size_t)PAGE_SIZE -
-					 offset);
-		size_t num_pages;
+		size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset);
 		size_t reserve_bytes;
 		size_t copied;
 		size_t dirty_sectors;
 		size_t num_sectors;
+		struct page *page = NULL;
 		int extents_locked;
+		bool force_page_uptodate = false;
 
 		/*
-		 * Fault pages before locking them in prepare_pages
+		 * Fault pages before locking them in prepare_one_page()
 		 * to avoid recursive lock
 		 */
 		if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
@@ -1291,8 +1234,6 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
 			only_release_metadata = true;
 		}
 
-		num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
-		WARN_ON(num_pages > nrptrs);
 		reserve_bytes = round_up(write_bytes + sector_offset,
 					 fs_info->sectorsize);
 		WARN_ON(reserve_bytes == 0);
@@ -1320,23 +1261,17 @@ again:
 			break;
 		}
 
-		/*
-		 * This is going to setup the pages array with the number of
-		 * pages we want, so we don't really need to worry about the
-		 * contents of pages from loop to loop
-		 */
-		ret = prepare_pages(inode, pages, num_pages,
-				    pos, write_bytes, force_page_uptodate, false);
+		ret = prepare_one_page(inode, &page, pos, write_bytes,
+				       force_page_uptodate, false);
 		if (ret) {
 			btrfs_delalloc_release_extents(BTRFS_I(inode),
 						       reserve_bytes);
 			break;
 		}
 
-		extents_locked = lock_and_cleanup_extent_if_need(
-				BTRFS_I(inode), pages,
-				num_pages, pos, write_bytes, &lockstart,
-				&lockend, nowait, &cached_state);
+		extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode),
+						page, pos, write_bytes, &lockstart,
+						&lockend, nowait, &cached_state);
 		if (extents_locked < 0) {
 			if (!nowait && extents_locked == -EAGAIN)
 				goto again;
@@ -1347,20 +1282,13 @@ again:
 			break;
 		}
 
-		copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
+		copied = btrfs_copy_from_user(pos, write_bytes, page, i);
 
 		num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
 		dirty_sectors = round_up(copied + sector_offset,
 					fs_info->sectorsize);
 		dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
 
-		/*
-		 * if we have trouble faulting in the pages, fall
-		 * back to one page at a time
-		 */
-		if (copied < write_bytes)
-			nrptrs = 1;
-
 		if (copied == 0) {
 			force_page_uptodate = true;
 			dirty_sectors = 0;
@@ -1386,15 +1314,14 @@ again:
 		release_bytes = round_up(copied + sector_offset,
 					fs_info->sectorsize);
 
-		ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
-					pos, copied,
-					&cached_state, only_release_metadata);
+		ret = btrfs_dirty_page(BTRFS_I(inode), page, pos, copied,
+				       &cached_state, only_release_metadata);
 
 		/*
 		 * If we have not locked the extent range, because the range's
 		 * start offset is >= i_size, we might still have a non-NULL
 		 * cached extent state, acquired while marking the extent range
-		 * as delalloc through btrfs_dirty_pages(). Therefore free any
+		 * as delalloc through btrfs_dirty_page(). Therefore free any
 		 * possible cached extent state to avoid a memory leak.
 		 */
 		if (extents_locked)
@@ -1405,7 +1332,7 @@ again:
 
 		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
 		if (ret) {
-			btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
+			btrfs_drop_page(fs_info, page, pos, copied);
 			break;
 		}
 
@@ -1413,7 +1340,7 @@ again:
 		if (only_release_metadata)
 			btrfs_check_nocow_unlock(BTRFS_I(inode));
 
-		btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
+		btrfs_drop_page(fs_info, page, pos, copied);
 
 		cond_resched();
 
@@ -1421,8 +1348,6 @@ again:
 		num_written += copied;
 	}
 
-	kfree(pages);
-
 	if (release_bytes) {
 		if (only_release_metadata) {
 			btrfs_check_nocow_unlock(BTRFS_I(inode));
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
index c2ce0ae94a9c..69a7b78d99bb 100644
--- a/fs/btrfs/file.h
+++ b/fs/btrfs/file.h
@@ -34,9 +34,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
 			    const struct btrfs_ioctl_encoded_io_args *encoded);
 int btrfs_release_file(struct inode *inode, struct file *file);
-int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
-		      loff_t pos, size_t write_bytes,
-		      struct extent_state **cached, bool noreserve);
+int btrfs_dirty_page(struct btrfs_inode *inode, struct page *page, loff_t pos,
+		     size_t write_bytes, struct extent_state **cached, bool noreserve);
 int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end);
 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
 			   size_t *write_bytes, bool nowait);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index b50b621c6afc..207aff5f1957 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1388,6 +1388,7 @@ static int __btrfs_write_out_cache(struct inode *inode,
 	int bitmaps = 0;
 	int ret;
 	int must_iput = 0;
+	int i_size;
 
 	if (!i_size_read(inode))
 		return -EIO;
@@ -1458,10 +1459,16 @@ static int __btrfs_write_out_cache(struct inode *inode,
 	io_ctl_zero_remaining_pages(io_ctl);
 
 	/* Everything is written out, now we dirty the pages in the file. */
-	ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, 0, i_size_read(inode),
-				&cached_state, false);
-	if (ret)
-		goto out_nospc;
+	i_size = i_size_read(inode);
+	for (int i = 0; i < round_up(i_size, PAGE_SIZE) / PAGE_SIZE; i++) {
+		u64 dirty_start = i * PAGE_SIZE;
+		u64 dirty_len = min_t(u64, dirty_start + PAGE_SIZE, i_size) - dirty_start;
+
+		ret = btrfs_dirty_page(BTRFS_I(inode), io_ctl->pages[i],
+				       dirty_start, dirty_len, &cached_state, false);
+		if (ret < 0)
+			goto out_nospc;
+	}
 
 	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
 		up_write(&block_group->data_rwsem);
-- 
2.51.0