From e820dbeb6ad1d66906663643302f2157347e1d8b Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 10 Oct 2024 15:16:13 +1030
Subject: [PATCH 01/16] btrfs: convert btrfs_buffered_write() to use folios

The buffered write path is still heavily utilizing the page interface.
Since we have converted it to do a page-by-page copying, it's much easier
to convert all involved functions to folio interface, this involves:

- btrfs_copy_from_user()
- btrfs_drop_folio()
- prepare_uptodate_page()
- prepare_one_page()
- lock_and_cleanup_extent_if_need()
- btrfs_dirty_page()

All function are changed to accept a folio parameter, and if the word
"page" is in the function name, change that to "folio" too.

The function btrfs_dirty_page() is exported for v1 space cache, convert
v1 cache call site to convert its page to folio for the new interface.

And there is a small enhancement for prepare_one_folio(), instead of
manually waiting for the page writeback, let __filemap_get_folio() to
handle that by using FGP_WRITEBEGIN, which implies
(FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE).

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c             | 138 ++++++++++++++++--------------------
 fs/btrfs/file.h             |   4 +-
 fs/btrfs/free-space-cache.c |   4 +-
 3 files changed, 66 insertions(+), 80 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0c5fc0e73c55..5e0a1805e897 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -42,7 +42,7 @@
  * calls into generic code.
  */
 static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
-					 struct page *page, struct iov_iter *i)
+					 struct folio *folio, struct iov_iter *i)
 {
 	size_t copied = 0;
 	size_t total_copied = 0;
@@ -53,10 +53,10 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
 		/*
 		 * Copy data from userspace to the current page
 		 */
-		copied = copy_page_from_iter_atomic(page, offset, count, i);
+		copied = copy_folio_from_iter_atomic(folio, offset, count, i);
 
 		/* Flush processor's dcache for this page */
-		flush_dcache_page(page);
+		flush_dcache_folio(folio);
 
 		/*
 		 * if we get a partial write, we can end up with
@@ -68,7 +68,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
 		 * back to page at a time copies after we return 0.
 		 */
 		if (unlikely(copied < count)) {
-			if (!PageUptodate(page)) {
+			if (!folio_test_uptodate(folio)) {
 				iov_iter_revert(i, copied);
 				copied = 0;
 			}
@@ -84,37 +84,36 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
 }
 
 /*
- * unlocks pages after btrfs_file_write is done with them
+ * Unlock folio after btrfs_file_write() is done with it.
  */
-static void btrfs_drop_page(struct btrfs_fs_info *fs_info, struct page *page,
-			    u64 pos, u64 copied)
+static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
+			     u64 pos, u64 copied)
 {
 	u64 block_start = round_down(pos, fs_info->sectorsize);
 	u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
 
 	ASSERT(block_len <= U32_MAX);
 	/*
-	 * Page checked is some magic around finding pages that have been
-	 * modified without going through btrfs_set_page_dirty clear it here.
+	 * Folio checked is some magic around finding folios that have been
+	 * modified without going through btrfs_dirty_folio().  Clear it here.
 	 * There should be no need to mark the pages accessed as
-	 * prepare_one_page() should have marked them accessed in
-	 * prepare_one_page() via find_or_create_page()
+	 * prepare_one_folio() should have marked them accessed in
+	 * prepare_one_folio() via find_or_create_page()
 	 */
-	btrfs_folio_clamp_clear_checked(fs_info, page_folio(page), block_start,
-					block_len);
-	unlock_page(page);
-	put_page(page);
+	btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len);
+	folio_unlock(folio);
+	folio_put(folio);
 }
 
 /*
  * After btrfs_copy_from_user(), update the following things for delalloc:
- * - Mark newly dirtied pages as DELALLOC in the io tree.
+ * - Mark newly dirtied folio as DELALLOC in the io tree.
  *   Used to advise which range is to be written back.
- * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
+ * - Mark modified folio as Uptodate/Dirty and not needing COW fixup
  * - Update inode size for past EOF write
  */
-int btrfs_dirty_page(struct btrfs_inode *inode, struct page *page, loff_t pos,
-		     size_t write_bytes, struct extent_state **cached, bool noreserve)
+int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
+		      size_t write_bytes, struct extent_state **cached, bool noreserve)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	int ret = 0;
@@ -122,7 +121,6 @@ int btrfs_dirty_page(struct btrfs_inode *inode, struct page *page, loff_t pos,
 	u64 start_pos;
 	u64 end_of_last_block;
 	u64 end_pos = pos + write_bytes;
-	struct folio *folio = page_folio(page);
 	loff_t isize = i_size_read(&inode->vfs_inode);
 	unsigned int extra_bits = 0;
 
@@ -835,14 +833,12 @@ out:
 }
 
 /*
- * on error we return an unlocked page and the error value
- * on success we return a locked page and 0
+ * On error return an unlocked folio and the error value
+ * On success return a locked folio and 0
  */
-static int prepare_uptodate_page(struct inode *inode,
-				 struct page *page, u64 pos,
-				 u64 len, bool force_uptodate)
+static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos,
+				  u64 len, bool force_uptodate)
 {
-	struct folio *folio = page_folio(page);
 	u64 clamp_start = max_t(u64, pos, folio_pos(folio));
 	u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio));
 	int ret = 0;
@@ -873,23 +869,13 @@ static int prepare_uptodate_page(struct inode *inode,
 	 * The private flag check is essential for subpage as we need to store
 	 * extra bitmap using folio private.
 	 */
-	if (page->mapping != inode->i_mapping || !folio_test_private(folio)) {
+	if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) {
 		folio_unlock(folio);
 		return -EAGAIN;
 	}
 	return 0;
 }
 
-static fgf_t get_prepare_fgp_flags(bool nowait)
-{
-	fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
-
-	if (nowait)
-		fgp_flags |= FGP_NOWAIT;
-
-	return fgp_flags;
-}
-
 static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
 {
 	gfp_t gfp;
@@ -904,60 +890,60 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
 }
 
 /*
- * this just gets page into the page cache and locks them down.
+ * Get folio into the page cache and lock it.
  */
-static noinline int prepare_one_page(struct inode *inode, struct page **page_ret,
-				     loff_t pos, size_t write_bytes,
-				     bool force_uptodate, bool nowait)
+static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret,
+				      loff_t pos, size_t write_bytes,
+				      bool force_uptodate, bool nowait)
 {
 	unsigned long index = pos >> PAGE_SHIFT;
 	gfp_t mask = get_prepare_gfp_flags(inode, nowait);
-	fgf_t fgp_flags = get_prepare_fgp_flags(nowait);
-	struct page *page;
+	fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN);
+	struct folio *folio;
 	int ret = 0;
 
 again:
-	page = pagecache_get_page(inode->i_mapping, index, fgp_flags,
-				  mask | __GFP_WRITE);
-	if (!page) {
+	folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask);
+	if (IS_ERR(folio)) {
 		if (nowait)
 			ret = -EAGAIN;
 		else
-			ret = -ENOMEM;
+			ret = PTR_ERR(folio);
 		return ret;
 	}
-	ret = set_page_extent_mapped(page);
+	/* Only support page sized folio yet. */
+	ASSERT(folio_order(folio) == 0);
+	ret = set_folio_extent_mapped(folio);
 	if (ret < 0) {
-		unlock_page(page);
-		put_page(page);
+		folio_unlock(folio);
+		folio_put(folio);
 		return ret;
 	}
-	ret = prepare_uptodate_page(inode, page, pos, write_bytes, force_uptodate);
+	ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate);
 	if (ret) {
-		/* The page is already unlocked. */
-		put_page(page);
+		/* The folio is already unlocked. */
+		folio_put(folio);
 		if (!nowait && ret == -EAGAIN) {
 			ret = 0;
 			goto again;
 		}
 		return ret;
 	}
-	wait_on_page_writeback(page);
-	*page_ret = page;
+	*folio_ret = folio;
 	return 0;
 }
 
 /*
- * This function locks the extent and properly waits for data=ordered extents
- * to finish before allowing the pages to be modified if need.
+ * Locks the extent and properly waits for data=ordered extents to finish
+ * before allowing the folios to be modified if need.
  *
- * The return value:
+ * Return:
  * 1 - the extent is locked
  * 0 - the extent is not locked, and everything is OK
- * -EAGAIN - need re-prepare the pages
+ * -EAGAIN - need to prepare the folios again
  */
 static noinline int
-lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page *page,
+lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
 				loff_t pos, size_t write_bytes,
 				u64 *lockstart, u64 *lockend, bool nowait,
 				struct extent_state **cached_state)
@@ -976,8 +962,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page *page,
 		if (nowait) {
 			if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
 					     cached_state)) {
-				unlock_page(page);
-				put_page(page);
+				folio_unlock(folio);
+				folio_put(folio);
 				return -EAGAIN;
 			}
 		} else {
@@ -991,8 +977,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page *page,
 		    ordered->file_offset <= last_pos) {
 			unlock_extent(&inode->io_tree, start_pos, last_pos,
 				      cached_state);
-			unlock_page(page);
-			put_page(page);
+			folio_unlock(folio);
+			folio_put(folio);
 			btrfs_start_ordered_extent(ordered);
 			btrfs_put_ordered_extent(ordered);
 			return -EAGAIN;
@@ -1006,10 +992,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page *page,
 	}
 
 	/*
-	 * We should be called after prepare_one_page() which should have locked
+	 * We should be called after prepare_one_folio() which should have locked
 	 * all pages in the range.
 	 */
-	WARN_ON(!PageLocked(page));
+	WARN_ON(!folio_test_locked(folio));
 
 	return ret;
 }
@@ -1190,12 +1176,12 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
 		size_t copied;
 		size_t dirty_sectors;
 		size_t num_sectors;
-		struct page *page = NULL;
+		struct folio *folio = NULL;
 		int extents_locked;
 		bool force_page_uptodate = false;
 
 		/*
-		 * Fault pages before locking them in prepare_one_page()
+		 * Fault pages before locking them in prepare_one_folio()
 		 * to avoid recursive lock
 		 */
 		if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
@@ -1261,8 +1247,8 @@ again:
 			break;
 		}
 
-		ret = prepare_one_page(inode, &page, pos, write_bytes,
-				       force_page_uptodate, false);
+		ret = prepare_one_folio(inode, &folio, pos, write_bytes,
+					force_page_uptodate, false);
 		if (ret) {
 			btrfs_delalloc_release_extents(BTRFS_I(inode),
 						       reserve_bytes);
@@ -1270,7 +1256,7 @@ again:
 		}
 
 		extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode),
-						page, pos, write_bytes, &lockstart,
+						folio, pos, write_bytes, &lockstart,
 						&lockend, nowait, &cached_state);
 		if (extents_locked < 0) {
 			if (!nowait && extents_locked == -EAGAIN)
@@ -1282,7 +1268,7 @@ again:
 			break;
 		}
 
-		copied = btrfs_copy_from_user(pos, write_bytes, page, i);
+		copied = btrfs_copy_from_user(pos, write_bytes, folio, i);
 
 		num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
 		dirty_sectors = round_up(copied + sector_offset,
@@ -1314,8 +1300,8 @@ again:
 		release_bytes = round_up(copied + sector_offset,
 					fs_info->sectorsize);
 
-		ret = btrfs_dirty_page(BTRFS_I(inode), page, pos, copied,
-				       &cached_state, only_release_metadata);
+		ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied,
+					&cached_state, only_release_metadata);
 
 		/*
 		 * If we have not locked the extent range, because the range's
@@ -1332,7 +1318,7 @@ again:
 
 		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
 		if (ret) {
-			btrfs_drop_page(fs_info, page, pos, copied);
+			btrfs_drop_folio(fs_info, folio, pos, copied);
 			break;
 		}
 
@@ -1340,7 +1326,7 @@ again:
 		if (only_release_metadata)
 			btrfs_check_nocow_unlock(BTRFS_I(inode));
 
-		btrfs_drop_page(fs_info, page, pos, copied);
+		btrfs_drop_folio(fs_info, folio, pos, copied);
 
 		cond_resched();
 
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
index 69a7b78d99bb..de89e644be29 100644
--- a/fs/btrfs/file.h
+++ b/fs/btrfs/file.h
@@ -34,8 +34,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
 			    const struct btrfs_ioctl_encoded_io_args *encoded);
 int btrfs_release_file(struct inode *inode, struct file *file);
-int btrfs_dirty_page(struct btrfs_inode *inode, struct page *page, loff_t pos,
-		     size_t write_bytes, struct extent_state **cached, bool noreserve);
+int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
+		      size_t write_bytes, struct extent_state **cached, bool noreserve);
 int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end);
 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
 			   size_t *write_bytes, bool nowait);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 207aff5f1957..cfa52ef40b06 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1464,8 +1464,8 @@ static int __btrfs_write_out_cache(struct inode *inode,
 		u64 dirty_start = i * PAGE_SIZE;
 		u64 dirty_len = min_t(u64, dirty_start + PAGE_SIZE, i_size) - dirty_start;
 
-		ret = btrfs_dirty_page(BTRFS_I(inode), io_ctl->pages[i],
-				       dirty_start, dirty_len, &cached_state, false);
+		ret = btrfs_dirty_folio(BTRFS_I(inode), page_folio(io_ctl->pages[i]),
+					dirty_start, dirty_len, &cached_state, false);
 		if (ret < 0)
 			goto out_nospc;
 	}
-- 
2.51.0


From d07eaa9995fc81eb18e390d860442e598427547d Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 14 Oct 2024 23:11:38 +0800
Subject: [PATCH 02/16] btrfs: use filemap_get_folio() helper
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

When fgp_flags and gfp_flags are zero, use filemap_get_folio(A, B)
instead of __filemap_get_folio(A, B, 0, 0)âno need for the extra
arguments 0, 0.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 2 +-
 fs/btrfs/extent_io.c   | 2 +-
 fs/btrfs/inode.c       | 7 +++----
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 21d73ea594a9..0c4d486c3048 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -453,7 +453,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 		if (pg_index > end_index)
 			break;
 
-		folio = __filemap_get_folio(mapping, pg_index, 0, 0);
+		folio = filemap_get_folio(mapping, pg_index);
 		if (!IS_ERR(folio)) {
 			u64 folio_sz = folio_size(folio);
 			u64 offset = offset_in_folio(folio, cur);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1beaba232532..e629d2ee152a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2265,7 +2265,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
 		u32 cur_len = cur_end + 1 - cur;
 		struct folio *folio;
 
-		folio = __filemap_get_folio(mapping, cur >> PAGE_SHIFT, 0, 0);
+		folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT);
 
 		/*
 		 * This shouldn't happen, the pages are pinned and locked, this
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 42e87a0f0946..87a30d9cb6e5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -421,7 +421,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
 			index++;
 			continue;
 		}
-		folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0);
+		folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
 		index++;
 		if (IS_ERR(folio))
 			continue;
@@ -556,8 +556,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 	} else {
 		struct folio *folio;
 
-		folio = __filemap_get_folio(inode->vfs_inode.i_mapping,
-					    0, 0, 0);
+		folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0);
 		ASSERT(!IS_ERR(folio));
 		btrfs_set_file_extent_compression(leaf, ei, 0);
 		kaddr = kmap_local_folio(folio, 0);
@@ -880,7 +879,7 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e
 
 	for (unsigned long index = start >> PAGE_SHIFT;
 	     index <= end_index; index++) {
-		folio = __filemap_get_folio(inode->i_mapping, index, 0, 0);
+		folio = filemap_get_folio(inode->i_mapping, index);
 		if (IS_ERR(folio)) {
 			if (!ret)
 				ret = PTR_ERR(folio);
-- 
2.51.0


From 6aea95ee318890d9add03b9275bde31fec682e80 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Wed, 23 Oct 2024 15:25:17 +0200
Subject: [PATCH 03/16] btrfs: implement partial deletion of RAID stripe
 extents

In our CI system, the RAID stripe tree configuration sometimes fails with
the following ASSERT():

  assertion failed: found_start >= start && found_end <= end, in fs/btrfs/raid-stripe-tree.c:64

This ASSERT()ion triggers, because for the initial design of RAID
stripe-tree, I had the "one ordered-extent equals one bio" rule of zoned
btrfs in mind.

But for a RAID stripe-tree based system, that is not hosted on a zoned
storage device, but on a regular device this rule doesn't apply.

So in case the range we want to delete starts in the middle of the
previous item, grab the item and "truncate" it's length. That is, clone
the item, subtract the deleted portion from the key's offset, delete the
old item and insert the new one.

In case the range to delete ends in the middle of an item, we have to
adjust both the item's key as well as the stripe extents and then
re-insert the modified clone into the tree after deleting the old stripe
extent.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid-stripe-tree.c | 81 +++++++++++++++++++++++++++++++++----
 1 file changed, 74 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 41970bbdb05f..9ffc79f250fb 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -13,6 +13,39 @@
 #include "volumes.h"
 #include "print-tree.h"
 
+static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
+					       struct btrfs_path *path,
+					       const struct btrfs_key *oldkey,
+					       u64 newlen, u64 frontpad)
+{
+	struct btrfs_stripe_extent *extent;
+	struct extent_buffer *leaf;
+	int slot;
+	size_t item_size;
+	struct btrfs_key newkey = {
+		.objectid = oldkey->objectid + frontpad,
+		.type = BTRFS_RAID_STRIPE_KEY,
+		.offset = newlen,
+	};
+
+	ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY);
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	item_size = btrfs_item_size(leaf, slot);
+	extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+
+	for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {
+		struct btrfs_raid_stride *stride = &extent->strides[i];
+		u64 phys;
+
+		phys = btrfs_raid_stride_physical(leaf, stride);
+		btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad);
+	}
+
+	btrfs_set_item_key_safe(trans, path, &newkey);
+}
+
 int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -36,23 +69,24 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
 	while (1) {
 		key.objectid = start;
 		key.type = BTRFS_RAID_STRIPE_KEY;
-		key.offset = length;
+		key.offset = 0;
 
 		ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
 		if (ret < 0)
 			break;
-		if (ret > 0) {
-			ret = 0;
-			if (path->slots[0] == 0)
-				break;
+
+		if (path->slots[0] == btrfs_header_nritems(path->nodes[0]))
 			path->slots[0]--;
-		}
 
 		leaf = path->nodes[0];
 		slot = path->slots[0];
 		btrfs_item_key_to_cpu(leaf, &key, slot);
 		found_start = key.objectid;
 		found_end = found_start + key.offset;
+		ret = 0;
+
+		if (key.type != BTRFS_RAID_STRIPE_KEY)
+			break;
 
 		/* That stripe ends before we start, we're done. */
 		if (found_end <= start)
@@ -61,7 +95,40 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
 		trace_btrfs_raid_extent_delete(fs_info, start, end,
 					       found_start, found_end);
 
-		ASSERT(found_start >= start && found_end <= end);
+		/*
+		 * The stripe extent starts before the range we want to delete:
+		 *
+		 * |--- RAID Stripe Extent ---|
+		 * |--- keep  ---|--- drop ---|
+		 *
+		 * This means we have to duplicate the tree item, truncate the
+		 * length to the new size and then re-insert the item.
+		 */
+		if (found_start < start) {
+			u64 diff = start - found_start;
+
+			btrfs_partially_delete_raid_extent(trans, path, &key,
+							   diff, 0);
+			break;
+		}
+
+		/*
+		 * The stripe extent ends after the range we want to delete:
+		 *
+		 * |--- RAID Stripe Extent ---|
+		 * |--- drop  ---|--- keep ---|
+		 *
+		 * This means we have to duplicate the tree item, truncate the
+		 * length to the new size and then re-insert the item.
+		 */
+		if (found_end > end) {
+			u64 diff = found_end - end;
+
+			btrfs_partially_delete_raid_extent(trans, path, &key,
+							   diff, diff);
+			break;
+		}
+
 		ret = btrfs_del_item(trans, stripe_root, path);
 		if (ret)
 			break;
-- 
2.51.0


From 6e6ecdec22645db64d14c25915444f238d98da0d Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Wed, 23 Oct 2024 15:25:18 +0200
Subject: [PATCH 04/16] btrfs: tests: implement case for partial RAID
 stripe-tree delete

Implement self-tests for partial deletion of RAID stripe-tree entries.

These two new tests cover both the deletion of the front of a RAID
stripe-tree stripe extent as well as truncation of an item to make it
smaller.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tests/raid-stripe-tree-tests.c | 221 ++++++++++++++++++++++++
 1 file changed, 221 insertions(+)

diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c
index b8013ab13c43..30f17eb7b6a8 100644
--- a/fs/btrfs/tests/raid-stripe-tree-tests.c
+++ b/fs/btrfs/tests/raid-stripe-tree-tests.c
@@ -29,6 +29,225 @@ static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_de
 	return NULL;
 }
 
+/*
+ * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
+ * delete the 1st 32K, making the new start address 1M+32K.
+ */
+static int test_front_delete(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_io_context *bioc;
+	struct btrfs_io_stripe io_stripe = { 0 };
+	u64 map_type = RST_TEST_RAID1_TYPE;
+	u64 logical = SZ_1M;
+	u64 len = SZ_64K;
+	int ret;
+
+	bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
+	if (!bioc) {
+		test_std_err(TEST_ALLOC_IO_CONTEXT);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+	bioc->map_type = map_type;
+	bioc->size = len;
+
+	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+		struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+		if (!stripe->dev) {
+			test_err("cannot find device with devid %d", i);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		stripe->physical = logical + i * SZ_1G;
+	}
+
+	ret = btrfs_insert_one_raid_extent(trans, bioc);
+	if (ret) {
+		test_err("inserting RAID extent failed: %d", ret);
+		goto out;
+	}
+
+	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+	if (ret) {
+		test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+			 logical + len);
+		goto out;
+	}
+
+	if (io_stripe.physical != logical) {
+		test_err("invalid physical address, expected %llu got %llu",
+			 logical, io_stripe.physical);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (len != SZ_64K) {
+		test_err("invalid stripe length, expected %llu got %llu",
+			 (u64)SZ_64K, len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_delete_raid_extent(trans, logical, SZ_32K);
+	if (ret) {
+		test_err("deleting RAID extent [%llu, %llu] failed", logical,
+			 logical + SZ_32K);
+		goto out;
+	}
+
+	len = SZ_32K;
+	ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_32K, &len,
+					   map_type, 0, &io_stripe);
+	if (ret) {
+		test_err("lookup of RAID extent [%llu, %llu] failed",
+			 logical + SZ_32K, logical + SZ_32K + len);
+		goto out;
+	}
+
+	if (io_stripe.physical != logical + SZ_32K) {
+		test_err("invalid physical address, expected %llu, got %llu",
+			 logical + SZ_32K, io_stripe.physical);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (len != SZ_32K) {
+		test_err("invalid stripe length, expected %llu, got %llu",
+			 (u64)SZ_32K, len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+	if (!ret) {
+		ret = -EINVAL;
+		test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail",
+			 logical, logical + SZ_32K);
+		goto out;
+	}
+
+	ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K);
+out:
+	btrfs_put_bioc(bioc);
+	return ret;
+}
+
+/*
+ * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
+ * truncate the stripe extent down to 32K.
+ */
+static int test_tail_delete(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_io_context *bioc;
+	struct btrfs_io_stripe io_stripe = { 0 };
+	u64 map_type = RST_TEST_RAID1_TYPE;
+	u64 logical = SZ_1M;
+	u64 len = SZ_64K;
+	int ret;
+
+	bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
+	if (!bioc) {
+		test_std_err(TEST_ALLOC_IO_CONTEXT);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+	bioc->map_type = map_type;
+	bioc->size = len;
+
+	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+		struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+		if (!stripe->dev) {
+			test_err("cannot find device with devid %d", i);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		stripe->physical = logical + i * SZ_1G;
+	}
+
+	ret = btrfs_insert_one_raid_extent(trans, bioc);
+	if (ret) {
+		test_err("inserting RAID extent failed: %d", ret);
+		goto out;
+	}
+
+	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+	if (!io_stripe.dev) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+	if (ret) {
+		test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+			 logical + len);
+		goto out;
+	}
+
+	if (io_stripe.physical != logical) {
+		test_err("invalid physical address, expected %llu got %llu",
+			 logical, io_stripe.physical);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (len != SZ_64K) {
+		test_err("invalid stripe length, expected %llu got %llu",
+			 (u64)SZ_64K, len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K);
+	if (ret) {
+		test_err("deleting RAID extent [%llu, %llu] failed",
+			 logical + SZ_32K, logical + SZ_64K);
+		goto out;
+	}
+
+	len = SZ_32K;
+	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+	if (ret) {
+		test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+			 logical + len);
+		goto out;
+	}
+
+	if (io_stripe.physical != logical) {
+		test_err("invalid physical address, expected %llu, got %llu",
+			 logical, io_stripe.physical);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (len != SZ_32K) {
+		test_err("invalid stripe length, expected %llu, got %llu",
+			 (u64)SZ_32K, len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_delete_raid_extent(trans, logical, len);
+	if (ret)
+		test_err("deleting RAID extent [%llu, %llu] failed", logical,
+			 logical + len);
+
+out:
+	btrfs_put_bioc(bioc);
+	return ret;
+}
+
 /*
  * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
  * overwrite the whole range giving it new physical address at an offset of 1G.
@@ -235,6 +454,8 @@ out:
 static const test_func_t tests[] = {
 	test_simple_create_delete,
 	test_create_update_delete,
+	test_tail_delete,
+	test_front_delete,
 };
 
 static int run_test(test_func_t test, u32 sectorsize, u32 nodesize)
-- 
2.51.0


From 1d16c2761bcc412ae4d9fb9fd9934cd426814191 Mon Sep 17 00:00:00 2001
From: Robbie Ko <robbieko@synology.com>
Date: Thu, 24 Oct 2024 10:31:42 +0800
Subject: [PATCH 05/16] btrfs: reduce extent tree lock contention when
 searching for inline backref

When inserting extent backref, in order to check whether refs other than
inline refs are used, we always use path keep locks for tree search, which
will increase the lock contention of extent tree.

We do not need the parent node every time to determine whether normal
refs are used.  It is only needed when the extent item is the last item
in a leaf.

Therefore, we change it to first use keep_locks=0 for search.  If the
extent item happens to be the last item in the leaf, we then change to
keep_locks=1 for the second search to reduce lock contention.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e29024c66edb..594d18ed908c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -795,7 +795,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 	if (insert) {
 		extra_size = btrfs_extent_inline_ref_size(want);
 		path->search_for_extension = 1;
-		path->keep_locks = 1;
 	} else
 		extra_size = -1;
 
@@ -946,6 +945,25 @@ again:
 			ret = -EAGAIN;
 			goto out;
 		}
+
+		if (path->slots[0] + 1 < btrfs_header_nritems(path->nodes[0])) {
+			struct btrfs_key tmp_key;
+
+			btrfs_item_key_to_cpu(path->nodes[0], &tmp_key, path->slots[0] + 1);
+			if (tmp_key.objectid == bytenr &&
+			    tmp_key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
+				ret = -EAGAIN;
+				goto out;
+			}
+			goto out_no_entry;
+		}
+
+		if (!path->keep_locks) {
+			btrfs_release_path(path);
+			path->keep_locks = 1;
+			goto again;
+		}
+
 		/*
 		 * To add new inline back ref, we have to make sure
 		 * there is no corresponding back ref item.
@@ -959,13 +977,15 @@ again:
 			goto out;
 		}
 	}
+out_no_entry:
 	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
 out:
-	if (insert) {
+	if (path->keep_locks) {
 		path->keep_locks = 0;
-		path->search_for_extension = 0;
 		btrfs_unlock_up_safe(path, 1);
 	}
+	if (insert)
+		path->search_for_extension = 0;
 	return ret;
 }
 
-- 
2.51.0


From 00f529661baaae79dc9de79f9273324b9e1f3542 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 17 Oct 2024 15:07:45 +0100
Subject: [PATCH 06/16] btrfs: remove BUG_ON() at btrfs_destroy_delayed_refs()

At btrfs_destroy_delayed_refs() it's unexpected to not find the block
group to which a delayed reference's extent belongs to, so we have this
BUG_ON(), not just because it's highly unexpected but also because we
don't know what to do there.

Since we are in the transaction abort path, there's nothing we can do
other than proceed and cleanup all used resources we can. So remove
the BUG_ON() and deal with a missing block group by logging an error
message and continuing to cleanup all we can related to the current
delayed ref head and moving to other delayed refs.

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bcf6e53bc2a7..47598e525ea5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4571,19 +4571,30 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 			struct btrfs_block_group *cache;
 
 			cache = btrfs_lookup_block_group(fs_info, head->bytenr);
-			BUG_ON(!cache);
-
-			spin_lock(&cache->space_info->lock);
-			spin_lock(&cache->lock);
-			cache->pinned += head->num_bytes;
-			btrfs_space_info_update_bytes_pinned(fs_info,
-				cache->space_info, head->num_bytes);
-			cache->reserved -= head->num_bytes;
-			cache->space_info->bytes_reserved -= head->num_bytes;
-			spin_unlock(&cache->lock);
-			spin_unlock(&cache->space_info->lock);
-
-			btrfs_put_block_group(cache);
+			if (WARN_ON_ONCE(cache == NULL)) {
+				/*
+				 * Unexpected and there's nothing we can do here
+				 * because we are in a transaction abort path,
+				 * so any errors can only be ignored or reported
+				 * while attempting to cleanup all resources.
+				 */
+				btrfs_err(fs_info,
+"block group for delayed ref at %llu was not found while destroying ref head",
+					  head->bytenr);
+			} else {
+				spin_lock(&cache->space_info->lock);
+				spin_lock(&cache->lock);
+				cache->pinned += head->num_bytes;
+				btrfs_space_info_update_bytes_pinned(fs_info,
+								     cache->space_info,
+								     head->num_bytes);
+				cache->reserved -= head->num_bytes;
+				cache->space_info->bytes_reserved -= head->num_bytes;
+				spin_unlock(&cache->lock);
+				spin_unlock(&cache->space_info->lock);
+
+				btrfs_put_block_group(cache);
+			}
 
 			btrfs_error_unpin_extent_range(fs_info, head->bytenr,
 				head->bytenr + head->num_bytes - 1);
-- 
2.51.0


From 22a0ae1889c6a5cd1f03de7e5f6c64646e782fd8 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 17 Oct 2024 16:23:41 +0100
Subject: [PATCH 07/16] btrfs: move btrfs_destroy_delayed_refs() to
 delayed-ref.c

It's better suited at delayed-ref.c since it's about delayed refs and
contains logic to iterate over them (using the red black tree, doing all
the locking, freeing, etc), so move it from disk-io.c, which is pretty
big, into delayed-ref.c, hiding implementation details of how delayed
refs are tracked and managed. This also facilitates the next patches in
the series.

This change moves the code between files but also does the following
simple cleanups:

1) Rename the 'cache' variable to 'bg', since it's a block group
   (the 'cache' logic comes from old days where the block group
   structure was named 'btrfs_block_group_cache');

2) Move the 'ref' variable declaration to the scope of the inner
   while loop, since it's not used outside that loop.

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 81 ++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/delayed-ref.h |  2 ++
 fs/btrfs/disk-io.c     | 80 -----------------------------------------
 3 files changed, 83 insertions(+), 80 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 1684857554c6..9e661f9a71b0 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -9,6 +9,7 @@
 #include "messages.h"
 #include "ctree.h"
 #include "delayed-ref.h"
+#include "extent-tree.h"
 #include "transaction.h"
 #include "qgroup.h"
 #include "space-info.h"
@@ -1238,6 +1239,86 @@ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
 	return found;
 }
 
+void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+				struct btrfs_fs_info *fs_info)
+{
+	struct rb_node *node;
+	struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
+
+	spin_lock(&delayed_refs->lock);
+	while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
+		struct btrfs_delayed_ref_head *head;
+		struct rb_node *n;
+		bool pin_bytes = false;
+
+		head = rb_entry(node, struct btrfs_delayed_ref_head,
+				href_node);
+		if (btrfs_delayed_ref_lock(delayed_refs, head))
+			continue;
+
+		spin_lock(&head->lock);
+		while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
+			struct btrfs_delayed_ref_node *ref;
+
+			ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node);
+			rb_erase_cached(&ref->ref_node, &head->ref_tree);
+			RB_CLEAR_NODE(&ref->ref_node);
+			if (!list_empty(&ref->add_list))
+				list_del(&ref->add_list);
+			atomic_dec(&delayed_refs->num_entries);
+			btrfs_put_delayed_ref(ref);
+			btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
+		}
+		if (head->must_insert_reserved)
+			pin_bytes = true;
+		btrfs_free_delayed_extent_op(head->extent_op);
+		btrfs_delete_ref_head(delayed_refs, head);
+		spin_unlock(&head->lock);
+		spin_unlock(&delayed_refs->lock);
+		mutex_unlock(&head->mutex);
+
+		if (pin_bytes) {
+			struct btrfs_block_group *bg;
+
+			bg = btrfs_lookup_block_group(fs_info, head->bytenr);
+			if (WARN_ON_ONCE(bg == NULL)) {
+				/*
+				 * Unexpected and there's nothing we can do here
+				 * because we are in a transaction abort path,
+				 * so any errors can only be ignored or reported
+				 * while attempting to cleanup all resources.
+				 */
+				btrfs_err(fs_info,
+"block group for delayed ref at %llu was not found while destroying ref head",
+					  head->bytenr);
+			} else {
+				spin_lock(&bg->space_info->lock);
+				spin_lock(&bg->lock);
+				bg->pinned += head->num_bytes;
+				btrfs_space_info_update_bytes_pinned(fs_info,
+								     bg->space_info,
+								     head->num_bytes);
+				bg->reserved -= head->num_bytes;
+				bg->space_info->bytes_reserved -= head->num_bytes;
+				spin_unlock(&bg->lock);
+				spin_unlock(&bg->space_info->lock);
+
+				btrfs_put_block_group(bg);
+			}
+
+			btrfs_error_unpin_extent_range(fs_info, head->bytenr,
+				head->bytenr + head->num_bytes - 1);
+		}
+		btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
+		btrfs_put_delayed_ref_head(head);
+		cond_resched();
+		spin_lock(&delayed_refs->lock);
+	}
+	btrfs_qgroup_destroy_extent_records(trans);
+
+	spin_unlock(&delayed_refs->lock);
+}
+
 void __cold btrfs_delayed_ref_exit(void)
 {
 	kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 352921e76c74..ccc040f94264 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -399,6 +399,8 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
 bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
 				 u64 root, u64 parent);
+void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+				struct btrfs_fs_info *fs_info);
 
 static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
 {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 47598e525ea5..f5d30c04ba66 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4529,86 +4529,6 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
 	btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
 }
 
-static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
-				       struct btrfs_fs_info *fs_info)
-{
-	struct rb_node *node;
-	struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
-	struct btrfs_delayed_ref_node *ref;
-
-	spin_lock(&delayed_refs->lock);
-	while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
-		struct btrfs_delayed_ref_head *head;
-		struct rb_node *n;
-		bool pin_bytes = false;
-
-		head = rb_entry(node, struct btrfs_delayed_ref_head,
-				href_node);
-		if (btrfs_delayed_ref_lock(delayed_refs, head))
-			continue;
-
-		spin_lock(&head->lock);
-		while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
-			ref = rb_entry(n, struct btrfs_delayed_ref_node,
-				       ref_node);
-			rb_erase_cached(&ref->ref_node, &head->ref_tree);
-			RB_CLEAR_NODE(&ref->ref_node);
-			if (!list_empty(&ref->add_list))
-				list_del(&ref->add_list);
-			atomic_dec(&delayed_refs->num_entries);
-			btrfs_put_delayed_ref(ref);
-			btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
-		}
-		if (head->must_insert_reserved)
-			pin_bytes = true;
-		btrfs_free_delayed_extent_op(head->extent_op);
-		btrfs_delete_ref_head(delayed_refs, head);
-		spin_unlock(&head->lock);
-		spin_unlock(&delayed_refs->lock);
-		mutex_unlock(&head->mutex);
-
-		if (pin_bytes) {
-			struct btrfs_block_group *cache;
-
-			cache = btrfs_lookup_block_group(fs_info, head->bytenr);
-			if (WARN_ON_ONCE(cache == NULL)) {
-				/*
-				 * Unexpected and there's nothing we can do here
-				 * because we are in a transaction abort path,
-				 * so any errors can only be ignored or reported
-				 * while attempting to cleanup all resources.
-				 */
-				btrfs_err(fs_info,
-"block group for delayed ref at %llu was not found while destroying ref head",
-					  head->bytenr);
-			} else {
-				spin_lock(&cache->space_info->lock);
-				spin_lock(&cache->lock);
-				cache->pinned += head->num_bytes;
-				btrfs_space_info_update_bytes_pinned(fs_info,
-								     cache->space_info,
-								     head->num_bytes);
-				cache->reserved -= head->num_bytes;
-				cache->space_info->bytes_reserved -= head->num_bytes;
-				spin_unlock(&cache->lock);
-				spin_unlock(&cache->space_info->lock);
-
-				btrfs_put_block_group(cache);
-			}
-
-			btrfs_error_unpin_extent_range(fs_info, head->bytenr,
-				head->bytenr + head->num_bytes - 1);
-		}
-		btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
-		btrfs_put_delayed_ref_head(head);
-		cond_resched();
-		spin_lock(&delayed_refs->lock);
-	}
-	btrfs_qgroup_destroy_extent_records(trans);
-
-	spin_unlock(&delayed_refs->lock);
-}
-
 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
 {
 	struct btrfs_inode *btrfs_inode;
-- 
2.51.0


From 2f6e05a5ccb81007a9ca75769fa54f5c57119a9f Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 21 Oct 2024 17:03:46 +0100
Subject: [PATCH 08/16] btrfs: remove fs_info parameter from
 btrfs_destroy_delayed_refs()

The fs_info parameter is redundant because it can be extracted from the
transaction given as another parameter. So remove it and use the fs_info
accessible from the transaction.

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 4 ++--
 fs/btrfs/delayed-ref.h | 3 +--
 fs/btrfs/disk-io.c     | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 9e661f9a71b0..4a4278dfe511 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -1239,11 +1239,11 @@ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
 	return found;
 }
 
-void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
-				struct btrfs_fs_info *fs_info)
+void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
 {
 	struct rb_node *node;
 	struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 
 	spin_lock(&delayed_refs->lock);
 	while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index ccc040f94264..cc78395f2fcd 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -399,8 +399,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
 bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
 				 u64 root, u64 parent);
-void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
-				struct btrfs_fs_info *fs_info);
+void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans);
 
 static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
 {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f5d30c04ba66..2e15afa9e04c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4748,7 +4748,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 		list_del_init(&dev->post_commit_list);
 	}
 
-	btrfs_destroy_delayed_refs(cur_trans, fs_info);
+	btrfs_destroy_delayed_refs(cur_trans);
 
 	cur_trans->state = TRANS_STATE_COMMIT_START;
 	wake_up(&fs_info->transaction_blocked_wait);
-- 
2.51.0


From c3a5888e0f4749c721b969e635303f397be9b44e Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 21 Oct 2024 17:12:22 +0100
Subject: [PATCH 09/16] btrfs: remove fs_info parameter from
 btrfs_cleanup_one_transaction()

The fs_info parameter is redundant because it can be extracted from the
transaction given as another parameter. So remove it and use the fs_info
accessible from the transaction.

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c     | 8 ++++----
 fs/btrfs/disk-io.h     | 3 +--
 fs/btrfs/transaction.c | 2 +-
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2e15afa9e04c..814320948645 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4183,7 +4183,7 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
 		btrfs_warn(fs_info,
 	"transaction %llu (with %llu dirty metadata bytes) is not committed",
 			   trans->transid, dirty_bytes);
-		btrfs_cleanup_one_transaction(trans, fs_info);
+		btrfs_cleanup_one_transaction(trans);
 
 		if (trans == fs_info->running_transaction)
 			fs_info->running_transaction = NULL;
@@ -4734,9 +4734,9 @@ static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info)
 	spin_unlock(&fs_info->fs_roots_radix_lock);
 }
 
-void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
-				   struct btrfs_fs_info *fs_info)
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans)
 {
+	struct btrfs_fs_info *fs_info = cur_trans->fs_info;
 	struct btrfs_device *dev, *tmp;
 
 	btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
@@ -4794,7 +4794,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
 		} else {
 			spin_unlock(&fs_info->trans_lock);
 		}
-		btrfs_cleanup_one_transaction(t, fs_info);
+		btrfs_cleanup_one_transaction(t);
 
 		spin_lock(&fs_info->trans_lock);
 		if (t == fs_info->running_transaction)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 127e31e08347..a7051e2570c1 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -126,8 +126,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
 void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans,
 			     struct btrfs_fs_info *fs_info);
-void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
-				  struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans);
 struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 				     u64 objectid);
 int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0fc873af891f..e580c566f033 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2052,7 +2052,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
 
 	spin_unlock(&fs_info->trans_lock);
 
-	btrfs_cleanup_one_transaction(trans->transaction, fs_info);
+	btrfs_cleanup_one_transaction(trans->transaction);
 
 	spin_lock(&fs_info->trans_lock);
 	if (cur_trans == fs_info->running_transaction)
-- 
2.51.0


From 8d07a8f4c6415c71a9e0232aef8c2aee39620fca Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 21 Oct 2024 12:10:26 +0100
Subject: [PATCH 10/16] btrfs: remove duplicated code to drop delayed ref
 during transaction abort

When destroying delayed refs during a transaction abort, we have open
coded the removal of a delayed ref, which is also done by the static
helper function drop_delayed_ref(). So remove that duplicated code and
use drop_delayed_ref() instead.

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 4a4278dfe511..e8291316c683 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -1261,13 +1261,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
 			struct btrfs_delayed_ref_node *ref;
 
 			ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node);
-			rb_erase_cached(&ref->ref_node, &head->ref_tree);
-			RB_CLEAR_NODE(&ref->ref_node);
-			if (!list_empty(&ref->add_list))
-				list_del(&ref->add_list);
-			atomic_dec(&delayed_refs->num_entries);
-			btrfs_put_delayed_ref(ref);
-			btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
+			drop_delayed_ref(fs_info, delayed_refs, head, ref);
 		}
 		if (head->must_insert_reserved)
 			pin_bytes = true;
-- 
2.51.0


From 055903c4e7fa2945b1b4014726e7268056047242 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 18 Oct 2024 11:29:37 +0100
Subject: [PATCH 11/16] btrfs: use helper to find first ref head at
 btrfs_destroy_delayed_refs()

Instead of open coding it, use the find_first_ref_head() helper at
btrfs_destroy_delayed_refs(). This avoids duplicating the logic,
specially with the upcoming changes in subsequent patches.

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e8291316c683..dc3a29f3c357 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -1241,18 +1241,19 @@ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
 
 void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
 {
-	struct rb_node *node;
 	struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 
 	spin_lock(&delayed_refs->lock);
-	while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
+	while (true) {
 		struct btrfs_delayed_ref_head *head;
 		struct rb_node *n;
 		bool pin_bytes = false;
 
-		head = rb_entry(node, struct btrfs_delayed_ref_head,
-				href_node);
+		head = find_first_ref_head(delayed_refs);
+		if (!head)
+			break;
+
 		if (btrfs_delayed_ref_lock(delayed_refs, head))
 			continue;
 
-- 
2.51.0


From f7d4b4924d226a9007c3a3cb14551b28a47f8767 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 23 Oct 2024 14:14:11 +0100
Subject: [PATCH 12/16] btrfs: remove num_entries atomic counter from delayed
 ref root

The atomic counter 'num_entries' is not used anymore, we increment it
and decrement it but then we don't ever read it to use for any logic.
Its last use was removed with commit 61a56a992fcf ("btrfs: delayed refs
pre-flushing should only run the heads we have"). So remove it.

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 4 ----
 fs/btrfs/delayed-ref.h | 5 -----
 fs/btrfs/extent-tree.c | 1 -
 fs/btrfs/transaction.c | 1 -
 4 files changed, 11 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index dc3a29f3c357..f7c7d1249f04 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -463,7 +463,6 @@ static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info,
 	if (!list_empty(&ref->add_list))
 		list_del(&ref->add_list);
 	btrfs_put_delayed_ref(ref);
-	atomic_dec(&delayed_refs->num_entries);
 	btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
 }
 
@@ -604,7 +603,6 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
 
 	rb_erase_cached(&head->href_node, &delayed_refs->href_root);
 	RB_CLEAR_NODE(&head->href_node);
-	atomic_dec(&delayed_refs->num_entries);
 	delayed_refs->num_heads--;
 	if (!head->processing)
 		delayed_refs->num_heads_ready--;
@@ -630,7 +628,6 @@ static bool insert_delayed_ref(struct btrfs_trans_handle *trans,
 	if (!exist) {
 		if (ref->action == BTRFS_ADD_DELAYED_REF)
 			list_add_tail(&ref->add_list, &href->ref_add_list);
-		atomic_inc(&root->num_entries);
 		spin_unlock(&href->lock);
 		trans->delayed_ref_updates++;
 		return false;
@@ -901,7 +898,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
 		}
 		delayed_refs->num_heads++;
 		delayed_refs->num_heads_ready++;
-		atomic_inc(&delayed_refs->num_entries);
 	}
 	if (qrecord_inserted_ret)
 		*qrecord_inserted_ret = qrecord_inserted;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index cc78395f2fcd..a97c9df19ea0 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -216,11 +216,6 @@ struct btrfs_delayed_ref_root {
 	/* this spin lock protects the rbtree and the entries inside */
 	spinlock_t lock;
 
-	/* how many delayed ref updates we've queued, used by the
-	 * throttling code
-	 */
-	atomic_t num_entries;
-
 	/* total number of head nodes in tree */
 	unsigned long num_heads;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 594d18ed908c..adff2b6fb629 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2029,7 +2029,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
 		default:
 			WARN_ON(1);
 		}
-		atomic_dec(&delayed_refs->num_entries);
 
 		/*
 		 * Record the must_insert_reserved flag before we drop the
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e580c566f033..9ccf68ab53f9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -351,7 +351,6 @@ loop:
 
 	cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
 	xa_init(&cur_trans->delayed_refs.dirty_extents);
-	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
 
 	/*
 	 * although the tree mod log is per file system and not per transaction,
-- 
2.51.0


From 7ef360488600e8b7c131306b9f5ed7e42202b487 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 21 Oct 2024 12:32:55 +0100
Subject: [PATCH 13/16] btrfs: change return type of btrfs_delayed_ref_lock()
 to boolean

The function only returns 0, meaning it was able to lock the delayed ref
head, or -EAGAIN in case it wasn't able to lock it. So simplify this and
use a boolean return type instead, returning true if it was able to lock
and false otherwise.

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 12 ++++++------
 fs/btrfs/delayed-ref.h |  4 ++--
 fs/btrfs/extent-tree.c |  6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index f7c7d1249f04..8392cb366700 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -431,12 +431,12 @@ static struct btrfs_delayed_ref_head *find_ref_head(
 	return NULL;
 }
 
-int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
-			   struct btrfs_delayed_ref_head *head)
+bool btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
+			    struct btrfs_delayed_ref_head *head)
 {
 	lockdep_assert_held(&delayed_refs->lock);
 	if (mutex_trylock(&head->mutex))
-		return 0;
+		return true;
 
 	refcount_inc(&head->refs);
 	spin_unlock(&delayed_refs->lock);
@@ -446,10 +446,10 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
 	if (RB_EMPTY_NODE(&head->href_node)) {
 		mutex_unlock(&head->mutex);
 		btrfs_put_delayed_ref_head(head);
-		return -EAGAIN;
+		return false;
 	}
 	btrfs_put_delayed_ref_head(head);
-	return 0;
+	return true;
 }
 
 static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info,
@@ -1250,7 +1250,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
 		if (!head)
 			break;
 
-		if (btrfs_delayed_ref_lock(delayed_refs, head))
+		if (!btrfs_delayed_ref_lock(delayed_refs, head))
 			continue;
 
 		spin_lock(&head->lock);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index a97c9df19ea0..04730c650212 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -369,8 +369,8 @@ void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
 			    u64 bytenr);
-int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
-			   struct btrfs_delayed_ref_head *head);
+bool btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
+			    struct btrfs_delayed_ref_head *head);
 static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
 {
 	mutex_unlock(&head->mutex);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index adff2b6fb629..b9f455ae8788 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1959,7 +1959,7 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
 	struct btrfs_delayed_ref_root *delayed_refs =
 		&trans->transaction->delayed_refs;
 	struct btrfs_delayed_ref_head *head = NULL;
-	int ret;
+	bool locked;
 
 	spin_lock(&delayed_refs->lock);
 	head = btrfs_select_ref_head(delayed_refs);
@@ -1972,7 +1972,7 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
 	 * Grab the lock that says we are going to process all the refs for
 	 * this head
 	 */
-	ret = btrfs_delayed_ref_lock(delayed_refs, head);
+	locked = btrfs_delayed_ref_lock(delayed_refs, head);
 	spin_unlock(&delayed_refs->lock);
 
 	/*
@@ -1980,7 +1980,7 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
 	 * that might have given someone else time to free the head.  If that's
 	 * true, it has been removed from our list and we can move on.
 	 */
-	if (ret == -EAGAIN)
+	if (!locked)
 		head = ERR_PTR(-EAGAIN);
 
 	return head;
-- 
2.51.0


From a98048e10d44569b8d5ba6edcb20992cf57ed816 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 21 Oct 2024 12:40:08 +0100
Subject: [PATCH 14/16] btrfs: simplify obtaining a delayed ref head

Instead of doing it in two steps outside of delayed-ref.c, leaking low
level details such as locking, move the logic entirely to delayed-ref.c
under btrfs_select_ref_head(), reducing code and making things simpler
for the caller.

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 27 ++++++++++++++++++++++-----
 fs/btrfs/delayed-ref.h |  2 --
 fs/btrfs/extent-tree.c | 35 +----------------------------------
 3 files changed, 23 insertions(+), 41 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 8392cb366700..fdecdd02db94 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -431,8 +431,8 @@ static struct btrfs_delayed_ref_head *find_ref_head(
 	return NULL;
 }
 
-bool btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
-			    struct btrfs_delayed_ref_head *head)
+static bool btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
+				   struct btrfs_delayed_ref_head *head)
 {
 	lockdep_assert_held(&delayed_refs->lock);
 	if (mutex_trylock(&head->mutex))
@@ -561,8 +561,9 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
 		struct btrfs_delayed_ref_root *delayed_refs)
 {
 	struct btrfs_delayed_ref_head *head;
+	bool locked;
 
-	lockdep_assert_held(&delayed_refs->lock);
+	spin_lock(&delayed_refs->lock);
 again:
 	head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
 			     true);
@@ -570,16 +571,20 @@ again:
 		delayed_refs->run_delayed_start = 0;
 		head = find_first_ref_head(delayed_refs);
 	}
-	if (!head)
+	if (!head) {
+		spin_unlock(&delayed_refs->lock);
 		return NULL;
+	}
 
 	while (head->processing) {
 		struct rb_node *node;
 
 		node = rb_next(&head->href_node);
 		if (!node) {
-			if (delayed_refs->run_delayed_start == 0)
+			if (delayed_refs->run_delayed_start == 0) {
+				spin_unlock(&delayed_refs->lock);
 				return NULL;
+			}
 			delayed_refs->run_delayed_start = 0;
 			goto again;
 		}
@@ -592,6 +597,18 @@ again:
 	delayed_refs->num_heads_ready--;
 	delayed_refs->run_delayed_start = head->bytenr +
 		head->num_bytes;
+
+	locked = btrfs_delayed_ref_lock(delayed_refs, head);
+	spin_unlock(&delayed_refs->lock);
+
+	/*
+	 * We may have dropped the spin lock to get the head mutex lock, and
+	 * that might have given someone else time to free the head.  If that's
+	 * true, it has been removed from our list and we can move on.
+	 */
+	if (!locked)
+		return ERR_PTR(-EAGAIN);
+
 	return head;
 }
 
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 04730c650212..956fbe5d6984 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -369,8 +369,6 @@ void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
 			    u64 bytenr);
-bool btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
-			    struct btrfs_delayed_ref_head *head);
 static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
 {
 	mutex_unlock(&head->mutex);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b9f455ae8788..887706e87524 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1953,39 +1953,6 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
-					struct btrfs_trans_handle *trans)
-{
-	struct btrfs_delayed_ref_root *delayed_refs =
-		&trans->transaction->delayed_refs;
-	struct btrfs_delayed_ref_head *head = NULL;
-	bool locked;
-
-	spin_lock(&delayed_refs->lock);
-	head = btrfs_select_ref_head(delayed_refs);
-	if (!head) {
-		spin_unlock(&delayed_refs->lock);
-		return head;
-	}
-
-	/*
-	 * Grab the lock that says we are going to process all the refs for
-	 * this head
-	 */
-	locked = btrfs_delayed_ref_lock(delayed_refs, head);
-	spin_unlock(&delayed_refs->lock);
-
-	/*
-	 * We may have dropped the spin lock to get the head mutex lock, and
-	 * that might have given someone else time to free the head.  If that's
-	 * true, it has been removed from our list and we can move on.
-	 */
-	if (!locked)
-		head = ERR_PTR(-EAGAIN);
-
-	return head;
-}
-
 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
 					   struct btrfs_delayed_ref_head *locked_ref,
 					   u64 *bytes_released)
@@ -2092,7 +2059,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 
 	do {
 		if (!locked_ref) {
-			locked_ref = btrfs_obtain_ref_head(trans);
+			locked_ref = btrfs_select_ref_head(delayed_refs);
 			if (IS_ERR_OR_NULL(locked_ref)) {
 				if (PTR_ERR(locked_ref) == -EAGAIN) {
 					continue;
-- 
2.51.0


From 58a4391810d4bc717d87fd446a0c3ad2cb7ea3cd Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 21 Oct 2024 12:52:55 +0100
Subject: [PATCH 15/16] btrfs: move delayed ref head unselection to
 delayed-ref.c

The unselect_delayed_ref_head() at extent-tree.c doesn't really belong in
that file as it's a delayed refs specific detail and therefore should be
at delayed-ref.c. Further its inverse, btrfs_select_ref_head(), is at
delayed-ref.c, so it only makes sense to have it there too.

So move unselect_delayed_ref_head() into delayed-ref.c and rename it to
btrfs_unselect_ref_head() so that its name closely matches its inverse
(btrfs_select_ref_head()).

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 10 ++++++++++
 fs/btrfs/delayed-ref.h |  2 ++
 fs/btrfs/extent-tree.c | 16 +++-------------
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index fdecdd02db94..9131dc53b0a1 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -612,6 +612,16 @@ again:
 	return head;
 }
 
+void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+			     struct btrfs_delayed_ref_head *head)
+{
+	spin_lock(&delayed_refs->lock);
+	head->processing = false;
+	delayed_refs->num_heads_ready++;
+	spin_unlock(&delayed_refs->lock);
+	btrfs_delayed_ref_unlock(head);
+}
+
 void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
 			   struct btrfs_delayed_ref_head *head)
 {
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 956fbe5d6984..d70de7ee63e5 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -378,6 +378,8 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
 
 struct btrfs_delayed_ref_head *btrfs_select_ref_head(
 		struct btrfs_delayed_ref_root *delayed_refs);
+void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+			     struct btrfs_delayed_ref_head *head);
 
 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 887706e87524..7a9edbd2e83d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1827,16 +1827,6 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
 	return ref;
 }
 
-static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
-				      struct btrfs_delayed_ref_head *head)
-{
-	spin_lock(&delayed_refs->lock);
-	head->processing = false;
-	delayed_refs->num_heads_ready++;
-	spin_unlock(&delayed_refs->lock);
-	btrfs_delayed_ref_unlock(head);
-}
-
 static struct btrfs_delayed_extent_op *cleanup_extent_op(
 				struct btrfs_delayed_ref_head *head)
 {
@@ -1911,7 +1901,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 
 	ret = run_and_cleanup_extent_op(trans, head);
 	if (ret < 0) {
-		unselect_delayed_ref_head(delayed_refs, head);
+		btrfs_unselect_ref_head(delayed_refs, head);
 		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
 		return ret;
 	} else if (ret) {
@@ -1973,7 +1963,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
 		if (ref->seq &&
 		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
 			spin_unlock(&locked_ref->lock);
-			unselect_delayed_ref_head(delayed_refs, locked_ref);
+			btrfs_unselect_ref_head(delayed_refs, locked_ref);
 			return -EAGAIN;
 		}
 
@@ -2021,7 +2011,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
 
 		btrfs_free_delayed_extent_op(extent_op);
 		if (ret) {
-			unselect_delayed_ref_head(delayed_refs, locked_ref);
+			btrfs_unselect_ref_head(delayed_refs, locked_ref);
 			btrfs_put_delayed_ref(ref);
 			return ret;
 		}
-- 
2.51.0


From 765f82890299e67efae4c22a378869c1d3d4e4be Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 21 Oct 2024 16:08:18 +0100
Subject: [PATCH 16/16] btrfs: pass fs_info to functions that search for
 delayed ref heads

One of the following patches in the series will need to access fs_info in
the function find_ref_head(), so pass a fs_info argument to it as well as
to the functions btrfs_select_ref_head() and btrfs_find_delayed_ref_head()
which call find_ref_head().

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c     |  3 ++-
 fs/btrfs/delayed-ref.c | 12 ++++++++----
 fs/btrfs/delayed-ref.h |  4 +++-
 fs/btrfs/extent-tree.c | 10 +++++-----
 4 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f8e1d5b2c512..04f53ca548e1 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1442,7 +1442,8 @@ again:
 		 */
 		delayed_refs = &ctx->trans->transaction->delayed_refs;
 		spin_lock(&delayed_refs->lock);
-		head = btrfs_find_delayed_ref_head(delayed_refs, ctx->bytenr);
+		head = btrfs_find_delayed_ref_head(ctx->fs_info, delayed_refs,
+						   ctx->bytenr);
 		if (head) {
 			if (!mutex_trylock(&head->mutex)) {
 				refcount_inc(&head->refs);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 9131dc53b0a1..60a9e538d919 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -399,6 +399,7 @@ static struct btrfs_delayed_ref_head *find_first_ref_head(
  * is given, the next bigger entry is returned if no exact match is found.
  */
 static struct btrfs_delayed_ref_head *find_ref_head(
+		const struct btrfs_fs_info *fs_info,
 		struct btrfs_delayed_ref_root *dr, u64 bytenr,
 		bool return_bigger)
 {
@@ -558,6 +559,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
 }
 
 struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+		const struct btrfs_fs_info *fs_info,
 		struct btrfs_delayed_ref_root *delayed_refs)
 {
 	struct btrfs_delayed_ref_head *head;
@@ -565,8 +567,8 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
 
 	spin_lock(&delayed_refs->lock);
 again:
-	head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
-			     true);
+	head = find_ref_head(fs_info, delayed_refs,
+			     delayed_refs->run_delayed_start, true);
 	if (!head && delayed_refs->run_delayed_start != 0) {
 		delayed_refs->run_delayed_start = 0;
 		head = find_first_ref_head(delayed_refs);
@@ -1188,11 +1190,13 @@ void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
  * head node if found, or NULL if not.
  */
 struct btrfs_delayed_ref_head *
-btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
+btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info,
+			    struct btrfs_delayed_ref_root *delayed_refs,
+			    u64 bytenr)
 {
 	lockdep_assert_held(&delayed_refs->lock);
 
-	return find_ref_head(delayed_refs, bytenr, false);
+	return find_ref_head(fs_info, delayed_refs, bytenr, false);
 }
 
 static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent)
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d70de7ee63e5..b5894aa01d5e 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -367,7 +367,8 @@ void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
 			      struct btrfs_delayed_ref_head *head);
 
 struct btrfs_delayed_ref_head *
-btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info,
+			    struct btrfs_delayed_ref_root *delayed_refs,
 			    u64 bytenr);
 static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
 {
@@ -377,6 +378,7 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
 			   struct btrfs_delayed_ref_head *head);
 
 struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+		const struct btrfs_fs_info *fs_info,
 		struct btrfs_delayed_ref_root *delayed_refs);
 void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
 			     struct btrfs_delayed_ref_head *head);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7a9edbd2e83d..50dc0f1dde24 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -182,7 +182,7 @@ search_again:
 
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
-	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+	head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr);
 	if (head) {
 		if (!mutex_trylock(&head->mutex)) {
 			refcount_inc(&head->refs);
@@ -2049,7 +2049,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 
 	do {
 		if (!locked_ref) {
-			locked_ref = btrfs_select_ref_head(delayed_refs);
+			locked_ref = btrfs_select_ref_head(fs_info, delayed_refs);
 			if (IS_ERR_OR_NULL(locked_ref)) {
 				if (PTR_ERR(locked_ref) == -EAGAIN) {
 					continue;
@@ -2251,7 +2251,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
 
 	delayed_refs = &cur_trans->delayed_refs;
 	spin_lock(&delayed_refs->lock);
-	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+	head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr);
 	if (!head) {
 		spin_unlock(&delayed_refs->lock);
 		btrfs_put_transaction(cur_trans);
@@ -3359,7 +3359,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
-	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+	head = btrfs_find_delayed_ref_head(trans->fs_info, delayed_refs, bytenr);
 	if (!head)
 		goto out_delayed_unlock;
 
@@ -5494,7 +5494,7 @@ again:
 	 */
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
-	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+	head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr);
 	if (!head)
 		goto out;
 	if (!mutex_trylock(&head->mutex)) {
-- 
2.51.0