From 69673992b1aea5540199d9b8b658ede72f55a6cf Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Fri, 30 Aug 2024 13:24:54 -0700 Subject: [PATCH 01/16] btrfs: push cleanup into btrfs_read_locked_inode() Move btrfs_add_inode_to_root() so it can be called from btrfs_read_locked_inode(), no changes were made to the function. Move cleanup code from btrfs_iget_path() to btrfs_read_locked_inode. This improves readability and improves a leaky abstraction. Previously btrfs_iget_path() had to handle a positive error case as a result of a call to btrfs_search_slot(), but it makes more sense to handle this closer to the source of the call. Signed-off-by: Leo Martins Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 101 +++++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 48 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4bb393cac170..09ff00aaa430 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3759,8 +3759,41 @@ static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) return 0; } +static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) +{ + struct btrfs_root *root = inode->root; + struct btrfs_inode *existing; + const u64 ino = btrfs_ino(inode); + int ret; + + if (inode_unhashed(&inode->vfs_inode)) + return 0; + + if (prealloc) { + ret = xa_reserve(&root->inodes, ino, GFP_NOFS); + if (ret) + return ret; + } + + existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); + + if (xa_is_err(existing)) { + ret = xa_err(existing); + ASSERT(ret != -EINVAL); + ASSERT(ret != -ENOMEM); + return ret; + } else if (existing) { + WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); + } + + return 0; +} + /* - * read an inode from the btree into the in-memory inode + * Read a locked inode from the btree into the in-memory inode and add it to + * its root list/tree. + * + * On failure clean up the inode. */ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *in_path) @@ -3780,7 +3813,7 @@ static int btrfs_read_locked_inode(struct inode *inode, ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); if (ret) - return ret; + goto out; ret = btrfs_fill_inode(inode, &rdev); if (!ret) @@ -3789,7 +3822,7 @@ static int btrfs_read_locked_inode(struct inode *inode, if (!path) { path = btrfs_alloc_path(); if (!path) - return -ENOMEM; + goto out; } btrfs_get_inode_key(BTRFS_I(inode), &location); @@ -3798,7 +3831,13 @@ static int btrfs_read_locked_inode(struct inode *inode, if (ret) { if (path != in_path) btrfs_free_path(path); - return ret; + /* + * ret > 0 can come from btrfs_search_slot called by + * btrfs_lookup_inode(), this means the inode was not found. + */ + if (ret > 0) + ret = -ENOENT; + goto out; } leaf = path->nodes[0]; @@ -3961,7 +4000,15 @@ cache_acl: } btrfs_sync_inode_flags_to_i_flags(inode); + + ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); + if (ret) + goto out; + return 0; +out: + iget_failed(inode); + return ret; } /* @@ -5470,35 +5517,7 @@ out: return err; } -static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) -{ - struct btrfs_root *root = inode->root; - struct btrfs_inode *existing; - const u64 ino = btrfs_ino(inode); - int ret; - if (inode_unhashed(&inode->vfs_inode)) - return 0; - - if (prealloc) { - ret = xa_reserve(&root->inodes, ino, GFP_NOFS); - if (ret) - return ret; - } - - existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); - - if (xa_is_err(existing)) { - ret = xa_err(existing); - ASSERT(ret != -EINVAL); - ASSERT(ret != -ENOMEM); - return ret; - } else if (existing) { - WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); - } - - return 0; -} static void btrfs_del_inode_from_root(struct btrfs_inode *inode) { @@ -5579,25 +5598,11 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, return inode; ret = btrfs_read_locked_inode(inode, path); - /* - * ret > 0 can come from btrfs_search_slot called by - * btrfs_read_locked_inode(), this means the inode item was not found. - */ - if (ret > 0) - ret = -ENOENT; - if (ret < 0) - goto error; - - ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); - if (ret < 0) - goto error; + if (ret) + return ERR_PTR(ret); unlock_new_inode(inode); - return inode; -error: - iget_failed(inode); - return ERR_PTR(ret); } struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) -- 2.51.0 From 7c855e16ab72596d771355050ffe026e6b99f91c Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Fri, 30 Aug 2024 13:24:55 -0700 Subject: [PATCH 02/16] btrfs: remove conditional path allocation in btrfs_read_locked_inode() Remove conditional path allocation from btrfs_read_locked_inode(). Add an ASSERT(path) to indicate it should never be called with a NULL path. Call btrfs_read_locked_inode() directly from btrfs_iget(). This causes code duplication between btrfs_iget() and btrfs_iget_path(), but I think this is justifiable as it removes the need for conditionally allocating the path inside of btrfs_read_locked_inode(). This makes the code easier to reason about and makes it clear who has the responsibility of allocating and freeing the path. Signed-off-by: Leo Martins Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 09ff00aaa430..fa366360007b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3795,11 +3795,9 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) * * On failure clean up the inode. */ -static int btrfs_read_locked_inode(struct inode *inode, - struct btrfs_path *in_path) +static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_path *path = in_path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -3819,18 +3817,12 @@ static int btrfs_read_locked_inode(struct inode *inode, if (!ret) filled = true; - if (!path) { - path = btrfs_alloc_path(); - if (!path) - goto out; - } + ASSERT(path); btrfs_get_inode_key(BTRFS_I(inode), &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { - if (path != in_path) - btrfs_free_path(path); /* * ret > 0 can come from btrfs_search_slot called by * btrfs_lookup_inode(), this means the inode was not found. @@ -3972,8 +3964,6 @@ cache_acl: btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret); } - if (path != in_path) - btrfs_free_path(path); if (!maybe_acls) cache_no_acl(inode); @@ -5579,10 +5569,8 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) } /* - * Get an inode object given its inode number and corresponding root. - * Path can be preallocated to prevent recursing back to iget through - * allocator. NULL is also valid but may require an additional allocation - * later. + * Get an inode object given its inode number and corresponding root. Path is + * preallocated to prevent recursing back to iget through allocator. */ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, struct btrfs_path *path) @@ -5605,9 +5593,33 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, return inode; } +/* + * Get an inode object given its inode number and corresponding root. + */ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) { - return btrfs_iget_path(ino, root, NULL); + struct inode *inode; + struct btrfs_path *path; + int ret; + + inode = btrfs_iget_locked(ino, root); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!(inode->i_state & I_NEW)) + return inode; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = btrfs_read_locked_inode(inode, path); + btrfs_free_path(path); + if (ret) + return ERR_PTR(ret); + + unlock_new_inode(inode); + return inode; } static struct inode *new_simple_dir(struct inode *dir, -- 2.51.0 From 5599f39356c6ec73d2015ae88286382d1034ef0d Mon Sep 17 00:00:00 2001 From: Haisu Wang Date: Fri, 25 Oct 2024 14:54:41 +0800 Subject: [PATCH 03/16] btrfs: simplify range tracking in cow_file_range() Simplify tracking of the range processed by using cur_alloc_size only to store the reserved part that may fail to the allocated extent. Remove the ram_size as well since it is always equal to cur_alloc_size in the context. Advance the start in normal path until extent allocation succeeds and keep the start unchanged in the error handling path. Passed the fstest generic/475 test for a hundred times with quota enabled. And a modified generic/475 test by removing the sleep time for a hundred times. About one tenth of the tests do enter the error handling path due to fail to reserve extent. Suggested-by: Qu Wenruo Signed-off-by: Haisu Wang Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fa366360007b..a2128eccafb3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1332,7 +1332,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, u64 alloc_hint = 0; u64 orig_start = start; u64 num_bytes; - unsigned long ram_size; u64 cur_alloc_size = 0; u64 min_alloc_size; u64 blocksize = fs_info->sectorsize; @@ -1340,7 +1339,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, struct extent_map *em; unsigned clear_bits; unsigned long page_ops; - bool extent_reserved = false; int ret = 0; if (btrfs_is_free_space_inode(inode)) { @@ -1394,8 +1392,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, struct btrfs_ordered_extent *ordered; struct btrfs_file_extent file_extent; - cur_alloc_size = num_bytes; - ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, + ret = btrfs_reserve_extent(root, num_bytes, num_bytes, min_alloc_size, 0, alloc_hint, &ins, 1, 1); if (ret == -EAGAIN) { @@ -1426,9 +1423,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret < 0) goto out_unlock; cur_alloc_size = ins.offset; - extent_reserved = true; - ram_size = ins.offset; file_extent.disk_bytenr = ins.objectid; file_extent.disk_num_bytes = ins.offset; file_extent.num_bytes = ins.offset; @@ -1436,14 +1431,14 @@ static noinline int cow_file_range(struct btrfs_inode *inode, file_extent.offset = 0; file_extent.compression = BTRFS_COMPRESS_NONE; - lock_extent(&inode->io_tree, start, start + ram_size - 1, + lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, &cached); em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_REGULAR); if (IS_ERR(em)) { unlock_extent(&inode->io_tree, start, - start + ram_size - 1, &cached); + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(em); goto out_reserve; } @@ -1453,7 +1448,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, 1 << BTRFS_ORDERED_REGULAR); if (IS_ERR(ordered)) { unlock_extent(&inode->io_tree, start, - start + ram_size - 1, &cached); + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(ordered); goto out_drop_extent_cache; } @@ -1474,7 +1469,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, */ if (ret) btrfs_drop_extent_map_range(inode, start, - start + ram_size - 1, + start + cur_alloc_size - 1, false); } btrfs_put_ordered_extent(ordered); @@ -1492,7 +1487,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, page_ops = (keep_locked ? 0 : PAGE_UNLOCK); page_ops |= PAGE_SET_ORDERED; - extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, + extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); @@ -1502,7 +1497,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; - extent_reserved = false; + cur_alloc_size = 0; /* * btrfs_reloc_clone_csums() error, since start is increased @@ -1518,7 +1513,7 @@ done: return ret; out_drop_extent_cache: - btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false); + btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); @@ -1572,13 +1567,12 @@ out_unlock: * to decrement again the data space_info's bytes_may_use counter, * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. */ - if (extent_reserved) { + if (cur_alloc_size) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, locked_folio, &cached, clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); - start += cur_alloc_size; } /* @@ -1587,11 +1581,13 @@ out_unlock: * space_info's bytes_may_use counter, reserved in * btrfs_check_data_free_space(). */ - if (start < end) { + if (start + cur_alloc_size < end) { clear_bits |= EXTENT_CLEAR_DATA_RESV; - extent_clear_unlock_delalloc(inode, start, end, locked_folio, + extent_clear_unlock_delalloc(inode, start + cur_alloc_size, + end, locked_folio, &cached, clear_bits, page_ops); - btrfs_qgroup_free_data(inode, NULL, start, end - start + 1, NULL); + btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size, + end - start - cur_alloc_size + 1, NULL); } return ret; } -- 2.51.0 From 6c83d153ed86eb17c46eafe4e78af4ce2071a052 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 21 Oct 2024 21:28:26 +0200 Subject: [PATCH 04/16] btrfs: add new ioctl to wait for cleaned subvolumes Add a new unprivileged ioctl that will let the command 'btrfs subvolume sync' work without the (privileged) SEARCH_TREE ioctl. There are several modes of operation, where the most common ones are to wait on a specific subvolume or all currently queued for cleaning. This is utilized e.g. in backup applications that delete subvolumes and wait until they're cleaned to check for remaining space. The other modes are for flexibility, e.g. for monitoring or checkpoints in the queue of deleted subvolumes, again without the need to use SEARCH_TREE. Notes: - waiting is interruptible, the timeout is set to 1 second and is not configurable - repeated calls to the ioctl see a different state, so this is inherently racy when using e.g. the count or peek next/last Use cases: - a subvolume A was deleted, wait for cleaning (WAIT_FOR_ONE) - a bunch of subvolumes were deleted, wait for all (WAIT_FOR_QUEUED or PEEK_LAST + WAIT_FOR_ONE) - count how many are queued (not blocking), for monitoring purposes - report progress (PEEK_NEXT), may miss some if cleaning is quick - own waiting in user space (PEEK_LAST until it's 0) Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 128 +++++++++++++++++++++++++++++++++++++ include/uapi/linux/btrfs.h | 25 ++++++++ 2 files changed, 153 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 27a9342cd91c..2118c22625ca 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -5027,6 +5027,132 @@ int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -EINVAL; } +static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *argp) +{ + struct btrfs_root *root; + struct btrfs_ioctl_subvol_wait args = { 0 }; + signed long sched_ret; + int refs; + u64 root_flags; + bool wait_for_deletion = false; + bool found = false; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + switch (args.mode) { + case BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED: + /* + * Wait for the first one deleted that waits until all previous + * are cleaned. + */ + spin_lock(&fs_info->trans_lock); + if (!list_empty(&fs_info->dead_roots)) { + root = list_last_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + found = true; + } + spin_unlock(&fs_info->trans_lock); + if (!found) + return -ENOENT; + + fallthrough; + case BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE: + if ((0 < args.subvolid && args.subvolid < BTRFS_FIRST_FREE_OBJECTID) || + BTRFS_LAST_FREE_OBJECTID < args.subvolid) + return -EINVAL; + break; + case BTRFS_SUBVOL_SYNC_COUNT: + spin_lock(&fs_info->trans_lock); + args.count = list_count_nodes(&fs_info->dead_roots); + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + case BTRFS_SUBVOL_SYNC_PEEK_FIRST: + spin_lock(&fs_info->trans_lock); + /* Last in the list was deleted first. */ + if (!list_empty(&fs_info->dead_roots)) { + root = list_last_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + } else { + args.subvolid = 0; + } + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + case BTRFS_SUBVOL_SYNC_PEEK_LAST: + spin_lock(&fs_info->trans_lock); + /* First in the list was deleted last. */ + if (!list_empty(&fs_info->dead_roots)) { + root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + } else { + args.subvolid = 0; + } + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + default: + return -EINVAL; + } + + /* 32bit limitation: fs_roots_radix key is not wide enough. */ + if (sizeof(unsigned long) != sizeof(u64) && args.subvolid > U32_MAX) + return -EOVERFLOW; + + while (1) { + /* Wait for the specific one. */ + if (down_read_interruptible(&fs_info->subvol_sem) == -EINTR) + return -EINTR; + refs = -1; + spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)args.subvolid); + if (root) { + spin_lock(&root->root_item_lock); + refs = btrfs_root_refs(&root->root_item); + root_flags = btrfs_root_flags(&root->root_item); + spin_unlock(&root->root_item_lock); + } + spin_unlock(&fs_info->fs_roots_radix_lock); + up_read(&fs_info->subvol_sem); + + /* Subvolume does not exist. */ + if (!root) + return -ENOENT; + + /* Subvolume not deleted at all. */ + if (refs > 0) + return -EEXIST; + /* We've waited and now the subvolume is gone. */ + if (wait_for_deletion && refs == -1) { + /* Return the one we waited for as the last one. */ + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + } + + /* Subvolume not found on the first try (deleted or never existed). */ + if (refs == -1) + return -ENOENT; + + wait_for_deletion = true; + ASSERT(root_flags & BTRFS_ROOT_SUBVOL_DEAD); + sched_ret = schedule_timeout_interruptible(HZ); + /* Early wake up or error. */ + if (sched_ret != 0) + return -EINTR; + } + + return 0; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -5178,6 +5304,8 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_ENCODED_WRITE_32: return btrfs_ioctl_encoded_write(file, argp, true); #endif + case BTRFS_IOC_SUBVOL_SYNC_WAIT: + return btrfs_ioctl_subvol_sync(fs_info, argp); } return -ENOTTY; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index cdf6ad872149..d3b222d7af24 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -1049,6 +1049,29 @@ struct btrfs_ioctl_encoded_io_args { #define BTRFS_ENCODED_IO_ENCRYPTION_NONE 0 #define BTRFS_ENCODED_IO_ENCRYPTION_TYPES 1 +/* + * Wait for subvolume cleaning process. This queries the kernel queue and it + * can change between the calls. + * + * - FOR_ONE - specify the subvolid + * - FOR_QUEUED - wait for all currently queued + * - COUNT - count number of queued + * - PEEK_FIRST - read which is the first in the queue (to be cleaned or being + * cleaned already), or 0 if the queue is empty + * - PEEK_LAST - read the last subvolid in the queue, or 0 if the queue is empty + */ +struct btrfs_ioctl_subvol_wait { + __u64 subvolid; + __u32 mode; + __u32 count; +}; + +#define BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE (0) +#define BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED (1) +#define BTRFS_SUBVOL_SYNC_COUNT (2) +#define BTRFS_SUBVOL_SYNC_PEEK_FIRST (3) +#define BTRFS_SUBVOL_SYNC_PEEK_LAST (4) + /* Error codes as returned by the kernel */ enum btrfs_err_code { BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1, @@ -1181,6 +1204,8 @@ enum btrfs_err_code { struct btrfs_ioctl_encoded_io_args) #define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \ struct btrfs_ioctl_encoded_io_args) +#define BTRFS_IOC_SUBVOL_SYNC_WAIT _IOW(BTRFS_IOCTL_MAGIC, 65, \ + struct btrfs_ioctl_subvol_wait) #ifdef __cplusplus } -- 2.51.0 From dd0896e77d89686c0736485c5ed4d115e99eaa0c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 4 Nov 2024 11:50:14 +0000 Subject: [PATCH 05/16] btrfs: update stale comment for struct btrfs_delayed_ref_node::add_list The comment refers to a list in the respective delayed ref head that no longer exists (ref_list), it was replaced with a rbtree (ref_tree) in commit 0e0adbcfdc90 ("btrfs: track refs in a rb_tree instead of a list"). So update the stale comment to refer to the rbtree instead of the old list. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 4cb7d4475808..611fb3388f82 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -61,7 +61,8 @@ struct btrfs_delayed_ref_node { /* * If action is BTRFS_ADD_DELAYED_REF, also link this node to * ref_head->ref_add_list, then we do not need to iterate the - * whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes. + * refs rbtree in the corresponding delayed ref head + * (struct btrfs_delayed_ref_head::ref_tree). */ struct list_head add_list; -- 2.51.0 From a20725e1e7014642fc8ba4c7dd2c4ef6d4ec56a9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 5 Nov 2024 11:56:45 +0000 Subject: [PATCH 06/16] btrfs: remove hole from struct btrfs_delayed_node On x86_64 and a release kernel, there's a 4 bytes hole in the structure after the ref count field: struct btrfs_delayed_node { u64 inode_id; /* 0 8 */ u64 bytes_reserved; /* 8 8 */ struct btrfs_root * root; /* 16 8 */ struct list_head n_list; /* 24 16 */ struct list_head p_list; /* 40 16 */ struct rb_root_cached ins_root; /* 56 16 */ /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */ struct rb_root_cached del_root; /* 72 16 */ struct mutex mutex; /* 88 32 */ struct btrfs_inode_item inode_item; /* 120 160 */ /* --- cacheline 4 boundary (256 bytes) was 24 bytes ago --- */ refcount_t refs; /* 280 4 */ /* XXX 4 bytes hole, try to pack */ u64 index_cnt; /* 288 8 */ long unsigned int flags; /* 296 8 */ int count; /* 304 4 */ u32 curr_index_batch_size; /* 308 4 */ u32 index_item_leaves; /* 312 4 */ /* size: 320, cachelines: 5, members: 15 */ /* sum members: 312, holes: 1, sum holes: 4 */ /* padding: 4 */ }; Move the 'count' field, which is 4 bytes long, to just below the ref count field, so we eliminate the hole and reduce the structure size from 320 bytes down to 312 bytes: struct btrfs_delayed_node { u64 inode_id; /* 0 8 */ u64 bytes_reserved; /* 8 8 */ struct btrfs_root * root; /* 16 8 */ struct list_head n_list; /* 24 16 */ struct list_head p_list; /* 40 16 */ struct rb_root_cached ins_root; /* 56 16 */ /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */ struct rb_root_cached del_root; /* 72 16 */ struct mutex mutex; /* 88 32 */ struct btrfs_inode_item inode_item; /* 120 160 */ /* --- cacheline 4 boundary (256 bytes) was 24 bytes ago --- */ refcount_t refs; /* 280 4 */ int count; /* 284 4 */ u64 index_cnt; /* 288 8 */ long unsigned int flags; /* 296 8 */ u32 curr_index_batch_size; /* 304 4 */ u32 index_item_leaves; /* 308 4 */ /* size: 312, cachelines: 5, members: 15 */ /* last cacheline: 56 bytes */ }; This now allows to have 13 delayed nodes per 4K page instead of 12. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 7cfefdfe54ea..f4d9feac0d0e 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -64,9 +64,9 @@ struct btrfs_delayed_node { struct mutex mutex; struct btrfs_inode_item inode_item; refcount_t refs; + int count; u64 index_cnt; unsigned long flags; - int count; /* * The size of the next batch of dir index items to insert (if this * node is from a directory inode). Protected by @mutex. -- 2.51.0 From e36d114990d2acf0a0fca135d50ac21a832daf11 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 5 Nov 2024 14:38:38 +0000 Subject: [PATCH 07/16] btrfs: simplify logic to decrement snapshot counter at btrfs_mksnapshot() There's no point in having a 'snapshot_force_cow' variable to track if we need to decrement the root->snapshot_force_cow counter, as we never jump to the 'out' label after incrementing the counter. Simplify this by removing the variable and always decrementing the counter before the 'out' label, right after the call to btrfs_mksubvol(). Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2118c22625ca..1fdeb216bf6c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1049,7 +1049,6 @@ static noinline int btrfs_mksnapshot(const struct path *parent, struct btrfs_qgroup_inherit *inherit) { int ret; - bool snapshot_force_cow = false; /* * Force new buffered writes to reserve space even when NOCOW is @@ -1068,15 +1067,13 @@ static noinline int btrfs_mksnapshot(const struct path *parent, * creation. */ atomic_inc(&root->snapshot_force_cow); - snapshot_force_cow = true; btrfs_wait_ordered_extents(root, U64_MAX, NULL); ret = btrfs_mksubvol(parent, idmap, name, namelen, root, readonly, inherit); + atomic_dec(&root->snapshot_force_cow); out: - if (snapshot_force_cow) - atomic_dec(&root->snapshot_force_cow); btrfs_drew_read_unlock(&root->snapshot_lock); return ret; } -- 2.51.0 From 08fdca9eee098563b55e7a717d2a4256a7375dc8 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Thu, 31 Oct 2024 11:52:05 +0000 Subject: [PATCH 08/16] btrfs: avoid superfluous calls to free_extent_map() in btrfs_encoded_read() Change the control flow of btrfs_encoded_read() so that it doesn't call free_extent_map() when we know that this has already been done. Reviewed-by: Anand Jain Signed-off-by: Mark Harmstone Suggested-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a2128eccafb3..22b8e2764619 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9324,7 +9324,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, cached_state, extent_start, count, encoded, &unlocked); - goto out_em; + goto out_unlock_extent; } /* @@ -9384,7 +9384,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, ret = -EFAULT; } else { ret = -EIOCBQUEUED; - goto out_em; + goto out_unlock_extent; } out_em: -- 2.51.0 From 80b3695538275e1b2c9a572615175643454b2cb3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 31 Oct 2024 15:03:17 +0100 Subject: [PATCH 09/16] btrfs: fix a typo in btrfs_use_zone_append REQ_OP_ZONE_APPNED -> REQ_OP_ZONE_APPEND. Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index dbcbf754d284..cb32966380f5 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1739,7 +1739,7 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio) return false; /* - * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the + * Using REQ_OP_ZONE_APPEND for relocation can break assumptions on the * extent layout the relocation code has. * Furthermore we have set aside own block-group from which only the * relocation "process" can allocate and make sure only one process at a -- 2.51.0 From 2342d6595b608eec94187a17dc112dd4c2a812fa Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 6 Nov 2024 12:14:09 +0000 Subject: [PATCH 10/16] btrfs: fix warning on PTR_ERR() against NULL device at btrfs_control_ioctl() Smatch complains about calling PTR_ERR() against a NULL pointer: fs/btrfs/super.c:2272 btrfs_control_ioctl() warn: passing zero to 'PTR_ERR' Fix this by calling PTR_ERR() against the device pointer only if it contains an error. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7cd62307cd41..3381389ab93a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2254,7 +2254,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); if (IS_ERR_OR_NULL(device)) { mutex_unlock(&uuid_mutex); - ret = PTR_ERR(device); + if (IS_ERR(device)) + ret = PTR_ERR(device); + else + ret = 0; break; } ret = !(device->fs_devices->num_devices == -- 2.51.0 From 722d343f12a626c9aee8844f5bf19a107d9a1067 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 6 Nov 2024 12:21:13 +0000 Subject: [PATCH 11/16] btrfs: remove check for NULL fs_info at btrfs_folio_end_lock_bitmap() Smatch complains about possibly dereferencing a NULL fs_info at btrfs_folio_end_lock_bitmap(): fs/btrfs/subpage.c:332 btrfs_folio_end_lock_bitmap() warn: variable dereferenced before check 'fs_info' (see line 326) because we access fs_info to set the 'start_bit' variable before doing the check for a NULL fs_info. However fs_info is never NULL, since in the only caller of btrfs_folio_end_lock_bitmap() is extent_writepage(), where we have an inode which always as a non-NULL fs_info. So remove the check for a NULL fs_info at btrfs_folio_end_lock_bitmap(). Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/subpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index d4cab3c55742..8c68059ac1b0 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -329,7 +329,7 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, int cleared = 0; int bit; - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + if (!btrfs_is_subpage(fs_info, folio->mapping)) { folio_unlock(folio); return; } -- 2.51.0 From dc058f5fda091abcdccc2487b48dbbc1cdde98d0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 6 Nov 2024 10:32:09 +0000 Subject: [PATCH 12/16] btrfs: send: check for dead send root under critical section We're checking if the send root is dead without the protection of the root's root_item_lock spinlock, which is what protects the root's flags. The inverse, setting the dead flag on a root, is done under the protection of that lock, at btrfs_delete_subvolume(). Also checking and updating the root's send_in_progress counter is supposed to be done in the same critical section as checking for or setting the root dead flag, so that these operations are done atomically as a single step (which is correctly done by btrfs_delete_subvolume()). So fix this by checking if the send root is dead in the same critical section that updates the send_in_progress counter, which is protected by the root's root_item_lock spinlock. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index cadb945bb345..3fcc8113641d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -8125,6 +8125,14 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a * making it RW. This also protects against deletion. */ spin_lock(&send_root->root_item_lock); + /* + * Unlikely but possible, if the subvolume is marked for deletion but + * is slow to remove the directory entry, send can still be started. + */ + if (btrfs_root_dead(send_root)) { + spin_unlock(&send_root->root_item_lock); + return -EPERM; + } if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) { dedupe_in_progress_warn(send_root); spin_unlock(&send_root->root_item_lock); @@ -8207,15 +8215,6 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a } sctx->send_root = send_root; - /* - * Unlikely but possible, if the subvolume is marked for deletion but - * is slow to remove the directory entry, send can still be started - */ - if (btrfs_root_dead(sctx->send_root)) { - ret = -EPERM; - goto out; - } - sctx->clone_roots_cnt = arg->clone_sources_count; if (sctx->proto >= 2) { -- 2.51.0 From e82c936293aafb4f33b153c684c37291b3eed377 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 6 Nov 2024 11:26:07 +0000 Subject: [PATCH 13/16] btrfs: send: check for read-only send root under critical section We're checking if the send root is read-only without being under the protection of the root's root_item_lock spinlock, which is what protects the root's flags when clearing the read-only flag, done at btrfs_ioctl_subvol_setflags(). Furthermore, it should be done in the same critical section that increments the root's send_in_progress counter, as btrfs_ioctl_subvol_setflags() clears the read-only flag in the same critical section that checks the counter's value. So fix this by moving the read-only check under the critical section delimited by the root's root_item_lock which also increments the root's send_in_progress counter. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 3fcc8113641d..7254279c3cc9 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -8133,7 +8133,12 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a spin_unlock(&send_root->root_item_lock); return -EPERM; } - if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) { + /* Userspace tools do the checks and warn the user if it's not RO. */ + if (!btrfs_root_readonly(send_root)) { + spin_unlock(&send_root->root_item_lock); + return -EPERM; + } + if (send_root->dedupe_in_progress) { dedupe_in_progress_warn(send_root); spin_unlock(&send_root->root_item_lock); return -EAGAIN; @@ -8141,15 +8146,6 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a send_root->send_in_progress++; spin_unlock(&send_root->root_item_lock); - /* - * Userspace tools do the checks and warn the user if it's - * not RO. - */ - if (!btrfs_root_readonly(send_root)) { - ret = -EPERM; - goto out; - } - /* * Check that we don't overflow at later allocations, we request * clone_sources_count + 1 items, and compare to unsigned long inside -- 2.51.0 From 05b36b04d74a517d6675bf2f90829ff1ac7e28dc Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 13 Nov 2024 18:16:48 +0100 Subject: [PATCH 14/16] btrfs: fix use-after-free in btrfs_encoded_read_endio() Shinichiro reported the following use-after free that sometimes is happening in our CI system when running fstests' btrfs/284 on a TCMU runner device: BUG: KASAN: slab-use-after-free in lock_release+0x708/0x780 Read of size 8 at addr ffff888106a83f18 by task kworker/u80:6/219 CPU: 8 UID: 0 PID: 219 Comm: kworker/u80:6 Not tainted 6.12.0-rc6-kts+ #15 Hardware name: Supermicro Super Server/X11SPi-TF, BIOS 3.3 02/21/2020 Workqueue: btrfs-endio btrfs_end_bio_work [btrfs] Call Trace: dump_stack_lvl+0x6e/0xa0 ? lock_release+0x708/0x780 print_report+0x174/0x505 ? lock_release+0x708/0x780 ? __virt_addr_valid+0x224/0x410 ? lock_release+0x708/0x780 kasan_report+0xda/0x1b0 ? lock_release+0x708/0x780 ? __wake_up+0x44/0x60 lock_release+0x708/0x780 ? __pfx_lock_release+0x10/0x10 ? __pfx_do_raw_spin_lock+0x10/0x10 ? lock_is_held_type+0x9a/0x110 _raw_spin_unlock_irqrestore+0x1f/0x60 __wake_up+0x44/0x60 btrfs_encoded_read_endio+0x14b/0x190 [btrfs] btrfs_check_read_bio+0x8d9/0x1360 [btrfs] ? lock_release+0x1b0/0x780 ? trace_lock_acquire+0x12f/0x1a0 ? __pfx_btrfs_check_read_bio+0x10/0x10 [btrfs] ? process_one_work+0x7e3/0x1460 ? lock_acquire+0x31/0xc0 ? process_one_work+0x7e3/0x1460 process_one_work+0x85c/0x1460 ? __pfx_process_one_work+0x10/0x10 ? assign_work+0x16c/0x240 worker_thread+0x5e6/0xfc0 ? __pfx_worker_thread+0x10/0x10 kthread+0x2c3/0x3a0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x31/0x70 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Allocated by task 3661: kasan_save_stack+0x30/0x50 kasan_save_track+0x14/0x30 __kasan_kmalloc+0xaa/0xb0 btrfs_encoded_read_regular_fill_pages+0x16c/0x6d0 [btrfs] send_extent_data+0xf0f/0x24a0 [btrfs] process_extent+0x48a/0x1830 [btrfs] changed_cb+0x178b/0x2ea0 [btrfs] btrfs_ioctl_send+0x3bf9/0x5c20 [btrfs] _btrfs_ioctl_send+0x117/0x330 [btrfs] btrfs_ioctl+0x184a/0x60a0 [btrfs] __x64_sys_ioctl+0x12e/0x1a0 do_syscall_64+0x95/0x180 entry_SYSCALL_64_after_hwframe+0x76/0x7e Freed by task 3661: kasan_save_stack+0x30/0x50 kasan_save_track+0x14/0x30 kasan_save_free_info+0x3b/0x70 __kasan_slab_free+0x4f/0x70 kfree+0x143/0x490 btrfs_encoded_read_regular_fill_pages+0x531/0x6d0 [btrfs] send_extent_data+0xf0f/0x24a0 [btrfs] process_extent+0x48a/0x1830 [btrfs] changed_cb+0x178b/0x2ea0 [btrfs] btrfs_ioctl_send+0x3bf9/0x5c20 [btrfs] _btrfs_ioctl_send+0x117/0x330 [btrfs] btrfs_ioctl+0x184a/0x60a0 [btrfs] __x64_sys_ioctl+0x12e/0x1a0 do_syscall_64+0x95/0x180 entry_SYSCALL_64_after_hwframe+0x76/0x7e The buggy address belongs to the object at ffff888106a83f00 which belongs to the cache kmalloc-rnd-07-96 of size 96 The buggy address is located 24 bytes inside of freed 96-byte region [ffff888106a83f00, ffff888106a83f60) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888106a83800 pfn:0x106a83 flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff) page_type: f5(slab) raw: 0017ffffc0000000 ffff888100053680 ffffea0004917200 0000000000000004 raw: ffff888106a83800 0000000080200019 00000001f5000000 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888106a83e00: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ffff888106a83e80: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc >ffff888106a83f00: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ^ ffff888106a83f80: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ffff888106a84000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ================================================================== Further analyzing the trace and the crash dump's vmcore file shows that the wake_up() call in btrfs_encoded_read_endio() is calling wake_up() on the wait_queue that is in the private data passed to the end_io handler. Commit 4ff47df40447 ("btrfs: move priv off stack in btrfs_encoded_read_regular_fill_pages()") moved 'struct btrfs_encoded_read_private' off the stack. Before that commit one can see a corruption of the private data when analyzing the vmcore after a crash: *(struct btrfs_encoded_read_private *)0xffff88815626eec8 = { .wait = (wait_queue_head_t){ .lock = (spinlock_t){ .rlock = (struct raw_spinlock){ .raw_lock = (arch_spinlock_t){ .val = (atomic_t){ .counter = (int)-2005885696, }, .locked = (u8)0, .pending = (u8)157, .locked_pending = (u16)40192, .tail = (u16)34928, }, .magic = (unsigned int)536325682, .owner_cpu = (unsigned int)29, .owner = (void *)__SCT__tp_func_btrfs_transaction_commit+0x0 = 0x0, .dep_map = (struct lockdep_map){ .key = (struct lock_class_key *)0xffff8881575a3b6c, .class_cache = (struct lock_class *[2]){ 0xffff8882a71985c0, 0xffffea00066f5d40 }, .name = (const char *)0xffff88815626f100 = "", .wait_type_outer = (u8)37, .wait_type_inner = (u8)178, .lock_type = (u8)154, }, }, .__padding = (u8 [24]){ 0, 157, 112, 136, 50, 174, 247, 31, 29 }, .dep_map = (struct lockdep_map){ .key = (struct lock_class_key *)0xffff8881575a3b6c, .class_cache = (struct lock_class *[2]){ 0xffff8882a71985c0, 0xffffea00066f5d40 }, .name = (const char *)0xffff88815626f100 = "", .wait_type_outer = (u8)37, .wait_type_inner = (u8)178, .lock_type = (u8)154, }, }, .head = (struct list_head){ .next = (struct list_head *)0x112cca, .prev = (struct list_head *)0x47, }, }, .pending = (atomic_t){ .counter = (int)-1491499288, }, .status = (blk_status_t)130, } Here we can see several indicators of in-memory data corruption, e.g. the large negative atomic values of ->pending or ->wait->lock->rlock->raw_lock->val, as well as the bogus spinlock magic 0x1ff7ae32 (decimal 536325682 above) instead of 0xdead4ead or the bogus pointer values for ->wait->head. To fix this, change atomic_dec_return() to atomic_dec_and_test() to fix the corruption, as atomic_dec_return() is defined as two instructions on x86_64, whereas atomic_dec_and_test() is defined as a single atomic operation. This can lead to a situation where counter value is already decremented but the if statement in btrfs_encoded_read_endio() is not completely processed, i.e. the 0 test has not completed. If another thread continues executing btrfs_encoded_read_regular_fill_pages() the atomic_dec_return() there can see an already updated ->pending counter and continues by freeing the private data. Continuing in the endio handler the test for 0 succeeds and the wait_queue is woken up, resulting in a use-after-free. Reported-by: Shinichiro Kawasaki Suggested-by: Damien Le Moal Fixes: 1881fba89bd5 ("btrfs: add BTRFS_IOC_ENCODED_READ ioctl") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Filipe Manana Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 22b8e2764619..fdad1adee1a3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9089,7 +9089,7 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) */ WRITE_ONCE(priv->status, bbio->bio.bi_status); } - if (atomic_dec_return(&priv->pending) == 0) { + if (atomic_dec_and_test(&priv->pending)) { int err = blk_status_to_errno(READ_ONCE(priv->status)); if (priv->uring_ctx) { -- 2.51.0 From 7d6872ccbd56c310d2b9658ecaeafeda258727b6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 14 Nov 2024 12:11:26 +0000 Subject: [PATCH 15/16] btrfs: fix deadlock between transaction commits and extent locks When running a workload with fsstress and duperemove (generic/561) we can hit a deadlock related to transaction commits and locking extent ranges, as described below. Task A hanging during a transaction commit, waiting for all other writers to complete: [178317.334817] INFO: task fsstress:555623 blocked for more than 120 seconds. [178317.335693] Not tainted 6.12.0-rc6-btrfs-next-179+ #1 [178317.336528] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [178317.337673] task:fsstress state:D stack:0 pid:555623 tgid:555623 ppid:555620 flags:0x00004002 [178317.337679] Call Trace: [178317.337681] [178317.337685] __schedule+0x364/0xbe0 [178317.337691] schedule+0x26/0xa0 [178317.337695] btrfs_commit_transaction+0x5c5/0x1050 [btrfs] [178317.337769] ? start_transaction+0xc4/0x800 [btrfs] [178317.337815] ? __pfx_autoremove_wake_function+0x10/0x10 [178317.337819] btrfs_mksubvol+0x381/0x640 [btrfs] [178317.337878] btrfs_mksnapshot+0x7a/0xb0 [btrfs] [178317.337935] __btrfs_ioctl_snap_create+0x1bb/0x1d0 [btrfs] [178317.337995] btrfs_ioctl_snap_create_v2+0x103/0x130 [btrfs] [178317.338053] btrfs_ioctl+0x29b/0x2a90 [btrfs] [178317.338118] ? kmem_cache_alloc_noprof+0x5f/0x2c0 [178317.338126] ? getname_flags+0x45/0x1f0 [178317.338133] ? _raw_spin_unlock+0x15/0x30 [178317.338145] ? __x64_sys_ioctl+0x88/0xc0 [178317.338149] __x64_sys_ioctl+0x88/0xc0 [178317.338152] do_syscall_64+0x4a/0x110 [178317.338160] entry_SYSCALL_64_after_hwframe+0x76/0x7e [178317.338190] RIP: 0033:0x7f13c28e271b Which corresponds to line 2361 of transaction.c: $ cat -n fs/btrfs/transaction.c (...) 2162 int btrfs_commit_transaction(struct btrfs_trans_handle *trans) 2163 { (...) 2349 spin_lock(&fs_info->trans_lock); 2350 add_pending_snapshot(trans); 2351 cur_trans->state = TRANS_STATE_COMMIT_DOING; 2352 spin_unlock(&fs_info->trans_lock); 2353 2354 /* 2355 * The thread has started/joined the transaction thus it holds the 2356 * lockdep map as a reader. It has to release it before acquiring the 2357 * lockdep map as a writer. 2358 */ 2359 btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); 2360 btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers); 2361 wait_event(cur_trans->writer_wait, 2362 atomic_read(&cur_trans->num_writers) == 1); (...) The transaction is in the TRANS_STATE_COMMIT_DOING state and so it's waiting for all other existing writers to complete and release their transaction handle. Task B is running ordered extent completion and blocked waiting to lock an extent range in an inode's io tree: [178317.327411] INFO: task kworker/u48:8:554545 blocked for more than 120 seconds. [178317.328630] Not tainted 6.12.0-rc6-btrfs-next-179+ #1 [178317.329635] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [178317.330872] task:kworker/u48:8 state:D stack:0 pid:554545 tgid:554545 ppid:2 flags:0x00004000 [178317.330878] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs] [178317.330944] Call Trace: [178317.330945] [178317.330947] __schedule+0x364/0xbe0 [178317.330952] schedule+0x26/0xa0 [178317.330955] __lock_extent+0x337/0x3a0 [btrfs] [178317.331014] ? __pfx_autoremove_wake_function+0x10/0x10 [178317.331017] btrfs_finish_one_ordered+0x47a/0xaa0 [btrfs] [178317.331074] ? psi_group_change+0x132/0x2d0 [178317.331078] btrfs_work_helper+0xbd/0x370 [btrfs] [178317.331140] process_scheduled_works+0xd3/0x460 [178317.331144] ? __pfx_worker_thread+0x10/0x10 [178317.331146] worker_thread+0x121/0x250 [178317.331149] ? __pfx_worker_thread+0x10/0x10 [178317.331151] kthread+0xe9/0x120 [178317.331154] ? __pfx_kthread+0x10/0x10 [178317.331157] ret_from_fork+0x2d/0x50 [178317.331159] ? __pfx_kthread+0x10/0x10 [178317.331162] ret_from_fork_asm+0x1a/0x30 This extent range locking happens after joining the current transaction, so task A is waiting for task B to release its transaction handle (decrementing the transaction's num_writers counter). Task C while doing a fiemap it tries to join the current transaction: [242682.812815] task:pool state:D stack:0 pid:560767 tgid:560724 ppid:555622 flags:0x00004006 [242682.812827] Call Trace: [242682.812856] [242682.812864] __schedule+0x364/0xbe0 [242682.812879] ? _raw_spin_unlock_irqrestore+0x23/0x40 [242682.812897] schedule+0x26/0xa0 [242682.812909] wait_current_trans+0xd6/0x130 [btrfs] [242682.813148] ? __pfx_autoremove_wake_function+0x10/0x10 [242682.813162] start_transaction+0x3d4/0x800 [btrfs] [242682.813399] btrfs_is_data_extent_shared+0xd2/0x440 [btrfs] [242682.813723] fiemap_process_hole+0x2a2/0x300 [btrfs] [242682.813995] extent_fiemap+0x9b8/0xb80 [btrfs] [242682.814249] btrfs_fiemap+0x78/0xc0 [btrfs] [242682.814501] do_vfs_ioctl+0x2db/0xa50 [242682.814519] __x64_sys_ioctl+0x6a/0xc0 [242682.814531] do_syscall_64+0x4a/0x110 [242682.814544] entry_SYSCALL_64_after_hwframe+0x76/0x7e [242682.814556] RIP: 0033:0x7efff595e71b It tries to join the current transaction, but it can't because the transaction is in the TRANS_STATE_COMMIT_DOING state, so join_transaction() returns -EBUSY to start_transaction() and makes it wait for the current transaction to complete. And while it's waiting for the transaction to complete, it's holding an extent range locked in the same inode that task B is operating, which causes a deadlock between these 3 tasks. The extent range for the inode was locked at the start of the fiemap operation, early at extent_fiemap(). In short these tasks deadlock because: 1) Task A is waiting for task B to release its transaction handle; 2) Task B is waiting to lock an extent range for an inode while holding a transaction handle open; 3) Task C is waiting for the current transaction to complete (for task A to finish the transaction commit) while holding the extent range for the inode locked, so task B can't progress and release its transaction handle. This results in an ABBA deadlock involving transaction commits and extent locks. Extent locks are higher level locks, like inode VFS locks, and should always be acquired before joining or starting a transaction, but recently commit 2206265f41e9 ("btrfs: remove code duplication in ordered extent finishing") accidentally changed btrfs_finish_one_ordered() to do the transaction join before locking the extent range. Fix this by making sure that btrfs_finish_one_ordered() always locks the extent before joining a transaction and add an explicit comment about the need for this order. Fixes: 2206265f41e9 ("btrfs: remove code duplication in ordered extent finishing") Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fdad1adee1a3..659a5940a5b2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3063,6 +3063,19 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } + /* + * If it's a COW write we need to lock the extent range as we will be + * inserting/replacing file extent items and unpinning an extent map. + * This must be taken before joining a transaction, as it's a higher + * level lock (like the inode's VFS lock), otherwise we can run into an + * ABBA deadlock with other tasks (transactions work like a lock, + * depending on their current state). + */ + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + clear_bits |= EXTENT_LOCKED; + lock_extent(io_tree, start, end, &cached_state); + } + if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else @@ -3099,9 +3112,6 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } - clear_bits |= EXTENT_LOCKED; - lock_extent(io_tree, start, end, &cached_state); - if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { -- 2.51.0 From b188ad7791899da8afe937e439e3086ffddd84a8 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 14 Nov 2024 15:38:27 +0000 Subject: [PATCH 16/16] btrfs: sysfs: advertise experimental features only if CONFIG_BTRFS_EXPERIMENTAL=y We are advertising experimental features through sysfs if CONFIG_BTRFS_DEBUG is set, without looking if CONFIG_BTRFS_EXPERIMENTAL is set. This is wrong as it will result in reporting experimental features as supported when CONFIG_BTRFS_EXPERIMENTAL is not set but CONFIG_BTRFS_DEBUG is set. Fix this by checking for CONFIG_BTRFS_EXPERIMENTAL instead of CONFIG_BTRFS_DEBUG. Fixes: 67cd3f221769 ("btrfs: split out CONFIG_BTRFS_EXPERIMENTAL from CONFIG_BTRFS_DEBUG") Reviewed-by: Neal Gompa Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index b843308e2bc6..fdcbf650ac31 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -295,7 +295,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA); #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); #endif -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* Remove once support for extent tree v2 is feature complete */ BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); /* Remove once support for raid stripe tree is feature complete. */ @@ -329,7 +329,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_PTR(zoned), #endif -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_FEAT_ATTR_PTR(extent_tree_v2), BTRFS_FEAT_ATTR_PTR(raid_stripe_tree), #endif -- 2.51.0