From 973a432637ed2ed8fe7c365a3ce7a9e4463af5a8 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Tue, 22 Oct 2024 15:50:18 +0100 Subject: [PATCH 01/16] btrfs: don't sleep in btrfs_encoded_read() if IOCB_NOWAIT is set Change btrfs_encoded_read() so that it returns -EAGAIN rather than sleeps if IOCB_NOWAIT is set in iocb->ki_flags. The conditions that require sleeping are: inode lock, writeback, extent lock, ordered range. Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 55 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 98a20edaa112..358a282de7fe 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8983,12 +8983,16 @@ static ssize_t btrfs_encoded_read_inline( unsigned long ptr; void *tmp; ssize_t ret; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } + + path->nowait = nowait; + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), extent_start, 0); if (ret) { @@ -9198,11 +9202,15 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); u64 start, lockend; struct extent_map *em; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); bool unlocked = false; file_accessed(iocb->ki_filp); - btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); + ret = btrfs_inode_lock(inode, + BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0)); + if (ret) + return ret; if (iocb->ki_pos >= inode->vfs_inode.i_size) { btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); @@ -9215,21 +9223,46 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, */ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; - for (;;) { + if (nowait) { struct btrfs_ordered_extent *ordered; - ret = btrfs_wait_ordered_range(inode, start, - lockend - start + 1); - if (ret) + if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping, + start, lockend)) { + ret = -EAGAIN; goto out_unlock_inode; - lock_extent(io_tree, start, lockend, cached_state); + } + + if (!try_lock_extent(io_tree, start, lockend, cached_state)) { + ret = -EAGAIN; + goto out_unlock_inode; + } + ordered = btrfs_lookup_ordered_range(inode, start, lockend - start + 1); - if (!ordered) - break; - btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, lockend, cached_state); - cond_resched(); + if (ordered) { + btrfs_put_ordered_extent(ordered); + unlock_extent(io_tree, start, lockend, cached_state); + ret = -EAGAIN; + goto out_unlock_inode; + } + } else { + for (;;) { + struct btrfs_ordered_extent *ordered; + + ret = btrfs_wait_ordered_range(inode, start, + lockend - start + 1); + if (ret) + goto out_unlock_inode; + + lock_extent(io_tree, start, lockend, cached_state); + ordered = btrfs_lookup_ordered_range(inode, start, + lockend - start + 1); + if (!ordered) + break; + btrfs_put_ordered_extent(ordered); + unlock_extent(io_tree, start, lockend, cached_state); + cond_resched(); + } } em = btrfs_get_extent(inode, NULL, start, lockend - start + 1); -- 2.51.0 From 68d3b27e05c7ca5545e88465f5e2be6eda0e11df Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Tue, 22 Oct 2024 15:50:19 +0100 Subject: [PATCH 02/16] btrfs: move priv off stack in btrfs_encoded_read_regular_fill_pages() Change btrfs_encoded_read_regular_fill_pages() so that the priv struct is allocated rather than stored on the stack, in preparation for adding an asynchronous mode to the function. Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 358a282de7fe..ed66eafa6b52 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9085,16 +9085,21 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, struct page **pages) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_encoded_read_private priv = { - .pending = ATOMIC_INIT(1), - }; + struct btrfs_encoded_read_private *priv; unsigned long i = 0; struct btrfs_bio *bbio; + int ret; - init_waitqueue_head(&priv.wait); + priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); + if (!priv) + return -ENOMEM; + + init_waitqueue_head(&priv->wait); + atomic_set(&priv->pending, 1); + priv->status = 0; bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, - btrfs_encoded_read_endio, &priv); + btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; @@ -9102,11 +9107,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { - atomic_inc(&priv.pending); + atomic_inc(&priv->pending); btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, - btrfs_encoded_read_endio, &priv); + btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; continue; @@ -9117,13 +9122,15 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, disk_io_size -= bytes; } while (disk_io_size); - atomic_inc(&priv.pending); + atomic_inc(&priv->pending); btrfs_submit_bbio(bbio, 0); - if (atomic_dec_return(&priv.pending)) - io_wait_event(priv.wait, !atomic_read(&priv.pending)); + if (atomic_dec_return(&priv->pending)) + io_wait_event(priv->wait, !atomic_read(&priv->pending)); /* See btrfs_encoded_read_endio() for ordering. */ - return blk_status_to_errno(READ_ONCE(priv.status)); + ret = blk_status_to_errno(READ_ONCE(priv->status)); + kfree(priv); + return ret; } ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, -- 2.51.0 From 34310c442e175f286b4c06ab5caa4e0b267ea31c Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Tue, 22 Oct 2024 15:50:20 +0100 Subject: [PATCH 03/16] btrfs: add io_uring command for encoded reads (ENCODED_READ ioctl) Add an io_uring command for encoded reads, using the same interface as the existing BTRFS_IOC_ENCODED_READ ioctl. btrfs_uring_encoded_read() is an io_uring version of btrfs_ioctl_encoded_read(), which validates the user input and calls btrfs_encoded_read() to read the appropriate metadata. If we determine that we need to read an extent from disk, we call btrfs_encoded_read_regular_fill_pages() through btrfs_uring_read_extent() to prepare the bio. The existing btrfs_encoded_read_regular_fill_pages() is changed so that if it is passed a valid uring_ctx, rather than waking up any waiting threads it calls btrfs_uring_read_extent_endio(). This in turn copies the read data back to userspace, and calls io_uring_cmd_done() to complete the io_uring command. Because we're potentially doing a non-blocking read, btrfs_uring_read_extent() doesn't clean up after itself if it returns -EIOCBQUEUED. Instead, it allocates a priv struct, populates the fields there that we will need to unlock the inode and free our allocations, and defers this to the btrfs_uring_read_finished() that gets called when the bio completes. Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/file.c | 1 + fs/btrfs/inode.c | 41 ++++-- fs/btrfs/ioctl.c | 302 +++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ioctl.h | 2 + fs/btrfs/send.c | 3 +- 6 files changed, 339 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 2817f0cc692b..aa1f55cd81b7 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -613,7 +613,7 @@ int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type); int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, u64 disk_bytenr, u64 disk_io_size, - struct page **pages); + struct page **pages, void *uring_ctx); ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, struct btrfs_ioctl_encoded_io_args *encoded, struct extent_state **cached_state, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5e0a1805e897..fbb753300071 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3710,6 +3710,7 @@ const struct file_operations btrfs_file_operations = { .compat_ioctl = btrfs_compat_ioctl, #endif .remap_file_range = btrfs_remap_file_range, + .uring_cmd = btrfs_uring_cmd, .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, }; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ed66eafa6b52..4bb393cac170 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9056,6 +9056,7 @@ out: struct btrfs_encoded_read_private { wait_queue_head_t wait; + void *uring_ctx; atomic_t pending; blk_status_t status; }; @@ -9075,14 +9076,22 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) */ WRITE_ONCE(priv->status, bbio->bio.bi_status); } - if (!atomic_dec_return(&priv->pending)) - wake_up(&priv->wait); + if (atomic_dec_return(&priv->pending) == 0) { + int err = blk_status_to_errno(READ_ONCE(priv->status)); + + if (priv->uring_ctx) { + btrfs_uring_read_extent_endio(priv->uring_ctx, err); + kfree(priv); + } else { + wake_up(&priv->wait); + } + } bio_put(&bbio->bio); } int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, u64 disk_bytenr, u64 disk_io_size, - struct page **pages) + struct page **pages, void *uring_ctx) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_encoded_read_private *priv; @@ -9097,6 +9106,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, init_waitqueue_head(&priv->wait); atomic_set(&priv->pending, 1); priv->status = 0; + priv->uring_ctx = uring_ctx; bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, btrfs_encoded_read_endio, priv); @@ -9125,12 +9135,23 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, atomic_inc(&priv->pending); btrfs_submit_bbio(bbio, 0); - if (atomic_dec_return(&priv->pending)) - io_wait_event(priv->wait, !atomic_read(&priv->pending)); - /* See btrfs_encoded_read_endio() for ordering. */ - ret = blk_status_to_errno(READ_ONCE(priv->status)); - kfree(priv); - return ret; + if (uring_ctx) { + if (atomic_dec_return(&priv->pending) == 0) { + ret = blk_status_to_errno(READ_ONCE(priv->status)); + btrfs_uring_read_extent_endio(uring_ctx, ret); + kfree(priv); + return ret; + } + + return -EIOCBQUEUED; + } else { + if (atomic_dec_return(&priv->pending) != 0) + io_wait_event(priv->wait, !atomic_read(&priv->pending)); + /* See btrfs_encoded_read_endio() for ordering. */ + ret = blk_status_to_errno(READ_ONCE(priv->status)); + kfree(priv); + return ret; + } } ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, @@ -9158,7 +9179,7 @@ ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, } ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, - disk_io_size, pages); + disk_io_size, pages, NULL); if (ret) goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f8ac8bb6c6c7..cf63264a3a60 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "ctree.h" #include "disk-io.h" #include "export.h" @@ -4719,6 +4720,307 @@ out_acct: return ret; } +/* + * Context that's attached to an encoded read io_uring command, in cmd->pdu. It + * contains the fields in btrfs_uring_read_extent that are necessary to finish + * off and cleanup the I/O in btrfs_uring_read_finished. + */ +struct btrfs_uring_priv { + struct io_uring_cmd *cmd; + struct page **pages; + unsigned long nr_pages; + struct kiocb iocb; + struct iovec *iov; + struct iov_iter iter; + struct extent_state *cached_state; + u64 count; + u64 start; + u64 lockend; + int err; + bool compressed; +}; + +static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct btrfs_uring_priv *priv = *io_uring_cmd_to_pdu(cmd, struct btrfs_uring_priv *); + struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); + struct extent_io_tree *io_tree = &inode->io_tree; + unsigned long index; + u64 cur; + size_t page_offset; + ssize_t ret; + + if (priv->err) { + ret = priv->err; + goto out; + } + + if (priv->compressed) { + index = 0; + page_offset = 0; + } else { + index = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT; + page_offset = offset_in_page(priv->iocb.ki_pos - priv->start); + } + cur = 0; + while (cur < priv->count) { + size_t bytes = min_t(size_t, priv->count - cur, PAGE_SIZE - page_offset); + + if (copy_page_to_iter(priv->pages[index], page_offset, bytes, + &priv->iter) != bytes) { + ret = -EFAULT; + goto out; + } + + index++; + cur += bytes; + page_offset = 0; + } + ret = priv->count; + +out: + unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + + io_uring_cmd_done(cmd, ret, 0, issue_flags); + add_rchar(current, ret); + + for (index = 0; index < priv->nr_pages; index++) + __free_page(priv->pages[index]); + + kfree(priv->pages); + kfree(priv->iov); + kfree(priv); +} + +void btrfs_uring_read_extent_endio(void *ctx, int err) +{ + struct btrfs_uring_priv *priv = ctx; + + priv->err = err; + + *io_uring_cmd_to_pdu(priv->cmd, struct btrfs_uring_priv *) = priv; + io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished); +} + +static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state *cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, + struct iovec *iov, struct io_uring_cmd *cmd) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct extent_io_tree *io_tree = &inode->io_tree; + struct page **pages; + struct btrfs_uring_priv *priv = NULL; + unsigned long nr_pages; + int ret; + + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + ret = btrfs_alloc_page_array(nr_pages, pages, 0); + if (ret) { + ret = -ENOMEM; + goto out_fail; + } + + priv = kmalloc(sizeof(*priv), GFP_NOFS); + if (!priv) { + ret = -ENOMEM; + goto out_fail; + } + + priv->iocb = *iocb; + priv->iov = iov; + priv->iter = *iter; + priv->count = count; + priv->cmd = cmd; + priv->cached_state = cached_state; + priv->compressed = compressed; + priv->nr_pages = nr_pages; + priv->pages = pages; + priv->start = start; + priv->lockend = lockend; + priv->err = 0; + + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, + disk_io_size, pages, priv); + if (ret && ret != -EIOCBQUEUED) + goto out_fail; + + /* + * If we return -EIOCBQUEUED, we're deferring the cleanup to + * btrfs_uring_read_finished(), which will handle unlocking the extent + * and inode and freeing the allocations. + */ + + return -EIOCBQUEUED; + +out_fail: + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + kfree(priv); + return ret; +} + +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); + size_t copy_end; + struct btrfs_ioctl_encoded_io_args args = { 0 }; + int ret; + u64 disk_bytenr, disk_io_size; + struct file *file; + struct btrfs_inode *inode; + struct btrfs_fs_info *fs_info; + struct extent_io_tree *io_tree; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + loff_t pos; + struct kiocb kiocb; + struct extent_state *cached_state = NULL; + u64 start, lockend; + void __user *sqe_addr; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + file = cmd->file; + inode = BTRFS_I(file->f_inode); + fs_info = inode->root->fs_info; + io_tree = &inode->io_tree; + sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); + + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags); + if (copy_from_user(&args32, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + args.iov = compat_ptr(args32.iov); + args.iovcnt = args32.iovcnt; + args.offset = args32.offset; + args.flags = args32.flags; +#else + return -ENOTTY; +#endif + } else { + copy_end = copy_end_kernel; + if (copy_from_user(&args, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + } + + if (args.flags != 0) + return -EINVAL; + + ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + &iov, &iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&iter) == 0) { + ret = 0; + goto out_free; + } + + pos = args.offset; + ret = rw_verify_area(READ, file, &pos, args.len); + if (ret < 0) + goto out_free; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = pos; + + if (issue_flags & IO_URING_F_NONBLOCK) + kiocb.ki_flags |= IOCB_NOWAIT; + + start = ALIGN_DOWN(pos, fs_info->sectorsize); + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; + + ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, + &disk_bytenr, &disk_io_size); + if (ret < 0 && ret != -EIOCBQUEUED) + goto out_free; + + file_accessed(file); + + if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel, + sizeof(args) - copy_end_kernel)) { + if (ret == -EIOCBQUEUED) { + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + } + ret = -EFAULT; + goto out_free; + } + + if (ret == -EIOCBQUEUED) { + u64 count; + + /* + * If we've optimized things by storing the iovecs on the stack, + * undo this. + */ + if (!iov) { + iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS); + if (!iov) { + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + ret = -ENOMEM; + goto out_acct; + } + + memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt); + } + + count = min_t(u64, iov_iter_count(&iter), disk_io_size); + + /* Match ioctl by not returning past EOF if uncompressed. */ + if (!args.compression) + count = min_t(u64, count, args.len); + + ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend, + cached_state, disk_bytenr, + disk_io_size, count, + args.compression, iov, cmd); + + goto out_acct; + } + +out_free: + kfree(iov); + +out_acct: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + + return ret; +} + +int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + switch (cmd->cmd_op) { + case BTRFS_IOC_ENCODED_READ: +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_ENCODED_READ_32: +#endif + return btrfs_uring_encoded_read(cmd, issue_flags); + } + + return -EINVAL; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 19cd26b0244a..2b760c8778f8 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -22,5 +22,7 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); int __pure btrfs_is_empty_uuid(const u8 *uuid); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); +int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); +void btrfs_uring_read_extent_endio(void *ctx, int err); #endif diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 03b31b1c39be..cadb945bb345 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5669,7 +5669,8 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), disk_bytenr, disk_num_bytes, sctx->send_buf_pages + - (data_offset >> PAGE_SHIFT)); + (data_offset >> PAGE_SHIFT), + NULL); if (ret) goto out; -- 2.51.0 From 1cc86aeadafd667c381c588a84fdef2d5e8093a5 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Thu, 31 Oct 2024 16:03:56 +0000 Subject: [PATCH 04/16] btrfs: add struct io_btrfs_cmd as type for io_uring_cmd_to_pdu() Add struct io_btrfs_cmd as a wrapper type for io_uring_cmd_to_pdu(), rather than using a raw pointer. Suggested-by: Pavel Begunkov Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cf63264a3a60..27a9342cd91c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4740,9 +4740,14 @@ struct btrfs_uring_priv { bool compressed; }; +struct io_btrfs_cmd { + struct btrfs_uring_priv *priv; +}; + static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags) { - struct btrfs_uring_priv *priv = *io_uring_cmd_to_pdu(cmd, struct btrfs_uring_priv *); + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); + struct btrfs_uring_priv *priv = bc->priv; struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); struct extent_io_tree *io_tree = &inode->io_tree; unsigned long index; @@ -4796,10 +4801,11 @@ out: void btrfs_uring_read_extent_endio(void *ctx, int err) { struct btrfs_uring_priv *priv = ctx; + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(priv->cmd, struct io_btrfs_cmd); priv->err = err; + bc->priv = priv; - *io_uring_cmd_to_pdu(priv->cmd, struct btrfs_uring_priv *) = priv; io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished); } -- 2.51.0 From df3b8ca604f224eb4cd51669416ad4d607682273 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 4 Nov 2024 16:12:04 +0000 Subject: [PATCH 05/16] io_uring/cmd: let cmds to know about dying task When the taks that submitted a request is dying, a task work for that request might get run by a kernel thread or even worse by a half dismantled task. We can't just cancel the task work without running the callback as the cmd might need to do some clean up, so pass a flag instead. If set, it's not safe to access any task resources and the callback is expected to cancel the cmd ASAP. Reviewed-by: Jens Axboe Reviewed-by: Ming Lei Signed-off-by: Pavel Begunkov Signed-off-by: David Sterba --- include/linux/io_uring_types.h | 1 + io_uring/uring_cmd.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 4b9ba523978d..2ee5dc105b58 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -37,6 +37,7 @@ enum io_uring_cmd_flags { /* set when uring wants to cancel a previously issued command */ IO_URING_F_CANCEL = (1 << 11), IO_URING_F_COMPAT = (1 << 12), + IO_URING_F_TASK_DEAD = (1 << 13), }; struct io_wq_work_node { diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 39c3c816ec78..e2e8485932d6 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -119,9 +119,13 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + unsigned int flags = IO_URING_F_COMPLETE_DEFER; + + if (current->flags & (PF_EXITING | PF_KTHREAD)) + flags |= IO_URING_F_TASK_DEAD; /* task_work executor checks the deffered list completion */ - ioucmd->task_work_cb(ioucmd, IO_URING_F_COMPLETE_DEFER); + ioucmd->task_work_cb(ioucmd, flags); } void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, -- 2.51.0 From 69673992b1aea5540199d9b8b658ede72f55a6cf Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Fri, 30 Aug 2024 13:24:54 -0700 Subject: [PATCH 06/16] btrfs: push cleanup into btrfs_read_locked_inode() Move btrfs_add_inode_to_root() so it can be called from btrfs_read_locked_inode(), no changes were made to the function. Move cleanup code from btrfs_iget_path() to btrfs_read_locked_inode. This improves readability and improves a leaky abstraction. Previously btrfs_iget_path() had to handle a positive error case as a result of a call to btrfs_search_slot(), but it makes more sense to handle this closer to the source of the call. Signed-off-by: Leo Martins Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 101 +++++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 48 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4bb393cac170..09ff00aaa430 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3759,8 +3759,41 @@ static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) return 0; } +static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) +{ + struct btrfs_root *root = inode->root; + struct btrfs_inode *existing; + const u64 ino = btrfs_ino(inode); + int ret; + + if (inode_unhashed(&inode->vfs_inode)) + return 0; + + if (prealloc) { + ret = xa_reserve(&root->inodes, ino, GFP_NOFS); + if (ret) + return ret; + } + + existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); + + if (xa_is_err(existing)) { + ret = xa_err(existing); + ASSERT(ret != -EINVAL); + ASSERT(ret != -ENOMEM); + return ret; + } else if (existing) { + WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); + } + + return 0; +} + /* - * read an inode from the btree into the in-memory inode + * Read a locked inode from the btree into the in-memory inode and add it to + * its root list/tree. + * + * On failure clean up the inode. */ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *in_path) @@ -3780,7 +3813,7 @@ static int btrfs_read_locked_inode(struct inode *inode, ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); if (ret) - return ret; + goto out; ret = btrfs_fill_inode(inode, &rdev); if (!ret) @@ -3789,7 +3822,7 @@ static int btrfs_read_locked_inode(struct inode *inode, if (!path) { path = btrfs_alloc_path(); if (!path) - return -ENOMEM; + goto out; } btrfs_get_inode_key(BTRFS_I(inode), &location); @@ -3798,7 +3831,13 @@ static int btrfs_read_locked_inode(struct inode *inode, if (ret) { if (path != in_path) btrfs_free_path(path); - return ret; + /* + * ret > 0 can come from btrfs_search_slot called by + * btrfs_lookup_inode(), this means the inode was not found. + */ + if (ret > 0) + ret = -ENOENT; + goto out; } leaf = path->nodes[0]; @@ -3961,7 +4000,15 @@ cache_acl: } btrfs_sync_inode_flags_to_i_flags(inode); + + ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); + if (ret) + goto out; + return 0; +out: + iget_failed(inode); + return ret; } /* @@ -5470,35 +5517,7 @@ out: return err; } -static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) -{ - struct btrfs_root *root = inode->root; - struct btrfs_inode *existing; - const u64 ino = btrfs_ino(inode); - int ret; - if (inode_unhashed(&inode->vfs_inode)) - return 0; - - if (prealloc) { - ret = xa_reserve(&root->inodes, ino, GFP_NOFS); - if (ret) - return ret; - } - - existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); - - if (xa_is_err(existing)) { - ret = xa_err(existing); - ASSERT(ret != -EINVAL); - ASSERT(ret != -ENOMEM); - return ret; - } else if (existing) { - WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); - } - - return 0; -} static void btrfs_del_inode_from_root(struct btrfs_inode *inode) { @@ -5579,25 +5598,11 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, return inode; ret = btrfs_read_locked_inode(inode, path); - /* - * ret > 0 can come from btrfs_search_slot called by - * btrfs_read_locked_inode(), this means the inode item was not found. - */ - if (ret > 0) - ret = -ENOENT; - if (ret < 0) - goto error; - - ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); - if (ret < 0) - goto error; + if (ret) + return ERR_PTR(ret); unlock_new_inode(inode); - return inode; -error: - iget_failed(inode); - return ERR_PTR(ret); } struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) -- 2.51.0 From 7c855e16ab72596d771355050ffe026e6b99f91c Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Fri, 30 Aug 2024 13:24:55 -0700 Subject: [PATCH 07/16] btrfs: remove conditional path allocation in btrfs_read_locked_inode() Remove conditional path allocation from btrfs_read_locked_inode(). Add an ASSERT(path) to indicate it should never be called with a NULL path. Call btrfs_read_locked_inode() directly from btrfs_iget(). This causes code duplication between btrfs_iget() and btrfs_iget_path(), but I think this is justifiable as it removes the need for conditionally allocating the path inside of btrfs_read_locked_inode(). This makes the code easier to reason about and makes it clear who has the responsibility of allocating and freeing the path. Signed-off-by: Leo Martins Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 09ff00aaa430..fa366360007b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3795,11 +3795,9 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) * * On failure clean up the inode. */ -static int btrfs_read_locked_inode(struct inode *inode, - struct btrfs_path *in_path) +static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_path *path = in_path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -3819,18 +3817,12 @@ static int btrfs_read_locked_inode(struct inode *inode, if (!ret) filled = true; - if (!path) { - path = btrfs_alloc_path(); - if (!path) - goto out; - } + ASSERT(path); btrfs_get_inode_key(BTRFS_I(inode), &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { - if (path != in_path) - btrfs_free_path(path); /* * ret > 0 can come from btrfs_search_slot called by * btrfs_lookup_inode(), this means the inode was not found. @@ -3972,8 +3964,6 @@ cache_acl: btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret); } - if (path != in_path) - btrfs_free_path(path); if (!maybe_acls) cache_no_acl(inode); @@ -5579,10 +5569,8 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) } /* - * Get an inode object given its inode number and corresponding root. - * Path can be preallocated to prevent recursing back to iget through - * allocator. NULL is also valid but may require an additional allocation - * later. + * Get an inode object given its inode number and corresponding root. Path is + * preallocated to prevent recursing back to iget through allocator. */ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, struct btrfs_path *path) @@ -5605,9 +5593,33 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, return inode; } +/* + * Get an inode object given its inode number and corresponding root. + */ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) { - return btrfs_iget_path(ino, root, NULL); + struct inode *inode; + struct btrfs_path *path; + int ret; + + inode = btrfs_iget_locked(ino, root); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!(inode->i_state & I_NEW)) + return inode; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = btrfs_read_locked_inode(inode, path); + btrfs_free_path(path); + if (ret) + return ERR_PTR(ret); + + unlock_new_inode(inode); + return inode; } static struct inode *new_simple_dir(struct inode *dir, -- 2.51.0 From 5599f39356c6ec73d2015ae88286382d1034ef0d Mon Sep 17 00:00:00 2001 From: Haisu Wang Date: Fri, 25 Oct 2024 14:54:41 +0800 Subject: [PATCH 08/16] btrfs: simplify range tracking in cow_file_range() Simplify tracking of the range processed by using cur_alloc_size only to store the reserved part that may fail to the allocated extent. Remove the ram_size as well since it is always equal to cur_alloc_size in the context. Advance the start in normal path until extent allocation succeeds and keep the start unchanged in the error handling path. Passed the fstest generic/475 test for a hundred times with quota enabled. And a modified generic/475 test by removing the sleep time for a hundred times. About one tenth of the tests do enter the error handling path due to fail to reserve extent. Suggested-by: Qu Wenruo Signed-off-by: Haisu Wang Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fa366360007b..a2128eccafb3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1332,7 +1332,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, u64 alloc_hint = 0; u64 orig_start = start; u64 num_bytes; - unsigned long ram_size; u64 cur_alloc_size = 0; u64 min_alloc_size; u64 blocksize = fs_info->sectorsize; @@ -1340,7 +1339,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, struct extent_map *em; unsigned clear_bits; unsigned long page_ops; - bool extent_reserved = false; int ret = 0; if (btrfs_is_free_space_inode(inode)) { @@ -1394,8 +1392,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, struct btrfs_ordered_extent *ordered; struct btrfs_file_extent file_extent; - cur_alloc_size = num_bytes; - ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, + ret = btrfs_reserve_extent(root, num_bytes, num_bytes, min_alloc_size, 0, alloc_hint, &ins, 1, 1); if (ret == -EAGAIN) { @@ -1426,9 +1423,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret < 0) goto out_unlock; cur_alloc_size = ins.offset; - extent_reserved = true; - ram_size = ins.offset; file_extent.disk_bytenr = ins.objectid; file_extent.disk_num_bytes = ins.offset; file_extent.num_bytes = ins.offset; @@ -1436,14 +1431,14 @@ static noinline int cow_file_range(struct btrfs_inode *inode, file_extent.offset = 0; file_extent.compression = BTRFS_COMPRESS_NONE; - lock_extent(&inode->io_tree, start, start + ram_size - 1, + lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, &cached); em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_REGULAR); if (IS_ERR(em)) { unlock_extent(&inode->io_tree, start, - start + ram_size - 1, &cached); + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(em); goto out_reserve; } @@ -1453,7 +1448,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, 1 << BTRFS_ORDERED_REGULAR); if (IS_ERR(ordered)) { unlock_extent(&inode->io_tree, start, - start + ram_size - 1, &cached); + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(ordered); goto out_drop_extent_cache; } @@ -1474,7 +1469,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, */ if (ret) btrfs_drop_extent_map_range(inode, start, - start + ram_size - 1, + start + cur_alloc_size - 1, false); } btrfs_put_ordered_extent(ordered); @@ -1492,7 +1487,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, page_ops = (keep_locked ? 0 : PAGE_UNLOCK); page_ops |= PAGE_SET_ORDERED; - extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, + extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); @@ -1502,7 +1497,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; - extent_reserved = false; + cur_alloc_size = 0; /* * btrfs_reloc_clone_csums() error, since start is increased @@ -1518,7 +1513,7 @@ done: return ret; out_drop_extent_cache: - btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false); + btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); @@ -1572,13 +1567,12 @@ out_unlock: * to decrement again the data space_info's bytes_may_use counter, * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. */ - if (extent_reserved) { + if (cur_alloc_size) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, locked_folio, &cached, clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); - start += cur_alloc_size; } /* @@ -1587,11 +1581,13 @@ out_unlock: * space_info's bytes_may_use counter, reserved in * btrfs_check_data_free_space(). */ - if (start < end) { + if (start + cur_alloc_size < end) { clear_bits |= EXTENT_CLEAR_DATA_RESV; - extent_clear_unlock_delalloc(inode, start, end, locked_folio, + extent_clear_unlock_delalloc(inode, start + cur_alloc_size, + end, locked_folio, &cached, clear_bits, page_ops); - btrfs_qgroup_free_data(inode, NULL, start, end - start + 1, NULL); + btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size, + end - start - cur_alloc_size + 1, NULL); } return ret; } -- 2.51.0 From 6c83d153ed86eb17c46eafe4e78af4ce2071a052 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 21 Oct 2024 21:28:26 +0200 Subject: [PATCH 09/16] btrfs: add new ioctl to wait for cleaned subvolumes Add a new unprivileged ioctl that will let the command 'btrfs subvolume sync' work without the (privileged) SEARCH_TREE ioctl. There are several modes of operation, where the most common ones are to wait on a specific subvolume or all currently queued for cleaning. This is utilized e.g. in backup applications that delete subvolumes and wait until they're cleaned to check for remaining space. The other modes are for flexibility, e.g. for monitoring or checkpoints in the queue of deleted subvolumes, again without the need to use SEARCH_TREE. Notes: - waiting is interruptible, the timeout is set to 1 second and is not configurable - repeated calls to the ioctl see a different state, so this is inherently racy when using e.g. the count or peek next/last Use cases: - a subvolume A was deleted, wait for cleaning (WAIT_FOR_ONE) - a bunch of subvolumes were deleted, wait for all (WAIT_FOR_QUEUED or PEEK_LAST + WAIT_FOR_ONE) - count how many are queued (not blocking), for monitoring purposes - report progress (PEEK_NEXT), may miss some if cleaning is quick - own waiting in user space (PEEK_LAST until it's 0) Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 128 +++++++++++++++++++++++++++++++++++++ include/uapi/linux/btrfs.h | 25 ++++++++ 2 files changed, 153 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 27a9342cd91c..2118c22625ca 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -5027,6 +5027,132 @@ int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -EINVAL; } +static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *argp) +{ + struct btrfs_root *root; + struct btrfs_ioctl_subvol_wait args = { 0 }; + signed long sched_ret; + int refs; + u64 root_flags; + bool wait_for_deletion = false; + bool found = false; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + switch (args.mode) { + case BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED: + /* + * Wait for the first one deleted that waits until all previous + * are cleaned. + */ + spin_lock(&fs_info->trans_lock); + if (!list_empty(&fs_info->dead_roots)) { + root = list_last_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + found = true; + } + spin_unlock(&fs_info->trans_lock); + if (!found) + return -ENOENT; + + fallthrough; + case BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE: + if ((0 < args.subvolid && args.subvolid < BTRFS_FIRST_FREE_OBJECTID) || + BTRFS_LAST_FREE_OBJECTID < args.subvolid) + return -EINVAL; + break; + case BTRFS_SUBVOL_SYNC_COUNT: + spin_lock(&fs_info->trans_lock); + args.count = list_count_nodes(&fs_info->dead_roots); + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + case BTRFS_SUBVOL_SYNC_PEEK_FIRST: + spin_lock(&fs_info->trans_lock); + /* Last in the list was deleted first. */ + if (!list_empty(&fs_info->dead_roots)) { + root = list_last_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + } else { + args.subvolid = 0; + } + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + case BTRFS_SUBVOL_SYNC_PEEK_LAST: + spin_lock(&fs_info->trans_lock); + /* First in the list was deleted last. */ + if (!list_empty(&fs_info->dead_roots)) { + root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + } else { + args.subvolid = 0; + } + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + default: + return -EINVAL; + } + + /* 32bit limitation: fs_roots_radix key is not wide enough. */ + if (sizeof(unsigned long) != sizeof(u64) && args.subvolid > U32_MAX) + return -EOVERFLOW; + + while (1) { + /* Wait for the specific one. */ + if (down_read_interruptible(&fs_info->subvol_sem) == -EINTR) + return -EINTR; + refs = -1; + spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)args.subvolid); + if (root) { + spin_lock(&root->root_item_lock); + refs = btrfs_root_refs(&root->root_item); + root_flags = btrfs_root_flags(&root->root_item); + spin_unlock(&root->root_item_lock); + } + spin_unlock(&fs_info->fs_roots_radix_lock); + up_read(&fs_info->subvol_sem); + + /* Subvolume does not exist. */ + if (!root) + return -ENOENT; + + /* Subvolume not deleted at all. */ + if (refs > 0) + return -EEXIST; + /* We've waited and now the subvolume is gone. */ + if (wait_for_deletion && refs == -1) { + /* Return the one we waited for as the last one. */ + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + } + + /* Subvolume not found on the first try (deleted or never existed). */ + if (refs == -1) + return -ENOENT; + + wait_for_deletion = true; + ASSERT(root_flags & BTRFS_ROOT_SUBVOL_DEAD); + sched_ret = schedule_timeout_interruptible(HZ); + /* Early wake up or error. */ + if (sched_ret != 0) + return -EINTR; + } + + return 0; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -5178,6 +5304,8 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_ENCODED_WRITE_32: return btrfs_ioctl_encoded_write(file, argp, true); #endif + case BTRFS_IOC_SUBVOL_SYNC_WAIT: + return btrfs_ioctl_subvol_sync(fs_info, argp); } return -ENOTTY; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index cdf6ad872149..d3b222d7af24 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -1049,6 +1049,29 @@ struct btrfs_ioctl_encoded_io_args { #define BTRFS_ENCODED_IO_ENCRYPTION_NONE 0 #define BTRFS_ENCODED_IO_ENCRYPTION_TYPES 1 +/* + * Wait for subvolume cleaning process. This queries the kernel queue and it + * can change between the calls. + * + * - FOR_ONE - specify the subvolid + * - FOR_QUEUED - wait for all currently queued + * - COUNT - count number of queued + * - PEEK_FIRST - read which is the first in the queue (to be cleaned or being + * cleaned already), or 0 if the queue is empty + * - PEEK_LAST - read the last subvolid in the queue, or 0 if the queue is empty + */ +struct btrfs_ioctl_subvol_wait { + __u64 subvolid; + __u32 mode; + __u32 count; +}; + +#define BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE (0) +#define BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED (1) +#define BTRFS_SUBVOL_SYNC_COUNT (2) +#define BTRFS_SUBVOL_SYNC_PEEK_FIRST (3) +#define BTRFS_SUBVOL_SYNC_PEEK_LAST (4) + /* Error codes as returned by the kernel */ enum btrfs_err_code { BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1, @@ -1181,6 +1204,8 @@ enum btrfs_err_code { struct btrfs_ioctl_encoded_io_args) #define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \ struct btrfs_ioctl_encoded_io_args) +#define BTRFS_IOC_SUBVOL_SYNC_WAIT _IOW(BTRFS_IOCTL_MAGIC, 65, \ + struct btrfs_ioctl_subvol_wait) #ifdef __cplusplus } -- 2.51.0 From dd0896e77d89686c0736485c5ed4d115e99eaa0c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 4 Nov 2024 11:50:14 +0000 Subject: [PATCH 10/16] btrfs: update stale comment for struct btrfs_delayed_ref_node::add_list The comment refers to a list in the respective delayed ref head that no longer exists (ref_list), it was replaced with a rbtree (ref_tree) in commit 0e0adbcfdc90 ("btrfs: track refs in a rb_tree instead of a list"). So update the stale comment to refer to the rbtree instead of the old list. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 4cb7d4475808..611fb3388f82 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -61,7 +61,8 @@ struct btrfs_delayed_ref_node { /* * If action is BTRFS_ADD_DELAYED_REF, also link this node to * ref_head->ref_add_list, then we do not need to iterate the - * whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes. + * refs rbtree in the corresponding delayed ref head + * (struct btrfs_delayed_ref_head::ref_tree). */ struct list_head add_list; -- 2.51.0 From a20725e1e7014642fc8ba4c7dd2c4ef6d4ec56a9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 5 Nov 2024 11:56:45 +0000 Subject: [PATCH 11/16] btrfs: remove hole from struct btrfs_delayed_node On x86_64 and a release kernel, there's a 4 bytes hole in the structure after the ref count field: struct btrfs_delayed_node { u64 inode_id; /* 0 8 */ u64 bytes_reserved; /* 8 8 */ struct btrfs_root * root; /* 16 8 */ struct list_head n_list; /* 24 16 */ struct list_head p_list; /* 40 16 */ struct rb_root_cached ins_root; /* 56 16 */ /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */ struct rb_root_cached del_root; /* 72 16 */ struct mutex mutex; /* 88 32 */ struct btrfs_inode_item inode_item; /* 120 160 */ /* --- cacheline 4 boundary (256 bytes) was 24 bytes ago --- */ refcount_t refs; /* 280 4 */ /* XXX 4 bytes hole, try to pack */ u64 index_cnt; /* 288 8 */ long unsigned int flags; /* 296 8 */ int count; /* 304 4 */ u32 curr_index_batch_size; /* 308 4 */ u32 index_item_leaves; /* 312 4 */ /* size: 320, cachelines: 5, members: 15 */ /* sum members: 312, holes: 1, sum holes: 4 */ /* padding: 4 */ }; Move the 'count' field, which is 4 bytes long, to just below the ref count field, so we eliminate the hole and reduce the structure size from 320 bytes down to 312 bytes: struct btrfs_delayed_node { u64 inode_id; /* 0 8 */ u64 bytes_reserved; /* 8 8 */ struct btrfs_root * root; /* 16 8 */ struct list_head n_list; /* 24 16 */ struct list_head p_list; /* 40 16 */ struct rb_root_cached ins_root; /* 56 16 */ /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */ struct rb_root_cached del_root; /* 72 16 */ struct mutex mutex; /* 88 32 */ struct btrfs_inode_item inode_item; /* 120 160 */ /* --- cacheline 4 boundary (256 bytes) was 24 bytes ago --- */ refcount_t refs; /* 280 4 */ int count; /* 284 4 */ u64 index_cnt; /* 288 8 */ long unsigned int flags; /* 296 8 */ u32 curr_index_batch_size; /* 304 4 */ u32 index_item_leaves; /* 308 4 */ /* size: 312, cachelines: 5, members: 15 */ /* last cacheline: 56 bytes */ }; This now allows to have 13 delayed nodes per 4K page instead of 12. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 7cfefdfe54ea..f4d9feac0d0e 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -64,9 +64,9 @@ struct btrfs_delayed_node { struct mutex mutex; struct btrfs_inode_item inode_item; refcount_t refs; + int count; u64 index_cnt; unsigned long flags; - int count; /* * The size of the next batch of dir index items to insert (if this * node is from a directory inode). Protected by @mutex. -- 2.51.0 From e36d114990d2acf0a0fca135d50ac21a832daf11 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 5 Nov 2024 14:38:38 +0000 Subject: [PATCH 12/16] btrfs: simplify logic to decrement snapshot counter at btrfs_mksnapshot() There's no point in having a 'snapshot_force_cow' variable to track if we need to decrement the root->snapshot_force_cow counter, as we never jump to the 'out' label after incrementing the counter. Simplify this by removing the variable and always decrementing the counter before the 'out' label, right after the call to btrfs_mksubvol(). Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2118c22625ca..1fdeb216bf6c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1049,7 +1049,6 @@ static noinline int btrfs_mksnapshot(const struct path *parent, struct btrfs_qgroup_inherit *inherit) { int ret; - bool snapshot_force_cow = false; /* * Force new buffered writes to reserve space even when NOCOW is @@ -1068,15 +1067,13 @@ static noinline int btrfs_mksnapshot(const struct path *parent, * creation. */ atomic_inc(&root->snapshot_force_cow); - snapshot_force_cow = true; btrfs_wait_ordered_extents(root, U64_MAX, NULL); ret = btrfs_mksubvol(parent, idmap, name, namelen, root, readonly, inherit); + atomic_dec(&root->snapshot_force_cow); out: - if (snapshot_force_cow) - atomic_dec(&root->snapshot_force_cow); btrfs_drew_read_unlock(&root->snapshot_lock); return ret; } -- 2.51.0 From 08fdca9eee098563b55e7a717d2a4256a7375dc8 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Thu, 31 Oct 2024 11:52:05 +0000 Subject: [PATCH 13/16] btrfs: avoid superfluous calls to free_extent_map() in btrfs_encoded_read() Change the control flow of btrfs_encoded_read() so that it doesn't call free_extent_map() when we know that this has already been done. Reviewed-by: Anand Jain Signed-off-by: Mark Harmstone Suggested-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a2128eccafb3..22b8e2764619 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9324,7 +9324,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, cached_state, extent_start, count, encoded, &unlocked); - goto out_em; + goto out_unlock_extent; } /* @@ -9384,7 +9384,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, ret = -EFAULT; } else { ret = -EIOCBQUEUED; - goto out_em; + goto out_unlock_extent; } out_em: -- 2.51.0 From 80b3695538275e1b2c9a572615175643454b2cb3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 31 Oct 2024 15:03:17 +0100 Subject: [PATCH 14/16] btrfs: fix a typo in btrfs_use_zone_append REQ_OP_ZONE_APPNED -> REQ_OP_ZONE_APPEND. Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index dbcbf754d284..cb32966380f5 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1739,7 +1739,7 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio) return false; /* - * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the + * Using REQ_OP_ZONE_APPEND for relocation can break assumptions on the * extent layout the relocation code has. * Furthermore we have set aside own block-group from which only the * relocation "process" can allocate and make sure only one process at a -- 2.51.0 From 2342d6595b608eec94187a17dc112dd4c2a812fa Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 6 Nov 2024 12:14:09 +0000 Subject: [PATCH 15/16] btrfs: fix warning on PTR_ERR() against NULL device at btrfs_control_ioctl() Smatch complains about calling PTR_ERR() against a NULL pointer: fs/btrfs/super.c:2272 btrfs_control_ioctl() warn: passing zero to 'PTR_ERR' Fix this by calling PTR_ERR() against the device pointer only if it contains an error. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7cd62307cd41..3381389ab93a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2254,7 +2254,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); if (IS_ERR_OR_NULL(device)) { mutex_unlock(&uuid_mutex); - ret = PTR_ERR(device); + if (IS_ERR(device)) + ret = PTR_ERR(device); + else + ret = 0; break; } ret = !(device->fs_devices->num_devices == -- 2.51.0 From 722d343f12a626c9aee8844f5bf19a107d9a1067 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 6 Nov 2024 12:21:13 +0000 Subject: [PATCH 16/16] btrfs: remove check for NULL fs_info at btrfs_folio_end_lock_bitmap() Smatch complains about possibly dereferencing a NULL fs_info at btrfs_folio_end_lock_bitmap(): fs/btrfs/subpage.c:332 btrfs_folio_end_lock_bitmap() warn: variable dereferenced before check 'fs_info' (see line 326) because we access fs_info to set the 'start_bit' variable before doing the check for a NULL fs_info. However fs_info is never NULL, since in the only caller of btrfs_folio_end_lock_bitmap() is extent_writepage(), where we have an inode which always as a non-NULL fs_info. So remove the check for a NULL fs_info at btrfs_folio_end_lock_bitmap(). Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/subpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index d4cab3c55742..8c68059ac1b0 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -329,7 +329,7 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, int cleared = 0; int bit; - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + if (!btrfs_is_subpage(fs_info, folio->mapping)) { folio_unlock(folio); return; } -- 2.51.0