btrfs: move the direct IO code into its own file

author Filipe Manana <fdmanana@suse.com>

Tue, 25 Jun 2024 14:52:28 +0000 (15:52 +0100)

committer David Sterba <dsterba@suse.com>

Thu, 11 Jul 2024 13:33:29 +0000 (15:33 +0200)
author Filipe Manana <fdmanana@suse.com>
Tue, 25 Jun 2024 14:52:28 +0000 (15:52 +0100)
committer David Sterba <dsterba@suse.com>
Thu, 11 Jul 2024 13:33:29 +0000 (15:33 +0200)
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile

index 50b19d15e95657f4ef493442527c0c75c39a5f76..87617f2968bccf9938905f119dd9ea1f27220862 100644 (file)
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -33,7 +33,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
            uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
            block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
            subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
-          lru_cache.o raid-stripe-tree.o fiemap.o
+          lru_cache.o raid-stripe-tree.o fiemap.o direct-io.o
  
  btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
  btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h

index 8b45d7e85d86a565ae3772b91682f9e39af29a94..3056c8aed8ef4f4e64c0b48e55a2b5f6a10d8f46 100644 (file)
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -610,10 +610,6 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
  ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
                                const struct btrfs_ioctl_encoded_io_args *encoded);
  
-ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
-                      size_t done_before);
-struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
-                                 size_t done_before);
  struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino);
  
  extern const struct dentry_operations btrfs_dentry_operations;
@@ -630,5 +626,10 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags);
  void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes,
                               const u64 del_bytes);
  void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
+u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
+                                    u64 num_bytes);
+struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
+                                     const struct btrfs_file_extent *file_extent,
+                                     int type);
  
  #endif
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c

new file mode 100644 (file)

index 0000000..f9fb2db
--- /dev/null
+++ b/fs/btrfs/direct-io.c
@@ -0,0 +1,1052 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/fsverity.h>
+#include <linux/iomap.h>
+#include "ctree.h"
+#include "delalloc-space.h"
+#include "direct-io.h"
+#include "extent-tree.h"
+#include "file.h"
+#include "fs.h"
+#include "transaction.h"
+#include "volumes.h"
+
+struct btrfs_dio_data {
+       ssize_t submitted;
+       struct extent_changeset *data_reserved;
+       struct btrfs_ordered_extent *ordered;
+       bool data_space_reserved;
+       bool nocow_done;
+};
+
+struct btrfs_dio_private {
+       /* Range of I/O */
+       u64 file_offset;
+       u32 bytes;
+
+       /* This must be last */
+       struct btrfs_bio bbio;
+};
+
+static struct bio_set btrfs_dio_bioset;
+
+static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
+                             struct extent_state **cached_state,
+                             unsigned int iomap_flags)
+{
+       const bool writing = (iomap_flags & IOMAP_WRITE);
+       const bool nowait = (iomap_flags & IOMAP_NOWAIT);
+       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+       struct btrfs_ordered_extent *ordered;
+       int ret = 0;
+
+       while (1) {
+               if (nowait) {
+                       if (!try_lock_extent(io_tree, lockstart, lockend,
+                                            cached_state))
+                               return -EAGAIN;
+               } else {
+                       lock_extent(io_tree, lockstart, lockend, cached_state);
+               }
+               /*
+                * We're concerned with the entire range that we're going to be
+                * doing DIO to, so we need to make sure there's no ordered
+                * extents in this range.
+                */
+               ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
+                                                    lockend - lockstart + 1);
+
+               /*
+                * We need to make sure there are no buffered pages in this
+                * range either, we could have raced between the invalidate in
+                * generic_file_direct_write and locking the extent.  The
+                * invalidate needs to happen so that reads after a write do not
+                * get stale data.
+                */
+               if (!ordered &&
+                   (!writing || !filemap_range_has_page(inode->i_mapping,
+                                                        lockstart, lockend)))
+                       break;
+
+               unlock_extent(io_tree, lockstart, lockend, cached_state);
+
+               if (ordered) {
+                       if (nowait) {
+                               btrfs_put_ordered_extent(ordered);
+                               ret = -EAGAIN;
+                               break;
+                       }
+                       /*
+                        * If we are doing a DIO read and the ordered extent we
+                        * found is for a buffered write, we can not wait for it
+                        * to complete and retry, because if we do so we can
+                        * deadlock with concurrent buffered writes on page
+                        * locks. This happens only if our DIO read covers more
+                        * than one extent map, if at this point has already
+                        * created an ordered extent for a previous extent map
+                        * and locked its range in the inode's io tree, and a
+                        * concurrent write against that previous extent map's
+                        * range and this range started (we unlock the ranges
+                        * in the io tree only when the bios complete and
+                        * buffered writes always lock pages before attempting
+                        * to lock range in the io tree).
+                        */
+                       if (writing ||
+                           test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
+                               btrfs_start_ordered_extent(ordered);
+                       else
+                               ret = nowait ? -EAGAIN : -ENOTBLK;
+                       btrfs_put_ordered_extent(ordered);
+               } else {
+                       /*
+                        * We could trigger writeback for this range (and wait
+                        * for it to complete) and then invalidate the pages for
+                        * this range (through invalidate_inode_pages2_range()),
+                        * but that can lead us to a deadlock with a concurrent
+                        * call to readahead (a buffered read or a defrag call
+                        * triggered a readahead) on a page lock due to an
+                        * ordered dio extent we created before but did not have
+                        * yet a corresponding bio submitted (whence it can not
+                        * complete), which makes readahead wait for that
+                        * ordered extent to complete while holding a lock on
+                        * that page.
+                        */
+                       ret = nowait ? -EAGAIN : -ENOTBLK;
+               }
+
+               if (ret)
+                       break;
+
+               cond_resched();
+       }
+
+       return ret;
+}
+
+static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
+                                                 struct btrfs_dio_data *dio_data,
+                                                 const u64 start,
+                                                 const struct btrfs_file_extent *file_extent,
+                                                 const int type)
+{
+       struct extent_map *em = NULL;
+       struct btrfs_ordered_extent *ordered;
+
+       if (type != BTRFS_ORDERED_NOCOW) {
+               em = btrfs_create_io_em(inode, start, file_extent, type);
+               if (IS_ERR(em))
+                       goto out;
+       }
+
+       ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
+                                            (1 << type) |
+                                            (1 << BTRFS_ORDERED_DIRECT));
+       if (IS_ERR(ordered)) {
+               if (em) {
+                       free_extent_map(em);
+                       btrfs_drop_extent_map_range(inode, start,
+                                       start + file_extent->num_bytes - 1, false);
+               }
+               em = ERR_CAST(ordered);
+       } else {
+               ASSERT(!dio_data->ordered);
+               dio_data->ordered = ordered;
+       }
+ out:
+
+       return em;
+}
+
+static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
+                                                 struct btrfs_dio_data *dio_data,
+                                                 u64 start, u64 len)
+{
+       struct btrfs_root *root = inode->root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_file_extent file_extent;
+       struct extent_map *em;
+       struct btrfs_key ins;
+       u64 alloc_hint;
+       int ret;
+
+       alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
+again:
+       ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
+                                  0, alloc_hint, &ins, 1, 1);
+       if (ret == -EAGAIN) {
+               ASSERT(btrfs_is_zoned(fs_info));
+               wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
+                              TASK_UNINTERRUPTIBLE);
+               goto again;
+       }
+       if (ret)
+               return ERR_PTR(ret);
+
+       file_extent.disk_bytenr = ins.objectid;
+       file_extent.disk_num_bytes = ins.offset;
+       file_extent.num_bytes = ins.offset;
+       file_extent.ram_bytes = ins.offset;
+       file_extent.offset = 0;
+       file_extent.compression = BTRFS_COMPRESS_NONE;
+       em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
+                                    BTRFS_ORDERED_REGULAR);
+       btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+       if (IS_ERR(em))
+               btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
+                                          1);
+
+       return em;
+}
+
+static int btrfs_get_blocks_direct_write(struct extent_map **map,
+                                        struct inode *inode,
+                                        struct btrfs_dio_data *dio_data,
+                                        u64 start, u64 *lenp,
+                                        unsigned int iomap_flags)
+{
+       const bool nowait = (iomap_flags & IOMAP_NOWAIT);
+       struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+       struct btrfs_file_extent file_extent;
+       struct extent_map *em = *map;
+       int type;
+       u64 block_start;
+       struct btrfs_block_group *bg;
+       bool can_nocow = false;
+       bool space_reserved = false;
+       u64 len = *lenp;
+       u64 prev_len;
+       int ret = 0;
+
+       /*
+        * We don't allocate a new extent in the following cases
+        *
+        * 1) The inode is marked as NODATACOW. In this case we'll just use the
+        * existing extent.
+        * 2) The extent is marked as PREALLOC. We're good to go here and can
+        * just use the extent.
+        *
+        */
+       if ((em->flags & EXTENT_FLAG_PREALLOC) ||
+           ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+            em->disk_bytenr != EXTENT_MAP_HOLE)) {
+               if (em->flags & EXTENT_FLAG_PREALLOC)
+                       type = BTRFS_ORDERED_PREALLOC;
+               else
+                       type = BTRFS_ORDERED_NOCOW;
+               len = min(len, em->len - (start - em->start));
+               block_start = extent_map_block_start(em) + (start - em->start);
+
+               if (can_nocow_extent(inode, start, &len,
+                                    &file_extent, false, false) == 1) {
+                       bg = btrfs_inc_nocow_writers(fs_info, block_start);
+                       if (bg)
+                               can_nocow = true;
+               }
+       }
+
+       prev_len = len;
+       if (can_nocow) {
+               struct extent_map *em2;
+
+               /* We can NOCOW, so only need to reserve metadata space. */
+               ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
+                                                     nowait);
+               if (ret < 0) {
+                       /* Our caller expects us to free the input extent map. */
+                       free_extent_map(em);
+                       *map = NULL;
+                       btrfs_dec_nocow_writers(bg);
+                       if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
+                               ret = -EAGAIN;
+                       goto out;
+               }
+               space_reserved = true;
+
+               em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
+                                             &file_extent, type);
+               btrfs_dec_nocow_writers(bg);
+               if (type == BTRFS_ORDERED_PREALLOC) {
+                       free_extent_map(em);
+                       *map = em2;
+                       em = em2;
+               }
+
+               if (IS_ERR(em2)) {
+                       ret = PTR_ERR(em2);
+                       goto out;
+               }
+
+               dio_data->nocow_done = true;
+       } else {
+               /* Our caller expects us to free the input extent map. */
+               free_extent_map(em);
+               *map = NULL;
+
+               if (nowait) {
+                       ret = -EAGAIN;
+                       goto out;
+               }
+
+               /*
+                * If we could not allocate data space before locking the file
+                * range and we can't do a NOCOW write, then we have to fail.
+                */
+               if (!dio_data->data_space_reserved) {
+                       ret = -ENOSPC;
+                       goto out;
+               }
+
+               /*
+                * We have to COW and we have already reserved data space before,
+                * so now we reserve only metadata.
+                */
+               ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
+                                                     false);
+               if (ret < 0)
+                       goto out;
+               space_reserved = true;
+
+               em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
+               if (IS_ERR(em)) {
+                       ret = PTR_ERR(em);
+                       goto out;
+               }
+               *map = em;
+               len = min(len, em->len - (start - em->start));
+               if (len < prev_len)
+                       btrfs_delalloc_release_metadata(BTRFS_I(inode),
+                                                       prev_len - len, true);
+       }
+
+       /*
+        * We have created our ordered extent, so we can now release our reservation
+        * for an outstanding extent.
+        */
+       btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
+
+       /*
+        * Need to update the i_size under the extent lock so buffered
+        * readers will get the updated i_size when we unlock.
+        */
+       if (start + len > i_size_read(inode))
+               i_size_write(inode, start + len);
+out:
+       if (ret && space_reserved) {
+               btrfs_delalloc_release_extents(BTRFS_I(inode), len);
+               btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
+       }
+       *lenp = len;
+       return ret;
+}
+
+static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+               loff_t length, unsigned int flags, struct iomap *iomap,
+               struct iomap *srcmap)
+{
+       struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
+       struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+       struct extent_map *em;
+       struct extent_state *cached_state = NULL;
+       struct btrfs_dio_data *dio_data = iter->private;
+       u64 lockstart, lockend;
+       const bool write = !!(flags & IOMAP_WRITE);
+       int ret = 0;
+       u64 len = length;
+       const u64 data_alloc_len = length;
+       bool unlock_extents = false;
+
+       /*
+        * We could potentially fault if we have a buffer > PAGE_SIZE, and if
+        * we're NOWAIT we may submit a bio for a partial range and return
+        * EIOCBQUEUED, which would result in an errant short read.
+        *
+        * The best way to handle this would be to allow for partial completions
+        * of iocb's, so we could submit the partial bio, return and fault in
+        * the rest of the pages, and then submit the io for the rest of the
+        * range.  However we don't have that currently, so simply return
+        * -EAGAIN at this point so that the normal path is used.
+        */
+       if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
+               return -EAGAIN;
+
+       /*
+        * Cap the size of reads to that usually seen in buffered I/O as we need
+        * to allocate a contiguous array for the checksums.
+        */
+       if (!write)
+               len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
+
+       lockstart = start;
+       lockend = start + len - 1;
+
+       /*
+        * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
+        * enough if we've written compressed pages to this area, so we need to
+        * flush the dirty pages again to make absolutely sure that any
+        * outstanding dirty pages are on disk - the first flush only starts
+        * compression on the data, while keeping the pages locked, so by the
+        * time the second flush returns we know bios for the compressed pages
+        * were submitted and finished, and the pages no longer under writeback.
+        *
+        * If we have a NOWAIT request and we have any pages in the range that
+        * are locked, likely due to compression still in progress, we don't want
+        * to block on page locks. We also don't want to block on pages marked as
+        * dirty or under writeback (same as for the non-compression case).
+        * iomap_dio_rw() did the same check, but after that and before we got
+        * here, mmap'ed writes may have happened or buffered reads started
+        * (readpage() and readahead(), which lock pages), as we haven't locked
+        * the file range yet.
+        */
+       if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+                    &BTRFS_I(inode)->runtime_flags)) {
+               if (flags & IOMAP_NOWAIT) {
+                       if (filemap_range_needs_writeback(inode->i_mapping,
+                                                         lockstart, lockend))
+                               return -EAGAIN;
+               } else {
+                       ret = filemap_fdatawrite_range(inode->i_mapping, start,
+                                                      start + length - 1);
+                       if (ret)
+                               return ret;
+               }
+       }
+
+       memset(dio_data, 0, sizeof(*dio_data));
+
+       /*
+        * We always try to allocate data space and must do it before locking
+        * the file range, to avoid deadlocks with concurrent writes to the same
+        * range if the range has several extents and the writes don't expand the
+        * current i_size (the inode lock is taken in shared mode). If we fail to
+        * allocate data space here we continue and later, after locking the
+        * file range, we fail with ENOSPC only if we figure out we can not do a
+        * NOCOW write.
+        */
+       if (write && !(flags & IOMAP_NOWAIT)) {
+               ret = btrfs_check_data_free_space(BTRFS_I(inode),
+                                                 &dio_data->data_reserved,
+                                                 start, data_alloc_len, false);
+               if (!ret)
+                       dio_data->data_space_reserved = true;
+               else if (ret && !(BTRFS_I(inode)->flags &
+                                 (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+                       goto err;
+       }
+
+       /*
+        * If this errors out it's because we couldn't invalidate pagecache for
+        * this range and we need to fallback to buffered IO, or we are doing a
+        * NOWAIT read/write and we need to block.
+        */
+       ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
+       if (ret < 0)
+               goto err;
+
+       em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
+       if (IS_ERR(em)) {
+               ret = PTR_ERR(em);
+               goto unlock_err;
+       }
+
+       /*
+        * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+        * io.  INLINE is special, and we could probably kludge it in here, but
+        * it's still buffered so for safety lets just fall back to the generic
+        * buffered path.
+        *
+        * For COMPRESSED we _have_ to read the entire extent in so we can
+        * decompress it, so there will be buffering required no matter what we
+        * do, so go ahead and fallback to buffered.
+        *
+        * We return -ENOTBLK because that's what makes DIO go ahead and go back
+        * to buffered IO.  Don't blame me, this is the price we pay for using
+        * the generic code.
+        */
+       if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
+               free_extent_map(em);
+               /*
+                * If we are in a NOWAIT context, return -EAGAIN in order to
+                * fallback to buffered IO. This is not only because we can
+                * block with buffered IO (no support for NOWAIT semantics at
+                * the moment) but also to avoid returning short reads to user
+                * space - this happens if we were able to read some data from
+                * previous non-compressed extents and then when we fallback to
+                * buffered IO, at btrfs_file_read_iter() by calling
+                * filemap_read(), we fail to fault in pages for the read buffer,
+                * in which case filemap_read() returns a short read (the number
+                * of bytes previously read is > 0, so it does not return -EFAULT).
+                */
+               ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
+               goto unlock_err;
+       }
+
+       len = min(len, em->len - (start - em->start));
+
+       /*
+        * If we have a NOWAIT request and the range contains multiple extents
+        * (or a mix of extents and holes), then we return -EAGAIN to make the
+        * caller fallback to a context where it can do a blocking (without
+        * NOWAIT) request. This way we avoid doing partial IO and returning
+        * success to the caller, which is not optimal for writes and for reads
+        * it can result in unexpected behaviour for an application.
+        *
+        * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
+        * iomap_dio_rw(), we can end up returning less data then what the caller
+        * asked for, resulting in an unexpected, and incorrect, short read.
+        * That is, the caller asked to read N bytes and we return less than that,
+        * which is wrong unless we are crossing EOF. This happens if we get a
+        * page fault error when trying to fault in pages for the buffer that is
+        * associated to the struct iov_iter passed to iomap_dio_rw(), and we
+        * have previously submitted bios for other extents in the range, in
+        * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
+        * those bios have completed by the time we get the page fault error,
+        * which we return back to our caller - we should only return EIOCBQUEUED
+        * after we have submitted bios for all the extents in the range.
+        */
+       if ((flags & IOMAP_NOWAIT) && len < length) {
+               free_extent_map(em);
+               ret = -EAGAIN;
+               goto unlock_err;
+       }
+
+       if (write) {
+               ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
+                                                   start, &len, flags);
+               if (ret < 0)
+                       goto unlock_err;
+               unlock_extents = true;
+               /* Recalc len in case the new em is smaller than requested */
+               len = min(len, em->len - (start - em->start));
+               if (dio_data->data_space_reserved) {
+                       u64 release_offset;
+                       u64 release_len = 0;
+
+                       if (dio_data->nocow_done) {
+                               release_offset = start;
+                               release_len = data_alloc_len;
+                       } else if (len < data_alloc_len) {
+                               release_offset = start + len;
+                               release_len = data_alloc_len - len;
+                       }
+
+                       if (release_len > 0)
+                               btrfs_free_reserved_data_space(BTRFS_I(inode),
+                                                              dio_data->data_reserved,
+                                                              release_offset,
+                                                              release_len);
+               }
+       } else {
+               /*
+                * We need to unlock only the end area that we aren't using.
+                * The rest is going to be unlocked by the endio routine.
+                */
+               lockstart = start + len;
+               if (lockstart < lockend)
+                       unlock_extents = true;
+       }
+
+       if (unlock_extents)
+               unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                             &cached_state);
+       else
+               free_extent_state(cached_state);
+
+       /*
+        * Translate extent map information to iomap.
+        * We trim the extents (and move the addr) even though iomap code does
+        * that, since we have locked only the parts we are performing I/O in.
+        */
+       if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
+           ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
+               iomap->addr = IOMAP_NULL_ADDR;
+               iomap->type = IOMAP_HOLE;
+       } else {
+               iomap->addr = extent_map_block_start(em) + (start - em->start);
+               iomap->type = IOMAP_MAPPED;
+       }
+       iomap->offset = start;
+       iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
+       iomap->length = len;
+       free_extent_map(em);
+
+       return 0;
+
+unlock_err:
+       unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                     &cached_state);
+err:
+       if (dio_data->data_space_reserved) {
+               btrfs_free_reserved_data_space(BTRFS_I(inode),
+                                              dio_data->data_reserved,
+                                              start, data_alloc_len);
+               extent_changeset_free(dio_data->data_reserved);
+       }
+
+       return ret;
+}
+
+static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+               ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+       struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
+       struct btrfs_dio_data *dio_data = iter->private;
+       size_t submitted = dio_data->submitted;
+       const bool write = !!(flags & IOMAP_WRITE);
+       int ret = 0;
+
+       if (!write && (iomap->type == IOMAP_HOLE)) {
+               /* If reading from a hole, unlock and return */
+               unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
+                             NULL);
+               return 0;
+       }
+
+       if (submitted < length) {
+               pos += submitted;
+               length -= submitted;
+               if (write)
+                       btrfs_finish_ordered_extent(dio_data->ordered, NULL,
+                                                   pos, length, false);
+               else
+                       unlock_extent(&BTRFS_I(inode)->io_tree, pos,
+                                     pos + length - 1, NULL);
+               ret = -ENOTBLK;
+       }
+       if (write) {
+               btrfs_put_ordered_extent(dio_data->ordered);
+               dio_data->ordered = NULL;
+       }
+
+       if (write)
+               extent_changeset_free(dio_data->data_reserved);
+       return ret;
+}
+
+static void btrfs_dio_end_io(struct btrfs_bio *bbio)
+{
+       struct btrfs_dio_private *dip =
+               container_of(bbio, struct btrfs_dio_private, bbio);
+       struct btrfs_inode *inode = bbio->inode;
+       struct bio *bio = &bbio->bio;
+
+       if (bio->bi_status) {
+               btrfs_warn(inode->root->fs_info,
+               "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
+                          btrfs_ino(inode), bio->bi_opf,
+                          dip->file_offset, dip->bytes, bio->bi_status);
+       }
+
+       if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
+               btrfs_finish_ordered_extent(bbio->ordered, NULL,
+                                           dip->file_offset, dip->bytes,
+                                           !bio->bi_status);
+       } else {
+               unlock_extent(&inode->io_tree, dip->file_offset,
+                             dip->file_offset + dip->bytes - 1, NULL);
+       }
+
+       bbio->bio.bi_private = bbio->private;
+       iomap_dio_bio_end_io(bio);
+}
+
+static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
+                                       struct btrfs_ordered_extent *ordered)
+{
+       u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+       u64 len = bbio->bio.bi_iter.bi_size;
+       struct btrfs_ordered_extent *new;
+       int ret;
+
+       /* Must always be called for the beginning of an ordered extent. */
+       if (WARN_ON_ONCE(start != ordered->disk_bytenr))
+               return -EINVAL;
+
+       /* No need to split if the ordered extent covers the entire bio. */
+       if (ordered->disk_num_bytes == len) {
+               refcount_inc(&ordered->refs);
+               bbio->ordered = ordered;
+               return 0;
+       }
+
+       /*
+        * Don't split the extent_map for NOCOW extents, as we're writing into
+        * a pre-existing one.
+        */
+       if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+               ret = split_extent_map(bbio->inode, bbio->file_offset,
+                                      ordered->num_bytes, len,
+                                      ordered->disk_bytenr);
+               if (ret)
+                       return ret;
+       }
+
+       new = btrfs_split_ordered_extent(ordered, len);
+       if (IS_ERR(new))
+               return PTR_ERR(new);
+       bbio->ordered = new;
+       return 0;
+}
+
+static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
+                               loff_t file_offset)
+{
+       struct btrfs_bio *bbio = btrfs_bio(bio);
+       struct btrfs_dio_private *dip =
+               container_of(bbio, struct btrfs_dio_private, bbio);
+       struct btrfs_dio_data *dio_data = iter->private;
+
+       btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
+                      btrfs_dio_end_io, bio->bi_private);
+       bbio->inode = BTRFS_I(iter->inode);
+       bbio->file_offset = file_offset;
+
+       dip->file_offset = file_offset;
+       dip->bytes = bio->bi_iter.bi_size;
+
+       dio_data->submitted += bio->bi_iter.bi_size;
+
+       /*
+        * Check if we are doing a partial write.  If we are, we need to split
+        * the ordered extent to match the submitted bio.  Hang on to the
+        * remaining unfinishable ordered_extent in dio_data so that it can be
+        * cancelled in iomap_end to avoid a deadlock wherein faulting the
+        * remaining pages is blocked on the outstanding ordered extent.
+        */
+       if (iter->flags & IOMAP_WRITE) {
+               int ret;
+
+               ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
+               if (ret) {
+                       btrfs_finish_ordered_extent(dio_data->ordered, NULL,
+                                                   file_offset, dip->bytes,
+                                                   !ret);
+                       bio->bi_status = errno_to_blk_status(ret);
+                       iomap_dio_bio_end_io(bio);
+                       return;
+               }
+       }
+
+       btrfs_submit_bio(bbio, 0);
+}
+
+static const struct iomap_ops btrfs_dio_iomap_ops = {
+       .iomap_begin            = btrfs_dio_iomap_begin,
+       .iomap_end              = btrfs_dio_iomap_end,
+};
+
+static const struct iomap_dio_ops btrfs_dio_ops = {
+       .submit_io              = btrfs_dio_submit_io,
+       .bio_set                = &btrfs_dio_bioset,
+};
+
+static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
+                             size_t done_before)
+{
+       struct btrfs_dio_data data = { 0 };
+
+       return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+                           IOMAP_DIO_PARTIAL, &data, done_before);
+}
+
+static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
+                                        size_t done_before)
+{
+       struct btrfs_dio_data data = { 0 };
+
+       return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+                           IOMAP_DIO_PARTIAL, &data, done_before);
+}
+
+static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
+                              const struct iov_iter *iter, loff_t offset)
+{
+       const u32 blocksize_mask = fs_info->sectorsize - 1;
+
+       if (offset & blocksize_mask)
+               return -EINVAL;
+
+       if (iov_iter_alignment(iter) & blocksize_mask)
+               return -EINVAL;
+
+       return 0;
+}
+
+ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file_inode(file);
+       struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+       loff_t pos;
+       ssize_t written = 0;
+       ssize_t written_buffered;
+       size_t prev_left = 0;
+       loff_t endbyte;
+       ssize_t ret;
+       unsigned int ilock_flags = 0;
+       struct iomap_dio *dio;
+
+       if (iocb->ki_flags & IOCB_NOWAIT)
+               ilock_flags |= BTRFS_ILOCK_TRY;
+
+       /*
+        * If the write DIO is within EOF, use a shared lock and also only if
+        * security bits will likely not be dropped by file_remove_privs() called
+        * from btrfs_write_check(). Either will need to be rechecked after the
+        * lock was acquired.
+        */
+       if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
+               ilock_flags |= BTRFS_ILOCK_SHARED;
+
+relock:
+       ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
+       if (ret < 0)
+               return ret;
+
+       /* Shared lock cannot be used with security bits set. */
+       if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
+               btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+               ilock_flags &= ~BTRFS_ILOCK_SHARED;
+               goto relock;
+       }
+
+       ret = generic_write_checks(iocb, from);
+       if (ret <= 0) {
+               btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+               return ret;
+       }
+
+       ret = btrfs_write_check(iocb, from, ret);
+       if (ret < 0) {
+               btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+               goto out;
+       }
+
+       pos = iocb->ki_pos;
+       /*
+        * Re-check since file size may have changed just before taking the
+        * lock or pos may have changed because of O_APPEND in generic_write_check()
+        */
+       if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
+           pos + iov_iter_count(from) > i_size_read(inode)) {
+               btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+               ilock_flags &= ~BTRFS_ILOCK_SHARED;
+               goto relock;
+       }
+
+       if (check_direct_IO(fs_info, from, pos)) {
+               btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+               goto buffered;
+       }
+
+       /*
+        * The iov_iter can be mapped to the same file range we are writing to.
+        * If that's the case, then we will deadlock in the iomap code, because
+        * it first calls our callback btrfs_dio_iomap_begin(), which will create
+        * an ordered extent, and after that it will fault in the pages that the
+        * iov_iter refers to. During the fault in we end up in the readahead
+        * pages code (starting at btrfs_readahead()), which will lock the range,
+        * find that ordered extent and then wait for it to complete (at
+        * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
+        * obviously the ordered extent can never complete as we didn't submit
+        * yet the respective bio(s). This always happens when the buffer is
+        * memory mapped to the same file range, since the iomap DIO code always
+        * invalidates pages in the target file range (after starting and waiting
+        * for any writeback).
+        *
+        * So here we disable page faults in the iov_iter and then retry if we
+        * got -EFAULT, faulting in the pages before the retry.
+        */
+       from->nofault = true;
+       dio = btrfs_dio_write(iocb, from, written);
+       from->nofault = false;
+
+       /*
+        * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
+        * iocb, and that needs to lock the inode. So unlock it before calling
+        * iomap_dio_complete() to avoid a deadlock.
+        */
+       btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+
+       if (IS_ERR_OR_NULL(dio))
+               ret = PTR_ERR_OR_ZERO(dio);
+       else
+               ret = iomap_dio_complete(dio);
+
+       /* No increment (+=) because iomap returns a cumulative value. */
+       if (ret > 0)
+               written = ret;
+
+       if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
+               const size_t left = iov_iter_count(from);
+               /*
+                * We have more data left to write. Try to fault in as many as
+                * possible of the remainder pages and retry. We do this without
+                * releasing and locking again the inode, to prevent races with
+                * truncate.
+                *
+                * Also, in case the iov refers to pages in the file range of the
+                * file we want to write to (due to a mmap), we could enter an
+                * infinite loop if we retry after faulting the pages in, since
+                * iomap will invalidate any pages in the range early on, before
+                * it tries to fault in the pages of the iov. So we keep track of
+                * how much was left of iov in the previous EFAULT and fallback
+                * to buffered IO in case we haven't made any progress.
+                */
+               if (left == prev_left) {
+                       ret = -ENOTBLK;
+               } else {
+                       fault_in_iov_iter_readable(from, left);
+                       prev_left = left;
+                       goto relock;
+               }
+       }
+
+       /*
+        * If 'ret' is -ENOTBLK or we have not written all data, then it means
+        * we must fallback to buffered IO.
+        */
+       if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
+               goto out;
+
+buffered:
+       /*
+        * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
+        * it must retry the operation in a context where blocking is acceptable,
+        * because even if we end up not blocking during the buffered IO attempt
+        * below, we will block when flushing and waiting for the IO.
+        */
+       if (iocb->ki_flags & IOCB_NOWAIT) {
+               ret = -EAGAIN;
+               goto out;
+       }
+
+       pos = iocb->ki_pos;
+       written_buffered = btrfs_buffered_write(iocb, from);
+       if (written_buffered < 0) {
+               ret = written_buffered;
+               goto out;
+       }
+       /*
+        * Ensure all data is persisted. We want the next direct IO read to be
+        * able to read what was just written.
+        */
+       endbyte = pos + written_buffered - 1;
+       ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
+       if (ret)
+               goto out;
+       ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
+       if (ret)
+               goto out;
+       written += written_buffered;
+       iocb->ki_pos = pos + written_buffered;
+       invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
+                                endbyte >> PAGE_SHIFT);
+out:
+       return ret < 0 ? ret : written;
+}
+
+static int check_direct_read(struct btrfs_fs_info *fs_info,
+                            const struct iov_iter *iter, loff_t offset)
+{
+       int ret;
+       int i, seg;
+
+       ret = check_direct_IO(fs_info, iter, offset);
+       if (ret < 0)
+               return ret;
+
+       if (!iter_is_iovec(iter))
+               return 0;
+
+       for (seg = 0; seg < iter->nr_segs; seg++) {
+               for (i = seg + 1; i < iter->nr_segs; i++) {
+                       const struct iovec *iov1 = iter_iov(iter) + seg;
+                       const struct iovec *iov2 = iter_iov(iter) + i;
+
+                       if (iov1->iov_base == iov2->iov_base)
+                               return -EINVAL;
+               }
+       }
+       return 0;
+}
+
+ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
+{
+       struct inode *inode = file_inode(iocb->ki_filp);
+       size_t prev_left = 0;
+       ssize_t read = 0;
+       ssize_t ret;
+
+       if (fsverity_active(inode))
+               return 0;
+
+       if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
+               return 0;
+
+       btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
+again:
+       /*
+        * This is similar to what we do for direct IO writes, see the comment
+        * at btrfs_direct_write(), but we also disable page faults in addition
+        * to disabling them only at the iov_iter level. This is because when
+        * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
+        * which can still trigger page fault ins despite having set ->nofault
+        * to true of our 'to' iov_iter.
+        *
+        * The difference to direct IO writes is that we deadlock when trying
+        * to lock the extent range in the inode's tree during he page reads
+        * triggered by the fault in (while for writes it is due to waiting for
+        * our own ordered extent). This is because for direct IO reads,
+        * btrfs_dio_iomap_begin() returns with the extent range locked, which
+        * is only unlocked in the endio callback (end_bio_extent_readpage()).
+        */
+       pagefault_disable();
+       to->nofault = true;
+       ret = btrfs_dio_read(iocb, to, read);
+       to->nofault = false;
+       pagefault_enable();
+
+       /* No increment (+=) because iomap returns a cumulative value. */
+       if (ret > 0)
+               read = ret;
+
+       if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
+               const size_t left = iov_iter_count(to);
+
+               if (left == prev_left) {
+                       /*
+                        * We didn't make any progress since the last attempt,
+                        * fallback to a buffered read for the remainder of the
+                        * range. This is just to avoid any possibility of looping
+                        * for too long.
+                        */
+                       ret = read;
+               } else {
+                       /*
+                        * We made some progress since the last retry or this is
+                        * the first time we are retrying. Fault in as many pages
+                        * as possible and retry.
+                        */
+                       fault_in_iov_iter_writeable(to, left);
+                       prev_left = left;
+                       goto again;
+               }
+       }
+       btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
+       return ret < 0 ? ret : read;
+}
+
+int __init btrfs_init_dio(void)
+{
+       if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
+                       offsetof(struct btrfs_dio_private, bbio.bio),
+                       BIOSET_NEED_BVECS))
+               return -ENOMEM;
+
+       return 0;
+}
+
+void __cold btrfs_destroy_dio(void)
+{
+       bioset_exit(&btrfs_dio_bioset);
+}
diff --git a/fs/btrfs/direct-io.h b/fs/btrfs/direct-io.h

new file mode 100644 (file)

index 0000000..3dc3ea9
--- /dev/null
+++ b/fs/btrfs/direct-io.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_DIRECT_IO_H
+#define BTRFS_DIRECT_IO_H
+
+#include <linux/types.h>
+
+int __init btrfs_init_dio(void);
+void __cold btrfs_destroy_dio(void);
+
+ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from);
+ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to);
+
+#endif /* BTRFS_DIRECT_IO_H */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index 5834d452677f6709631b0df5485300fe5af54ea7..21381de906f6073e32a9f360e456da900dbe4d47 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -17,8 +17,8 @@
  #include <linux/uio.h>
  #include <linux/iversion.h>
  #include <linux/fsverity.h>
-#include <linux/iomap.h>
  #include "ctree.h"
+#include "direct-io.h"
  #include "disk-io.h"
  #include "transaction.h"
  #include "btrfs_inode.h"
@@ -1140,8 +1140,7 @@ static void update_time_for_write(struct inode *inode)
                 inode_inc_iversion(inode);
  }
  
-static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
-                            size_t count)
+int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count)
  {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file_inode(file);
@@ -1187,8 +1186,7 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
         return 0;
  }
  
-static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
-                                              struct iov_iter *i)
+ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
  {
         struct file *file = iocb->ki_filp;
         loff_t pos;
@@ -1451,194 +1449,6 @@ out:
         return num_written ? num_written : ret;
  }
  
-static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
-                              const struct iov_iter *iter, loff_t offset)
-{
-       const u32 blocksize_mask = fs_info->sectorsize - 1;
-
-       if (offset & blocksize_mask)
-               return -EINVAL;
-
-       if (iov_iter_alignment(iter) & blocksize_mask)
-               return -EINVAL;
-
-       return 0;
-}
-
-static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
-{
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = file_inode(file);
-       struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
-       loff_t pos;
-       ssize_t written = 0;
-       ssize_t written_buffered;
-       size_t prev_left = 0;
-       loff_t endbyte;
-       ssize_t ret;
-       unsigned int ilock_flags = 0;
-       struct iomap_dio *dio;
-
-       if (iocb->ki_flags & IOCB_NOWAIT)
-               ilock_flags |= BTRFS_ILOCK_TRY;
-
-       /*
-        * If the write DIO is within EOF, use a shared lock and also only if
-        * security bits will likely not be dropped by file_remove_privs() called
-        * from btrfs_write_check(). Either will need to be rechecked after the
-        * lock was acquired.
-        */
-       if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
-               ilock_flags |= BTRFS_ILOCK_SHARED;
-
-relock:
-       ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
-       if (ret < 0)
-               return ret;
-
-       /* Shared lock cannot be used with security bits set. */
-       if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
-               btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
-               ilock_flags &= ~BTRFS_ILOCK_SHARED;
-               goto relock;
-       }
-
-       ret = generic_write_checks(iocb, from);
-       if (ret <= 0) {
-               btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
-               return ret;
-       }
-
-       ret = btrfs_write_check(iocb, from, ret);
-       if (ret < 0) {
-               btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
-               goto out;
-       }
-
-       pos = iocb->ki_pos;
-       /*
-        * Re-check since file size may have changed just before taking the
-        * lock or pos may have changed because of O_APPEND in generic_write_check()
-        */
-       if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
-           pos + iov_iter_count(from) > i_size_read(inode)) {
-               btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
-               ilock_flags &= ~BTRFS_ILOCK_SHARED;
-               goto relock;
-       }
-
-       if (check_direct_IO(fs_info, from, pos)) {
-               btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
-               goto buffered;
-       }
-
-       /*
-        * The iov_iter can be mapped to the same file range we are writing to.
-        * If that's the case, then we will deadlock in the iomap code, because
-        * it first calls our callback btrfs_dio_iomap_begin(), which will create
-        * an ordered extent, and after that it will fault in the pages that the
-        * iov_iter refers to. During the fault in we end up in the readahead
-        * pages code (starting at btrfs_readahead()), which will lock the range,
-        * find that ordered extent and then wait for it to complete (at
-        * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
-        * obviously the ordered extent can never complete as we didn't submit
-        * yet the respective bio(s). This always happens when the buffer is
-        * memory mapped to the same file range, since the iomap DIO code always
-        * invalidates pages in the target file range (after starting and waiting
-        * for any writeback).
-        *
-        * So here we disable page faults in the iov_iter and then retry if we
-        * got -EFAULT, faulting in the pages before the retry.
-        */
-       from->nofault = true;
-       dio = btrfs_dio_write(iocb, from, written);
-       from->nofault = false;
-
-       /*
-        * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
-        * iocb, and that needs to lock the inode. So unlock it before calling
-        * iomap_dio_complete() to avoid a deadlock.
-        */
-       btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
-
-       if (IS_ERR_OR_NULL(dio))
-               ret = PTR_ERR_OR_ZERO(dio);
-       else
-               ret = iomap_dio_complete(dio);
-
-       /* No increment (+=) because iomap returns a cumulative value. */
-       if (ret > 0)
-               written = ret;
-
-       if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
-               const size_t left = iov_iter_count(from);
-               /*
-                * We have more data left to write. Try to fault in as many as
-                * possible of the remainder pages and retry. We do this without
-                * releasing and locking again the inode, to prevent races with
-                * truncate.
-                *
-                * Also, in case the iov refers to pages in the file range of the
-                * file we want to write to (due to a mmap), we could enter an
-                * infinite loop if we retry after faulting the pages in, since
-                * iomap will invalidate any pages in the range early on, before
-                * it tries to fault in the pages of the iov. So we keep track of
-                * how much was left of iov in the previous EFAULT and fallback
-                * to buffered IO in case we haven't made any progress.
-                */
-               if (left == prev_left) {
-                       ret = -ENOTBLK;
-               } else {
-                       fault_in_iov_iter_readable(from, left);
-                       prev_left = left;
-                       goto relock;
-               }
-       }
-
-       /*
-        * If 'ret' is -ENOTBLK or we have not written all data, then it means
-        * we must fallback to buffered IO.
-        */
-       if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
-               goto out;
-
-buffered:
-       /*
-        * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
-        * it must retry the operation in a context where blocking is acceptable,
-        * because even if we end up not blocking during the buffered IO attempt
-        * below, we will block when flushing and waiting for the IO.
-        */
-       if (iocb->ki_flags & IOCB_NOWAIT) {
-               ret = -EAGAIN;
-               goto out;
-       }
-
-       pos = iocb->ki_pos;
-       written_buffered = btrfs_buffered_write(iocb, from);
-       if (written_buffered < 0) {
-               ret = written_buffered;
-               goto out;
-       }
-       /*
-        * Ensure all data is persisted. We want the next direct IO read to be
-        * able to read what was just written.
-        */
-       endbyte = pos + written_buffered - 1;
-       ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
-       if (ret)
-               goto out;
-       ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
-       if (ret)
-               goto out;
-       written += written_buffered;
-       iocb->ki_pos = pos + written_buffered;
-       invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
-                                endbyte >> PAGE_SHIFT);
-out:
-       return ret < 0 ? ret : written;
-}
-
  static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
                         const struct btrfs_ioctl_encoded_io_args *encoded)
  {
@@ -3914,97 +3724,6 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
         return generic_file_open(inode, filp);
  }
  
-static int check_direct_read(struct btrfs_fs_info *fs_info,
-                            const struct iov_iter *iter, loff_t offset)
-{
-       int ret;
-       int i, seg;
-
-       ret = check_direct_IO(fs_info, iter, offset);
-       if (ret < 0)
-               return ret;
-
-       if (!iter_is_iovec(iter))
-               return 0;
-
-       for (seg = 0; seg < iter->nr_segs; seg++) {
-               for (i = seg + 1; i < iter->nr_segs; i++) {
-                       const struct iovec *iov1 = iter_iov(iter) + seg;
-                       const struct iovec *iov2 = iter_iov(iter) + i;
-
-                       if (iov1->iov_base == iov2->iov_base)
-                               return -EINVAL;
-               }
-       }
-       return 0;
-}
-
-static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
-{
-       struct inode *inode = file_inode(iocb->ki_filp);
-       size_t prev_left = 0;
-       ssize_t read = 0;
-       ssize_t ret;
-
-       if (fsverity_active(inode))
-               return 0;
-
-       if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
-               return 0;
-
-       btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
-again:
-       /*
-        * This is similar to what we do for direct IO writes, see the comment
-        * at btrfs_direct_write(), but we also disable page faults in addition
-        * to disabling them only at the iov_iter level. This is because when
-        * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
-        * which can still trigger page fault ins despite having set ->nofault
-        * to true of our 'to' iov_iter.
-        *
-        * The difference to direct IO writes is that we deadlock when trying
-        * to lock the extent range in the inode's tree during he page reads
-        * triggered by the fault in (while for writes it is due to waiting for
-        * our own ordered extent). This is because for direct IO reads,
-        * btrfs_dio_iomap_begin() returns with the extent range locked, which
-        * is only unlocked in the endio callback (end_bio_extent_readpage()).
-        */
-       pagefault_disable();
-       to->nofault = true;
-       ret = btrfs_dio_read(iocb, to, read);
-       to->nofault = false;
-       pagefault_enable();
-
-       /* No increment (+=) because iomap returns a cumulative value. */
-       if (ret > 0)
-               read = ret;
-
-       if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
-               const size_t left = iov_iter_count(to);
-
-               if (left == prev_left) {
-                       /*
-                        * We didn't make any progress since the last attempt,
-                        * fallback to a buffered read for the remainder of the
-                        * range. This is just to avoid any possibility of looping
-                        * for too long.
-                        */
-                       ret = read;
-               } else {
-                       /*
-                        * We made some progress since the last retry or this is
-                        * the first time we are retrying. Fault in as many pages
-                        * as possible and retry.
-                        */
-                       fault_in_iov_iter_writeable(to, left);
-                       prev_left = left;
-                       goto again;
-               }
-       }
-       btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
-       return ret < 0 ? ret : read;
-}
-
  static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
  {
         ssize_t ret = 0;
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h

index ce93ed7083ab3d551962b457ebee9b7cce0208e2..912254e653cf968a7a75a905d7b55a40be11d16b 100644 (file)
--- a/fs/btrfs/file.h
+++ b/fs/btrfs/file.h
@@ -44,5 +44,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
  bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
                                   struct extent_state **cached_state,
                                   u64 *delalloc_start_ret, u64 *delalloc_end_ret);
+int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count);
+ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i);
  
  #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 87ab9db699760e2badfc974fa0c5720be1d2d078..90f26b0464b885e679bc17fdfcfd8496c46e854b 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -77,25 +77,6 @@ struct btrfs_iget_args {
         struct btrfs_root *root;
  };
  
-struct btrfs_dio_data {
-       ssize_t submitted;
-       struct extent_changeset *data_reserved;
-       struct btrfs_ordered_extent *ordered;
-       bool data_space_reserved;
-       bool nocow_done;
-};
-
-struct btrfs_dio_private {
-       /* Range of I/O */
-       u64 file_offset;
-       u32 bytes;
-
-       /* This must be last */
-       struct btrfs_bio bbio;
-};
-
-static struct bio_set btrfs_dio_bioset;
-
  struct btrfs_rename_ctx {
         /* Output field. Stores the index number of the old directory entry. */
         u64 index;
@@ -138,9 +119,6 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode,
                                      struct page *locked_page, u64 start,
                                      u64 end, struct writeback_control *wbc,
                                      bool pages_dirty);
-static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
-                                      const struct btrfs_file_extent *file_extent,
-                                      int type);
  
  static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
                                           u64 root, void *warn_ctx)
@@ -1205,7 +1183,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
         file_extent.offset = 0;
         file_extent.compression = async_extent->compress_type;
  
-       em = create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
+       em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
         if (IS_ERR(em)) {
                 ret = PTR_ERR(em);
                 goto out_free_reserve;
@@ -1257,8 +1235,8 @@ out_free_reserve:
         kfree(async_extent);
  }
  
-static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
-                                     u64 num_bytes)
+u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
+                                    u64 num_bytes)
  {
         struct extent_map_tree *em_tree = &inode->extent_tree;
         struct extent_map *em;
@@ -1368,7 +1346,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
                 }
         }
  
-       alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
+       alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
  
         /*
          * Relocation relies on the relocated extents to have exactly the same
@@ -1435,7 +1413,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
                 lock_extent(&inode->io_tree, start, start + ram_size - 1,
                             &cached);
  
-               em = create_io_em(inode, start, &file_extent, BTRFS_ORDERED_REGULAR);
+               em = btrfs_create_io_em(inode, start, &file_extent,
+                                       BTRFS_ORDERED_REGULAR);
                 if (IS_ERR(em)) {
                         unlock_extent(&inode->io_tree, start,
                                       start + ram_size - 1, &cached);
@@ -2152,8 +2131,9 @@ must_cow:
                 if (is_prealloc) {
                         struct extent_map *em;
  
-                       em = create_io_em(inode, cur_offset, &nocow_args.file_extent,
-                                         BTRFS_ORDERED_PREALLOC);
+                       em = btrfs_create_io_em(inode, cur_offset,
+                                               &nocow_args.file_extent,
+                                               BTRFS_ORDERED_PREALLOC);
                         if (IS_ERR(em)) {
                                 unlock_extent(&inode->io_tree, cur_offset,
                                               nocow_end, &cached_state);
@@ -2582,44 +2562,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
         }
  }
  
-static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
-                                       struct btrfs_ordered_extent *ordered)
-{
-       u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
-       u64 len = bbio->bio.bi_iter.bi_size;
-       struct btrfs_ordered_extent *new;
-       int ret;
-
-       /* Must always be called for the beginning of an ordered extent. */
-       if (WARN_ON_ONCE(start != ordered->disk_bytenr))
-               return -EINVAL;
-
-       /* No need to split if the ordered extent covers the entire bio. */
-       if (ordered->disk_num_bytes == len) {
-               refcount_inc(&ordered->refs);
-               bbio->ordered = ordered;
-               return 0;
-       }
-
-       /*
-        * Don't split the extent_map for NOCOW extents, as we're writing into
-        * a pre-existing one.
-        */
-       if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
-               ret = split_extent_map(bbio->inode, bbio->file_offset,
-                                      ordered->num_bytes, len,
-                                      ordered->disk_bytenr);
-               if (ret)
-                       return ret;
-       }
-
-       new = btrfs_split_ordered_extent(ordered, len);
-       if (IS_ERR(new))
-               return PTR_ERR(new);
-       bbio->ordered = new;
-       return 0;
-}
-
  /*
   * given a list of ordered sums record them in the inode.  This happens
   * at IO completion time based on sums calculated at bio submission time.
@@ -6995,81 +6937,6 @@ out:
         return em;
  }
  
-static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
-                                                 struct btrfs_dio_data *dio_data,
-                                                 const u64 start,
-                                                 const struct btrfs_file_extent *file_extent,
-                                                 const int type)
-{
-       struct extent_map *em = NULL;
-       struct btrfs_ordered_extent *ordered;
-
-       if (type != BTRFS_ORDERED_NOCOW) {
-               em = create_io_em(inode, start, file_extent, type);
-               if (IS_ERR(em))
-                       goto out;
-       }
-
-       ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
-                                            (1 << type) |
-                                            (1 << BTRFS_ORDERED_DIRECT));
-       if (IS_ERR(ordered)) {
-               if (em) {
-                       free_extent_map(em);
-                       btrfs_drop_extent_map_range(inode, start,
-                                       start + file_extent->num_bytes - 1, false);
-               }
-               em = ERR_CAST(ordered);
-       } else {
-               ASSERT(!dio_data->ordered);
-               dio_data->ordered = ordered;
-       }
- out:
-
-       return em;
-}
-
-static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
-                                                 struct btrfs_dio_data *dio_data,
-                                                 u64 start, u64 len)
-{
-       struct btrfs_root *root = inode->root;
-       struct btrfs_fs_info *fs_info = root->fs_info;
-       struct btrfs_file_extent file_extent;
-       struct extent_map *em;
-       struct btrfs_key ins;
-       u64 alloc_hint;
-       int ret;
-
-       alloc_hint = get_extent_allocation_hint(inode, start, len);
-again:
-       ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
-                                  0, alloc_hint, &ins, 1, 1);
-       if (ret == -EAGAIN) {
-               ASSERT(btrfs_is_zoned(fs_info));
-               wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
-                              TASK_UNINTERRUPTIBLE);
-               goto again;
-       }
-       if (ret)
-               return ERR_PTR(ret);
-
-       file_extent.disk_bytenr = ins.objectid;
-       file_extent.disk_num_bytes = ins.offset;
-       file_extent.num_bytes = ins.offset;
-       file_extent.ram_bytes = ins.offset;
-       file_extent.offset = 0;
-       file_extent.compression = BTRFS_COMPRESS_NONE;
-       em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
-                                    BTRFS_ORDERED_REGULAR);
-       btrfs_dec_block_group_reservations(fs_info, ins.objectid);
-       if (IS_ERR(em))
-               btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
-                                          1);
-
-       return em;
-}
-
  static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
  {
         struct btrfs_block_group *block_group;
@@ -7200,103 +7067,10 @@ out:
         return ret;
  }
  
-static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
-                             struct extent_state **cached_state,
-                             unsigned int iomap_flags)
-{
-       const bool writing = (iomap_flags & IOMAP_WRITE);
-       const bool nowait = (iomap_flags & IOMAP_NOWAIT);
-       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-       struct btrfs_ordered_extent *ordered;
-       int ret = 0;
-
-       while (1) {
-               if (nowait) {
-                       if (!try_lock_extent(io_tree, lockstart, lockend,
-                                            cached_state))
-                               return -EAGAIN;
-               } else {
-                       lock_extent(io_tree, lockstart, lockend, cached_state);
-               }
-               /*
-                * We're concerned with the entire range that we're going to be
-                * doing DIO to, so we need to make sure there's no ordered
-                * extents in this range.
-                */
-               ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
-                                                    lockend - lockstart + 1);
-
-               /*
-                * We need to make sure there are no buffered pages in this
-                * range either, we could have raced between the invalidate in
-                * generic_file_direct_write and locking the extent.  The
-                * invalidate needs to happen so that reads after a write do not
-                * get stale data.
-                */
-               if (!ordered &&
-                   (!writing || !filemap_range_has_page(inode->i_mapping,
-                                                        lockstart, lockend)))
-                       break;
-
-               unlock_extent(io_tree, lockstart, lockend, cached_state);
-
-               if (ordered) {
-                       if (nowait) {
-                               btrfs_put_ordered_extent(ordered);
-                               ret = -EAGAIN;
-                               break;
-                       }
-                       /*
-                        * If we are doing a DIO read and the ordered extent we
-                        * found is for a buffered write, we can not wait for it
-                        * to complete and retry, because if we do so we can
-                        * deadlock with concurrent buffered writes on page
-                        * locks. This happens only if our DIO read covers more
-                        * than one extent map, if at this point has already
-                        * created an ordered extent for a previous extent map
-                        * and locked its range in the inode's io tree, and a
-                        * concurrent write against that previous extent map's
-                        * range and this range started (we unlock the ranges
-                        * in the io tree only when the bios complete and
-                        * buffered writes always lock pages before attempting
-                        * to lock range in the io tree).
-                        */
-                       if (writing ||
-                           test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
-                               btrfs_start_ordered_extent(ordered);
-                       else
-                               ret = nowait ? -EAGAIN : -ENOTBLK;
-                       btrfs_put_ordered_extent(ordered);
-               } else {
-                       /*
-                        * We could trigger writeback for this range (and wait
-                        * for it to complete) and then invalidate the pages for
-                        * this range (through invalidate_inode_pages2_range()),
-                        * but that can lead us to a deadlock with a concurrent
-                        * call to readahead (a buffered read or a defrag call
-                        * triggered a readahead) on a page lock due to an
-                        * ordered dio extent we created before but did not have
-                        * yet a corresponding bio submitted (whence it can not
-                        * complete), which makes readahead wait for that
-                        * ordered extent to complete while holding a lock on
-                        * that page.
-                        */
-                       ret = nowait ? -EAGAIN : -ENOTBLK;
-               }
-
-               if (ret)
-                       break;
-
-               cond_resched();
-       }
-
-       return ret;
-}
-
  /* The callers of this must take lock_extent() */
-static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
-                                      const struct btrfs_file_extent *file_extent,
-                                      int type)
+struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
+                                     const struct btrfs_file_extent *file_extent,
+                                     int type)
  {
         struct extent_map *em;
         int ret;
@@ -7363,527 +7137,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
         return em;
  }
  
-
-static int btrfs_get_blocks_direct_write(struct extent_map **map,
-                                        struct inode *inode,
-                                        struct btrfs_dio_data *dio_data,
-                                        u64 start, u64 *lenp,
-                                        unsigned int iomap_flags)
-{
-       const bool nowait = (iomap_flags & IOMAP_NOWAIT);
-       struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
-       struct btrfs_file_extent file_extent;
-       struct extent_map *em = *map;
-       int type;
-       u64 block_start;
-       struct btrfs_block_group *bg;
-       bool can_nocow = false;
-       bool space_reserved = false;
-       u64 len = *lenp;
-       u64 prev_len;
-       int ret = 0;
-
-       /*
-        * We don't allocate a new extent in the following cases
-        *
-        * 1) The inode is marked as NODATACOW. In this case we'll just use the
-        * existing extent.
-        * 2) The extent is marked as PREALLOC. We're good to go here and can
-        * just use the extent.
-        *
-        */
-       if ((em->flags & EXTENT_FLAG_PREALLOC) ||
-           ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
-            em->disk_bytenr != EXTENT_MAP_HOLE)) {
-               if (em->flags & EXTENT_FLAG_PREALLOC)
-                       type = BTRFS_ORDERED_PREALLOC;
-               else
-                       type = BTRFS_ORDERED_NOCOW;
-               len = min(len, em->len - (start - em->start));
-               block_start = extent_map_block_start(em) + (start - em->start);
-
-               if (can_nocow_extent(inode, start, &len,
-                                    &file_extent, false, false) == 1) {
-                       bg = btrfs_inc_nocow_writers(fs_info, block_start);
-                       if (bg)
-                               can_nocow = true;
-               }
-       }
-
-       prev_len = len;
-       if (can_nocow) {
-               struct extent_map *em2;
-
-               /* We can NOCOW, so only need to reserve metadata space. */
-               ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
-                                                     nowait);
-               if (ret < 0) {
-                       /* Our caller expects us to free the input extent map. */
-                       free_extent_map(em);
-                       *map = NULL;
-                       btrfs_dec_nocow_writers(bg);
-                       if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
-                               ret = -EAGAIN;
-                       goto out;
-               }
-               space_reserved = true;
-
-               em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
-                                             &file_extent, type);
-               btrfs_dec_nocow_writers(bg);
-               if (type == BTRFS_ORDERED_PREALLOC) {
-                       free_extent_map(em);
-                       *map = em2;
-                       em = em2;
-               }
-
-               if (IS_ERR(em2)) {
-                       ret = PTR_ERR(em2);
-                       goto out;
-               }
-
-               dio_data->nocow_done = true;
-       } else {
-               /* Our caller expects us to free the input extent map. */
-               free_extent_map(em);
-               *map = NULL;
-
-               if (nowait) {
-                       ret = -EAGAIN;
-                       goto out;
-               }
-
-               /*
-                * If we could not allocate data space before locking the file
-                * range and we can't do a NOCOW write, then we have to fail.
-                */
-               if (!dio_data->data_space_reserved) {
-                       ret = -ENOSPC;
-                       goto out;
-               }
-
-               /*
-                * We have to COW and we have already reserved data space before,
-                * so now we reserve only metadata.
-                */
-               ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
-                                                     false);
-               if (ret < 0)
-                       goto out;
-               space_reserved = true;
-
-               em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
-               if (IS_ERR(em)) {
-                       ret = PTR_ERR(em);
-                       goto out;
-               }
-               *map = em;
-               len = min(len, em->len - (start - em->start));
-               if (len < prev_len)
-                       btrfs_delalloc_release_metadata(BTRFS_I(inode),
-                                                       prev_len - len, true);
-       }
-
-       /*
-        * We have created our ordered extent, so we can now release our reservation
-        * for an outstanding extent.
-        */
-       btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
-
-       /*
-        * Need to update the i_size under the extent lock so buffered
-        * readers will get the updated i_size when we unlock.
-        */
-       if (start + len > i_size_read(inode))
-               i_size_write(inode, start + len);
-out:
-       if (ret && space_reserved) {
-               btrfs_delalloc_release_extents(BTRFS_I(inode), len);
-               btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
-       }
-       *lenp = len;
-       return ret;
-}
-
-static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
-               loff_t length, unsigned int flags, struct iomap *iomap,
-               struct iomap *srcmap)
-{
-       struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
-       struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
-       struct extent_map *em;
-       struct extent_state *cached_state = NULL;
-       struct btrfs_dio_data *dio_data = iter->private;
-       u64 lockstart, lockend;
-       const bool write = !!(flags & IOMAP_WRITE);
-       int ret = 0;
-       u64 len = length;
-       const u64 data_alloc_len = length;
-       bool unlock_extents = false;
-
-       /*
-        * We could potentially fault if we have a buffer > PAGE_SIZE, and if
-        * we're NOWAIT we may submit a bio for a partial range and return
-        * EIOCBQUEUED, which would result in an errant short read.
-        *
-        * The best way to handle this would be to allow for partial completions
-        * of iocb's, so we could submit the partial bio, return and fault in
-        * the rest of the pages, and then submit the io for the rest of the
-        * range.  However we don't have that currently, so simply return
-        * -EAGAIN at this point so that the normal path is used.
-        */
-       if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
-               return -EAGAIN;
-
-       /*
-        * Cap the size of reads to that usually seen in buffered I/O as we need
-        * to allocate a contiguous array for the checksums.
-        */
-       if (!write)
-               len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
-
-       lockstart = start;
-       lockend = start + len - 1;
-
-       /*
-        * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
-        * enough if we've written compressed pages to this area, so we need to
-        * flush the dirty pages again to make absolutely sure that any
-        * outstanding dirty pages are on disk - the first flush only starts
-        * compression on the data, while keeping the pages locked, so by the
-        * time the second flush returns we know bios for the compressed pages
-        * were submitted and finished, and the pages no longer under writeback.
-        *
-        * If we have a NOWAIT request and we have any pages in the range that
-        * are locked, likely due to compression still in progress, we don't want
-        * to block on page locks. We also don't want to block on pages marked as
-        * dirty or under writeback (same as for the non-compression case).
-        * iomap_dio_rw() did the same check, but after that and before we got
-        * here, mmap'ed writes may have happened or buffered reads started
-        * (readpage() and readahead(), which lock pages), as we haven't locked
-        * the file range yet.
-        */
-       if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-                    &BTRFS_I(inode)->runtime_flags)) {
-               if (flags & IOMAP_NOWAIT) {
-                       if (filemap_range_needs_writeback(inode->i_mapping,
-                                                         lockstart, lockend))
-                               return -EAGAIN;
-               } else {
-                       ret = filemap_fdatawrite_range(inode->i_mapping, start,
-                                                      start + length - 1);
-                       if (ret)
-                               return ret;
-               }
-       }
-
-       memset(dio_data, 0, sizeof(*dio_data));
-
-       /*
-        * We always try to allocate data space and must do it before locking
-        * the file range, to avoid deadlocks with concurrent writes to the same
-        * range if the range has several extents and the writes don't expand the
-        * current i_size (the inode lock is taken in shared mode). If we fail to
-        * allocate data space here we continue and later, after locking the
-        * file range, we fail with ENOSPC only if we figure out we can not do a
-        * NOCOW write.
-        */
-       if (write && !(flags & IOMAP_NOWAIT)) {
-               ret = btrfs_check_data_free_space(BTRFS_I(inode),
-                                                 &dio_data->data_reserved,
-                                                 start, data_alloc_len, false);
-               if (!ret)
-                       dio_data->data_space_reserved = true;
-               else if (ret && !(BTRFS_I(inode)->flags &
-                                 (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
-                       goto err;
-       }
-
-       /*
-        * If this errors out it's because we couldn't invalidate pagecache for
-        * this range and we need to fallback to buffered IO, or we are doing a
-        * NOWAIT read/write and we need to block.
-        */
-       ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
-       if (ret < 0)
-               goto err;
-
-       em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
-       if (IS_ERR(em)) {
-               ret = PTR_ERR(em);
-               goto unlock_err;
-       }
-
-       /*
-        * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
-        * io.  INLINE is special, and we could probably kludge it in here, but
-        * it's still buffered so for safety lets just fall back to the generic
-        * buffered path.
-        *
-        * For COMPRESSED we _have_ to read the entire extent in so we can
-        * decompress it, so there will be buffering required no matter what we
-        * do, so go ahead and fallback to buffered.
-        *
-        * We return -ENOTBLK because that's what makes DIO go ahead and go back
-        * to buffered IO.  Don't blame me, this is the price we pay for using
-        * the generic code.
-        */
-       if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
-               free_extent_map(em);
-               /*
-                * If we are in a NOWAIT context, return -EAGAIN in order to
-                * fallback to buffered IO. This is not only because we can
-                * block with buffered IO (no support for NOWAIT semantics at
-                * the moment) but also to avoid returning short reads to user
-                * space - this happens if we were able to read some data from
-                * previous non-compressed extents and then when we fallback to
-                * buffered IO, at btrfs_file_read_iter() by calling
-                * filemap_read(), we fail to fault in pages for the read buffer,
-                * in which case filemap_read() returns a short read (the number
-                * of bytes previously read is > 0, so it does not return -EFAULT).
-                */
-               ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
-               goto unlock_err;
-       }
-
-       len = min(len, em->len - (start - em->start));
-
-       /*
-        * If we have a NOWAIT request and the range contains multiple extents
-        * (or a mix of extents and holes), then we return -EAGAIN to make the
-        * caller fallback to a context where it can do a blocking (without
-        * NOWAIT) request. This way we avoid doing partial IO and returning
-        * success to the caller, which is not optimal for writes and for reads
-        * it can result in unexpected behaviour for an application.
-        *
-        * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
-        * iomap_dio_rw(), we can end up returning less data then what the caller
-        * asked for, resulting in an unexpected, and incorrect, short read.
-        * That is, the caller asked to read N bytes and we return less than that,
-        * which is wrong unless we are crossing EOF. This happens if we get a
-        * page fault error when trying to fault in pages for the buffer that is
-        * associated to the struct iov_iter passed to iomap_dio_rw(), and we
-        * have previously submitted bios for other extents in the range, in
-        * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
-        * those bios have completed by the time we get the page fault error,
-        * which we return back to our caller - we should only return EIOCBQUEUED
-        * after we have submitted bios for all the extents in the range.
-        */
-       if ((flags & IOMAP_NOWAIT) && len < length) {
-               free_extent_map(em);
-               ret = -EAGAIN;
-               goto unlock_err;
-       }
-
-       if (write) {
-               ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
-                                                   start, &len, flags);
-               if (ret < 0)
-                       goto unlock_err;
-               unlock_extents = true;
-               /* Recalc len in case the new em is smaller than requested */
-               len = min(len, em->len - (start - em->start));
-               if (dio_data->data_space_reserved) {
-                       u64 release_offset;
-                       u64 release_len = 0;
-
-                       if (dio_data->nocow_done) {
-                               release_offset = start;
-                               release_len = data_alloc_len;
-                       } else if (len < data_alloc_len) {
-                               release_offset = start + len;
-                               release_len = data_alloc_len - len;
-                       }
-
-                       if (release_len > 0)
-                               btrfs_free_reserved_data_space(BTRFS_I(inode),
-                                                              dio_data->data_reserved,
-                                                              release_offset,
-                                                              release_len);
-               }
-       } else {
-               /*
-                * We need to unlock only the end area that we aren't using.
-                * The rest is going to be unlocked by the endio routine.
-                */
-               lockstart = start + len;
-               if (lockstart < lockend)
-                       unlock_extents = true;
-       }
-
-       if (unlock_extents)
-               unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                             &cached_state);
-       else
-               free_extent_state(cached_state);
-
-       /*
-        * Translate extent map information to iomap.
-        * We trim the extents (and move the addr) even though iomap code does
-        * that, since we have locked only the parts we are performing I/O in.
-        */
-       if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
-           ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
-               iomap->addr = IOMAP_NULL_ADDR;
-               iomap->type = IOMAP_HOLE;
-       } else {
-               iomap->addr = extent_map_block_start(em) + (start - em->start);
-               iomap->type = IOMAP_MAPPED;
-       }
-       iomap->offset = start;
-       iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
-       iomap->length = len;
-       free_extent_map(em);
-
-       return 0;
-
-unlock_err:
-       unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                     &cached_state);
-err:
-       if (dio_data->data_space_reserved) {
-               btrfs_free_reserved_data_space(BTRFS_I(inode),
-                                              dio_data->data_reserved,
-                                              start, data_alloc_len);
-               extent_changeset_free(dio_data->data_reserved);
-       }
-
-       return ret;
-}
-
-static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
-               ssize_t written, unsigned int flags, struct iomap *iomap)
-{
-       struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
-       struct btrfs_dio_data *dio_data = iter->private;
-       size_t submitted = dio_data->submitted;
-       const bool write = !!(flags & IOMAP_WRITE);
-       int ret = 0;
-
-       if (!write && (iomap->type == IOMAP_HOLE)) {
-               /* If reading from a hole, unlock and return */
-               unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
-                             NULL);
-               return 0;
-       }
-
-       if (submitted < length) {
-               pos += submitted;
-               length -= submitted;
-               if (write)
-                       btrfs_finish_ordered_extent(dio_data->ordered, NULL,
-                                                   pos, length, false);
-               else
-                       unlock_extent(&BTRFS_I(inode)->io_tree, pos,
-                                     pos + length - 1, NULL);
-               ret = -ENOTBLK;
-       }
-       if (write) {
-               btrfs_put_ordered_extent(dio_data->ordered);
-               dio_data->ordered = NULL;
-       }
-
-       if (write)
-               extent_changeset_free(dio_data->data_reserved);
-       return ret;
-}
-
-static void btrfs_dio_end_io(struct btrfs_bio *bbio)
-{
-       struct btrfs_dio_private *dip =
-               container_of(bbio, struct btrfs_dio_private, bbio);
-       struct btrfs_inode *inode = bbio->inode;
-       struct bio *bio = &bbio->bio;
-
-       if (bio->bi_status) {
-               btrfs_warn(inode->root->fs_info,
-               "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
-                          btrfs_ino(inode), bio->bi_opf,
-                          dip->file_offset, dip->bytes, bio->bi_status);
-       }
-
-       if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
-               btrfs_finish_ordered_extent(bbio->ordered, NULL,
-                                           dip->file_offset, dip->bytes,
-                                           !bio->bi_status);
-       } else {
-               unlock_extent(&inode->io_tree, dip->file_offset,
-                             dip->file_offset + dip->bytes - 1, NULL);
-       }
-
-       bbio->bio.bi_private = bbio->private;
-       iomap_dio_bio_end_io(bio);
-}
-
-static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
-                               loff_t file_offset)
-{
-       struct btrfs_bio *bbio = btrfs_bio(bio);
-       struct btrfs_dio_private *dip =
-               container_of(bbio, struct btrfs_dio_private, bbio);
-       struct btrfs_dio_data *dio_data = iter->private;
-
-       btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
-                      btrfs_dio_end_io, bio->bi_private);
-       bbio->inode = BTRFS_I(iter->inode);
-       bbio->file_offset = file_offset;
-
-       dip->file_offset = file_offset;
-       dip->bytes = bio->bi_iter.bi_size;
-
-       dio_data->submitted += bio->bi_iter.bi_size;
-
-       /*
-        * Check if we are doing a partial write.  If we are, we need to split
-        * the ordered extent to match the submitted bio.  Hang on to the
-        * remaining unfinishable ordered_extent in dio_data so that it can be
-        * cancelled in iomap_end to avoid a deadlock wherein faulting the
-        * remaining pages is blocked on the outstanding ordered extent.
-        */
-       if (iter->flags & IOMAP_WRITE) {
-               int ret;
-
-               ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
-               if (ret) {
-                       btrfs_finish_ordered_extent(dio_data->ordered, NULL,
-                                                   file_offset, dip->bytes,
-                                                   !ret);
-                       bio->bi_status = errno_to_blk_status(ret);
-                       iomap_dio_bio_end_io(bio);
-                       return;
-               }
-       }
-
-       btrfs_submit_bio(bbio, 0);
-}
-
-static const struct iomap_ops btrfs_dio_iomap_ops = {
-       .iomap_begin            = btrfs_dio_iomap_begin,
-       .iomap_end              = btrfs_dio_iomap_end,
-};
-
-static const struct iomap_dio_ops btrfs_dio_ops = {
-       .submit_io              = btrfs_dio_submit_io,
-       .bio_set                = &btrfs_dio_bioset,
-};
-
-ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
-{
-       struct btrfs_dio_data data = { 0 };
-
-       return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
-                           IOMAP_DIO_PARTIAL, &data, done_before);
-}
-
-struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
-                                 size_t done_before)
-{
-       struct btrfs_dio_data data = { 0 };
-
-       return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
-                           IOMAP_DIO_PARTIAL, &data, done_before);
-}
-
  /*
   * For release_folio() and invalidate_folio() we have a race window where
   * folio_end_writeback() is called but the subpage spinlock is not yet released.
@@ -8503,7 +7756,6 @@ void __cold btrfs_destroy_cachep(void)
          * destroy cache.
          */
         rcu_barrier();
-       bioset_exit(&btrfs_dio_bioset);
         kmem_cache_destroy(btrfs_inode_cachep);
  }
  
@@ -8514,17 +7766,9 @@ int __init btrfs_init_cachep(void)
                         SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
                         init_once);
         if (!btrfs_inode_cachep)
-               goto fail;
-
-       if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
-                       offsetof(struct btrfs_dio_private, bbio.bio),
-                       BIOSET_NEED_BVECS))
-               goto fail;
+               return -ENOMEM;
  
         return 0;
-fail:
-       btrfs_destroy_cachep();
-       return -ENOMEM;
  }
  
  static int btrfs_getattr(struct mnt_idmap *idmap,
@@ -10267,7 +9511,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
         file_extent.ram_bytes = ram_bytes;
         file_extent.offset = encoded->unencoded_offset;
         file_extent.compression = compression;
-       em = create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
+       em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
         if (IS_ERR(em)) {
                 ret = PTR_ERR(em);
                 goto out_free_reserved;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c

index 715686e8d4cbffdb9e431cc2a6de2c6ef18237fc..5450a01cb69c372c73120f945b7f7bb409e9fe94 100644 (file)
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -34,6 +34,7 @@
  #include "disk-io.h"
  #include "transaction.h"
  #include "btrfs_inode.h"
+#include "direct-io.h"
  #include "props.h"
  #include "xattr.h"
  #include "bio.h"
@@ -2489,6 +2490,9 @@ static const struct init_sequence mod_init_seq[] = {
         }, {
                 .init_func = btrfs_init_cachep,
                 .exit_func = btrfs_destroy_cachep,
+       }, {
+               .init_func = btrfs_init_dio,
+               .exit_func = btrfs_destroy_dio,
         }, {
                 .init_func = btrfs_transaction_init,
                 .exit_func = btrfs_transaction_exit,
author	Filipe Manana <fdmanana@suse.com>
	Tue, 25 Jun 2024 14:52:28 +0000 (15:52 +0100)
committer	David Sterba <dsterba@suse.com>
	Thu, 11 Jul 2024 13:33:29 +0000 (15:33 +0200)
fs/btrfs/Makefile		patch \| blob \| history
fs/btrfs/btrfs_inode.h		patch \| blob \| history
fs/btrfs/direct-io.c	[new file with mode: 0644]	patch \| blob
fs/btrfs/direct-io.h	[new file with mode: 0644]	patch \| blob
fs/btrfs/file.c		patch \| blob \| history
fs/btrfs/file.h		patch \| blob \| history
fs/btrfs/inode.c		patch \| blob \| history
fs/btrfs/super.c		patch \| blob \| history