From aaa5ae8f6d15c7f662b639ac8907c34794a4d52c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 24 Feb 2025 09:15:17 +0100 Subject: [PATCH 01/16] btrfs: use BTRFS_PATH_AUTO_FREE in load_global_roots() This is the trivial pattern for path auto free, initialize at the beginning and free at the end with simple goto -> return conversions. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9c9a48aeac5f..28ff9c524fba 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2196,8 +2196,8 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, static int load_global_roots(struct btrfs_root *tree_root) { - struct btrfs_path *path; - int ret = 0; + BTRFS_PATH_AUTO_FREE(path); + int ret; path = btrfs_alloc_path(); if (!path) @@ -2206,18 +2206,17 @@ static int load_global_roots(struct btrfs_root *tree_root) ret = load_global_roots_objectid(tree_root, path, BTRFS_EXTENT_TREE_OBJECTID, "extent"); if (ret) - goto out; + return ret; ret = load_global_roots_objectid(tree_root, path, BTRFS_CSUM_TREE_OBJECTID, "csum"); if (ret) - goto out; + return ret; if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE)) - goto out; + return ret; ret = load_global_roots_objectid(tree_root, path, BTRFS_FREE_SPACE_TREE_OBJECTID, "free space"); -out: - btrfs_free_path(path); + return ret; } -- 2.51.0 From 72f2bae3c15ce54b0c38d7ee348de1ca74ef6de9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 24 Feb 2025 09:15:17 +0100 Subject: [PATCH 02/16] btrfs: use BTRFS_PATH_AUTO_FREE in btrfs_init_root_free_objectid() This is the trivial pattern for path auto free, initialize at the beginning and free at the end with simple goto -> return conversions. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 28ff9c524fba..52c2335ef62f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4896,7 +4896,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) int btrfs_init_root_free_objectid(struct btrfs_root *root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; struct extent_buffer *l; struct btrfs_key search_key; @@ -4912,14 +4912,13 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) search_key.offset = (u64)-1; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) - goto error; + return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. */ - ret = -EUCLEAN; - goto error; + return -EUCLEAN; } if (path->slots[0] > 0) { slot = path->slots[0] - 1; @@ -4930,10 +4929,8 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) } else { root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; } - ret = 0; -error: - btrfs_free_path(path); - return ret; + + return 0; } int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) -- 2.51.0 From 899c8798b59b16f7ffdc73e3766bfcf126cad483 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 24 Feb 2025 09:15:17 +0100 Subject: [PATCH 03/16] btrfs: use BTRFS_PATH_AUTO_FREE in btrfs_get_name() This is the trivial pattern for path auto free, initialize at the beginning and free at the end with some return simplifications. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/export.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 0c0b8db82df6..a91eaf0ca34e 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -223,7 +223,7 @@ static int btrfs_get_name(struct dentry *parent, char *name, struct btrfs_inode *dir = BTRFS_I(d_inode(parent)); struct btrfs_root *root = dir->root; struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_inode_ref *iref; struct btrfs_root_ref *rref; struct extent_buffer *leaf; @@ -255,15 +255,12 @@ static int btrfs_get_name(struct dentry *parent, char *name, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { - btrfs_free_path(path); return ret; } else if (ret > 0) { - if (ino == BTRFS_FIRST_FREE_OBJECTID) { + if (ino == BTRFS_FIRST_FREE_OBJECTID) path->slots[0]--; - } else { - btrfs_free_path(path); + else return -ENOENT; - } } leaf = path->nodes[0]; @@ -280,7 +277,6 @@ static int btrfs_get_name(struct dentry *parent, char *name, } read_extent_buffer(leaf, name, name_ptr, name_len); - btrfs_free_path(path); /* * have to add the null termination to make sure that reconnect_path -- 2.51.0 From 2267214a05ca845385af500e79f271b8d25fe63c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 24 Feb 2025 09:15:17 +0100 Subject: [PATCH 04/16] btrfs: use BTRFS_PATH_AUTO_FREE in btrfs_lookup_extent_info() This is the trivial pattern for path auto free, initialize at the beginning and free at the end with simple goto -> return conversions. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ea440b87a013..6e82d627f12c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -100,7 +100,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 num_refs; u64 extent_flags; @@ -131,7 +131,7 @@ search_again: extent_root = btrfs_extent_root(fs_info, bytenr); ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto out_free; + return ret; if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) { if (path->slots[0]) { @@ -156,7 +156,7 @@ search_again: "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); - goto out_free; + return ret; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -167,7 +167,7 @@ search_again: "unexpected zero reference count for extent item (%llu %u %llu)", key.objectid, key.type, key.offset); btrfs_abort_transaction(trans, ret); - goto out_free; + return ret; } extent_flags = btrfs_extent_flags(leaf, ei); owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]); @@ -213,8 +213,7 @@ search_again: *flags = extent_flags; if (owning_root) *owning_root = owner; -out_free: - btrfs_free_path(path); + return ret; } -- 2.51.0 From e5344080cfcca41e8d32b3f016a444e5974d6760 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 24 Feb 2025 09:15:17 +0100 Subject: [PATCH 05/16] btrfs: use BTRFS_PATH_AUTO_FREE in run_delayed_extent_op() This is the trivial pattern for path auto free, initialize at the beginning and free at the end with simple goto -> return conversions. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6e82d627f12c..5de1a1293c93 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1626,7 +1626,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_item *ei; struct extent_buffer *leaf; u32 item_size; @@ -1657,7 +1657,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - goto out; + return ret; } else if (ret > 0) { if (metadata) { if (path->slots[0] > 0) { @@ -1683,7 +1683,7 @@ again: btrfs_err(fs_info, "missing extent item for extent %llu num_bytes %llu level %d", head->bytenr, head->num_bytes, head->level); - goto out; + return ret; } } @@ -1696,13 +1696,12 @@ again: "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); - goto out; + return ret; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); __run_delayed_extent_op(extent_op, leaf, ei); -out: - btrfs_free_path(path); + return ret; } -- 2.51.0 From 3349ae34b75c790cf78aed64893599d0bec0e8ec Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 24 Feb 2025 09:15:17 +0100 Subject: [PATCH 06/16] btrfs: use BTRFS_PATH_AUTO_FREE in btrfs_lookup_bio_sums() This is the trivial pattern for path auto free, initialize at the beginning and free at the end. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 2d2244b32506..344b4db487a0 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -341,7 +341,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = &bbio->bio; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; u32 orig_len = bio->bi_iter.bi_size; @@ -373,10 +373,8 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); - if (!bbio->csum) { - btrfs_free_path(path); + if (!bbio->csum) return BLK_STS_RESOURCE; - } } else { bbio->csum = bbio->csum_inline; } @@ -444,7 +442,6 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) bio_offset += count * sectorsize; } - btrfs_free_path(path); return ret; } -- 2.51.0 From c42c0db1bbcca13c130c5f1b1b802e1ded50b2e9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 24 Feb 2025 09:15:17 +0100 Subject: [PATCH 07/16] btrfs: use BTRFS_PATH_AUTO_FREE in btrfs_remove_free_space_inode() This is the trivial pattern for path auto free, initialize at the beginning and free at the end with simple goto -> return conversions. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 056546bf9abd..3095cce904b5 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -244,7 +244,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_block_group *block_group) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret = 0; @@ -257,12 +257,12 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) { if (PTR_ERR(inode) != -ENOENT) ret = PTR_ERR(inode); - goto out; + return ret; } ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { btrfs_add_delayed_iput(BTRFS_I(inode)); - goto out; + return ret; } clear_nlink(inode); /* One for the block groups ref */ @@ -285,12 +285,9 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, if (ret) { if (ret > 0) ret = 0; - goto out; + return ret; } - ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, trans->fs_info->tree_root, path); } int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, -- 2.51.0 From 3bfd9ead8131e02516b3ac96a5df06788485d736 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 24 Feb 2025 09:15:17 +0100 Subject: [PATCH 08/16] btrfs: use BTRFS_PATH_AUTO_FREE in populate_free_space_tree() This is the trivial pattern for path auto free, initialize at the beginning and free at the end with simple goto -> return conversions. This applies to both path and path2. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/free-space-tree.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index cae540ec15ed..d1ec02d5e1f6 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1062,7 +1062,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { struct btrfs_root *extent_root; - struct btrfs_path *path, *path2; + BTRFS_PATH_AUTO_FREE(path); + BTRFS_PATH_AUTO_FREE(path2); struct btrfs_key key; u64 start, end; int ret; @@ -1070,17 +1071,16 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = READA_FORWARD; path2 = btrfs_alloc_path(); - if (!path2) { - btrfs_free_path(path); + if (!path2) return -ENOMEM; - } + + path->reada = READA_FORWARD; ret = add_new_free_space_info(trans, block_group, path2); if (ret) - goto out; + return ret; mutex_lock(&block_group->free_space_lock); @@ -1146,9 +1146,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = 0; out_locked: mutex_unlock(&block_group->free_space_lock); -out: - btrfs_free_path(path2); - btrfs_free_path(path); + return ret; } -- 2.51.0 From 2e70d126f9dcef0efba3711c7da8820ac1d036d2 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 24 Feb 2025 09:15:17 +0100 Subject: [PATCH 09/16] btrfs: use BTRFS_PATH_AUTO_FREE in clear_free_space_tree() This is the trivial pattern for path auto free, initialize at the beginning and free at the end with simple goto -> return conversions. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/free-space-tree.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index d1ec02d5e1f6..589ff73ed13a 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1215,7 +1215,7 @@ out_clear: static int clear_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int nr; int ret; @@ -1231,7 +1231,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; nr = btrfs_header_nritems(path->nodes[0]); if (!nr) @@ -1240,15 +1240,12 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, path->slots[0] = 0; ret = btrfs_del_items(trans, root, path, 0, nr); if (ret) - goto out; + return ret; btrfs_release_path(path); } - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) -- 2.51.0 From 19eaf5fd8c73437bb1f70bc5a062bdc14f152c9a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 24 Feb 2025 09:15:17 +0100 Subject: [PATCH 10/16] btrfs: use BTRFS_PATH_AUTO_FREE in load_free_space_tree() This is the trivial pattern for path auto free, initialize at the beginning and free at the end with simple goto -> return conversions. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/free-space-tree.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 589ff73ed13a..39c6b96a4c25 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1633,9 +1633,8 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group; struct btrfs_free_space_info *info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); u32 extent_count, flags; - int ret; block_group = caching_ctl->block_group; @@ -1652,10 +1651,9 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) path->reada = READA_FORWARD; info = search_free_space_info(NULL, block_group, path, 0); - if (IS_ERR(info)) { - ret = PTR_ERR(info); - goto out; - } + if (IS_ERR(info)) + return PTR_ERR(info); + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); flags = btrfs_free_space_flags(path->nodes[0], info); @@ -1665,11 +1663,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) * there. */ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) - ret = load_free_space_bitmaps(caching_ctl, path, extent_count); + return load_free_space_bitmaps(caching_ctl, path, extent_count); else - ret = load_free_space_extents(caching_ctl, path, extent_count); - -out: - btrfs_free_path(path); - return ret; + return load_free_space_extents(caching_ctl, path, extent_count); } -- 2.51.0 From a66b39f699cbd9125c0f248d18fc1381265ee42c Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 29 Jan 2025 23:21:46 +0800 Subject: [PATCH 11/16] btrfs: sysfs: accept size suffixes for read policy values We now parse human-friendly size values (e.g. '1G', '2M') when setting read policies. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 14f53f757555..974e8a75e3ab 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1342,17 +1342,18 @@ int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) /* Separate value from input in policy:value format. */ value_str = strchr(param, ':'); if (value_str) { - int ret; + char *retptr; *value_str = 0; value_str++; if (!value_ret) return -EINVAL; - ret = kstrtos64(value_str, 10, value_ret); - if (ret) + + *value_ret = memparse(value_str, &retptr); + /* There could be any trailing typos after the value. */ + retptr = skip_spaces(retptr); + if (*retptr != 0 || *value_ret <= 0) return -EINVAL; - if (*value_ret < 0) - return -ERANGE; } #endif -- 2.51.0 From 1a5b5668d711d3d1ef447446beab920826decec3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 15 Nov 2024 19:15:34 +1030 Subject: [PATCH 12/16] btrfs: prevent inline data extents read from touching blocks beyond its range Currently reading an inline data extent will zero out the remaining range in the page. This is not yet causing problems even for block size < page size (subpage) cases because: 1) An inline data extent always starts at file offset 0 Meaning at page read, we always read the inline extent first, before any other blocks in the page. Then later blocks are properly read out and re-fill the zeroed out ranges. 2) Currently btrfs will read out the whole page if a buffered write is not page aligned So a page is either fully uptodate at buffered write time (covers the whole page), or we will read out the whole page first. Meaning there is nothing to lose for such an inline extent read. But it's still not ideal: - We're zeroing out the page twice Once done by read_inline_extent()/uncompress_inline(), once done by btrfs_do_readpage() for ranges beyond i_size. - We're touching blocks that don't belong to the inline extent In the incoming patches, we can have a partial uptodate folio, of which some dirty blocks can exist while the page is not fully uptodate: The page size is 16K and block size is 4K: 0 4K 8K 12K 16K | | |/////////| | And range [8K, 12K) is dirtied by a buffered write, the remaining blocks are not uptodate. If range [0, 4K) contains an inline data extent, and we try to read the whole page, the current behavior will overwrite range [8K, 12K) with zero and cause data loss. So to make the behavior more consistent and in preparation for future changes, limit the inline data extents read to only zero out the range inside the first block, not the whole page. Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 84ee6e5c5a81..97f0d91cda07 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6785,6 +6785,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, { int ret; struct extent_buffer *leaf = path->nodes[0]; + const u32 blocksize = leaf->fs_info->sectorsize; char *tmp; size_t max_size; unsigned long inline_size; @@ -6801,7 +6802,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); - max_size = min_t(unsigned long, PAGE_SIZE, max_size); + max_size = min_t(unsigned long, blocksize, max_size); ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size, max_size); @@ -6813,14 +6814,15 @@ static noinline int uncompress_inline(struct btrfs_path *path, * cover that region here. */ - if (max_size < PAGE_SIZE) - folio_zero_range(folio, max_size, PAGE_SIZE - max_size); + if (max_size < blocksize) + folio_zero_range(folio, max_size, blocksize - max_size); kfree(tmp); return ret; } static int read_inline_extent(struct btrfs_path *path, struct folio *folio) { + const u32 blocksize = path->nodes[0]->fs_info->sectorsize; struct btrfs_file_extent_item *fi; void *kaddr; size_t copy_size; @@ -6835,14 +6837,14 @@ static int read_inline_extent(struct btrfs_path *path, struct folio *folio) if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) return uncompress_inline(path, folio, fi); - copy_size = min_t(u64, PAGE_SIZE, + copy_size = min_t(u64, blocksize, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); kaddr = kmap_local_folio(folio, 0); read_extent_buffer(path->nodes[0], kaddr, btrfs_file_extent_inline_start(fi), copy_size); kunmap_local(kaddr); - if (copy_size < PAGE_SIZE) - folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size); + if (copy_size < blocksize) + folio_zero_range(folio, copy_size, blocksize - copy_size); return 0; } -- 2.51.0 From 0bb067ca64e35536f1f5d9ef6aaafc40f4833623 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 15 Nov 2024 16:33:43 +1030 Subject: [PATCH 13/16] btrfs: fix the qgroup data free range for inline data extents Inside function __cow_file_range_inline() since the inlined data no longer take any data space, we need to free up the reserved space. However the code is still using the old page size == sector size assumption, and will not handle subpage case well. Thankfully it is not going to cause any problems because we have two extra safe nets: - Inline data extents creation is disabled for sector size < page size cases for now But it won't stay that for long. - btrfs_qgroup_free_data() will only clear ranges which have been already reserved So even if we pass a range larger than what we need, it should still be fine, especially there is only reserved space for a single block at file offset 0 of an inline data extent. But just for the sake of consistency, fix the call site to use sectorsize instead of page size. Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 97f0d91cda07..c541183e3d22 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -672,7 +672,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); + btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; -- 2.51.0 From d2da21a6e06c40b9fd380ada93f1b48279e48b16 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 10 Feb 2025 18:52:33 +1030 Subject: [PATCH 14/16] btrfs: introduce a read path dedicated extent lock helper Currently we're using btrfs_lock_and_flush_ordered_range() for both btrfs_read_folio() and btrfs_readahead(), but it has one critical problem for future subpage optimizations: - It will call btrfs_start_ordered_extent() to writeback the involved folios But remember we're calling btrfs_lock_and_flush_ordered_range() at read paths, meaning the folio is already locked by read path. If we really trigger writeback for those already locked folios, this will lead to a deadlock and writeback cannot get the folio lock. Such dead lock is prevented by the fact that btrfs always keeps a dirty folio also uptodate, by either dirtying all blocks of the folio, or by reading the whole folio before dirtying. To prepare for the incoming patch which allows btrfs to skip full folio read if the buffered write is block aligned, we have to start by solving the possible deadlock first. Instead of blindly calling btrfs_start_ordered_extent(), introduce a new helper, which is smarter in the following ways: - Only wait and flush the ordered extent if * The folio doesn't even have private bit set * Part of the blocks of the ordered extent are not uptodate This can happen by: * The folio writeback finished, then got invalidated. There are a lot of reasons that a folio can get invalidated, from memory pressure to direct IO (which invalidates all folios of the range). But OE not yet finished. We have to wait for the ordered extent, as the OE may contain to-be-inserted data checksum. Without waiting, our read can fail due to the missing checksum. But either way, the OE should not need any extra flush inside the locked folio range. - Skip the ordered extent completely if * All the blocks are dirty This happens when OE creation is caused by a folio writeback whose file offset is before our folio. E.g. 16K page size and 4K block size 0 8K 16K 24K 32K |//////////////||///////| | The writeback of folio 0 created an OE for range [0, 24K), but since folio 16K is not fully uptodate, a read is triggered for folio 16K. The writeback will never happen (we're holding the folio lock for read), nor will the OE finish. Thus we must skip the range. * All the blocks are uptodate This happens when the writeback finished, but OE not yet finished. Since the blocks are already uptodate, we can skip the OE range. The new helper lock_extents_for_read() will do a loop for the target range by: 1) Lock the full range 2) If there is no ordered extent in the remaining range, exit 3) If there is an ordered extent that we can skip Skip to the end of the OE, and continue checking We do not trigger writeback nor wait for the OE. 4) If there is an ordered extent that we cannot skip Unlock the whole extent range and start the ordered extent. And also update btrfs_start_ordered_extent() to add two more parameters: @nowriteback_start and @nowriteback_len, to prevent triggering flush for a certain range. This will allow us to handle the following case properly in the future: 16K page size, 4K btrfs block size: 0 4K 8K 12K 16K 20K 24K 28K 32K |/////////////////////////////||////////////////| | | |<-------------------- OE 2 ------------------->| |< OE 1 >| The folio has been written back before, thus we have an OE at [28K, 32K). Although the OE 1 finished its IO, the OE is not yet removed from IO tree. The folio got invalidated after writeback completed and before the ordered extent finished. And [16K, 24K) range is dirty and uptodate, caused by a block aligned buffered write (and future enhancements allowing btrfs to skip full folio read for such case). But writeback for folio 0 has began, thus it generated OE 2, covering range [0, 24K). Since the full folio 16K is not uptodate, if we want to read the folio, the existing btrfs_lock_and_flush_ordered_range() will dead lock, by: btrfs_read_folio() | Folio 16K is already locked |- btrfs_lock_and_flush_ordered_range() |- btrfs_start_ordered_extent() for range [16K, 24K) |- filemap_fdatawrite_range() for range [16K, 24K) |- extent_write_cache_pages() folio_lock() on folio 16K, deadlock. But now we will have the following sequence: btrfs_read_folio() | Folio 16K is already locked |- lock_extents_for_read() |- can_skip_ordered_extent() for range [16K, 24K) | Returned true, the range [16K, 24K) will be skipped. |- can_skip_ordered_extent() for range [28K, 32K) | Returned false. |- btrfs_start_ordered_extent() for range [28K, 32K) with [16K, 32K) as no writeback range No writeback for folio 16K will be triggered. And there will be no more possible deadlock on the same folio. Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 182 +++++++++++++++++++++++++++++++++++++++- fs/btrfs/ordered-data.c | 23 +++-- fs/btrfs/ordered-data.h | 8 +- 3 files changed, 205 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bdf75435b5f9..bfc1882ac439 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1075,6 +1075,184 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, return 0; } +/* + * Check if we can skip waiting the @ordered extent covering the block at @fileoff. + * + * @fileoff: Both input and output. + * Input as the file offset where the check should start at. + * Output as where the next check should start at, + * if the function returns true. + * + * Return true if we can skip to @fileoff. The caller needs to check the new + * @fileoff value to make sure it covers the full range, before skipping the + * full OE. + * + * Return false if we must wait for the ordered extent. + */ +static bool can_skip_one_ordered_range(struct btrfs_inode *inode, + struct btrfs_ordered_extent *ordered, + u64 *fileoff) +{ + const struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct folio *folio; + const u32 blocksize = fs_info->sectorsize; + u64 cur = *fileoff; + bool ret; + + folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT); + + /* + * We should have locked the folio(s) for range [start, end], thus + * there must be a folio and it must be locked. + */ + ASSERT(!IS_ERR(folio)); + ASSERT(folio_test_locked(folio)); + + /* + * There are several cases for the folio and OE combination: + * + * 1) Folio has no private flag + * The OE has all its IO done but not yet finished, and folio got + * invalidated. + * + * Have we have to wait for the OE to finish, as it may contain the + * to-be-inserted data checksum. + * Without the data checksum inserted into the csum tree, read will + * just fail with missing csum. + */ + if (!folio_test_private(folio)) { + ret = false; + goto out; + } + + /* + * 2) The first block is DIRTY. + * + * This means the OE is created by some other folios whose file pos is + * before this one. And since we are holding the folio lock, the writeback + * of this folio cannot start. + * + * We must skip the whole OE, because it will never start until we + * finished our folio read and unlocked the folio. + */ + if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) { + u64 range_len = min(folio_pos(folio) + folio_size(folio), + ordered->file_offset + ordered->num_bytes) - cur; + + ret = true; + /* + * At least inside the folio, all the remaining blocks should + * also be dirty. + */ + ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len)); + *fileoff = ordered->file_offset + ordered->num_bytes; + goto out; + } + + /* + * 3) The first block is uptodate. + * + * At least the first block can be skipped, but we are still not fully + * sure. E.g. if the OE has some other folios in the range that cannot + * be skipped. + * So we return true and update @next_ret to the OE/folio boundary. + */ + if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { + u64 range_len = min(folio_pos(folio) + folio_size(folio), + ordered->file_offset + ordered->num_bytes) - cur; + + /* + * The whole range to the OE end or folio boundary should also + * be uptodate. + */ + ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len)); + ret = true; + *fileoff = cur + range_len; + goto out; + } + + /* + * 4) The first block is not uptodate. + * + * This means the folio is invalidated after the writeback was finished, + * but by some other operations (e.g. block aligned buffered write) the + * folio is inserted into filemap. + * Very much the same as case 1). + */ + ret = false; +out: + folio_put(folio); + return ret; +} + +static bool can_skip_ordered_extent(struct btrfs_inode *inode, + struct btrfs_ordered_extent *ordered, + u64 start, u64 end) +{ + const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1); + u64 cur = max(start, ordered->file_offset); + + while (cur < range_end) { + bool can_skip; + + can_skip = can_skip_one_ordered_range(inode, ordered, &cur); + if (!can_skip) + return false; + } + return true; +} + +/* + * Locking helper to make sure we get a stable view of extent maps for the + * involved range. + * + * This is for folio read paths (read and readahead), thus the involved range + * should have all the folios locked. + */ +static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state) +{ + u64 cur_pos; + + /* Caller must provide a valid @cached_state. */ + ASSERT(cached_state); + + /* The range must at least be page aligned, as all read paths are folio based. */ + ASSERT(IS_ALIGNED(start, PAGE_SIZE)); + ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE)); + +again: + lock_extent(&inode->io_tree, start, end, cached_state); + cur_pos = start; + while (cur_pos < end) { + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_lookup_ordered_range(inode, cur_pos, + end - cur_pos + 1); + /* + * No ordered extents in the range, and we hold the extent lock, + * no one can modify the extent maps in the range, we're safe to return. + */ + if (!ordered) + break; + + /* Check if we can skip waiting for the whole OE. */ + if (can_skip_ordered_extent(inode, ordered, start, end)) { + cur_pos = min(ordered->file_offset + ordered->num_bytes, + end + 1); + btrfs_put_ordered_extent(ordered); + continue; + } + + /* Now wait for the OE to finish. */ + unlock_extent(&inode->io_tree, start, end, cached_state); + btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start); + btrfs_put_ordered_extent(ordered); + /* We have unlocked the whole range, restart from the beginning. */ + goto again; + } +} + int btrfs_read_folio(struct file *file, struct folio *folio) { struct btrfs_inode *inode = folio_to_inode(folio); @@ -1085,7 +1263,7 @@ int btrfs_read_folio(struct file *file, struct folio *folio) struct extent_map *em_cached = NULL; int ret; - btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + lock_extents_for_read(inode, start, end, &cached_state); ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); unlock_extent(&inode->io_tree, start, end, &cached_state); @@ -2370,7 +2548,7 @@ void btrfs_readahead(struct readahead_control *rac) struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; - btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + lock_extents_for_read(inode, start, end, &cached_state); while ((folio = readahead_folio(rac)) != NULL) btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 4aca7475fd82..03c945711003 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -842,10 +842,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, /* * Start IO and wait for a given ordered extent to finish. * - * Wait on page writeback for all the pages in the extent and the IO completion - * code to insert metadata into the btree corresponding to the extent. + * Wait on page writeback for all the pages in the extent but not in + * [@nowriteback_start, @nowriteback_start + @nowriteback_len) and the + * IO completion code to insert metadata into the btree corresponding to the extent. */ -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) +void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry, + u64 nowriteback_start, u32 nowriteback_len) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; @@ -865,8 +867,19 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) * start IO on any dirty ones so the wait doesn't stall waiting * for the flusher thread to find them */ - if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); + if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) { + if (!nowriteback_len) { + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); + } else { + if (start < nowriteback_start) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, + nowriteback_start - 1); + if (nowriteback_start + nowriteback_len < end) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, + nowriteback_start + nowriteback_len, + end); + } + } if (!freespace_inode) btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index be36083297a7..1e6b0b182b29 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -192,7 +192,13 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry); +void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry, + u64 nowriteback_start, u32 nowriteback_len); +static inline void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) +{ + return btrfs_start_ordered_extent_nowriteback(entry, 0, 0); +} + int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -- 2.51.0 From b2e743927fdd7ef83b865cb1a4ffd04faeecbfaa Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 30 Oct 2024 17:04:00 +1030 Subject: [PATCH 15/16] btrfs: make btrfs_do_readpage() to do block-by-block read Currently if btrfs has its block size (the older sector size) smaller than the page size, btrfs_do_readpage() will handle the range extent by extent, this is good for performance as it doesn't need to re-lookup the same extent map again and again. (Although get_extent_map() already does extra cached em check, thus the optimization is not that obvious.) This is totally fine and is a valid optimization, but it has an assumption that there is no partial uptodate range in the page. Meanwhile there is an incoming feature, requiring btrfs to skip the full page read if a buffered write range covers a full block but not a full page. In that case, we can have a page that is partially uptodate, and the current per-extent lookup cannot handle such case. So here we change btrfs_do_readpage() to do block-by-block read, this simplifies the following things: - Remove the need for @iosize variable Because we just use sectorsize as our increment. - Remove @pg_offset, and calculate it inside the loop when needed It's just offset_in_folio(). - Use a for() loop instead of a while() loop This will slightly reduce the read performance for subpage cases, but for the future where we need to skip already uptodate blocks, it should still be worth. For block size == page size, this brings no performance change. Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bfc1882ac439..0dffff4be5c4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -936,14 +936,11 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 start = folio_pos(folio); const u64 end = start + PAGE_SIZE - 1; - u64 cur = start; u64 extent_offset; u64 last_byte = i_size_read(inode); struct extent_map *em; int ret = 0; - size_t pg_offset = 0; - size_t iosize; - size_t blocksize = fs_info->sectorsize; + const size_t blocksize = fs_info->sectorsize; ret = set_folio_extent_mapped(folio); if (ret < 0) { @@ -954,24 +951,23 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { size_t zero_offset = offset_in_folio(folio, last_byte); - if (zero_offset) { - iosize = folio_size(folio) - zero_offset; - folio_zero_range(folio, zero_offset, iosize); - } + if (zero_offset) + folio_zero_range(folio, zero_offset, + folio_size(folio) - zero_offset); } bio_ctrl->end_io_func = end_bbio_data_read; begin_folio_read(fs_info, folio); - while (cur <= end) { + for (u64 cur = start; cur <= end; cur += blocksize) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; + unsigned long pg_offset = offset_in_folio(folio, cur); bool force_bio_submit = false; u64 disk_bytenr; u64 block_start; ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { - iosize = folio_size(folio) - pg_offset; - folio_zero_range(folio, pg_offset, iosize); - end_folio_read(folio, true, cur, iosize); + folio_zero_range(folio, pg_offset, end - cur + 1); + end_folio_read(folio, true, cur, end - cur + 1); break; } em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); @@ -985,8 +981,6 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, compress_type = extent_map_compression(em); - iosize = min(extent_map_end(em) - cur, end - cur + 1); - iosize = ALIGN(iosize, blocksize); if (compress_type != BTRFS_COMPRESS_NONE) disk_bytenr = em->disk_bytenr; else @@ -1044,18 +1038,13 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - folio_zero_range(folio, pg_offset, iosize); - - end_folio_read(folio, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; + folio_zero_range(folio, pg_offset, blocksize); + end_folio_read(folio, true, cur, blocksize); continue; } /* the get_extent function already copied into the folio */ if (block_start == EXTENT_MAP_INLINE) { - end_folio_read(folio, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; + end_folio_read(folio, true, cur, blocksize); continue; } @@ -1066,12 +1055,9 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); - submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize, + submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize, pg_offset); - cur = cur + iosize; - pg_offset += iosize; } - return 0; } -- 2.51.0 From 0d31ca6584f21821c708752d379871b9fce2dc48 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 30 Oct 2024 17:04:02 +1030 Subject: [PATCH 16/16] btrfs: allow buffered write to avoid full page read if it's block aligned [BUG] Since the support of block size (sector size) < page size for btrfs, test case generic/563 fails with 4K block size and 64K page size: --- tests/generic/563.out 2024-04-25 18:13:45.178550333 +0930 +++ /home/adam/xfstests-dev/results//generic/563.out.bad 2024-09-30 09:09:16.155312379 +0930 @@ -3,7 +3,8 @@ read is in range write is in range write -> read/write -read is in range +read has value of 8388608 +read is NOT in range -33792 .. 33792 write is in range ... [CAUSE] The test case creates a 8MiB file, then does buffered write into the 8MiB using 4K block size, to overwrite the whole file. On 4K page sized systems, since the write range covers the full block and page, btrfs will not bother reading the page, just like what XFS and EXT4 do. But on 64K page sized systems, although the 4K sized write is still block aligned, it's not page aligned anymore, thus btrfs will read the full page, which will be accounted by cgroup and fail the test. As the test case itself expects such 4K block aligned write should not trigger any read. Such expected behavior is an optimization to reduce folio reads when possible, and unfortunately btrfs does not implement such optimization. [FIX] To skip the full page read, we need to do the following modification: - Do not trigger full page read as long as the buffered write is block aligned This is pretty simple by modifying the check inside prepare_uptodate_page(). - Skip already uptodate blocks during full page read Or we can lead to the following data corruption: 0 32K 64K |///////| | Where the file range [0, 32K) is dirtied by buffered write, the remaining range [32K, 64K) is not. When reading the full page, since [0,32K) is only dirtied but not written back, there is no data extent map for it, but a hole covering [0, 64k). If we continue reading the full page range [0, 64K), the dirtied range will be filled with 0 (since there is only a hole covering the whole range). This causes the dirtied range to get lost. With this optimization, btrfs can pass generic/563 even if the page size is larger than fs block size. Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 4 ++++ fs/btrfs/file.c | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0dffff4be5c4..2f1df6e29d4a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -970,6 +970,10 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, end_folio_read(folio, true, cur, end - cur + 1); break; } + if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { + end_folio_read(folio, true, cur, blocksize); + continue; + } em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { end_folio_read(folio, false, cur, end + 1 - cur); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f78cfe5dfa2c..643f101c7340 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -804,14 +804,15 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 { u64 clamp_start = max_t(u64, pos, folio_pos(folio)); u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); + const u32 blocksize = inode_to_fs_info(inode)->sectorsize; int ret = 0; if (folio_test_uptodate(folio)) return 0; if (!force_uptodate && - IS_ALIGNED(clamp_start, PAGE_SIZE) && - IS_ALIGNED(clamp_end, PAGE_SIZE)) + IS_ALIGNED(clamp_start, blocksize) && + IS_ALIGNED(clamp_end, blocksize)) return 0; ret = btrfs_read_folio(NULL, folio); -- 2.51.0