From 287d1cf303dc1521de531b63f4123ba9f5b792dc Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 24 Sep 2024 17:50:34 +0100 Subject: [PATCH 01/16] btrfs: remove pointless initialization at btrfs_qgroup_trace_extent() The qgroup record was allocated with kzalloc(), so it's pointless to set its old_roots member to NULL. Remove the assignment. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f7b05c7ee428..8773f989c3be 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2156,7 +2156,6 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, } record->num_bytes = num_bytes; - record->old_roots = NULL; ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr); if (ret) { -- 2.50.1 From 522945b3424297699a5d62e0f39f9537a2a451a2 Mon Sep 17 00:00:00 2001 From: Riyan Dhiman Date: Thu, 26 Sep 2024 13:20:34 +0530 Subject: [PATCH 02/16] btrfs: remove redundant stop_loop variable in scrub_stripe() The variable stop_loop was originally introduced in commit 625f1c8dc66d7 ("Btrfs: improve the loop of scrub_stripe"). It was initialized to 0 in commit 3b080b2564287 ("Btrfs: scrub raid56 stripes in the right way"). However, in a later commit 18d30ab961497 ("btrfs: scrub: use scrub_simple_mirror() to handle RAID56 data stripe scrub"), the code that modified stop_loop was removed, making the variable redundant. Currently, stop_loop is only initialized with 0 and is never used or modified within the scrub_stripe() function. As a result, this patch removes the stop_loop variable to clean up the code and eliminate unnecessary redundancy. This change has no impact on functionality, as stop_loop was never utilized in any meaningful way in the final version of the code. Reviewed-by: Filipe Manana Signed-off-by: Riyan Dhiman Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3a3427428074..43431065d981 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2256,7 +2256,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* Offset inside the chunk */ u64 offset; u64 stripe_logical; - int stop_loop = 0; /* Extent_path should be released by now. */ ASSERT(sctx->extent_path.nodes[0] == NULL); @@ -2370,14 +2369,8 @@ next: logical += increment; physical += BTRFS_STRIPE_LEN; spin_lock(&sctx->stat_lock); - if (stop_loop) - sctx->stat.last_physical = - map->stripes[stripe_index].physical + dev_stripe_len; - else - sctx->stat.last_physical = physical; + sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); - if (stop_loop) - break; } out: ret2 = flush_scrub_stripes(sctx); -- 2.50.1 From fa984c9e625e4d8375024e949c13fd3bb48a350b Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Tue, 24 Sep 2024 10:31:35 +0800 Subject: [PATCH 03/16] btrfs: remove unused page_to_inode and page_to_fs_info macros This macro is no longer used after the "btrfs: Cleaned up folio->page conversion" series patch [1] was applied, so remove it. [1]: https://patchwork.kernel.org/project/linux-btrfs/cover/20240828182908.3735344-1-lizetao1@huawei.com/ Reviewed-by: Neal Gompa Signed-off-by: Youling Tang Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/fs.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 785ec15c1b84..85e6a644aff2 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -878,12 +878,9 @@ struct btrfs_fs_info { #endif }; -#define page_to_inode(_page) (BTRFS_I(_Generic((_page), \ - struct page *: (_page))->mapping->host)) #define folio_to_inode(_folio) (BTRFS_I(_Generic((_folio), \ struct folio *: (_folio))->mapping->host)) -#define page_to_fs_info(_page) (page_to_inode(_page)->root->fs_info) #define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info) #define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \ -- 2.50.1 From 2144e1f23f986977acf0ff695a93931517c374d8 Mon Sep 17 00:00:00 2001 From: Shen Lichuan Date: Tue, 24 Sep 2024 11:09:44 +0800 Subject: [PATCH 04/16] btrfs: correct typos in multiple comments across various files Fix some confusing spelling errors that were currently identified, the details are as follows: block-group.c: 2800: uncompressible ==> incompressible extent-tree.c: 3131: EXTEMT ==> EXTENT extent_io.c: 3124: utlizing ==> utilizing extent_map.c: 1323: ealier ==> earlier extent_map.c: 1325: possiblity ==> possibility fiemap.c: 189: emmitted ==> emitted fiemap.c: 197: emmitted ==> emitted fiemap.c: 203: emmitted ==> emitted transaction.h: 36: trasaction ==> transaction volumes.c: 5312: filesysmte ==> filesystem zoned.c: 1977: trasnsaction ==> transaction Signed-off-by: Shen Lichuan Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 +- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/extent_map.c | 4 ++-- fs/btrfs/fiemap.c | 6 +++--- fs/btrfs/inode.c | 2 +- fs/btrfs/qgroup.c | 2 +- fs/btrfs/scrub.c | 2 +- fs/btrfs/space-info.c | 2 +- fs/btrfs/transaction.h | 2 +- fs/btrfs/volumes.c | 2 +- fs/btrfs/zoned.c | 2 +- 13 files changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 4423d8b716a5..4427c1b835e8 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2797,7 +2797,7 @@ next: * uncompressed data size, because the compression is only done * when writeback triggered and we don't know how much space we * are actually going to need, so we reserve the uncompressed - * size because the data may be uncompressible in the worst case. + * size because the data may be incompressible in the worst case. */ if (ret == 0) { bool used; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 604399e59a3d..ac8e97ed13f7 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -45,7 +45,7 @@ * * - Copy existing extents * - * This happens by re-using scrub facility, as scrub also iterates through + * This happens by reusing scrub facility, as scrub also iterates through * existing extents from commit root. * * Location: scrub_write_block_to_dev_replace() from diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d9f511babd89..79373f0ab6ce 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3144,7 +3144,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, break; } - /* Quick path didn't find the EXTEMT/METADATA_ITEM */ + /* Quick path didn't find the EXTENT/METADATA_ITEM */ if (path->slots[0] - extent_slot > 5) break; extent_slot--; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 89a7e85f2b38..6aa39e0be2e8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3186,7 +3186,7 @@ out: } /* * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, - * so it can be cleaned up without utlizing page->mapping. + * so it can be cleaned up without utilizing page->mapping. */ set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 1d93e1202c33..a8b86f12b00d 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1326,9 +1326,9 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) * not possible to know which task made more progress because we can * cycle back to the first root and first inode if it's not the first * time the shrinker ran, see the above logic. Also a task that started - * later may finish ealier than another task and made less progress. So + * later may finish earlier than another task and made less progress. So * make this simple and update to the progress of the last task that - * finished, with the occasional possiblity of having two consecutive + * finished, with the occasional possibility of having two consecutive * runs of the shrinker process the same inodes. */ spin_lock(&fs_info->extent_map_shrinker_lock); diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c index df7f09f3b02e..b80c07ad8c5e 100644 --- a/fs/btrfs/fiemap.c +++ b/fs/btrfs/fiemap.c @@ -186,7 +186,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * we have in the cache is the last delalloc range we * found while the file extent item we found can be * either for a whole delalloc range we previously - * emmitted or only a part of that range. + * emitted or only a part of that range. * * We have two cases here: * @@ -194,13 +194,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * cached extent's end. In this case just ignore the * current file extent item because we don't want to * overlap with previous ranges that may have been - * emmitted already; + * emitted already; * * 2) The file extent item starts behind the currently * cached extent but its end offset goes beyond the * end offset of the cached extent. We don't want to * overlap with a previous range that may have been - * emmitted already, so we emit the currently cached + * emitted already, so we emit the currently cached * extent and then partially store the current file * extent item's range in the cache, for the subrange * going the cached extent's end to the end of the diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 20336b1bf4a5..5d8da882d487 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5987,7 +5987,7 @@ again: * offset. This means that new entries created during readdir * are *guaranteed* to be seen in the future by that readdir. * This has broken buggy programs which operate on names as - * they're returned by readdir. Until we re-use freed offsets + * they're returned by readdir. Until we reuse freed offsets * we have this hack to stop new entries from being returned * under the assumption that they'll never reach this huge * offset. diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 8773f989c3be..4276c4607c56 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -469,7 +469,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) /* * If a qgroup exists for a subvolume ID, it is possible * that subvolume has been deleted, in which case - * re-using that ID would lead to incorrect accounting. + * reusing that ID would lead to incorrect accounting. * * Ensure that we skip any such subvol ids. * diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 43431065d981..e141132b5c8d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1954,7 +1954,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ASSERT(sctx->raid56_data_stripes); /* - * For data stripe search, we cannot re-use the same extent/csum paths, + * For data stripe search, we cannot reuse the same extent/csum paths, * as the data stripe bytenr may be smaller than previous extent. Thus * we have to use our own extent/csum paths. */ diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d5a9cd8a4fd8..ee23fae73f47 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1279,7 +1279,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) * If we are freeing inodes, we want to make sure all delayed iputs have * completed, because they could have been on an inode with i_nlink == 0, and * thus have been truncated and freed up space. But again this space is not - * immediately re-usable, it comes in the form of a delayed ref, which must be + * immediately reusable, it comes in the form of a delayed ref, which must be * run and then the transaction must be committed. * * COMMIT_TRANS diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index dd9ce9b9f69e..184fa5c0062a 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -33,7 +33,7 @@ struct btrfs_path; */ #define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1) -/* Radix-tree tag for roots that are part of the trasaction. */ +/* Radix-tree tag for roots that are part of the transaction. */ #define BTRFS_ROOT_TRANS_TAG 0 enum btrfs_trans_state { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5895397364aa..82f3a2ed2d9c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5431,7 +5431,7 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; - /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ + /* stripe_size is fixed in zoned filesystem. Reduce ndevs instead. */ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, ctl->stripe_size) + ctl->nparity, diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 69d03feea4e0..dbcbf754d284 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1973,7 +1973,7 @@ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, if (block_group->meta_write_pointer > eb->start) return -EBUSY; - /* If for_sync, this hole will be filled with trasnsaction commit. */ + /* If for_sync, this hole will be filled with transaction commit. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) return -EAGAIN; return -EBUSY; -- 2.50.1 From 506be4d5657569e7683cad9de17f980d264a60ec Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 2 Oct 2024 12:11:48 +0200 Subject: [PATCH 05/16] btrfs: tests: add selftests for raid-stripe-tree Add first stash of very basic self tests for the RAID stripe-tree. More test cases will follow exercising the tree. Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/Makefile | 3 +- fs/btrfs/raid-stripe-tree.c | 5 +- fs/btrfs/raid-stripe-tree.h | 5 + fs/btrfs/tests/btrfs-tests.c | 4 + fs/btrfs/tests/btrfs-tests.h | 2 + fs/btrfs/tests/raid-stripe-tree-tests.c | 317 ++++++++++++++++++++++++ fs/btrfs/volumes.c | 6 +- fs/btrfs/volumes.h | 5 + 8 files changed, 341 insertions(+), 6 deletions(-) create mode 100644 fs/btrfs/tests/raid-stripe-tree-tests.c diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 87617f2968bc..3cfc440c636c 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -43,4 +43,5 @@ btrfs-$(CONFIG_FS_VERITY) += verity.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ - tests/free-space-tree-tests.o tests/extent-map-tests.o + tests/free-space-tree-tests.o tests/extent-map-tests.o \ + tests/raid-stripe-tree-tests.o diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 4c859b550f6c..b7787a8e4af2 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -108,8 +108,9 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans, return ret; } -static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, - struct btrfs_io_context *bioc) +EXPORT_FOR_TESTS +int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_key stripe_key; diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h index 1ac1c21aac2f..541836421778 100644 --- a/fs/btrfs/raid-stripe-tree.h +++ b/fs/btrfs/raid-stripe-tree.h @@ -28,6 +28,11 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *ordered_extent); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_io_context *bioc); +#endif + static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info, u64 map_type) { diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index ce50847e1e01..e607b5d52fb1 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -29,6 +29,7 @@ const char *test_error[] = { [TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group", [TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map", [TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map", + [TEST_ALLOC_IO_CONTEXT] = "cannot allocate io context", }; static const struct super_operations btrfs_test_super_ops = { @@ -291,6 +292,9 @@ int btrfs_run_sanity_tests(void) ret = btrfs_test_free_space_tree(sectorsize, nodesize); if (ret) goto out; + ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize); + if (ret) + goto out; } } ret = btrfs_test_extent_map(); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index dc2f2ab15fa5..b524ecf2f452 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -24,6 +24,7 @@ enum { TEST_ALLOC_BLOCK_GROUP, TEST_ALLOC_EXTENT_MAP, TEST_ALLOC_CHUNK_MAP, + TEST_ALLOC_IO_CONTEXT, }; extern const char *test_error[]; @@ -37,6 +38,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize); int btrfs_test_inodes(u32 sectorsize, u32 nodesize); int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); +int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize); int btrfs_test_extent_map(void); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c new file mode 100644 index 000000000000..b8013ab13c43 --- /dev/null +++ b/fs/btrfs/tests/raid-stripe-tree-tests.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 Western Digital Corporation or its affiliates. + */ + +#include +#include "../fs.h" +#include "../disk-io.h" +#include "../transaction.h" +#include "../volumes.h" +#include "../raid-stripe-tree.h" +#include "btrfs-tests.h" + +#define RST_TEST_NUM_DEVICES (2) +#define RST_TEST_RAID1_TYPE (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_RAID1) + +typedef int (*test_func_t)(struct btrfs_trans_handle *trans); + +static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_devices, + u64 devid) +{ + struct btrfs_device *dev; + + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + if (dev->devid == devid) + return dev; + } + + return NULL; +} + +/* + * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then + * overwrite the whole range giving it new physical address at an offset of 1G. + * The intent of this test is to exercise the 'update_raid_extent_item()' + * function called be btrfs_insert_one_raid_extent(). + */ +static int test_create_update_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + if (!io_stripe.dev) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = SZ_1G + logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("updating RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical + SZ_1G) { + test_err("invalid physical address, expected %llu, got %llu", + logical + SZ_1G, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, len); + if (ret) + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + len); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a simple 64K RST write on a 2 disk RAID1 at a logical address of 1M. + * The "physical" copy on device 0 is at 1M, on device 1 it is at 1G+1M. + */ +static int test_simple_create_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + bioc->map_type = map_type; + bioc->size = SZ_64K; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + if (!io_stripe.dev) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, len); + if (ret) + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + len); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +static const test_func_t tests[] = { + test_simple_create_delete, + test_create_update_delete, +}; + +static int run_test(test_func_t test, u32 sectorsize, u32 nodesize) +{ + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root = NULL; + int ret; + + fs_info = btrfs_alloc_dummy_fs_info(sectorsize, nodesize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + ret = -ENOMEM; + goto out; + } + + root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); + ret = PTR_ERR(root); + goto out; + } + btrfs_set_super_compat_ro_flags(root->fs_info->super_copy, + BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE); + root->root_key.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + fs_info->stripe_root = root; + root->fs_info->tree_root = root; + + root->node = alloc_test_extent_buffer(root->fs_info, nodesize); + if (IS_ERR(root->node)) { + test_std_err(TEST_ALLOC_EXTENT_BUFFER); + ret = PTR_ERR(root->node); + goto out; + } + btrfs_set_header_level(root->node, 0); + btrfs_set_header_nritems(root->node, 0); + root->alloc_bytenr += 2 * nodesize; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_device *dev; + + dev = btrfs_alloc_dummy_device(fs_info); + if (IS_ERR(dev)) { + test_err("cannot allocate device"); + ret = PTR_ERR(dev); + goto out; + } + dev->devid = i; + } + + btrfs_init_dummy_trans(&trans, root->fs_info); + ret = test(&trans); + if (ret) + goto out; + +out: + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); + + return ret; +} + +int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize) +{ + int ret = 0; + + test_msg("running raid-stripe-tree tests"); + for (int i = 0; i < ARRAY_SIZE(tests); i++) { + ret = run_test(tests[i], sectorsize, nodesize); + if (ret) { + test_err("test-case %ps failed with %d\n", tests[i], ret); + goto out; + } + } + +out: + return ret; +} diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 82f3a2ed2d9c..edb2dd6bb4f4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6041,9 +6041,9 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, return preferred_mirror; } -static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, - u64 logical, - u16 total_stripes) +EXPORT_FOR_TESTS +struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + u64 logical, u16 total_stripes) { struct btrfs_io_context *bioc; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 26e35fc1c8fd..3ebb3c2732b0 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -840,4 +840,9 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + u64 logical, u16 total_stripes); +#endif + #endif -- 2.50.1 From 004641bd06405611f5636a0f923c2b4da3e3ef07 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 3 Oct 2024 21:33:19 +0100 Subject: [PATCH 06/16] btrfs: remove unused btrfs_free_squota_rsv() btrfs_free_squota_rsv() was added in commit e85a0adacf17 ("btrfs: ensure releasing squota reserve on head refs") but has remained unused since then. Remove it as we don't seem to need it and was probably a leftover. Reviewed-by: Qu Wenruo Signed-off-by: Dr. David Alan Gilbert Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 11 ----------- fs/btrfs/qgroup.h | 1 - 2 files changed, 12 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 4276c4607c56..5a42fefd3d11 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -4893,17 +4893,6 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) xa_destroy(&trans->delayed_refs.dirty_extents); } -void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes) -{ - if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) - return; - - if (!is_fstree(root)) - return; - - btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA); -} - int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, const struct btrfs_squota_delta *delta) { diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index c36019abc82f..afb184c4d744 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -449,7 +449,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb); void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info); -void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes); int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, const struct btrfs_squota_delta *delta); -- 2.50.1 From 441ffe8a98302f96e497272df1ff6aacd8842d6c Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 3 Oct 2024 15:27:26 +0100 Subject: [PATCH 07/16] btrfs: remove unused btrfs_is_parity_mirror() btrfs_is_parity_mirror() has been unused since commit 4886ff7b50f6 ("btrfs: introduce a new helper to submit write bio for repair"). Remove it as the code was refactored and we don't need the helper anymore. Reviewed-by: Qu Wenruo Signed-off-by: Dr. David Alan Gilbert Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 18 ------------------ fs/btrfs/volumes.h | 2 -- 2 files changed, 20 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index edb2dd6bb4f4..f5bfb4c8adb2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5963,24 +5963,6 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, return len; } -int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) -{ - struct btrfs_chunk_map *map; - int ret = 0; - - if (!btrfs_fs_incompat(fs_info, RAID56)) - return 0; - - map = btrfs_get_chunk_map(fs_info, logical, len); - - if (!WARN_ON(IS_ERR(map))) { - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - ret = 1; - btrfs_free_chunk_map(map); - } - return ret; -} - static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 3ebb3c2732b0..3a416b1bc24c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -741,8 +741,6 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans); void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev); void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev); -int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, - u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map); -- 2.50.1 From b628c139519ae0e5453e5327161a41bae966201d Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 3 Oct 2024 15:27:27 +0100 Subject: [PATCH 08/16] btrfs: remove unused btrfs_try_tree_write_lock() btrfs_try_tree_write_lock() has been unused since commit 50b21d7a066f ("btrfs: submit a writeback bio per extent_buffer"). Remove it as we don't need it anymore. Reviewed-by: Christoph Hellwig Reviewed-by: Qu Wenruo Signed-off-by: Dr. David Alan Gilbert Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/locking.c | 15 --------------- fs/btrfs/locking.h | 1 - include/trace/events/btrfs.h | 1 - 3 files changed, 17 deletions(-) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 6a0b7abb5bd9..9a7a7b723305 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -161,21 +161,6 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) return 0; } -/* - * Try-lock for write. - * - * Return 1 if the rwlock has been taken, 0 otherwise - */ -int btrfs_try_tree_write_lock(struct extent_buffer *eb) -{ - if (down_write_trylock(&eb->lock)) { - btrfs_set_eb_lock_owner(eb, current->pid); - trace_btrfs_try_tree_write_lock(eb); - return 1; - } - return 0; -} - /* * Release read lock. */ diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 3c15c75e0582..46c8be2afab1 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -180,7 +180,6 @@ static inline void btrfs_tree_read_lock(struct extent_buffer *eb) void btrfs_tree_read_unlock(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); -int btrfs_try_tree_write_lock(struct extent_buffer *eb); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 8d2ff32fb3b0..bd515415ea8b 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -2344,7 +2344,6 @@ DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_unlock_blocking); DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_read); DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_write); DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_read_lock); -DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_write_lock); DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_lock_atomic); DECLARE_EVENT_CLASS(btrfs__space_info_update, -- 2.50.1 From 00c5135dceaf57b212a808444d719d321444c819 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 2 Oct 2024 08:47:48 +0930 Subject: [PATCH 09/16] btrfs: remove the dirty_page local variable Inside btrfs_buffered_write(), we have a local variable @dirty_pages, recording the number of pages we dirtied in the current iteration. However we do not really need that variable, since it can be calculated from @pos and @copied. In fact there is already a problem inside the short copy path, where we use @dirty_pages to calculate the range we need to release. But that usage assumes sectorsize == PAGE_SIZE, which is no longer true. Instead of keeping @dirty_pages and cause incorrect usage, just calculate the number of dirtied pages inside btrfs_dirty_pages(). Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/file.c | 19 +++++++------------ fs/btrfs/file.h | 2 +- fs/btrfs/free-space-cache.c | 3 +-- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4fb521d91b06..9555a3485670 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -124,12 +124,14 @@ static void btrfs_drop_pages(struct btrfs_fs_info *fs_info, * - Update inode size for past EOF write */ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, size_t write_bytes, + loff_t pos, size_t write_bytes, struct extent_state **cached, bool noreserve) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; int i; + const int num_pages = (round_up(pos + write_bytes, PAGE_SIZE) - + round_down(pos, PAGE_SIZE)) >> PAGE_SHIFT; u64 num_bytes; u64 start_pos; u64 end_of_last_block; @@ -1242,7 +1244,6 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) offset); size_t num_pages; size_t reserve_bytes; - size_t dirty_pages; size_t copied; size_t dirty_sectors; size_t num_sectors; @@ -1361,11 +1362,8 @@ again: if (copied == 0) { force_page_uptodate = true; dirty_sectors = 0; - dirty_pages = 0; } else { force_page_uptodate = false; - dirty_pages = DIV_ROUND_UP(copied + offset, - PAGE_SIZE); } if (num_sectors > dirty_sectors) { @@ -1375,13 +1373,10 @@ again: btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes, true); } else { - u64 __pos; - - __pos = round_down(pos, - fs_info->sectorsize) + - (dirty_pages << PAGE_SHIFT); + u64 release_start = round_up(pos + copied, + fs_info->sectorsize); btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, __pos, + data_reserved, release_start, release_bytes, true); } } @@ -1390,7 +1385,7 @@ again: fs_info->sectorsize); ret = btrfs_dirty_pages(BTRFS_I(inode), pages, - dirty_pages, pos, copied, + pos, copied, &cached_state, only_release_metadata); /* diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h index 912254e653cf..c23d0bf42598 100644 --- a/fs/btrfs/file.h +++ b/fs/btrfs/file.h @@ -35,7 +35,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded); int btrfs_release_file(struct inode *inode, struct file *file); int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, size_t write_bytes, + loff_t pos, size_t write_bytes, struct extent_state **cached, bool noreserve); int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end); int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f4bcb2530660..0d2db205b9f6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1457,8 +1457,7 @@ static int __btrfs_write_out_cache(struct inode *inode, io_ctl_zero_remaining_pages(io_ctl); /* Everything is written out, now we dirty the pages in the file. */ - ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, - io_ctl->num_pages, 0, i_size_read(inode), + ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, 0, i_size_read(inode), &cached_state, false); if (ret) goto out_nospc; -- 2.50.1 From 7f91c6a78a0e0125e69f6aef05914aeb2d91a2eb Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 2 Oct 2024 08:47:49 +0930 Subject: [PATCH 10/16] btrfs: simplify the page uptodate preparation for prepare_pages() Currently inside prepare_pages(), we handle the leading and tailing page differently, and skip the middle pages (if any). This is to avoid reading pages which are fully covered by the dirty range. Refactor the code by moving all checks (alignment check, range check, force read check) into prepare_uptodate_page(). So that prepare_pages() only needs to iterate all the pages unconditionally. And since we're here, also update prepare_uptodate_page() to use folio API other than the old page API. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/file.c | 64 +++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9555a3485670..160d77f8eb6f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -858,36 +858,42 @@ out: */ static int prepare_uptodate_page(struct inode *inode, struct page *page, u64 pos, - bool force_uptodate) + u64 len, bool force_uptodate) { struct folio *folio = page_folio(page); + u64 clamp_start = max_t(u64, pos, folio_pos(folio)); + u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); int ret = 0; - if (((pos & (PAGE_SIZE - 1)) || force_uptodate) && - !PageUptodate(page)) { - ret = btrfs_read_folio(NULL, folio); - if (ret) - return ret; - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - return -EIO; - } + if (folio_test_uptodate(folio)) + return 0; - /* - * Since btrfs_read_folio() will unlock the folio before it - * returns, there is a window where btrfs_release_folio() can be - * called to release the page. Here we check both inode - * mapping and PagePrivate() to make sure the page was not - * released. - * - * The private flag check is essential for subpage as we need - * to store extra bitmap using folio private. - */ - if (page->mapping != inode->i_mapping || !folio_test_private(folio)) { - unlock_page(page); - return -EAGAIN; - } + if (!force_uptodate && + IS_ALIGNED(clamp_start, PAGE_SIZE) && + IS_ALIGNED(clamp_end, PAGE_SIZE)) + return 0; + + ret = btrfs_read_folio(NULL, folio); + if (ret) + return ret; + folio_lock(folio); + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); + return -EIO; + } + + /* + * Since btrfs_read_folio() will unlock the folio before it returns, + * there is a window where btrfs_release_folio() can be called to + * release the page. Here we check both inode mapping and page + * private to make sure the page was not released. + * + * The private flag check is essential for subpage as we need to store + * extra bitmap using folio private. + */ + if (page->mapping != inode->i_mapping || !folio_test_private(folio)) { + folio_unlock(folio); + return -EAGAIN; } return 0; } @@ -949,12 +955,8 @@ again: goto fail; } - if (i == 0) - ret = prepare_uptodate_page(inode, pages[i], pos, - force_uptodate); - if (!ret && i == num_pages - 1) - ret = prepare_uptodate_page(inode, pages[i], - pos + write_bytes, false); + ret = prepare_uptodate_page(inode, pages[i], pos, write_bytes, + force_uptodate); if (ret) { put_page(pages[i]); if (!nowait && ret == -EAGAIN) { -- 2.50.1 From 61b4d75e3c5c10d523d95e19728dd5a6e2fd58f9 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 4 Oct 2024 15:19:01 +0200 Subject: [PATCH 11/16] btrfs: handle empty list of NOCOW ordered extents with checksum list Currently we BUG_ON() in btrfs_finish_one_ordered() if we are finishing an ordered extent that is flagged as NOCOW, but it's checksum list is not empty. This is clearly a logic error which we can recover from by aborting the transaction. For developer builds which enable CONFIG_BTRFS_ASSERT, also ASSERT() that the list is empty. Suggested-by: Filipe Manana Reviewed-by: Qu Wenruo Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5d8da882d487..8da5e47db751 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3088,7 +3088,12 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { /* Logic error */ - BUG_ON(!list_empty(&ordered_extent->list)); + ASSERT(list_empty(&ordered_extent->list)); + if (!list_empty(&ordered_extent->list)) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, inode); -- 2.50.1 From 5e72aabc1fffe9d713276974b0533d10354d0a13 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 7 Oct 2024 13:52:47 +0200 Subject: [PATCH 12/16] btrfs: return ENODATA in case RST lookup fails In case a lookup in the RAID stripe-tree fails, return ENODATA instead of ENOENT to better distinguish stripe-tree lookups from other code paths where we return ENOENT. Suggested-by: Josef Bacik Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid-stripe-tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index b7787a8e4af2..41970bbdb05f 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -234,7 +234,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, found_end = found_logical + found_length; if (found_logical > end) { - ret = -ENOENT; + ret = -ENODATA; goto out; } @@ -280,10 +280,10 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, } /* If we're here, we haven't found the requested devid in the stripe. */ - ret = -ENOENT; + ret = -ENODATA; out: if (ret > 0) - ret = -ENOENT; + ret = -ENODATA; if (ret && ret != -EIO && !stripe->rst_search_commit_root) { btrfs_debug(fs_info, "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", -- 2.50.1 From 9fde8a67b9786f31cbc77c23b0e468d259ce82d1 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 7 Oct 2024 13:52:48 +0200 Subject: [PATCH 13/16] btrfs: scrub: skip initial RST lookup errors Performing the initial extent sector read on a RAID stripe-tree backed filesystem with pre-allocated extents will cause the RAID stripe-tree lookup code to return ENODATA, as pre-allocated extents do not have any on-disk bytes and thus no RAID stripe-tree entries. But the current scrub read code marks these extents as errors, because the lookup fails. If btrfs_map_block() returns -ENODATA, it means that the call to btrfs_get_raid_extent_offset() returned -ENODATA, because there is no entry for the corresponding range in the RAID stripe-tree. But as this range is in the extent tree it means we've hit a pre-allocated extent. In this case, don't mark the sector in the stripe's error bitmaps as faulty and carry on to the next. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e141132b5c8d..52e09f307462 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1704,8 +1704,18 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, &stripe_len, &bioc, &io_stripe, &mirror); btrfs_put_bioc(bioc); if (err < 0) { - set_bit(i, &stripe->io_error_bitmap); - set_bit(i, &stripe->error_bitmap); + if (err != -ENODATA) { + /* + * Earlier btrfs_get_raid_extent_offset() + * returned -ENODATA, which means there's + * no entry for the corresponding range + * in the stripe tree. But if it's in + * the extent tree, then it's a preallocated + * extent and not an error. + */ + set_bit(i, &stripe->io_error_bitmap); + set_bit(i, &stripe->error_bitmap); + } continue; } -- 2.50.1 From dd4028315e5dfdfe9ecde68db90681313822f906 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 7 Oct 2024 15:55:43 +0100 Subject: [PATCH 14/16] btrfs: qgroup: run delayed iputs after ordered extent completion When trying to flush qgroups in order to release space we run delayed iputs in order to release space from recently deleted files (their link counted reached zero), and then we start delalloc and wait for any existing ordered extents to complete. However there's a time window here where we end up not doing the final iput on a deleted file which could release necessary space: 1) An unlink operation starts; 2) During the unlink, or right before it completes, delalloc is flushed and an ordered extent is created; 3) When the ordered extent is created, the inode's ref count is incremented (with igrab() at alloc_ordered_extent()); 4) When the unlink finishes it doesn't drop the last reference on the inode and so it doesn't trigger inode eviction to delete all of the inode's items in its root and drop all references on its data extents; 5) Another task enters try_flush_qgroup() to try to release space, it runs all delayed iputs, but there's no delayed iput yet for that deleted file because the ordered extent hasn't completed yet; 6) Then at try_flush_qgroup() we wait for the ordered extent to complete and that results in adding a delayed iput at btrfs_put_ordered_extent() when called from btrfs_finish_one_ordered(); 7) Adding the delayed iput results in waking the cleaner kthread if it's not running already. However it may take some time for it to be scheduled, or it may be running but busy running auto defrag, dropping deleted snapshots or doing other work, so by the time we return from try_flush_qgroup() the space for deleted file isn't released. Improve on this by running delayed iputs only after flushing delalloc and waiting for ordered extent completion. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5a42fefd3d11..3ba650c06bfe 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -4195,13 +4195,20 @@ static int try_flush_qgroup(struct btrfs_root *root) return 0; } - btrfs_run_delayed_iputs(root->fs_info); - btrfs_wait_on_delayed_iputs(root->fs_info); ret = btrfs_start_delalloc_snapshot(root, true); if (ret < 0) goto out; btrfs_wait_ordered_extents(root, U64_MAX, NULL); + /* + * After waiting for ordered extents run delayed iputs in order to free + * space from unlinked files before committing the current transaction, + * as ordered extents may have been holding the last reference of an + * inode and they add a delayed iput when they complete. + */ + btrfs_run_delayed_iputs(root->fs_info); + btrfs_wait_on_delayed_iputs(root->fs_info); + ret = btrfs_commit_current_transaction(root); out: clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); -- 2.50.1 From 0fcaf926ad7650f3f4badaca355e59ebc2773045 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 6 Oct 2024 10:36:20 +1030 Subject: [PATCH 15/16] btrfs: remove btrfs_set_range_writeback() The function btrfs_set_range_writeback() was originally a callback for metadata and data, to mark a range with writeback flag. Then it was converted into a common function call for both metadata and data. From the very beginning, the function had been only called on a full page, later converted to handle range inside a page. But it never needed to handle multiple pages, and since commit 8189197425e7 ("btrfs: refactor __extent_writepage_io() to do sector-by-sector submission") the function was only called on a sector-by-sector basis. This makes the function unnecessary, and can be converted to a simple btrfs_folio_set_writeback() call instead. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 1 - fs/btrfs/extent_io.c | 2 +- fs/btrfs/inode.c | 22 ---------------------- 3 files changed, 1 insertion(+), 24 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index e152fde888fc..c514bab532fa 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -577,7 +577,6 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state struct extent_state *other); void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct extent_state *orig, u64 split); -void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); void btrfs_evict_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6aa39e0be2e8..bfa745258e9b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1360,7 +1360,7 @@ static int submit_one_sector(struct btrfs_inode *inode, * a folio for a range already written to disk. */ btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); - btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1); + btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); /* * Above call should set the whole folio with writeback flag, even * just for a single subpage sector. diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8da5e47db751..3404e7043dac 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8941,28 +8941,6 @@ out_inode: return finish_open_simple(file, ret); } -void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - struct folio *folio; - u32 len; - - ASSERT(end + 1 - start <= U32_MAX); - len = end + 1 - start; - while (index <= end_index) { - folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); - ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */ - - /* This is for data, which doesn't yet support larger folio. */ - ASSERT(folio_order(folio) == 0); - btrfs_folio_set_writeback(fs_info, folio, start, len); - folio_put(folio); - index++; - } -} - int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type) { -- 2.50.1 From 2fac7e163d24f77476399c5e646edcf86db57d0c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 9 Oct 2024 16:30:50 +0200 Subject: [PATCH 16/16] btrfs: zstd: assert the timer pointer in callback Make sure we got the right timer struct for the zstd workspace reclaim work. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/zstd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 15f8a83165a3..5232b56d5892 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -111,6 +111,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; struct list_head *pos, *next; + ASSERT(timer == &wsm.timer); + spin_lock(&wsm.lock); if (list_empty(&wsm.lru_list)) { -- 2.50.1