From bdf1660b221abb8660a809ab8de6b6b0e6ff0320 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 18 Dec 2024 12:52:28 +0000 Subject: [PATCH 01/16] btrfs: raid-stripe-tree: remove unnecessary call to btrfs_mark_buffer_dirty() The call to btrfs_mark_buffer_dirty() at update_raid_extent_item() is not necessary as we have a path setup for writing with btrfs_search_slot() having a 'cow' argument set to 1. This just makes the code more verbose, confusing and add a little extra overhead and well as increase the module's text size, so remove it. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/raid-stripe-tree.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 45b823a0913a..0bf3c032d9dc 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -169,7 +169,6 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot), item_size); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); return ret; -- 2.51.0 From 5a8293a1cc466e2effce6fd3a61880acea37adf2 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 18 Dec 2024 12:56:38 +0000 Subject: [PATCH 02/16] btrfs: relocation: remove unnecessary calls to btrfs_mark_buffer_dirty() We have several places explicitly calling btrfs_mark_buffer_dirty() but that is not necessarily since the target leaf came from a path that was obtained for a btree search function that modifies the btree, something like btrfs_insert_empty_item() or anything else that ends up calling btrfs_search_slot() with a value of 1 for its 'cow' argument. These just make the code more verbose, confusing and add a little extra overhead and well as increase the module's text size, so remove them. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index cdd9a7b15a11..d4100e58172f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -856,7 +856,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans, u32 i; int ret = 0; int first = 1; - int dirty = 0; if (rc->stage != UPDATE_DATA_PTRS) return 0; @@ -936,7 +935,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans, } btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); - dirty = 1; key.offset -= btrfs_file_extent_offset(leaf, fi); ref.action = BTRFS_ADD_DELAYED_REF; @@ -967,8 +965,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans, break; } } - if (dirty) - btrfs_mark_buffer_dirty(trans, leaf); if (inode) btrfs_add_delayed_iput(inode); return ret; @@ -1161,13 +1157,11 @@ again: */ btrfs_set_node_blockptr(parent, slot, new_bytenr); btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); - btrfs_mark_buffer_dirty(trans, parent); btrfs_set_node_blockptr(path->nodes[level], path->slots[level], old_bytenr); btrfs_set_node_ptr_generation(path->nodes[level], path->slots[level], old_ptr_gen); - btrfs_mark_buffer_dirty(trans, path->nodes[level]); ref.action = BTRFS_ADD_DELAYED_REF; ref.bytenr = old_bytenr; @@ -3728,7 +3722,6 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC); - btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; -- 2.51.0 From 65733e8d6cc334565891719ed6bc9239695d63d9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 18 Dec 2024 12:57:56 +0000 Subject: [PATCH 03/16] btrfs: root-tree: remove unnecessary calls to btrfs_mark_buffer_dirty() We have several places explicitly calling btrfs_mark_buffer_dirty() but that is not necessarily since the target leaf came from a path that was obtained for a btree search function that modifies the btree, something like btrfs_insert_empty_item() or anything else that ends up calling btrfs_search_slot() with a value of 1 for its 'cow' argument. These just make the code more verbose, confusing and add a little extra overhead and well as increase the module's text size, so remove them. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/root-tree.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 33962671a96c..e22e6b06927a 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -197,7 +197,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); write_extent_buffer(l, item, ptr, sizeof(*item)); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); out: btrfs_free_path(path); return ret; @@ -447,7 +446,6 @@ again: btrfs_set_root_ref_name_len(leaf, ref, name->len); ptr = (unsigned long)(ref + 1); write_extent_buffer(leaf, name->name, ptr, name->len); - btrfs_mark_buffer_dirty(trans, leaf); if (key.type == BTRFS_ROOT_BACKREF_KEY) { btrfs_release_path(path); -- 2.51.0 From c9a4390707c8b6a6f022526a5618e76d24bc6cb2 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 18 Dec 2024 12:59:51 +0000 Subject: [PATCH 04/16] btrfs: uuid-tree: remove unnecessary call to btrfs_mark_buffer_dirty() The call to btrfs_mark_buffer_dirty() at btrfs_uuid_tree_add() is not necessary as we have a path setup for writing with btrfs_search_slot() having a 'cow' argument set to 1 (through btrfs_insert_empty_item()). This just makes the code more verbose, confusing and add a little extra overhead and well as increase the module's text size, so remove it. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/uuid-tree.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index aca2861f2187..17b5e81123a1 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -140,8 +140,6 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ ret = 0; subid_le = cpu_to_le64(subid_cpu); write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le)); - btrfs_mark_buffer_dirty(trans, eb); - out: btrfs_free_path(path); return ret; -- 2.51.0 From 1ca4e15f41f2e3ad99016b3b13b3713af9f40eb2 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 18 Dec 2024 13:02:59 +0000 Subject: [PATCH 05/16] btrfs: volumes: remove unnecessary calls to btrfs_mark_buffer_dirty() We have several places explicitly calling btrfs_mark_buffer_dirty() but that is not necessarily since the target leaf came from a path that was obtained for a btree search function that modifies the btree, something like btrfs_insert_empty_item() or anything else that ends up calling btrfs_search_slot() with a value of 1 for its 'cow' argument. These just make the code more verbose, confusing and add a little extra overhead and well as increase the module's text size, so remove them. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a58cf494b3d0..ccbfea163390 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2046,7 +2046,6 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, ptr = btrfs_device_fsid(dev_item); write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, ptr, BTRFS_FSID_SIZE); - btrfs_mark_buffer_dirty(trans, leaf); ret = 0; out: @@ -2742,11 +2741,9 @@ next_slot: device = btrfs_find_device(fs_info->fs_devices, &args); BUG_ON(!device); /* Logic error */ - if (device->fs_devices->seeding) { + if (device->fs_devices->seeding) btrfs_set_device_generation(leaf, dev_item, device->generation); - btrfs_mark_buffer_dirty(trans, leaf); - } path->slots[0]++; goto next_slot; @@ -3039,8 +3036,6 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_device_get_disk_total_bytes(device)); btrfs_set_device_bytes_used(leaf, dev_item, btrfs_device_get_bytes_used(device)); - btrfs_mark_buffer_dirty(trans, leaf); - out: btrfs_free_path(path); return ret; @@ -3749,10 +3744,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info, btrfs_set_balance_meta(leaf, item, &disk_bargs); btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); btrfs_set_balance_sys(leaf, item, &disk_bargs); - btrfs_set_balance_flags(leaf, item, bctl->flags); - - btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); err = btrfs_commit_transaction(trans); @@ -7700,8 +7692,6 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) btrfs_set_dev_stats_value(eb, ptr, i, btrfs_dev_stat_read(device, i)); - btrfs_mark_buffer_dirty(trans, eb); - out: btrfs_free_path(path); return ret; -- 2.51.0 From 74973b45a69b8f805e12e50cf85fa2ad500754f0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 18 Dec 2024 13:04:01 +0000 Subject: [PATCH 06/16] btrfs: xattr: remove unnecessary call to btrfs_mark_buffer_dirty() The call to btrfs_mark_buffer_dirty() at btrfs_setxattr() is not necessary as we have a path setup for writing with btrfs_search_slot() having a 'cow' argument set to 1. This just makes the code more verbose, confusing and add a little extra overhead and well as increase the module's text size, so remove it. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/xattr.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index bc18710d1dcf..3e0edbcf73e1 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -204,7 +204,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, btrfs_set_dir_data_len(leaf, di, size); data_ptr = ((unsigned long)(di + 1)) + name_len; write_extent_buffer(leaf, value, data_ptr, size); - btrfs_mark_buffer_dirty(trans, leaf); } else { /* * Insert, and we had space for the xattr, so path->slots[0] is -- 2.51.0 From a5019b70704a8cbea4c295ae7a61abd87300ff29 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:30 +0800 Subject: [PATCH 07/16] btrfs: initialize fs_devices->fs_info earlier in btrfs_init_devices_late() Currently, fs_devices->fs_info is initialized in btrfs_init_devices_late(), but this occurs too late for find_live_mirror(), which is invoked by load_super_root() much earlier than btrfs_init_devices_late(). Fix this by moving the initialization to open_ctree(), before load_super_root(). Reviewed-by: Naohiro Aota Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 + fs/btrfs/volumes.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 04d68f253940..4928bf2cd07f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3388,6 +3388,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; + fs_info->fs_devices->fs_info = fs_info; /* * Handle the space caching options appropriately now that we have the diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ccbfea163390..e5d5cfb2d239 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7515,8 +7515,6 @@ int btrfs_init_devices_late(struct btrfs_fs_info *fs_info) struct btrfs_device *device; int ret = 0; - fs_devices->fs_info = fs_info; - mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) device->fs_info = fs_info; -- 2.51.0 From 83be7f8b9c24bd040a348577c7c84fd08911707f Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:31 +0800 Subject: [PATCH 08/16] btrfs: sysfs: refactor output formatting in btrfs_read_policy_show() Refactor the logic in btrfs_read_policy_show() for easier extension with more balancing methods. Streamline the space and bracket handling around the active policy without altering the functional output. This is in preparation to add more methods. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 7f09b6c9cc2d..ab18b4e59468 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1316,14 +1316,16 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, int i; for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (policy == i) - ret += sysfs_emit_at(buf, ret, "%s[%s]", - (ret == 0 ? "" : " "), - btrfs_read_policy_name[i]); - else - ret += sysfs_emit_at(buf, ret, "%s%s", - (ret == 0 ? "" : " "), - btrfs_read_policy_name[i]); + if (ret != 0) + ret += sysfs_emit_at(buf, ret, " "); + + if (i == policy) + ret += sysfs_emit_at(buf, ret, "["); + + ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); + + if (i == policy) + ret += sysfs_emit_at(buf, ret, "]"); } ret += sysfs_emit_at(buf, ret, "\n"); -- 2.51.0 From 38cae63137d5e13dc3c2ba88c4f393be4a6bf4bb Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:32 +0800 Subject: [PATCH 09/16] btrfs: sysfs: add btrfs_read_policy_to_enum() helper and refactor read policy store Introduce btrfs_read_policy_to_enum() helper to simplify the conversion of a string read policy to its corresponding enum value. This reduces duplication and improves code clarity in btrfs_read_policy_store(). The parameter is copied locally to allow modification, enabling the separation of the method and its value. This prepares for the addition of more functionality in subsequent patches. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index ab18b4e59468..78b4af72997b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1307,6 +1307,18 @@ BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); static const char * const btrfs_read_policy_name[] = { "pid" }; +static int btrfs_read_policy_to_enum(const char *str) +{ + char param[32] = { 0 }; + + if (!str || strlen(str) == 0) + return 0; + + strncpy(param, str, sizeof(param) - 1); + + return sysfs_match_string(btrfs_read_policy_name, param); +} + static ssize_t btrfs_read_policy_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1338,21 +1350,19 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, const char *buf, size_t len) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); - int i; + int index; - for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (sysfs_streq(buf, btrfs_read_policy_name[i])) { - if (i != READ_ONCE(fs_devices->read_policy)) { - WRITE_ONCE(fs_devices->read_policy, i); - btrfs_info(fs_devices->fs_info, - "read policy set to '%s'", - btrfs_read_policy_name[i]); - } - return len; - } + index = btrfs_read_policy_to_enum(buf); + if (index < 0) + return -EINVAL; + + if (index != READ_ONCE(fs_devices->read_policy)) { + WRITE_ONCE(fs_devices->read_policy, index); + btrfs_info(fs_devices->fs_info, "read policy set to '%s'", + btrfs_read_policy_name[index]); } - return -EINVAL; + return len; } BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store); -- 2.51.0 From b6bed20ed398f71069bfd2cd769bb91fa15859b5 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:33 +0800 Subject: [PATCH 10/16] btrfs: sysfs: handle value associated with read balancing policy Enable specifying additional configuration values along the RAID1 balancing read policy in a single input string. Update btrfs_read_policy_to_enum() to parse and handle a value associated with the policy in the format "policy:value", the value part if present is converted to 64-bit integer. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 78b4af72997b..2880407d0dd3 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1307,15 +1307,34 @@ BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); static const char * const btrfs_read_policy_name[] = { "pid" }; -static int btrfs_read_policy_to_enum(const char *str) +static int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) { char param[32] = { 0 }; + char __maybe_unused *value_str; if (!str || strlen(str) == 0) return 0; strncpy(param, str, sizeof(param) - 1); +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Separate value from input in policy:value format. */ + value_str = strchr(param, ':'); + if (value_str) { + int ret; + + *value_str = 0; + value_str++; + if (!value_ret) + return -EINVAL; + ret = kstrtos64(value_str, 10, value_ret); + if (ret) + return -EINVAL; + if (*value_ret < 0) + return -ERANGE; + } +#endif + return sysfs_match_string(btrfs_read_policy_name, param); } @@ -1351,8 +1370,9 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); int index; + s64 value = -1; - index = btrfs_read_policy_to_enum(buf); + index = btrfs_read_policy_to_enum(buf, &value); if (index < 0) return -EINVAL; -- 2.51.0 From 22fb0d99c90583e5b32a2a54e614bce221d31a8a Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:34 +0800 Subject: [PATCH 11/16] btrfs: add tracking of read blocks for read policy Track number of read blocks in the whole filesystem. The counter is initialized when devices are opened. The counter is increased at btrfs_submit_dev_bio() if the stats tracking is enabled (depends on the read policy). Stats tracking is disabled by default and is enabled through fs_devices::collect_fs_stats when required. The code is not under the EXPERIMENTAL define, as stats can be expanded to include write counts and other performance counters, with the user interface independent of its internal use. This is an in-memory-only feature, not related to the dev error stats. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 8 ++++++++ fs/btrfs/disk-io.c | 5 +++++ fs/btrfs/fs.h | 3 +++ fs/btrfs/volumes.h | 2 ++ 4 files changed, 18 insertions(+) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index bc80ee4f95a5..bc2555c44a12 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -453,6 +453,14 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), dev->devid, bio->bi_iter.bi_size); + /* + * Track reads if tracking is enabled; ignore I/O operations before the + * filesystem is fully initialized. + */ + if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info) + percpu_counter_add(&dev->fs_info->stats_read_blocks, + bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits); + if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) blkcg_punt_bio_submit(bio); else diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4928bf2cd07f..ef3121b55c50 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1258,6 +1258,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { struct percpu_counter *em_counter = &fs_info->evictable_extent_maps; + percpu_counter_destroy(&fs_info->stats_read_blocks); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->ordered_bytes); @@ -2923,6 +2924,10 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (ret) return ret; + ret = percpu_counter_init(&fs_info->stats_read_blocks, 0, GFP_KERNEL); + if (ret) + return ret; + fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index be8c32d1a7bb..b572d6b9730b 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -627,6 +627,9 @@ struct btrfs_fs_info { struct kobject *qgroups_kobj; struct kobject *discard_kobj; + /* Track the number of blocks (sectors) read by the filesystem. */ + struct percpu_counter stats_read_blocks; + /* Used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; struct percpu_counter delalloc_bytes; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 10bdd731e3fc..77926fdb6b0d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -417,6 +417,8 @@ struct btrfs_fs_devices { bool seeding; /* The mount needs to use a randomly generated fsid. */ bool temp_fsid; + /* Enable/disable the filesystem stats tracking. */ + bool collect_fs_stats; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ -- 2.51.0 From 6d7a9154955e50c0b991063c65f86ab24796754e Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:35 +0800 Subject: [PATCH 12/16] btrfs: introduce RAID1 round-robin read balancing Add round-robin read policy that balances reads over available devices (all RAID1 block group profiles). Switch to the next devices is done after a number of blocks is read, which is 256K by default and is configurable in sysfs. The format is "round-robin:" and can be set in file /sys/fs/btrfs/FSID/read_policy Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 48 +++++++++++++++++++++++++++++++++- fs/btrfs/volumes.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 13 ++++++++++ 3 files changed, 125 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 2880407d0dd3..e155b7ce1ee5 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1305,7 +1305,12 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj, } BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); -static const char * const btrfs_read_policy_name[] = { "pid" }; +static const char *btrfs_read_policy_name[] = { + "pid", +#ifdef CONFIG_BTRFS_EXPERIMENTAL + "round-robin", +#endif +}; static int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) { @@ -1355,6 +1360,12 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); +#ifdef CONFIG_BTRFS_EXPERIMENTAL + if (i == BTRFS_READ_POLICY_RR) + ret += sysfs_emit_at(buf, ret, ":%u", + READ_ONCE(fs_devices->rr_min_contig_read)); +#endif + if (i == policy) ret += sysfs_emit_at(buf, ret, "]"); } @@ -1376,6 +1387,41 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, if (index < 0) return -EINVAL; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* If moving from RR then disable collecting fs stats. */ + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR && index != BTRFS_READ_POLICY_RR) + fs_devices->collect_fs_stats = false; + + if (index == BTRFS_READ_POLICY_RR) { + if (value != -1) { + const u32 sectorsize = fs_devices->fs_info->sectorsize; + + if (!IS_ALIGNED(value, sectorsize)) { + u64 temp_value = round_up(value, sectorsize); + + btrfs_debug(fs_devices->fs_info, +"read_policy: min contig read %lld should be multiple of sectorsize %u, rounded to %llu", + value, sectorsize, temp_value); + value = temp_value; + } + } else { + value = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; + } + + if (index != READ_ONCE(fs_devices->read_policy) || + value != READ_ONCE(fs_devices->rr_min_contig_read)) { + WRITE_ONCE(fs_devices->read_policy, index); + WRITE_ONCE(fs_devices->rr_min_contig_read, value); + + btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'", + btrfs_read_policy_name[index], value); + } + + fs_devices->collect_fs_stats = true; + + return len; + } +#endif if (index != READ_ONCE(fs_devices->read_policy)) { WRITE_ONCE(fs_devices->read_policy, index); btrfs_info(fs_devices->fs_info, "read policy set to '%s'", diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e5d5cfb2d239..cfe1d5ada5f2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1329,6 +1329,9 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; fs_devices->read_policy = BTRFS_READ_POLICY_PID; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; +#endif return 0; } @@ -5953,6 +5956,63 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, return len; } +#ifdef CONFIG_BTRFS_EXPERIMENTAL +struct stripe_mirror { + u64 devid; + int num; +}; + +static int btrfs_cmp_devid(const void *a, const void *b) +{ + const struct stripe_mirror *s1 = (const struct stripe_mirror *)a; + const struct stripe_mirror *s2 = (const struct stripe_mirror *)b; + + if (s1->devid < s2->devid) + return -1; + if (s1->devid > s2->devid) + return 1; + return 0; +} + +/* + * Select a stripe for reading using the round-robin algorithm. + * + * 1. Compute the read cycle as the total sectors read divided by the minimum + * sectors per device. + * 2. Determine the stripe number for the current read by taking the modulus + * of the read cycle with the total number of stripes: + * + * stripe index = (total sectors / min sectors per dev) % num stripes + * + * The calculated stripe index is then used to select the corresponding device + * from the list of devices, which is ordered by devid. + */ +static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_stripes) +{ + struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = { 0 }; + struct btrfs_device *device = map->stripes[first].dev; + struct btrfs_fs_info *fs_info = device->fs_devices->fs_info; + unsigned int read_cycle; + unsigned int total_reads; + unsigned int min_reads_per_dev; + + total_reads = percpu_counter_sum(&fs_info->stats_read_blocks); + min_reads_per_dev = READ_ONCE(fs_info->fs_devices->rr_min_contig_read) >> + fs_info->sectorsize_bits; + + for (int index = 0, i = first; i < first + num_stripes; i++) { + stripes[index].devid = map->stripes[i].dev->devid; + stripes[index].num = i; + index++; + } + sort(stripes, num_stripes, sizeof(struct stripe_mirror), + btrfs_cmp_devid, NULL); + + read_cycle = total_reads / min_reads_per_dev; + return stripes[read_cycle % num_stripes].num; +} +#endif + static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) @@ -5982,6 +6042,11 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, case BTRFS_READ_POLICY_PID: preferred_mirror = first + (current->pid % num_stripes); break; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + case BTRFS_READ_POLICY_RR: + preferred_mirror = btrfs_read_rr(map, first, num_stripes); + break; +#endif } if (dev_replace_is_ongoing && diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 77926fdb6b0d..f9fe698a9b4b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -296,6 +296,9 @@ enum btrfs_chunk_allocation_policy { BTRFS_CHUNK_ALLOC_ZONED, }; +#define BTRFS_DEFAULT_RR_MIN_CONTIG_READ (SZ_256K) +/* Keep in sync with raid_attr table, current maximum is RAID1C4. */ +#define BTRFS_RAID1_MAX_MIRRORS (4) /* * Read policies for mirrored block group profiles, read picks the stripe based * on these policies. @@ -303,6 +306,10 @@ enum btrfs_chunk_allocation_policy { enum btrfs_read_policy { /* Use process PID to choose the stripe */ BTRFS_READ_POLICY_PID, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Balancing RAID1 reads across all striped devices (round-robin). */ + BTRFS_READ_POLICY_RR, +#endif BTRFS_NR_READ_POLICY, }; @@ -433,6 +440,12 @@ struct btrfs_fs_devices { enum btrfs_read_policy read_policy; #ifdef CONFIG_BTRFS_EXPERIMENTAL + /* + * Minimum contiguous reads before switching to next device, the unit + * is one block/sectorsize. + */ + u32 rr_min_contig_read; + /* Checksum mode - offload it or do it synchronously. */ enum btrfs_offload_csum_mode offload_csum_mode; #endif -- 2.51.0 From c86aae73bd5882e4a6b4e1b6ed448ea902551f80 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:36 +0800 Subject: [PATCH 13/16] btrfs: add read policy to set a preferred device Add read policy that will force all reads to go to the given device (specified by devid) on the RAID1 profiles. This will be used for testing, e.g. to read from stale device. Users may find other use cases. Can be set in sysfs, the value format is "devid:" to the file /sys/fs/btrfs/FSID/read_policy Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 31 ++++++++++++++++++++++++++++++- fs/btrfs/volumes.c | 17 +++++++++++++++++ fs/btrfs/volumes.h | 5 +++++ 3 files changed, 52 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index e155b7ce1ee5..5211d13d73f8 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1309,6 +1309,7 @@ static const char *btrfs_read_policy_name[] = { "pid", #ifdef CONFIG_BTRFS_EXPERIMENTAL "round-robin", + "devid", #endif }; @@ -1364,8 +1365,11 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, if (i == BTRFS_READ_POLICY_RR) ret += sysfs_emit_at(buf, ret, ":%u", READ_ONCE(fs_devices->rr_min_contig_read)); -#endif + if (i == BTRFS_READ_POLICY_DEVID) + ret += sysfs_emit_at(buf, ret, ":%llu", + READ_ONCE(fs_devices->read_devid)); +#endif if (i == policy) ret += sysfs_emit_at(buf, ret, "]"); } @@ -1421,6 +1425,31 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, return len; } + + if (index == BTRFS_READ_POLICY_DEVID) { + if (value != -1) { + BTRFS_DEV_LOOKUP_ARGS(args); + + /* Validate input devid. */ + args.devid = value; + if (btrfs_find_device(fs_devices, &args) == NULL) + return -EINVAL; + } else { + /* Set default devid to the devid of the latest device. */ + value = fs_devices->latest_dev->devid; + } + + if (index != READ_ONCE(fs_devices->read_policy) || + value != READ_ONCE(fs_devices->read_devid)) { + WRITE_ONCE(fs_devices->read_policy, index); + WRITE_ONCE(fs_devices->read_devid, value); + + btrfs_info(fs_devices->fs_info, "read policy set to '%s:%llu'", + btrfs_read_policy_name[index], value); + } + + return len; + } #endif if (index != READ_ONCE(fs_devices->read_policy)) { WRITE_ONCE(fs_devices->read_policy, index); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cfe1d5ada5f2..b5fd1aa45c4c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1331,6 +1331,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->read_policy = BTRFS_READ_POLICY_PID; #ifdef CONFIG_BTRFS_EXPERIMENTAL fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; + fs_devices->read_devid = latest_dev->devid; #endif return 0; @@ -5957,6 +5958,19 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, } #ifdef CONFIG_BTRFS_EXPERIMENTAL +static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, int num_stripes) +{ + for (int index = first; index < first + num_stripes; index++) { + const struct btrfs_device *device = map->stripes[index].dev; + + if (device->devid == READ_ONCE(device->fs_devices->read_devid)) + return index; + } + + /* If no read-preferred device is set use the first stripe. */ + return first; +} + struct stripe_mirror { u64 devid; int num; @@ -6046,6 +6060,9 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, case BTRFS_READ_POLICY_RR: preferred_mirror = btrfs_read_rr(map, first, num_stripes); break; + case BTRFS_READ_POLICY_DEVID: + preferred_mirror = btrfs_read_preferred(map, first, num_stripes); + break; #endif } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index f9fe698a9b4b..120f65e21eeb 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -309,6 +309,8 @@ enum btrfs_read_policy { #ifdef CONFIG_BTRFS_EXPERIMENTAL /* Balancing RAID1 reads across all striped devices (round-robin). */ BTRFS_READ_POLICY_RR, + /* Read from a specific device. */ + BTRFS_READ_POLICY_DEVID, #endif BTRFS_NR_READ_POLICY, }; @@ -446,6 +448,9 @@ struct btrfs_fs_devices { */ u32 rr_min_contig_read; + /* Device to be used for reading in case of RAID1. */ + u64 read_devid; + /* Checksum mode - offload it or do it synchronously. */ enum btrfs_offload_csum_mode offload_csum_mode; #endif -- 2.51.0 From bb4715e967cf3b2eb8550eda73886208f1fc805d Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:37 +0800 Subject: [PATCH 14/16] btrfs: print status of experimental mode when loading module Commit c9c49e8f157e ("btrfs: split out CONFIG_BTRFS_EXPERIMENTAL from CONFIG_BTRFS_DEBUG") introduces a way to enable or disable experimental features, print its status during module load, like: Btrfs loaded, experimental=on, debug=on, assert=on, zoned=yes, fsverity=yes Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f6eaaf20229d..5157037a0048 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2446,6 +2446,9 @@ static __cold void btrfs_interface_exit(void) static int __init btrfs_print_mod_info(void) { static const char options[] = "" +#ifdef CONFIG_BTRFS_EXPERIMENTAL + ", experimental=on" +#endif #ifdef CONFIG_BTRFS_DEBUG ", debug=on" #endif -- 2.51.0 From e426286cfa6f85e51006f6151b309a395ada6540 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:38 +0800 Subject: [PATCH 15/16] btrfs: configure read policy via module parameter For testing purposes allow to configure the read policy via module parameter from the beginning. Available only with CONFIG_BTRFS_EXPERIMENTAL Examples: - Set the RAID1 balancing method to round-robin with a custom min_contig_read of 4k: $ modprobe btrfs read_policy=round-robin:4096 - Set the round-robin balancing method with the default min_contiguous_read: $ modprobe btrfs read_policy=round-robin - Set the "devid" balancing method, defaulting to the latest device: $ modprobe btrfs read_policy=devid Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/super.c | 5 +++++ fs/btrfs/sysfs.c | 31 ++++++++++++++++++++++++++++++- fs/btrfs/sysfs.h | 6 ++++++ fs/btrfs/volumes.c | 15 ++++++++++++++- 4 files changed, 55 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 5157037a0048..f310cfa0b5b4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2527,6 +2527,11 @@ static const struct init_sequence mod_init_seq[] = { }, { .init_func = extent_map_init, .exit_func = extent_map_exit, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + }, { + .init_func = btrfs_read_policy_init, + .exit_func = NULL, +#endif }, { .init_func = ordered_data_init, .exit_func = ordered_data_exit, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 5211d13d73f8..53b846d99ece 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1313,7 +1313,22 @@ static const char *btrfs_read_policy_name[] = { #endif }; -static int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) +#ifdef CONFIG_BTRFS_EXPERIMENTAL + +/* Global module configuration parameters. */ +static char *read_policy; +char *btrfs_get_mod_read_policy(void) +{ + return read_policy; +} + +/* Set perms to 0, disable /sys/module/btrfs/parameter/read_policy interface. */ +module_param(read_policy, charp, 0); +MODULE_PARM_DESC(read_policy, +"Global read policy: pid (default), round-robin[:], devid[:]"); +#endif + +int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) { char param[32] = { 0 }; char __maybe_unused *value_str; @@ -1344,6 +1359,20 @@ static int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) return sysfs_match_string(btrfs_read_policy_name, param); } +#ifdef CONFIG_BTRFS_EXPERIMENTAL +int __init btrfs_read_policy_init(void) +{ + s64 value; + + if (btrfs_read_policy_to_enum(read_policy, &value) == -EINVAL) { + btrfs_err(NULL, "invalid read policy or value %s", read_policy); + return -EINVAL; + } + + return 0; +} +#endif + static ssize_t btrfs_read_policy_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index e6a284c59809..3fc5c6f90dc4 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -47,5 +47,11 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info); int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info); void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup); +int btrfs_read_policy_to_enum(const char *str, s64 *value); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +int __init btrfs_read_policy_init(void); +char *btrfs_get_mod_read_policy(void); +#endif #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b5fd1aa45c4c..a594f66daedf 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1299,6 +1299,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; struct btrfs_device *tmp_device; + s64 __maybe_unused value = 0; int ret = 0; list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, @@ -1328,10 +1329,22 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->latest_dev = latest_dev; fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; - fs_devices->read_policy = BTRFS_READ_POLICY_PID; #ifdef CONFIG_BTRFS_EXPERIMENTAL fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; fs_devices->read_devid = latest_dev->devid; + fs_devices->read_policy = btrfs_read_policy_to_enum(btrfs_get_mod_read_policy(), + &value); + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) + fs_devices->collect_fs_stats = true; + + if (value) { + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) + fs_devices->rr_min_contig_read = value; + if (fs_devices->read_policy == BTRFS_READ_POLICY_DEVID) + fs_devices->read_devid = value; + } +#else + fs_devices->read_policy = BTRFS_READ_POLICY_PID; #endif return 0; -- 2.51.0 From 3681dbe0afeef3946b71a7af05e31375d6e70b90 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:39 +0800 Subject: [PATCH 16/16] btrfs: print read policy on module load Print the read read policy if set as module parameter (with CONFIG_BTRFS_EXPERIMENTAL). Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f310cfa0b5b4..f809c3200c21 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2469,7 +2469,17 @@ static int __init btrfs_print_mod_info(void) ", fsverity=no" #endif ; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + if (btrfs_get_mod_read_policy() == NULL) + pr_info("Btrfs loaded%s\n", options); + else + pr_info("Btrfs loaded%s, read_policy=%s\n", + options, btrfs_get_mod_read_policy()); +#else pr_info("Btrfs loaded%s\n", options); +#endif + return 0; } -- 2.51.0