From a5019b70704a8cbea4c295ae7a61abd87300ff29 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:30 +0800 Subject: [PATCH 01/16] btrfs: initialize fs_devices->fs_info earlier in btrfs_init_devices_late() Currently, fs_devices->fs_info is initialized in btrfs_init_devices_late(), but this occurs too late for find_live_mirror(), which is invoked by load_super_root() much earlier than btrfs_init_devices_late(). Fix this by moving the initialization to open_ctree(), before load_super_root(). Reviewed-by: Naohiro Aota Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 + fs/btrfs/volumes.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 04d68f253940..4928bf2cd07f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3388,6 +3388,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; + fs_info->fs_devices->fs_info = fs_info; /* * Handle the space caching options appropriately now that we have the diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ccbfea163390..e5d5cfb2d239 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7515,8 +7515,6 @@ int btrfs_init_devices_late(struct btrfs_fs_info *fs_info) struct btrfs_device *device; int ret = 0; - fs_devices->fs_info = fs_info; - mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) device->fs_info = fs_info; -- 2.51.0 From 83be7f8b9c24bd040a348577c7c84fd08911707f Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:31 +0800 Subject: [PATCH 02/16] btrfs: sysfs: refactor output formatting in btrfs_read_policy_show() Refactor the logic in btrfs_read_policy_show() for easier extension with more balancing methods. Streamline the space and bracket handling around the active policy without altering the functional output. This is in preparation to add more methods. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 7f09b6c9cc2d..ab18b4e59468 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1316,14 +1316,16 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, int i; for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (policy == i) - ret += sysfs_emit_at(buf, ret, "%s[%s]", - (ret == 0 ? "" : " "), - btrfs_read_policy_name[i]); - else - ret += sysfs_emit_at(buf, ret, "%s%s", - (ret == 0 ? "" : " "), - btrfs_read_policy_name[i]); + if (ret != 0) + ret += sysfs_emit_at(buf, ret, " "); + + if (i == policy) + ret += sysfs_emit_at(buf, ret, "["); + + ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); + + if (i == policy) + ret += sysfs_emit_at(buf, ret, "]"); } ret += sysfs_emit_at(buf, ret, "\n"); -- 2.51.0 From 38cae63137d5e13dc3c2ba88c4f393be4a6bf4bb Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:32 +0800 Subject: [PATCH 03/16] btrfs: sysfs: add btrfs_read_policy_to_enum() helper and refactor read policy store Introduce btrfs_read_policy_to_enum() helper to simplify the conversion of a string read policy to its corresponding enum value. This reduces duplication and improves code clarity in btrfs_read_policy_store(). The parameter is copied locally to allow modification, enabling the separation of the method and its value. This prepares for the addition of more functionality in subsequent patches. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index ab18b4e59468..78b4af72997b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1307,6 +1307,18 @@ BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); static const char * const btrfs_read_policy_name[] = { "pid" }; +static int btrfs_read_policy_to_enum(const char *str) +{ + char param[32] = { 0 }; + + if (!str || strlen(str) == 0) + return 0; + + strncpy(param, str, sizeof(param) - 1); + + return sysfs_match_string(btrfs_read_policy_name, param); +} + static ssize_t btrfs_read_policy_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1338,21 +1350,19 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, const char *buf, size_t len) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); - int i; + int index; - for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (sysfs_streq(buf, btrfs_read_policy_name[i])) { - if (i != READ_ONCE(fs_devices->read_policy)) { - WRITE_ONCE(fs_devices->read_policy, i); - btrfs_info(fs_devices->fs_info, - "read policy set to '%s'", - btrfs_read_policy_name[i]); - } - return len; - } + index = btrfs_read_policy_to_enum(buf); + if (index < 0) + return -EINVAL; + + if (index != READ_ONCE(fs_devices->read_policy)) { + WRITE_ONCE(fs_devices->read_policy, index); + btrfs_info(fs_devices->fs_info, "read policy set to '%s'", + btrfs_read_policy_name[index]); } - return -EINVAL; + return len; } BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store); -- 2.51.0 From b6bed20ed398f71069bfd2cd769bb91fa15859b5 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:33 +0800 Subject: [PATCH 04/16] btrfs: sysfs: handle value associated with read balancing policy Enable specifying additional configuration values along the RAID1 balancing read policy in a single input string. Update btrfs_read_policy_to_enum() to parse and handle a value associated with the policy in the format "policy:value", the value part if present is converted to 64-bit integer. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 78b4af72997b..2880407d0dd3 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1307,15 +1307,34 @@ BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); static const char * const btrfs_read_policy_name[] = { "pid" }; -static int btrfs_read_policy_to_enum(const char *str) +static int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) { char param[32] = { 0 }; + char __maybe_unused *value_str; if (!str || strlen(str) == 0) return 0; strncpy(param, str, sizeof(param) - 1); +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Separate value from input in policy:value format. */ + value_str = strchr(param, ':'); + if (value_str) { + int ret; + + *value_str = 0; + value_str++; + if (!value_ret) + return -EINVAL; + ret = kstrtos64(value_str, 10, value_ret); + if (ret) + return -EINVAL; + if (*value_ret < 0) + return -ERANGE; + } +#endif + return sysfs_match_string(btrfs_read_policy_name, param); } @@ -1351,8 +1370,9 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); int index; + s64 value = -1; - index = btrfs_read_policy_to_enum(buf); + index = btrfs_read_policy_to_enum(buf, &value); if (index < 0) return -EINVAL; -- 2.51.0 From 22fb0d99c90583e5b32a2a54e614bce221d31a8a Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:34 +0800 Subject: [PATCH 05/16] btrfs: add tracking of read blocks for read policy Track number of read blocks in the whole filesystem. The counter is initialized when devices are opened. The counter is increased at btrfs_submit_dev_bio() if the stats tracking is enabled (depends on the read policy). Stats tracking is disabled by default and is enabled through fs_devices::collect_fs_stats when required. The code is not under the EXPERIMENTAL define, as stats can be expanded to include write counts and other performance counters, with the user interface independent of its internal use. This is an in-memory-only feature, not related to the dev error stats. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 8 ++++++++ fs/btrfs/disk-io.c | 5 +++++ fs/btrfs/fs.h | 3 +++ fs/btrfs/volumes.h | 2 ++ 4 files changed, 18 insertions(+) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index bc80ee4f95a5..bc2555c44a12 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -453,6 +453,14 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), dev->devid, bio->bi_iter.bi_size); + /* + * Track reads if tracking is enabled; ignore I/O operations before the + * filesystem is fully initialized. + */ + if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info) + percpu_counter_add(&dev->fs_info->stats_read_blocks, + bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits); + if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) blkcg_punt_bio_submit(bio); else diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4928bf2cd07f..ef3121b55c50 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1258,6 +1258,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { struct percpu_counter *em_counter = &fs_info->evictable_extent_maps; + percpu_counter_destroy(&fs_info->stats_read_blocks); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->ordered_bytes); @@ -2923,6 +2924,10 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (ret) return ret; + ret = percpu_counter_init(&fs_info->stats_read_blocks, 0, GFP_KERNEL); + if (ret) + return ret; + fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index be8c32d1a7bb..b572d6b9730b 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -627,6 +627,9 @@ struct btrfs_fs_info { struct kobject *qgroups_kobj; struct kobject *discard_kobj; + /* Track the number of blocks (sectors) read by the filesystem. */ + struct percpu_counter stats_read_blocks; + /* Used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; struct percpu_counter delalloc_bytes; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 10bdd731e3fc..77926fdb6b0d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -417,6 +417,8 @@ struct btrfs_fs_devices { bool seeding; /* The mount needs to use a randomly generated fsid. */ bool temp_fsid; + /* Enable/disable the filesystem stats tracking. */ + bool collect_fs_stats; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ -- 2.51.0 From 6d7a9154955e50c0b991063c65f86ab24796754e Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:35 +0800 Subject: [PATCH 06/16] btrfs: introduce RAID1 round-robin read balancing Add round-robin read policy that balances reads over available devices (all RAID1 block group profiles). Switch to the next devices is done after a number of blocks is read, which is 256K by default and is configurable in sysfs. The format is "round-robin:" and can be set in file /sys/fs/btrfs/FSID/read_policy Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 48 +++++++++++++++++++++++++++++++++- fs/btrfs/volumes.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 13 ++++++++++ 3 files changed, 125 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 2880407d0dd3..e155b7ce1ee5 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1305,7 +1305,12 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj, } BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); -static const char * const btrfs_read_policy_name[] = { "pid" }; +static const char *btrfs_read_policy_name[] = { + "pid", +#ifdef CONFIG_BTRFS_EXPERIMENTAL + "round-robin", +#endif +}; static int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) { @@ -1355,6 +1360,12 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); +#ifdef CONFIG_BTRFS_EXPERIMENTAL + if (i == BTRFS_READ_POLICY_RR) + ret += sysfs_emit_at(buf, ret, ":%u", + READ_ONCE(fs_devices->rr_min_contig_read)); +#endif + if (i == policy) ret += sysfs_emit_at(buf, ret, "]"); } @@ -1376,6 +1387,41 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, if (index < 0) return -EINVAL; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* If moving from RR then disable collecting fs stats. */ + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR && index != BTRFS_READ_POLICY_RR) + fs_devices->collect_fs_stats = false; + + if (index == BTRFS_READ_POLICY_RR) { + if (value != -1) { + const u32 sectorsize = fs_devices->fs_info->sectorsize; + + if (!IS_ALIGNED(value, sectorsize)) { + u64 temp_value = round_up(value, sectorsize); + + btrfs_debug(fs_devices->fs_info, +"read_policy: min contig read %lld should be multiple of sectorsize %u, rounded to %llu", + value, sectorsize, temp_value); + value = temp_value; + } + } else { + value = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; + } + + if (index != READ_ONCE(fs_devices->read_policy) || + value != READ_ONCE(fs_devices->rr_min_contig_read)) { + WRITE_ONCE(fs_devices->read_policy, index); + WRITE_ONCE(fs_devices->rr_min_contig_read, value); + + btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'", + btrfs_read_policy_name[index], value); + } + + fs_devices->collect_fs_stats = true; + + return len; + } +#endif if (index != READ_ONCE(fs_devices->read_policy)) { WRITE_ONCE(fs_devices->read_policy, index); btrfs_info(fs_devices->fs_info, "read policy set to '%s'", diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e5d5cfb2d239..cfe1d5ada5f2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1329,6 +1329,9 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; fs_devices->read_policy = BTRFS_READ_POLICY_PID; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; +#endif return 0; } @@ -5953,6 +5956,63 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, return len; } +#ifdef CONFIG_BTRFS_EXPERIMENTAL +struct stripe_mirror { + u64 devid; + int num; +}; + +static int btrfs_cmp_devid(const void *a, const void *b) +{ + const struct stripe_mirror *s1 = (const struct stripe_mirror *)a; + const struct stripe_mirror *s2 = (const struct stripe_mirror *)b; + + if (s1->devid < s2->devid) + return -1; + if (s1->devid > s2->devid) + return 1; + return 0; +} + +/* + * Select a stripe for reading using the round-robin algorithm. + * + * 1. Compute the read cycle as the total sectors read divided by the minimum + * sectors per device. + * 2. Determine the stripe number for the current read by taking the modulus + * of the read cycle with the total number of stripes: + * + * stripe index = (total sectors / min sectors per dev) % num stripes + * + * The calculated stripe index is then used to select the corresponding device + * from the list of devices, which is ordered by devid. + */ +static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_stripes) +{ + struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = { 0 }; + struct btrfs_device *device = map->stripes[first].dev; + struct btrfs_fs_info *fs_info = device->fs_devices->fs_info; + unsigned int read_cycle; + unsigned int total_reads; + unsigned int min_reads_per_dev; + + total_reads = percpu_counter_sum(&fs_info->stats_read_blocks); + min_reads_per_dev = READ_ONCE(fs_info->fs_devices->rr_min_contig_read) >> + fs_info->sectorsize_bits; + + for (int index = 0, i = first; i < first + num_stripes; i++) { + stripes[index].devid = map->stripes[i].dev->devid; + stripes[index].num = i; + index++; + } + sort(stripes, num_stripes, sizeof(struct stripe_mirror), + btrfs_cmp_devid, NULL); + + read_cycle = total_reads / min_reads_per_dev; + return stripes[read_cycle % num_stripes].num; +} +#endif + static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) @@ -5982,6 +6042,11 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, case BTRFS_READ_POLICY_PID: preferred_mirror = first + (current->pid % num_stripes); break; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + case BTRFS_READ_POLICY_RR: + preferred_mirror = btrfs_read_rr(map, first, num_stripes); + break; +#endif } if (dev_replace_is_ongoing && diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 77926fdb6b0d..f9fe698a9b4b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -296,6 +296,9 @@ enum btrfs_chunk_allocation_policy { BTRFS_CHUNK_ALLOC_ZONED, }; +#define BTRFS_DEFAULT_RR_MIN_CONTIG_READ (SZ_256K) +/* Keep in sync with raid_attr table, current maximum is RAID1C4. */ +#define BTRFS_RAID1_MAX_MIRRORS (4) /* * Read policies for mirrored block group profiles, read picks the stripe based * on these policies. @@ -303,6 +306,10 @@ enum btrfs_chunk_allocation_policy { enum btrfs_read_policy { /* Use process PID to choose the stripe */ BTRFS_READ_POLICY_PID, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Balancing RAID1 reads across all striped devices (round-robin). */ + BTRFS_READ_POLICY_RR, +#endif BTRFS_NR_READ_POLICY, }; @@ -433,6 +440,12 @@ struct btrfs_fs_devices { enum btrfs_read_policy read_policy; #ifdef CONFIG_BTRFS_EXPERIMENTAL + /* + * Minimum contiguous reads before switching to next device, the unit + * is one block/sectorsize. + */ + u32 rr_min_contig_read; + /* Checksum mode - offload it or do it synchronously. */ enum btrfs_offload_csum_mode offload_csum_mode; #endif -- 2.51.0 From c86aae73bd5882e4a6b4e1b6ed448ea902551f80 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:36 +0800 Subject: [PATCH 07/16] btrfs: add read policy to set a preferred device Add read policy that will force all reads to go to the given device (specified by devid) on the RAID1 profiles. This will be used for testing, e.g. to read from stale device. Users may find other use cases. Can be set in sysfs, the value format is "devid:" to the file /sys/fs/btrfs/FSID/read_policy Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 31 ++++++++++++++++++++++++++++++- fs/btrfs/volumes.c | 17 +++++++++++++++++ fs/btrfs/volumes.h | 5 +++++ 3 files changed, 52 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index e155b7ce1ee5..5211d13d73f8 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1309,6 +1309,7 @@ static const char *btrfs_read_policy_name[] = { "pid", #ifdef CONFIG_BTRFS_EXPERIMENTAL "round-robin", + "devid", #endif }; @@ -1364,8 +1365,11 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, if (i == BTRFS_READ_POLICY_RR) ret += sysfs_emit_at(buf, ret, ":%u", READ_ONCE(fs_devices->rr_min_contig_read)); -#endif + if (i == BTRFS_READ_POLICY_DEVID) + ret += sysfs_emit_at(buf, ret, ":%llu", + READ_ONCE(fs_devices->read_devid)); +#endif if (i == policy) ret += sysfs_emit_at(buf, ret, "]"); } @@ -1421,6 +1425,31 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, return len; } + + if (index == BTRFS_READ_POLICY_DEVID) { + if (value != -1) { + BTRFS_DEV_LOOKUP_ARGS(args); + + /* Validate input devid. */ + args.devid = value; + if (btrfs_find_device(fs_devices, &args) == NULL) + return -EINVAL; + } else { + /* Set default devid to the devid of the latest device. */ + value = fs_devices->latest_dev->devid; + } + + if (index != READ_ONCE(fs_devices->read_policy) || + value != READ_ONCE(fs_devices->read_devid)) { + WRITE_ONCE(fs_devices->read_policy, index); + WRITE_ONCE(fs_devices->read_devid, value); + + btrfs_info(fs_devices->fs_info, "read policy set to '%s:%llu'", + btrfs_read_policy_name[index], value); + } + + return len; + } #endif if (index != READ_ONCE(fs_devices->read_policy)) { WRITE_ONCE(fs_devices->read_policy, index); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cfe1d5ada5f2..b5fd1aa45c4c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1331,6 +1331,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->read_policy = BTRFS_READ_POLICY_PID; #ifdef CONFIG_BTRFS_EXPERIMENTAL fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; + fs_devices->read_devid = latest_dev->devid; #endif return 0; @@ -5957,6 +5958,19 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, } #ifdef CONFIG_BTRFS_EXPERIMENTAL +static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, int num_stripes) +{ + for (int index = first; index < first + num_stripes; index++) { + const struct btrfs_device *device = map->stripes[index].dev; + + if (device->devid == READ_ONCE(device->fs_devices->read_devid)) + return index; + } + + /* If no read-preferred device is set use the first stripe. */ + return first; +} + struct stripe_mirror { u64 devid; int num; @@ -6046,6 +6060,9 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, case BTRFS_READ_POLICY_RR: preferred_mirror = btrfs_read_rr(map, first, num_stripes); break; + case BTRFS_READ_POLICY_DEVID: + preferred_mirror = btrfs_read_preferred(map, first, num_stripes); + break; #endif } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index f9fe698a9b4b..120f65e21eeb 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -309,6 +309,8 @@ enum btrfs_read_policy { #ifdef CONFIG_BTRFS_EXPERIMENTAL /* Balancing RAID1 reads across all striped devices (round-robin). */ BTRFS_READ_POLICY_RR, + /* Read from a specific device. */ + BTRFS_READ_POLICY_DEVID, #endif BTRFS_NR_READ_POLICY, }; @@ -446,6 +448,9 @@ struct btrfs_fs_devices { */ u32 rr_min_contig_read; + /* Device to be used for reading in case of RAID1. */ + u64 read_devid; + /* Checksum mode - offload it or do it synchronously. */ enum btrfs_offload_csum_mode offload_csum_mode; #endif -- 2.51.0 From bb4715e967cf3b2eb8550eda73886208f1fc805d Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:37 +0800 Subject: [PATCH 08/16] btrfs: print status of experimental mode when loading module Commit c9c49e8f157e ("btrfs: split out CONFIG_BTRFS_EXPERIMENTAL from CONFIG_BTRFS_DEBUG") introduces a way to enable or disable experimental features, print its status during module load, like: Btrfs loaded, experimental=on, debug=on, assert=on, zoned=yes, fsverity=yes Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f6eaaf20229d..5157037a0048 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2446,6 +2446,9 @@ static __cold void btrfs_interface_exit(void) static int __init btrfs_print_mod_info(void) { static const char options[] = "" +#ifdef CONFIG_BTRFS_EXPERIMENTAL + ", experimental=on" +#endif #ifdef CONFIG_BTRFS_DEBUG ", debug=on" #endif -- 2.51.0 From e426286cfa6f85e51006f6151b309a395ada6540 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:38 +0800 Subject: [PATCH 09/16] btrfs: configure read policy via module parameter For testing purposes allow to configure the read policy via module parameter from the beginning. Available only with CONFIG_BTRFS_EXPERIMENTAL Examples: - Set the RAID1 balancing method to round-robin with a custom min_contig_read of 4k: $ modprobe btrfs read_policy=round-robin:4096 - Set the round-robin balancing method with the default min_contiguous_read: $ modprobe btrfs read_policy=round-robin - Set the "devid" balancing method, defaulting to the latest device: $ modprobe btrfs read_policy=devid Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/super.c | 5 +++++ fs/btrfs/sysfs.c | 31 ++++++++++++++++++++++++++++++- fs/btrfs/sysfs.h | 6 ++++++ fs/btrfs/volumes.c | 15 ++++++++++++++- 4 files changed, 55 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 5157037a0048..f310cfa0b5b4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2527,6 +2527,11 @@ static const struct init_sequence mod_init_seq[] = { }, { .init_func = extent_map_init, .exit_func = extent_map_exit, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + }, { + .init_func = btrfs_read_policy_init, + .exit_func = NULL, +#endif }, { .init_func = ordered_data_init, .exit_func = ordered_data_exit, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 5211d13d73f8..53b846d99ece 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1313,7 +1313,22 @@ static const char *btrfs_read_policy_name[] = { #endif }; -static int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) +#ifdef CONFIG_BTRFS_EXPERIMENTAL + +/* Global module configuration parameters. */ +static char *read_policy; +char *btrfs_get_mod_read_policy(void) +{ + return read_policy; +} + +/* Set perms to 0, disable /sys/module/btrfs/parameter/read_policy interface. */ +module_param(read_policy, charp, 0); +MODULE_PARM_DESC(read_policy, +"Global read policy: pid (default), round-robin[:], devid[:]"); +#endif + +int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) { char param[32] = { 0 }; char __maybe_unused *value_str; @@ -1344,6 +1359,20 @@ static int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) return sysfs_match_string(btrfs_read_policy_name, param); } +#ifdef CONFIG_BTRFS_EXPERIMENTAL +int __init btrfs_read_policy_init(void) +{ + s64 value; + + if (btrfs_read_policy_to_enum(read_policy, &value) == -EINVAL) { + btrfs_err(NULL, "invalid read policy or value %s", read_policy); + return -EINVAL; + } + + return 0; +} +#endif + static ssize_t btrfs_read_policy_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index e6a284c59809..3fc5c6f90dc4 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -47,5 +47,11 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info); int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info); void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup); +int btrfs_read_policy_to_enum(const char *str, s64 *value); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +int __init btrfs_read_policy_init(void); +char *btrfs_get_mod_read_policy(void); +#endif #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b5fd1aa45c4c..a594f66daedf 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1299,6 +1299,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; struct btrfs_device *tmp_device; + s64 __maybe_unused value = 0; int ret = 0; list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, @@ -1328,10 +1329,22 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->latest_dev = latest_dev; fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; - fs_devices->read_policy = BTRFS_READ_POLICY_PID; #ifdef CONFIG_BTRFS_EXPERIMENTAL fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; fs_devices->read_devid = latest_dev->devid; + fs_devices->read_policy = btrfs_read_policy_to_enum(btrfs_get_mod_read_policy(), + &value); + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) + fs_devices->collect_fs_stats = true; + + if (value) { + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) + fs_devices->rr_min_contig_read = value; + if (fs_devices->read_policy == BTRFS_READ_POLICY_DEVID) + fs_devices->read_devid = value; + } +#else + fs_devices->read_policy = BTRFS_READ_POLICY_PID; #endif return 0; -- 2.51.0 From 3681dbe0afeef3946b71a7af05e31375d6e70b90 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:39 +0800 Subject: [PATCH 10/16] btrfs: print read policy on module load Print the read read policy if set as module parameter (with CONFIG_BTRFS_EXPERIMENTAL). Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f310cfa0b5b4..f809c3200c21 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2469,7 +2469,17 @@ static int __init btrfs_print_mod_info(void) ", fsverity=no" #endif ; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + if (btrfs_get_mod_read_policy() == NULL) + pr_info("Btrfs loaded%s\n", options); + else + pr_info("Btrfs loaded%s, read_policy=%s\n", + options, btrfs_get_mod_read_policy()); +#else pr_info("Btrfs loaded%s\n", options); +#endif + return 0; } -- 2.51.0 From 2fa07d7a0f0084d9777f076a154ad10e759ba731 Mon Sep 17 00:00:00 2001 From: Jing Xia Date: Tue, 3 Sep 2024 13:40:12 +0800 Subject: [PATCH 11/16] btrfs: pass write-hint for buffered IO Commit 449813515d3e ("block, fs: Restore the per-bio/request data lifetime fields") restored write-hint support in btrfs. But that is applicable only for direct IO. This patch supports passing write-hint for buffered IO from btrfs file system to block layer by filling bi_write_hint of struct bio in alloc_new_bio(). There's an ongoing discussion which devices can use that, https://lore.kernel.org/all/20240910150200.6589-6-joshi.k@samsung.com, in SCSI there's support using sd_group_number() and sd_setup_rw32_cmnd(). The hint goes from the application directly to the block device so it's up to the application to set up everything properly to utilize the different hint classes. Link: https://lore.kernel.org/all/20240910150200.6589-6-joshi.k@samsung.com Reviewed-by: Johannes Thumshirn Signed-off-by: Jing Xia [ Add more context and use case. ] Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9725ff7f274d..74f2775db51e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -710,6 +710,7 @@ static void alloc_new_bio(struct btrfs_inode *inode, bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info, bio_ctrl->end_io_func, NULL); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint; bbio->inode = inode; bbio->file_offset = file_offset; bio_ctrl->bbio = bbio; -- 2.51.0 From 5f14eb12a3be1628809141759e46c381925b5ef1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 9 Jan 2025 11:23:59 +0100 Subject: [PATCH 12/16] btrfs: drop unused parameter fs_info to btrfs_delete_delayed_insertion_item() Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index f9f1a972a6f7..0b4933c6a889 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1555,8 +1555,7 @@ release_node: return ret; } -static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_node *node, +static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node, u64 index) { struct btrfs_delayed_item *item; @@ -1614,7 +1613,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, if (IS_ERR(node)) return PTR_ERR(node); - ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index); + ret = btrfs_delete_delayed_insertion_item(node, index); if (!ret) goto end; -- 2.51.0 From 6d67ff1c0be3c04e01c6b20a0c8286e99df05b2d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 9 Jan 2025 11:24:01 +0100 Subject: [PATCH 13/16] btrfs: remove stray comment about SRCU The subvol_srcu was removed in c75e839414d361 ("btrfs: kill the subvol_srcu") years ago. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/disk-io.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index a7051e2570c1..587842991b24 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -96,9 +96,6 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); /* * This function is used to grab the root, and avoid it is freed when we * access it. But it doesn't ensure that the tree is not dropped. - * - * If you want to ensure the whole tree is safe, you should use - * fs_info->subvol_srcu */ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) { -- 2.51.0 From 2a1e8378dc3814e37deee2be495600da0c98b175 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 9 Jan 2025 11:24:07 +0100 Subject: [PATCH 14/16] btrfs: use SECTOR_SIZE defines in btrfs_issue_discard() Use the existing define for single sector size. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1cb1bd45f7ec..3014a1a23efd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1256,12 +1256,12 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, { int j, ret = 0; u64 bytes_left, end; - u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); + u64 aligned_start = ALIGN(start, SECTOR_SIZE); /* Adjust the range to be aligned to 512B sectors if necessary. */ if (start != aligned_start) { len -= aligned_start - start; - len = round_down(len, 1 << SECTOR_SHIFT); + len = round_down(len, SECTOR_SIZE); start = aligned_start; } -- 2.51.0 From 2b41599bff1714df957c82821b8b17113ea44054 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 9 Jan 2025 11:24:10 +0100 Subject: [PATCH 15/16] btrfs: rename __unlock_for_delalloc() and drop underscores Drop the leading underscores in '__unlock_for_delalloc()' and rename it to 'unlock_delalloc_folio()'. This also ensures naming parity with 'lock_delalloc_folios()'. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 74f2775db51e..bc4d41985c4b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -221,7 +221,7 @@ static void __process_folios_contig(struct address_space *mapping, } } -static noinline void __unlock_for_delalloc(const struct inode *inode, +static noinline void unlock_delalloc_folio(const struct inode *inode, const struct folio *locked_folio, u64 start, u64 end) { @@ -288,8 +288,7 @@ static noinline int lock_delalloc_folios(struct inode *inode, out: folio_batch_release(&fbatch); if (processed_end > start) - __unlock_for_delalloc(inode, locked_folio, start, - processed_end); + unlock_delalloc_folio(inode, locked_folio, start, processed_end); return -EAGAIN; } @@ -390,7 +389,7 @@ again: unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); if (!ret) { - __unlock_for_delalloc(inode, locked_folio, delalloc_start, + unlock_delalloc_folio(inode, locked_folio, delalloc_start, delalloc_end); cond_resched(); goto again; @@ -1248,7 +1247,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, */ unlock_extent(&inode->io_tree, found_start, found_start + found_len - 1, NULL); - __unlock_for_delalloc(&inode->vfs_inode, folio, + unlock_delalloc_folio(&inode->vfs_inode, folio, found_start, found_start + found_len - 1); } -- 2.51.0 From 3a1c46dbc9856a286808170b58c35ff5f50afa30 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 9 Jan 2025 11:24:12 +0100 Subject: [PATCH 16/16] btrfs: open code set_page_extent_mapped() The function set_page_extent_mapped() is now a simple wrapper so use the folio helper. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 5 ----- fs/btrfs/extent_io.h | 1 - fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/relocation.c | 2 +- 4 files changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bc4d41985c4b..532a5f09121f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -862,11 +862,6 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, return ret; } -int set_page_extent_mapped(struct page *page) -{ - return set_folio_extent_mapped(page_folio(page)); -} - int set_folio_extent_mapped(struct folio *folio) { struct btrfs_fs_info *fs_info; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 8a36117ed453..d14bda928e7b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -248,7 +248,6 @@ int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); void btrfs_readahead(struct readahead_control *rac); int set_folio_extent_mapped(struct folio *folio); -int set_page_extent_mapped(struct page *page); void clear_folio_extent_mapped(struct folio *folio); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 3048cb38dc80..d42b6f882f57 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -461,7 +461,7 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) return -ENOMEM; } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(page_folio(page)); if (ret < 0) { unlock_page(page); put_page(page); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d4100e58172f..af0969b70b53 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2870,7 +2870,7 @@ again: /* * We could have lost folio private when we dropped the lock to read the - * folio above, make sure we set_page_extent_mapped here so we have any + * folio above, make sure we set_folio_extent_mapped() here so we have any * of the subpage blocksize stuff we need in place. */ ret = set_folio_extent_mapped(folio); -- 2.51.0