From 530112d88ebd7405fb61711892b2a680048984c7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 15 Apr 2025 15:15:36 -0400 Subject: [PATCH 01/16] bcachefs: BCH_FEATURE_small_image We can't go RW if it's an image file that hasn't been resized. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 8 +++++--- fs/bcachefs/bcachefs_format.h | 3 ++- fs/bcachefs/errcode.h | 1 + fs/bcachefs/journal.c | 9 ++++++++- fs/bcachefs/journal_reclaim.c | 26 ++++++++++++++------------ fs/bcachefs/recovery.c | 5 +++++ fs/bcachefs/super.c | 5 +++++ 7 files changed, 40 insertions(+), 17 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 8b8c2344855f..6ac8bd49c629 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -2392,14 +2392,16 @@ bkey_err: int bch2_fs_freespace_init(struct bch_fs *c) { - int ret = 0; - bool doing_init = false; + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) + return 0; + /* * We can crash during the device add path, so we need to check this on * every mount: */ + bool doing_init = false; for_each_member_device(c, ca) { if (ca->mi.freespace_initialized) continue; @@ -2409,7 +2411,7 @@ int bch2_fs_freespace_init(struct bch_fs *c) doing_init = true; } - ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); + int ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); if (ret) { bch2_dev_put(ca); bch_err_fn(c, ret); diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index c0041391e2e8..7ce475c565b5 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -924,7 +924,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u x(extents_across_btree_nodes, 18) \ x(incompat_version_field, 19) \ x(casefolding, 20) \ - x(no_alloc_info, 21) + x(no_alloc_info, 21) \ + x(small_image, 22) #define BCH_SB_FEATURES_ALWAYS \ (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \ diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 8a4435660d86..6a4b3fe9ea99 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -222,6 +222,7 @@ x(EROFS, erofs_norecovery) \ x(EROFS, erofs_nochanges) \ x(EROFS, erofs_no_alloc_info) \ + x(EROFS, erofs_filesystem_full) \ x(EROFS, insufficient_devices) \ x(0, operation_blocked) \ x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 5442d526a448..3694b83af8cc 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1295,9 +1295,16 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) { + struct bch_fs *c = ca->fs; + if (!(ca->mi.data_allowed & BIT(BCH_DATA_journal))) return 0; + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { + bch_err(c, "cannot allocate journal, filesystem is an unresized image file"); + return -BCH_ERR_erofs_filesystem_full; + } + unsigned nr; int ret; @@ -1318,7 +1325,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) min(1 << 13, (1 << 24) / ca->mi.bucket_size)); - ret = bch2_set_nr_journal_buckets_loop(ca->fs, ca, nr, new_fs); + ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs); err: bch_err_fn(ca, ret); return ret; diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index cc00b0fc40d8..a02f483a016a 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -215,18 +215,20 @@ void bch2_journal_space_available(struct journal *j) j->can_discard = can_discard; if (nr_online < metadata_replicas_required(c)) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" - "rw journal devs:", nr_online, metadata_replicas_required(c)); - - rcu_read_lock(); - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) - prt_printf(&buf, " %s", ca->name); - rcu_read_unlock(); - - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); + if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) { + struct printbuf buf = PRINTBUF; + buf.atomic++; + prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" + "rw journal devs:", nr_online, metadata_replicas_required(c)); + + rcu_read_lock(); + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) + prt_printf(&buf, " %s", ca->name); + rcu_read_unlock(); + + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); + } ret = -BCH_ERR_insufficient_journal_devices; goto out; } diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index b5ab77f3c692..2436f334dde4 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -734,6 +734,11 @@ int bch2_fs_recovery(struct bch_fs *c) c->opts.read_only = true; } + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { + bch_info(c, "filesystem is an unresized image file, mounting ro"); + c->opts.read_only = true; + } + mutex_lock(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); bool write_sb = false; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 6ab3e63ef139..7cd075303f95 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -451,6 +451,11 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) return -BCH_ERR_erofs_unfixed_errors; } + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { + bch_err(c, "cannot go rw, filesystem is an unresized image file"); + return -BCH_ERR_erofs_filesystem_full; + } + if (test_bit(BCH_FS_rw, &c->flags)) return 0; -- 2.51.0 From 0ca375b1779f22f703f956b22ea2bdbc69c247eb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 15 Apr 2025 14:09:34 -0400 Subject: [PATCH 02/16] bcachefs: BCH_MEMBER_RESIZE_ON_MOUNT Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 18 +++++++--- fs/bcachefs/sb-members.c | 1 + fs/bcachefs/sb-members.h | 1 + fs/bcachefs/sb-members_format.h | 2 ++ fs/bcachefs/sb-members_types.h | 1 + fs/bcachefs/super.c | 64 +++++++++++++++++++++++++++++---- fs/bcachefs/super.h | 2 ++ 7 files changed, 77 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 2436f334dde4..2a8bcb9b1dd2 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -734,11 +734,6 @@ int bch2_fs_recovery(struct bch_fs *c) c->opts.read_only = true; } - if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { - bch_info(c, "filesystem is an unresized image file, mounting ro"); - c->opts.read_only = true; - } - mutex_lock(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); bool write_sb = false; @@ -892,6 +887,17 @@ use_clean: if (ret) goto err; + ret = bch2_fs_resize_on_mount(c); + if (ret) { + up_write(&c->state_lock); + goto err; + } + + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { + bch_info(c, "filesystem is an unresized image file, mounting ro"); + c->opts.read_only = true; + } + if (!c->opts.read_only && (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) { bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); @@ -954,6 +960,8 @@ use_clean: set_bit(BCH_FS_btree_running, &c->flags); ret = bch2_sb_set_upgrade_extra(c); + if (ret) + goto err; ret = bch2_run_recovery_passes(c); if (ret) diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 39ce94875dde..462a2c21a9de 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -294,6 +294,7 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m)); prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); + prt_printf(out, "Resize on mount:\t%llu\n", BCH_MEMBER_RESIZE_ON_MOUNT(&m)); printbuf_indent_sub(out, 2); } diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 0f1741fffcb6..424143f5e330 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -353,6 +353,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) ? BCH_MEMBER_DURABILITY(mi) - 1 : 1, .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), + .resize_on_mount = BCH_MEMBER_RESIZE_ON_MOUNT(mi), .valid = bch2_member_alive(mi), .btree_bitmap_shift = mi->btree_bitmap_shift, .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap), diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h index 3affec823b3f..472218a59102 100644 --- a/fs/bcachefs/sb-members_format.h +++ b/fs/bcachefs/sb-members_format.h @@ -88,6 +88,8 @@ LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, struct bch_member, flags, 30, 31) +LE64_BITMASK(BCH_MEMBER_RESIZE_ON_MOUNT, + struct bch_member, flags, 31, 32) #if 0 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h index c0eda888fe39..d6443e186872 100644 --- a/fs/bcachefs/sb-members_types.h +++ b/fs/bcachefs/sb-members_types.h @@ -13,6 +13,7 @@ struct bch_member_cpu { u8 data_allowed; u8 durability; u8 freespace_initialized; + u8 resize_on_mount; u8 valid; u8 btree_bitmap_shift; u64 btree_allocated_bitmap; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 7cd075303f95..839b1582c1f1 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1141,6 +1141,9 @@ int bch2_fs_start(struct bch_fs *c) for_each_online_member(c, ca) bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); + /* + * Dno't write superblock yet: recovery might have to downgrade + */ mutex_unlock(&c->sb_lock); for_each_rw_member(c, ca) @@ -2039,6 +2042,18 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) return 0; } +static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets) +{ + struct bch_fs *c = ca->fs; + u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 }; + + return bch2_trans_commit_do(ca->fs, NULL, NULL, 0, + bch2_disk_accounting_mod2(trans, false, v, dev_data_type, + .dev = ca->dev_idx, + .data_type = BCH_DATA_free)) ?: + bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets); +} + int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { struct bch_member *m; @@ -2086,13 +2101,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) mutex_unlock(&c->sb_lock); if (ca->mi.freespace_initialized) { - u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; - - ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, - bch2_disk_accounting_mod2(trans, false, v, dev_data_type, - .dev = ca->dev_idx, - .data_type = BCH_DATA_free)) ?: - bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); + ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets); if (ret) goto err; } @@ -2103,6 +2112,47 @@ err: return ret; } +int bch2_fs_resize_on_mount(struct bch_fs *c) +{ + for_each_online_member(c, ca) { + u64 old_nbuckets = ca->mi.nbuckets; + u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk), + ca->mi.bucket_size); + + if (ca->mi.resize_on_mount && + new_nbuckets > ca->mi.nbuckets) { + bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size); + int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets); + bch_err_fn(ca, ret); + if (ret) { + percpu_ref_put(&ca->io_ref[READ]); + up_write(&c->state_lock); + return ret; + } + + mutex_lock(&c->sb_lock); + struct bch_member *m = + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + m->nbuckets = cpu_to_le64(new_nbuckets); + SET_BCH_MEMBER_RESIZE_ON_MOUNT(m, false); + + c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_small_image)); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + if (ca->mi.freespace_initialized) { + ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets); + if (ret) { + percpu_ref_put(&ca->io_ref[READ]); + up_write(&c->state_lock); + return ret; + } + } + } + } + return 0; +} + /* return with ref on ca->ref: */ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) { diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index 23533bce5709..50588ab20be2 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -35,6 +35,8 @@ void bch2_fs_read_only(struct bch_fs *); int bch2_fs_read_write(struct bch_fs *); int bch2_fs_read_write_early(struct bch_fs *); +int bch2_fs_resize_on_mount(struct bch_fs *); + void __bch2_fs_stop(struct bch_fs *); void bch2_fs_free(struct bch_fs *); void bch2_fs_stop(struct bch_fs *); -- 2.51.0 From ecedc87cfaf016e7e857a209e1b2685a28d59566 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 3 Apr 2025 19:33:54 -0400 Subject: [PATCH 03/16] bcachefs: export bch2_move_data_phys() Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 20 ++++++++++---------- fs/bcachefs/move.h | 5 +++++ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index a4678a205da6..29981ebcb972 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -900,16 +900,16 @@ err: return ret; } -static int bch2_move_data_phys(struct bch_fs *c, - unsigned dev, - u64 start, - u64 end, - unsigned data_types, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc, - move_pred_fn pred, void *arg) +int bch2_move_data_phys(struct bch_fs *c, + unsigned dev, + u64 start, + u64 end, + unsigned data_types, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc, + move_pred_fn pred, void *arg) { struct moving_context ctxt; diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 51e0505a8156..1ab6dd4621d6 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -135,6 +135,11 @@ int bch2_move_data(struct bch_fs *, bool, move_pred_fn, void *); +int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned, + struct bch_ratelimit *, struct bch_move_stats *, + struct write_point_specifier, bool, + move_pred_fn, void *); + int bch2_evacuate_bucket(struct moving_context *, struct move_bucket_in_flight *, struct bpos, int, -- 2.51.0 From f3c8eaf7a133ef122dfd97e6f6f972265cc84fb0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 3 Apr 2025 19:42:02 -0400 Subject: [PATCH 04/16] bcachefs: Plumb target parameter through btree_node_rewrite_pos() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 2 +- fs/bcachefs/btree_update_interior.c | 37 +++++++++++++++++++---------- fs/bcachefs/btree_update_interior.h | 4 ++-- fs/bcachefs/move.c | 5 ++-- 4 files changed, 30 insertions(+), 18 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index b6f5e0dfc9f1..c1c671e340c7 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1918,7 +1918,7 @@ static void btree_node_scrub_work(struct work_struct *work) bch_err(c, "error validating btree node during scrub on %s at btree %s", scrub->ca->name, err.buf); - ret = bch2_btree_node_rewrite(trans, &iter, b, 0); + ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0); } err: bch2_trans_iter_exit(trans, &iter); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 2be7c10fc59c..3155b4360fbc 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -284,6 +284,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct disk_reservation *res, struct closure *cl, bool interior_node, + unsigned target, unsigned flags) { struct bch_fs *c = trans->c; @@ -317,6 +318,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, mutex_unlock(&c->btree_reserve_cache_lock); retry: ret = bch2_alloc_sectors_start_trans(trans, + target ?: c->opts.metadata_target ?: c->opts.foreground_target, 0, @@ -325,7 +327,9 @@ retry: res->nr_replicas, min(res->nr_replicas, c->opts.metadata_replicas_required), - watermark, 0, cl, &wp); + watermark, + target ? BCH_WRITE_only_specified_devs : 0, + cl, &wp); if (unlikely(ret)) goto err; @@ -505,6 +509,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans * static int bch2_btree_reserve_get(struct btree_trans *trans, struct btree_update *as, unsigned nr_nodes[2], + unsigned target, unsigned flags, struct closure *cl) { @@ -527,7 +532,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, while (p->nr < nr_nodes[interior]) { b = __bch2_btree_node_alloc(trans, &as->disk_res, cl, - interior, flags); + interior, target, flags); if (IS_ERR(b)) { ret = PTR_ERR(b); goto err; @@ -1116,7 +1121,8 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * static struct btree_update * bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - unsigned level_start, bool split, unsigned flags) + unsigned level_start, bool split, + unsigned target, unsigned flags) { struct bch_fs *c = trans->c; struct btree_update *as; @@ -1226,7 +1232,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (ret) goto err; - ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL); + ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, NULL); if (bch2_err_matches(ret, ENOSPC) || bch2_err_matches(ret, ENOMEM)) { struct closure cl; @@ -1245,7 +1251,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, closure_init_stack(&cl); do { - ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); + ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl); bch2_trans_unlock(trans); bch2_wait_on_allocator(c, &cl); @@ -1878,7 +1884,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans, as = bch2_btree_update_start(trans, trans->paths + path, trans->paths[path].level, - true, flags); + true, 0, flags); if (IS_ERR(as)) return PTR_ERR(as); @@ -1948,7 +1954,8 @@ int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, return bch2_btree_split_leaf(trans, path, flags); struct btree_update *as = - bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags); + bch2_btree_update_start(trans, trans->paths + path, b->c.level, + true, 0, flags); if (IS_ERR(as)) return PTR_ERR(as); @@ -2077,7 +2084,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, parent = btree_node_parent(trans->paths + path, b); as = bch2_btree_update_start(trans, trans->paths + path, level, false, - BCH_TRANS_COMMIT_no_enospc|flags); + 0, BCH_TRANS_COMMIT_no_enospc|flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto err; @@ -2184,6 +2191,7 @@ err: int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_iter *iter, struct btree *b, + unsigned target, unsigned flags) { struct bch_fs *c = trans->c; @@ -2196,7 +2204,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_path *path = btree_iter_path(trans, iter); parent = btree_node_parent(path, b); - as = bch2_btree_update_start(trans, path, b->c.level, false, flags); + as = bch2_btree_update_start(trans, path, b->c.level, + false, target, flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto out; @@ -2261,7 +2270,7 @@ static int bch2_btree_node_rewrite_key(struct btree_trans *trans, bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k); ret = found - ? bch2_btree_node_rewrite(trans, &iter, b, flags) + ? bch2_btree_node_rewrite(trans, &iter, b, 0, flags) : -ENOENT; out: bch2_trans_iter_exit(trans, &iter); @@ -2270,7 +2279,9 @@ out: int bch2_btree_node_rewrite_pos(struct btree_trans *trans, enum btree_id btree, unsigned level, - struct bpos pos, unsigned flags) + struct bpos pos, + unsigned target, + unsigned flags) { BUG_ON(!level); @@ -2282,7 +2293,7 @@ int bch2_btree_node_rewrite_pos(struct btree_trans *trans, if (ret) goto err; - ret = bch2_btree_node_rewrite(trans, &iter, b, flags); + ret = bch2_btree_node_rewrite(trans, &iter, b, target, flags); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -2296,7 +2307,7 @@ int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans, if (ret) return ret == -BCH_ERR_btree_node_dying ? 0 : ret; - ret = bch2_btree_node_rewrite(trans, &iter, b, flags); + ret = bch2_btree_node_rewrite(trans, &iter, b, 0, flags); bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index be71cd73b864..ff9b95aac554 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -168,10 +168,10 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, } int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, - struct btree *, unsigned); + struct btree *, unsigned, unsigned); int bch2_btree_node_rewrite_pos(struct btree_trans *, enum btree_id, unsigned, - struct bpos, unsigned); + struct bpos, unsigned, unsigned); int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *, struct btree *, unsigned); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 29981ebcb972..d40e2d14ec52 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -872,7 +872,8 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (!bp.v->level) ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); else if (!data_opts.scrub) - ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0); + ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, + k.k->p, data_opts.target, 0); else ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev); @@ -1022,7 +1023,7 @@ retry: if (!pred(c, arg, b, &io_opts, &data_opts)) goto next; - ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret; + ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0) ?: ret; if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) -- 2.51.0 From 7a274285d3706608d788efcbd9982f08531dd9ec Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 3 Apr 2025 19:51:05 -0400 Subject: [PATCH 05/16] bcachefs: plumb btree_id through move_pred_fd Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 20 +++++++++++--------- fs/bcachefs/move.h | 2 +- fs/bcachefs/rebalance.c | 2 +- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index d40e2d14ec52..07cea68b04f0 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -667,7 +667,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, continue; memset(&data_opts, 0, sizeof(data_opts)); - if (!pred(c, arg, k, io_opts, &data_opts)) + if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts)) goto next; /* @@ -851,7 +851,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, } struct data_update_opts data_opts = {}; - if (!pred(c, arg, k, &io_opts, &data_opts)) { + if (!pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts)) { bch2_trans_iter_exit(trans, &iter); goto next; } @@ -934,7 +934,8 @@ struct evacuate_bucket_arg { struct data_update_opts data_opts; }; -static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k, +static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -1048,7 +1049,7 @@ next: } static bool rereplicate_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -1080,7 +1081,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, } static bool migrate_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -1107,7 +1108,7 @@ static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { - return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); + return rereplicate_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), io_opts, data_opts); } /* @@ -1163,7 +1164,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) } static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -1196,11 +1197,12 @@ static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { - return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); + return drop_extra_replicas_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), + io_opts, data_opts); } static bool scrub_pred(struct bch_fs *c, void *_arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 1ab6dd4621d6..9c6c229e583e 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -72,7 +72,7 @@ do { \ break; \ } while (1) -typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, +typedef bool (*move_pred_fn)(struct bch_fs *, void *, enum btree_id, struct bkey_s_c, struct bch_io_opts *, struct data_update_opts *); extern const char * const bch2_data_ops_strs[]; diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 3c45500c1a28..d2a7001cf872 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -454,7 +454,7 @@ out: } static bool rebalance_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { -- 2.51.0 From 3484840ece849ee700c7cf8e0d44d5536b29fa08 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 13 Apr 2025 16:31:34 -0400 Subject: [PATCH 06/16] bcachefs: bch2_move_data_btree() can move btree nodes Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 37 ++++++++++++++++++++++++------------- fs/bcachefs/move.h | 2 ++ 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 07cea68b04f0..a8ad8d4538e0 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -423,6 +423,9 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, struct bch_io_opts *opts_ret = &io_opts->fs_io_opts; int ret = 0; + if (extent_iter->min_depth) + return opts_ret; + if (extent_k.k->type == KEY_TYPE_reflink_v) goto out; @@ -573,11 +576,11 @@ static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans * return k; } -static int bch2_move_data_btree(struct moving_context *ctxt, - struct bpos start, - struct bpos end, - move_pred_fn pred, void *arg, - enum btree_id btree_id) +int bch2_move_data_btree(struct moving_context *ctxt, + struct bpos start, + struct bpos end, + move_pred_fn pred, void *arg, + enum btree_id btree_id, unsigned level) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; @@ -604,10 +607,10 @@ static int bch2_move_data_btree(struct moving_context *ctxt, } bch2_trans_begin(trans); - bch2_trans_iter_init(trans, &iter, btree_id, start, - BTREE_ITER_prefetch| - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots); + bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level, + BTREE_ITER_prefetch| + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots); if (ctxt->rate) bch2_ratelimit_reset(ctxt->rate); @@ -627,7 +630,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, if (ret) break; - if (bkey_ge(bkey_start_pos(k.k), end)) + if (bkey_gt(bkey_start_pos(k.k), end)) break; if (ctxt->stats) @@ -677,7 +680,14 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); + if (!level) + ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); + else if (!data_opts.scrub) + ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level, + k.k->p, data_opts.target, 0); + else + ret2 = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); + if (ret2) { if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) continue; @@ -695,7 +705,8 @@ next: if (ctxt->stats) atomic64_add(k.k->size, &ctxt->stats->sectors_seen); next_nondata: - bch2_btree_iter_advance(trans, &iter); + if (!bch2_btree_iter_advance(trans, &iter)) + break; } bch2_trans_iter_exit(trans, &reflink_iter); @@ -727,7 +738,7 @@ int __bch2_move_data(struct moving_context *ctxt, ret = bch2_move_data_btree(ctxt, id == start.btree ? start.pos : POS_MIN, id == end.btree ? end.pos : POS_MAX, - pred, arg, id); + pred, arg, id, 0); if (ret) break; } diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 9c6c229e583e..0c620a5f728d 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -122,6 +122,8 @@ int bch2_move_extent(struct moving_context *, struct bch_io_opts, struct data_update_opts); +int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos, + move_pred_fn, void *, enum btree_id, unsigned); int __bch2_move_data(struct moving_context *, struct bbpos, struct bbpos, -- 2.51.0 From fe27298b92001d51797ddc26ca0d7c3d4a0f04d4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 15 Apr 2025 21:35:28 -0400 Subject: [PATCH 07/16] bcachefs: bch2_move_data_btree() can now walk roots Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 47 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index a8ad8d4538e0..ff56d8886c32 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -606,7 +606,52 @@ int bch2_move_data_btree(struct moving_context *ctxt, ctxt->stats->pos = BBPOS(btree_id, start); } +retry_root: bch2_trans_begin(trans); + + if (level == bch2_btree_id_root(c, btree_id)->level + 1) { + bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level - 1, + BTREE_ITER_prefetch| + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots); + struct btree *b = bch2_btree_iter_peek_node(trans, &iter); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto root_err; + + if (b != btree_node_root(c, b)) { + bch2_trans_iter_exit(trans, &iter); + goto retry_root; + } + + k = bkey_i_to_s_c(&b->key); + + io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, + iter.pos, &iter, k); + ret = PTR_ERR_OR_ZERO(io_opts); + if (ret) + goto root_err; + + memset(&data_opts, 0, sizeof(data_opts)); + if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts)) + goto out; + + + if (!data_opts.scrub) + ret = bch2_btree_node_rewrite_pos(trans, btree_id, level, + k.k->p, data_opts.target, 0); + else + ret = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); + +root_err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + bch2_trans_iter_exit(trans, &iter); + goto retry_root; + } + + goto out; + } + bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level, BTREE_ITER_prefetch| BTREE_ITER_not_extents| @@ -708,7 +753,7 @@ next_nondata: if (!bch2_btree_iter_advance(trans, &iter)) break; } - +out: bch2_trans_iter_exit(trans, &reflink_iter); bch2_trans_iter_exit(trans, &iter); bch2_bkey_buf_exit(&sk, c); -- 2.51.0 From 9e260e4590e044dc5887f9eb21dfaf479226e7d4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 20 Apr 2024 17:40:47 -0400 Subject: [PATCH 08/16] docs: bcachefs: idle work scheduling design doc People have been asking to see the plan for this, so - bcachefs has various background tasks that need to be scheduled to balance efficiency, predictability of performance, etc. The design and philosophy hasn't changed too much since bcache, which was primarily designed for server usage, with sustained load in mind. These days we're seeing more desktop usage - where we really want to let the system idle effictively, to reduce total power usage - while also still balancing previous concerns, we still want to let work accumulate to a degree. This lays out all the requirements and starts to sketch out the algorithm I have in mind. Signed-off-by: Kent Overstreet --- .../filesystems/bcachefs/future/idle_work.rst | 78 +++++++++++++++++++ Documentation/filesystems/bcachefs/index.rst | 7 ++ 2 files changed, 85 insertions(+) create mode 100644 Documentation/filesystems/bcachefs/future/idle_work.rst diff --git a/Documentation/filesystems/bcachefs/future/idle_work.rst b/Documentation/filesystems/bcachefs/future/idle_work.rst new file mode 100644 index 000000000000..59a332509dcd --- /dev/null +++ b/Documentation/filesystems/bcachefs/future/idle_work.rst @@ -0,0 +1,78 @@ +Idle/background work classes design doc: + +Right now, our behaviour at idle isn't ideal, it was designed for servers that +would be under sustained load, to keep pending work at a "medium" level, to +let work build up so we can process it in more efficient batches, while also +giving headroom for bursts in load. + +But for desktops or mobile - scenarios where work is less sustained and power +usage is more important - we want to operate differently, with a "rush to +idle" so the system can go to sleep. We don't want to be dribbling out +background work while the system should be idle. + +The complicating factor is that there are a number of background tasks, which +form a heirarchy (or a digraph, depending on how you divide it up) - one +background task may generate work for another. + +Thus proper idle detection needs to model this heirarchy. + +- Foreground writes +- Page cache writeback +- Copygc, rebalance +- Journal reclaim + +When we implement idle detection and rush to idle, we need to be careful not +to disturb too much the existing behaviour that works reasonably well when the +system is under sustained load (or perhaps improve it in the case of +rebalance, which currently does not actively attempt to let work batch up). + +SUSTAINED LOAD REGIME +--------------------- + +When the system is under continuous load, we want these jobs to run +continuously - this is perhaps best modelled with a P/D controller, where +they'll be trying to keep a target value (i.e. fragmented disk space, +available journal space) roughly in the middle of some range. + +The goal under sustained load is to balance our ability to handle load spikes +without running out of x resource (free disk space, free space in the +journal), while also letting some work accumululate to be batched (or become +unnecessary). + +For example, we don't want to run copygc too aggressively, because then it +will be evacuating buckets that would have become empty (been overwritten or +deleted) anyways, and we don't want to wait until we're almost out of free +space because then the system will behave unpredicably - suddenly we're doing +a lot more work to service each write and the system becomes much slower. + +IDLE REGIME +----------- + +When the system becomes idle, we should start flushing our pending work +quicker so the system can go to sleep. + +Note that the definition of "idle" depends on where in the heirarchy a task +is - a task should start flushing work more quickly when the task above it has +stopped generating new work. + +e.g. rebalance should start flushing more quickly when page cache writeback is +idle, and journal reclaim should only start flushing more quickly when both +copygc and rebalance are idle. + +It's important to let work accumulate when more work is still incoming and we +still have room, because flushing is always more efficient if we let it batch +up. New writes may overwrite data before rebalance moves it, and tasks may be +generating more updates for the btree nodes that journal reclaim needs to flush. + +On idle, how much work we do at each interval should be proportional to the +length of time we have been idle for. If we're idle only for a short duration, +we shouldn't flush everything right away; the system might wake up and start +generating new work soon, and flushing immediately might end up doing a lot of +work that would have been unnecessary if we'd allowed things to batch more. + +To summarize, we will need: + + - A list of classes for background tasks that generate work, which will + include one "foreground" class. + - Tracking for each class - "Am I doing work, or have I gone to sleep?" + - And each class should check the class above it when deciding how much work to issue. diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst index 3864d0ae89c1..e5c4c2120b93 100644 --- a/Documentation/filesystems/bcachefs/index.rst +++ b/Documentation/filesystems/bcachefs/index.rst @@ -29,3 +29,10 @@ At this moment, only a few of these are described here. casefolding errorcodes + +Future design +------------- +.. toctree:: + :maxdepth: 1 + + future/idle_work -- 2.51.0 From 62095464e9d2a2340a6b08a90fb280ea2b091a28 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 17 Apr 2025 12:42:13 -0400 Subject: [PATCH 09/16] bcachefs: Fix struct with flex member ABI warning This pops up when buliding in userspace. The structs aren't actually variable length, but no way to tell the compiler that... Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 24 ++++++++++++------------ fs/bcachefs/disk_accounting.h | 8 ++++---- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 1f0422bfae35..e399237e124a 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -631,17 +631,17 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) } static int bch2_disk_accounting_validate_late(struct btree_trans *trans, - struct disk_accounting_pos acc, + struct disk_accounting_pos *acc, u64 *v, unsigned nr) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; int ret = 0, invalid_dev = -1; - switch (acc.type) { + switch (acc->type) { case BCH_DISK_ACCOUNTING_replicas: { struct bch_replicas_padded r; - __accounting_to_replicas(&r.e, &acc); + __accounting_to_replicas(&r.e, acc); for (unsigned i = 0; i < r.e.nr_devs; i++) if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && @@ -660,7 +660,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, trans, accounting_replicas_not_marked, "accounting not marked in superblock replicas\n%s", (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, &acc), + bch2_accounting_key_to_text(&buf, acc), buf.buf))) { /* * We're not RW yet and still single threaded, dropping @@ -676,8 +676,8 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, } case BCH_DISK_ACCOUNTING_dev_data_type: - if (!bch2_dev_exists(c, acc.dev_data_type.dev)) { - invalid_dev = acc.dev_data_type.dev; + if (!bch2_dev_exists(c, acc->dev_data_type.dev)) { + invalid_dev = acc->dev_data_type.dev; goto invalid_device; } break; @@ -691,13 +691,13 @@ invalid_device: "accounting entry points to invalid device %i\n%s", invalid_dev, (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, &acc), + bch2_accounting_key_to_text(&buf, acc), buf.buf))) { for (unsigned i = 0; i < nr; i++) v[i] = -v[i]; ret = commit_do(trans, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?: + bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?: -BCH_ERR_remove_disk_accounting_entry; } else { ret = -BCH_ERR_remove_disk_accounting_entry; @@ -748,7 +748,7 @@ int bch2_accounting_read(struct bch_fs *c) if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) break; - if (!bch2_accounting_is_mem(acc_k)) { + if (!bch2_accounting_is_mem(&acc_k)) { struct disk_accounting_pos next; memset(&next, 0, sizeof(next)); next.type = acc_k.type + 1; @@ -770,7 +770,7 @@ int bch2_accounting_read(struct bch_fs *c) struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); - if (!bch2_accounting_is_mem(acc_k)) + if (!bch2_accounting_is_mem(&acc_k)) continue; struct bkey_s_c k = bkey_i_to_s_c(i->k); @@ -826,7 +826,7 @@ int bch2_accounting_read(struct bch_fs *c) */ ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) ? -BCH_ERR_remove_disk_accounting_entry - : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters); + : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); if (ret == -BCH_ERR_remove_disk_accounting_entry) { free_percpu(i->v[0]); @@ -939,7 +939,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c) if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) break; - if (!bch2_accounting_is_mem(acc_k)) { + if (!bch2_accounting_is_mem(&acc_k)) { struct disk_accounting_pos next; memset(&next, 0, sizeof(next)); next.type = acc_k.type + 1; diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index d557b99b3c0a..54cb8a5b117d 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -139,10 +139,10 @@ int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); void bch2_accounting_mem_gc(struct bch_fs *); -static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) +static inline bool bch2_accounting_is_mem(struct disk_accounting_pos *acc) { - return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR && - acc.type != BCH_DISK_ACCOUNTING_inum; + return acc->type < BCH_DISK_ACCOUNTING_TYPE_NR && + acc->type != BCH_DISK_ACCOUNTING_inum; } /* @@ -163,7 +163,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, if (gc && !acc->gc_running) return 0; - if (!bch2_accounting_is_mem(acc_k)) + if (!bch2_accounting_is_mem(&acc_k)) return 0; if (mode == BCH_ACCOUNTING_normal) { -- 2.51.0 From 09279bba72f809eeb1f02d39a462e8e1d06fa32a Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Fri, 18 Apr 2025 15:52:10 +0800 Subject: [PATCH 10/16] bcachefs: Kill dead code Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_reclaim.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index a02f483a016a..fd7a140c9fd6 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -637,8 +637,6 @@ static u64 journal_seq_to_flush(struct journal *j) /* Try to keep the journal at most half full: */ nr_buckets = ja->nr / 2; - nr_buckets = min(nr_buckets, ja->nr); - bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; seq_to_flush = max(seq_to_flush, ja->bucket_seq[bucket_to_flush]); -- 2.51.0 From 834f9475aabd84f60760ac8ceffc45eedff4a176 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 14 Mar 2025 09:46:25 -0400 Subject: [PATCH 11/16] bcachefs: bch2_check_rebalance_work() Add a pass for checking the rebalance_work btree. Signed-off-by: Kent Overstreet --- fs/bcachefs/rebalance.c | 116 ++++++++++++++++++++++++++++ fs/bcachefs/rebalance.h | 2 + fs/bcachefs/recovery_passes_types.h | 1 + 3 files changed, 119 insertions(+) diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index d2a7001cf872..26c87ab019e8 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -712,3 +712,119 @@ void bch2_fs_rebalance_init(struct bch_fs *c) { bch2_pd_controller_init(&c->rebalance.pd); } + +static int check_rebalance_work_one(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct btree_iter *rebalance_iter, + struct bkey_buf *last_flushed) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c extent_k, rebalance_k; + struct printbuf buf = PRINTBUF; + + int ret = bkey_err(extent_k = bch2_btree_iter_peek(trans, extent_iter)) ?: + bkey_err(rebalance_k = bch2_btree_iter_peek(trans, rebalance_iter)); + if (ret) + return ret; + + if (!extent_k.k && + extent_iter->btree_id == BTREE_ID_reflink && + (!rebalance_k.k || + rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) { + bch2_trans_iter_exit(trans, extent_iter); + bch2_trans_iter_init(trans, extent_iter, + BTREE_ID_extents, POS_MIN, + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots); + return -BCH_ERR_transaction_restart_nested; + } + + if (!extent_k.k && !rebalance_k.k) + return 1; + + int cmp = bpos_cmp(extent_k.k ? extent_k.k->p : SPOS_MAX, + rebalance_k.k ? rebalance_k.k->p : SPOS_MAX); + + struct bkey deleted; + bkey_init(&deleted); + + if (cmp < 0) { + deleted.p = extent_k.k->p; + rebalance_k.k = &deleted; + } else if (cmp > 0) { + deleted.p = rebalance_k.k->p; + extent_k.k = &deleted; + } + + bool should_have_rebalance = + bch2_bkey_sectors_need_rebalance(c, extent_k) != 0; + bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set; + + if (should_have_rebalance != have_rebalance) { + ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed); + if (ret) + return ret; + + bch2_bkey_val_to_text(&buf, c, extent_k); + } + + if (fsck_err_on(!should_have_rebalance && have_rebalance, + trans, rebalance_work_incorrectly_set, + "rebalance work incorrectly set\n%s", buf.buf)) { + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, + extent_k.k->p, false); + if (ret) + goto err; + } + + if (fsck_err_on(should_have_rebalance && !have_rebalance, + trans, rebalance_work_incorrectly_unset, + "rebalance work incorrectly unset\n%s", buf.buf)) { + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, + extent_k.k->p, true); + if (ret) + goto err; + } + + if (cmp <= 0) + bch2_btree_iter_advance(trans, extent_iter); + if (cmp >= 0) + bch2_btree_iter_advance(trans, rebalance_iter); +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +int bch2_check_rebalance_work(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter rebalance_iter, extent_iter; + int ret = 0; + + bch2_trans_iter_init(trans, &extent_iter, + BTREE_ID_reflink, POS_MIN, + BTREE_ITER_prefetch); + bch2_trans_iter_init(trans, &rebalance_iter, + BTREE_ID_rebalance_work, POS_MIN, + BTREE_ITER_prefetch); + + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + while (!ret) { + bch2_trans_begin(trans); + + ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + } + + bch2_bkey_buf_exit(&last_flushed, c); + bch2_trans_iter_exit(trans, &extent_iter); + bch2_trans_iter_exit(trans, &rebalance_iter); + bch2_trans_put(trans); + return ret < 0 ? ret : 0; +} diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index e5e8eb4a2dd1..b7c8c0652ad6 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -54,4 +54,6 @@ void bch2_rebalance_stop(struct bch_fs *); int bch2_rebalance_start(struct bch_fs *); void bch2_fs_rebalance_init(struct bch_fs *); +int bch2_check_rebalance_work(struct bch_fs *); + #endif /* _BCACHEFS_REBALANCE_H */ diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index f9d565bb50dd..be3185fc6ef4 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -59,6 +59,7 @@ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ x(check_nlinks, 31, PASS_FSCK) \ + x(check_rebalance_work, 43, PASS_ONLINE|PASS_FSCK) \ x(resume_logged_ops, 23, PASS_ALWAYS) \ x(delete_dead_inodes, 32, PASS_ALWAYS) \ x(fix_reflink_p, 33, 0) \ -- 2.51.0 From c53be0ffaa501d25f58ac2e56b7e5710f3408a50 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 19 Apr 2025 00:57:55 -0400 Subject: [PATCH 12/16] bcachefs: bch2_target_to_text() no longer depends on io_ref Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_groups.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 2ca3cbf12b71..4e2f237338c2 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -554,14 +554,12 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) ? rcu_dereference(c->devs[t.dev]) : NULL; - if (ca && percpu_ref_tryget(&ca->io_ref[READ])) { + if (ca && ca->disk_sb.bdev) prt_printf(out, "/dev/%s", ca->name); - percpu_ref_put(&ca->io_ref[READ]); - } else if (ca) { + else if (ca) prt_printf(out, "offline device %u", t.dev); - } else { + else prt_printf(out, "invalid device %u", t.dev); - } rcu_read_unlock(); out->atomic--; -- 2.51.0 From 2483dd1243584432c8b407f09673500c00095cd3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 20 Apr 2025 11:23:53 -0400 Subject: [PATCH 13/16] bcachefs: recalc_capacity() no longer depends on io_ref Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 6ac8bd49c629..0494d188605f 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -2505,15 +2505,15 @@ void bch2_recalc_capacity(struct bch_fs *c) lockdep_assert_held(&c->state_lock); - for_each_online_member(c, ca) { - struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; - - ra_pages += bdi->ra_pages; - } + rcu_read_lock(); + for_each_member_device_rcu(c, ca, NULL) { + struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev); + if (bdev) + ra_pages += bdev->bd_disk->bdi->ra_pages; - bch2_set_ra_pages(c, ra_pages); + if (ca->mi.state != BCH_MEMBER_STATE_rw) + continue; - __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) { u64 dev_reserve = 0; /* @@ -2550,6 +2550,9 @@ void bch2_recalc_capacity(struct bch_fs *c) bucket_size_max = max_t(unsigned, bucket_size_max, ca->mi.bucket_size); } + rcu_read_unlock(); + + bch2_set_ra_pages(c, ra_pages); gc_reserve = c->opts.gc_reserve_bytes ? c->opts.gc_reserve_bytes >> 9 -- 2.51.0 From 9fa4a8a3bdb14c1a36ff439643e00ca416e04b66 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 18 Apr 2025 22:11:15 -0400 Subject: [PATCH 14/16] bcachefs: for_each_online_member_rcu() Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 10 +++++++++- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/chardev.c | 6 ++++-- fs/bcachefs/fs.c | 9 ++++++--- fs/bcachefs/sb-members.h | 14 +++----------- fs/bcachefs/super.c | 16 ++++++++++++---- 6 files changed, 35 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index d56cee7e8cb5..e87b95f609c5 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1645,7 +1645,12 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) printbuf_indent_sub(&buf, 2); prt_newline(&buf); - for_each_online_member(c, ca) { + bch2_printbuf_make_room(&buf, 4096); + + rcu_read_lock(); + buf.atomic++; + + for_each_online_member_rcu(c, ca) { prt_printf(&buf, "Dev %u:\n", ca->dev_idx); printbuf_indent_add(&buf, 2); bch2_dev_alloc_debug_to_text(&buf, ca); @@ -1653,6 +1658,9 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) prt_newline(&buf); } + --buf.atomic; + rcu_read_unlock(); + prt_printf(&buf, "Copygc debug:\n"); printbuf_indent_add(&buf, 2); bch2_copygc_wait_to_text(&buf, c); diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 4fd096349790..1597259b708c 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -891,6 +891,7 @@ struct bch_fs { struct workqueue_struct *write_ref_wq; /* ALLOCATION */ + struct bch_devs_mask online_devs; struct bch_devs_mask rw_devs[BCH_DATA_NR]; unsigned long rw_devs_change_count; diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 5891b3a1e61c..4066946b26bc 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -613,11 +613,13 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, if (!dev) return -EINVAL; - for_each_online_member(c, ca) + rcu_read_lock(); + for_each_online_member_rcu(c, ca) if (ca->dev == dev) { - percpu_ref_put(&ca->io_ref[READ]); + rcu_read_unlock(); return ca->dev_idx; } + rcu_read_unlock(); return -BCH_ERR_ENOENT_dev_idx_not_found; } diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 17a27d6d8c9d..cdf84180829a 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -2327,12 +2327,14 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root) struct bch_fs *c = root->d_sb->s_fs_info; bool first = true; - for_each_online_member(c, ca) { + rcu_read_lock(); + for_each_online_member_rcu(c, ca) { if (!first) seq_putc(seq, ':'); first = false; seq_puts(seq, ca->disk_sb.sb_name); } + rcu_read_unlock(); return 0; } @@ -2529,15 +2531,16 @@ got_sb: sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; - for_each_online_member(c, ca) { + rcu_read_lock(); + for_each_online_member_rcu(c, ca) { struct block_device *bdev = ca->disk_sb.bdev; /* XXX: create an anonymous device for multi device filesystems */ sb->s_bdev = bdev; sb->s_dev = bdev->bd_dev; - percpu_ref_put(&ca->io_ref[READ]); break; } + rcu_read_unlock(); c->dev = sb->s_dev; diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 424143f5e330..28c6fc25c32c 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -104,6 +104,9 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev * for (struct bch_dev *_ca = NULL; \ (_ca = __bch2_next_dev((_c), _ca, (_mask)));) +#define for_each_online_member_rcu(_c, _ca) \ + for_each_member_device_rcu(_c, _ca, &(_c)->online_devs) + static inline void bch2_dev_get(struct bch_dev *ca) { #ifdef CONFIG_BCACHEFS_DEBUG @@ -307,17 +310,6 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, return NULL; } -/* XXX kill, move to struct bch_fs */ -static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) -{ - struct bch_devs_mask devs; - - memset(&devs, 0, sizeof(devs)); - for_each_online_member(c, ca) - __set_bit(ca->dev_idx, devs.d); - return devs; -} - extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1; extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 839b1582c1f1..834c68a273b4 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1105,7 +1105,7 @@ static bool bch2_fs_may_start(struct bch_fs *c) break; } - return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); + return bch2_have_enough_devs(c, c->online_devs, flags, true); } int bch2_fs_start(struct bch_fs *c) @@ -1138,8 +1138,11 @@ int bch2_fs_start(struct bch_fs *c) goto err; } - for_each_online_member(c, ca) - bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); + rcu_read_lock(); + for_each_online_member_rcu(c, ca) + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = + cpu_to_le64(now); + rcu_read_unlock(); /* * Dno't write superblock yet: recovery might have to downgrade @@ -1294,6 +1297,9 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw) { + if (rw == READ) + clear_bit(ca->dev_idx, ca->fs->online_devs.d); + if (!percpu_ref_is_zero(&ca->io_ref[rw])) { reinit_completion(&ca->io_ref_completion[rw]); percpu_ref_kill(&ca->io_ref[rw]); @@ -1577,6 +1583,8 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) if (ret) return ret; + set_bit(ca->dev_idx, c->online_devs.d); + bch2_dev_sysfs_online(c, ca); struct printbuf name = PRINTBUF; @@ -1634,7 +1642,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, return true; /* do we have enough devices to read from? */ - new_online_devs = bch2_online_devs(c); + new_online_devs = c->online_devs; __clear_bit(ca->dev_idx, new_online_devs.d); return bch2_have_enough_devs(c, new_online_devs, flags, false); -- 2.51.0 From e14e06e91dadcd1c65f08ba5a02716d3e855fc74 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 20 Apr 2025 11:27:18 -0400 Subject: [PATCH 15/16] bcachefs: __bch2_fs_read_write() no longer depends on io_ref Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 834c68a273b4..cdcfed4dd283 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -471,10 +471,14 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) clear_bit(BCH_FS_clean_shutdown, &c->flags); - __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) { - bch2_dev_allocator_add(c, ca); - percpu_ref_reinit(&ca->io_ref[WRITE]); - } + rcu_read_lock(); + for_each_online_member_rcu(c, ca) + if (ca->mi.state == BCH_MEMBER_STATE_rw) { + bch2_dev_allocator_add(c, ca); + percpu_ref_reinit(&ca->io_ref[WRITE]); + } + rcu_read_unlock(); + bch2_recalc_capacity(c); /* @@ -1149,8 +1153,11 @@ int bch2_fs_start(struct bch_fs *c) */ mutex_unlock(&c->sb_lock); - for_each_rw_member(c, ca) - bch2_dev_allocator_add(c, ca); + rcu_read_lock(); + for_each_online_member_rcu(c, ca) + if (ca->mi.state == BCH_MEMBER_STATE_rw) + bch2_dev_allocator_add(c, ca); + rcu_read_unlock(); bch2_recalc_capacity(c); up_write(&c->state_lock); -- 2.51.0 From 6d67de1079993e09e7a867a6936a8163ece98792 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 19 Apr 2025 19:13:40 -0400 Subject: [PATCH 16/16] bcachefs: for_each_rw_member_rcu() Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 10 ++++++++-- fs/bcachefs/journal.c | 4 +++- fs/bcachefs/journal_io.c | 5 +---- fs/bcachefs/journal_reclaim.c | 4 +++- fs/bcachefs/movinggc.c | 4 +++- fs/bcachefs/sb-members.h | 3 +++ 6 files changed, 21 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 0494d188605f..195d20220b7d 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -2575,8 +2575,10 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c) { u64 ret = U64_MAX; - for_each_rw_member(c, ca) + rcu_read_lock(); + for_each_rw_member_rcu(c, ca) ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); + rcu_read_unlock(); return ret; } @@ -2600,8 +2602,12 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw) { + /* BCH_DATA_free == all rw devs */ + for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) - if (rw && (ca->mi.data_allowed & BIT(i))) + if (rw && + (i == BCH_DATA_free || + (ca->mi.data_allowed & BIT(i)))) set_bit(ca->dev_idx, c->rw_devs[i].d); else clear_bit(ca->dev_idx, c->rw_devs[i].d); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 3694b83af8cc..e1cd6e8e37cf 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -699,8 +699,10 @@ static unsigned max_dev_latency(struct bch_fs *c) { u64 nsecs = 0; - for_each_rw_member(c, ca) + rcu_read_lock(); + for_each_rw_member_rcu(c, ca) nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); + rcu_read_unlock(); return nsecs_to_jiffies(nsecs); } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 28fa381cd589..438ad32ba242 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -2055,12 +2055,9 @@ CLOSURE_CALLBACK(bch2_journal_write) struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_replicas_padded replicas; - unsigned nr_rw_members = 0; + unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_journal]); int ret; - for_each_rw_member(c, ca) - nr_rw_members++; - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); BUG_ON(!w->write_started); BUG_ON(w->write_allocated); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index fd7a140c9fd6..dc8169a970dd 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -627,7 +627,8 @@ static u64 journal_seq_to_flush(struct journal *j) spin_lock(&j->lock); - for_each_rw_member(c, ca) { + rcu_read_lock(); + for_each_rw_member_rcu(c, ca) { struct journal_device *ja = &ca->journal; unsigned nr_buckets, bucket_to_flush; @@ -641,6 +642,7 @@ static u64 journal_seq_to_flush(struct journal *j) seq_to_flush = max(seq_to_flush, ja->bucket_seq[bucket_to_flush]); } + rcu_read_unlock(); /* Also flush if the pin fifo is more than half full */ seq_to_flush = max_t(s64, seq_to_flush, diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 96873372b516..e97e87ebe312 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -279,7 +279,8 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) { s64 wait = S64_MAX, fragmented_allowed, fragmented; - for_each_rw_member(c, ca) { + rcu_read_lock(); + for_each_rw_member_rcu(c, ca) { struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca); struct bch_dev_usage usage; @@ -296,6 +297,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) wait = min(wait, max(0LL, fragmented_allowed - fragmented)); } + rcu_read_unlock(); return wait; } diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 28c6fc25c32c..c71a1ba61525 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -107,6 +107,9 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev * #define for_each_online_member_rcu(_c, _ca) \ for_each_member_device_rcu(_c, _ca, &(_c)->online_devs) +#define for_each_rw_member_rcu(_c, _ca) \ + for_each_member_device_rcu(_c, _ca, &(_c)->rw_devs[BCH_DATA_free]) + static inline void bch2_dev_get(struct bch_dev *ca) { #ifdef CONFIG_BCACHEFS_DEBUG -- 2.51.0