From 8d5ac187da10a8a5599b481588951470a2fb792b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 20 May 2025 20:05:45 -0400 Subject: [PATCH 01/16] bcachefs: Fix casefold opt via xattr interface Changing the casefold option requires extra checks/work - factor out a helper from bch2_fileattr_set() for the xattr code to use. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 26 +------------------------- fs/bcachefs/inode.c | 36 ++++++++++++++++++++++++++++++++++++ fs/bcachefs/inode.h | 4 +++- fs/bcachefs/xattr.c | 6 ++++++ 4 files changed, 46 insertions(+), 26 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 4b742e62255b..47f1a64c5c8d 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1664,33 +1664,9 @@ static int fssetxattr_inode_update_fn(struct btree_trans *trans, return -EINVAL; if (s->casefold != bch2_inode_casefold(c, bi)) { -#ifdef CONFIG_UNICODE - int ret = 0; - /* Not supported on individual files. */ - if (!S_ISDIR(bi->bi_mode)) - return -EOPNOTSUPP; - - /* - * Make sure the dir is empty, as otherwise we'd need to - * rehash everything and update the dirent keys. - */ - ret = bch2_empty_dir_trans(trans, inode_inum(inode)); - if (ret < 0) - return ret; - - ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding); + int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->casefold); if (ret) return ret; - - bch2_check_set_feature(c, BCH_FEATURE_casefolding); - - bi->bi_casefold = s->casefold + 1; - bi->bi_fields_set |= BIT(Inode_opt_casefold); - -#else - printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n"); - return -EOPNOTSUPP; -#endif } if (s->set_project) { diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index b51d98cf8a80..490b85841de9 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -14,6 +14,7 @@ #include "extent_update.h" #include "fs.h" #include "inode.h" +#include "namei.h" #include "opts.h" #include "str_hash.h" #include "snapshot.h" @@ -1204,6 +1205,41 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i return 0; } +int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *bi, unsigned v) +{ + struct bch_fs *c = trans->c; + +#ifdef CONFIG_UNICODE + int ret = 0; + /* Not supported on individual files. */ + if (!S_ISDIR(bi->bi_mode)) + return -EOPNOTSUPP; + + /* + * Make sure the dir is empty, as otherwise we'd need to + * rehash everything and update the dirent keys. + */ + ret = bch2_empty_dir_trans(trans, inum); + if (ret < 0) + return ret; + + ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding); + if (ret) + return ret; + + bch2_check_set_feature(c, BCH_FEATURE_casefolding); + + bi->bi_casefold = v + 1; + bi->bi_fields_set |= BIT(Inode_opt_casefold); + + return 0; +#else + bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE"); + return -EOPNOTSUPP; +#endif +} + static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) { struct bch_fs *c = trans->c; diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index c74af15b14f2..5cfba9e98966 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -292,7 +292,9 @@ static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *i struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, struct bch_inode_unpacked *); -int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *); +int bch2_inum_opts_get(struct btree_trans *, subvol_inum, struct bch_io_opts *); +int bch2_inode_set_casefold(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, unsigned); #include "rebalance.h" diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 651da52b2cbc..e6be32003f3b 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -473,6 +473,12 @@ static int inode_opt_set_fn(struct btree_trans *trans, { struct inode_opt_set *s = p; + if (s->id == Inode_opt_casefold) { + int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->v); + if (ret) + return ret; + } + if (s->defined) bi->bi_fields_set |= 1U << s->id; else -- 2.51.0 From ecd76c5f108eec3eea0a5b32007cda0dca6c92ac Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 21 May 2025 00:41:07 -0400 Subject: [PATCH 02/16] bcachefs: Fix bch2_dirent_create_snapshot() for casefolding bch2_dirent_create_snapshot(), used in fsck, neglected to create a casefolded dirent. Just move this into dirent_create_key(). Signed-off-by: Kent Overstreet --- fs/bcachefs/dirent.c | 33 +++++++++++++++------------------ fs/bcachefs/dirent.h | 2 +- fs/bcachefs/fsck.c | 2 ++ fs/bcachefs/namei.c | 2 -- 4 files changed, 18 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 8a680e52c1ed..a51195088227 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -288,6 +288,7 @@ static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent, } static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, + const struct bch_hash_info *hash_info, subvol_inum dir, u8 type, const struct qstr *name, @@ -295,10 +296,19 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, u64 dst) { struct bkey_i_dirent *dirent; + struct qstr _cf_name; if (name->len > BCH_NAME_MAX) return ERR_PTR(-ENAMETOOLONG); + if (hash_info->cf_encoding && !cf_name) { + int ret = bch2_casefold(trans, hash_info, name, &_cf_name); + if (ret) + return ERR_PTR(ret); + + cf_name = &_cf_name; + } + dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst); if (IS_ERR(dirent)) return dirent; @@ -324,7 +334,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, dir_inum, type, name, NULL, dst_inum); + dirent = dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -333,8 +343,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, dirent->k.p.snapshot = snapshot; ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, - dir_inum, snapshot, &dirent->k_i, - flags|BTREE_UPDATE_internal_snapshot_node); + dir_inum, snapshot, &dirent->k_i, flags); *dir_offset = dirent->k.p.offset; return ret; @@ -344,28 +353,16 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, - u64 *i_size, enum btree_iter_update_trigger_flags flags) { struct bkey_i_dirent *dirent; int ret; - if (hash_info->cf_encoding) { - struct qstr cf_name; - ret = bch2_casefold(trans, hash_info, name, &cf_name); - if (ret) - return ret; - dirent = dirent_create_key(trans, dir, type, name, &cf_name, dst_inum); - } else { - dirent = dirent_create_key(trans, dir, type, name, NULL, dst_inum); - } - + dirent = dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; - *i_size += bkey_bytes(&dirent->k); - ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, dir, &dirent->k_i, flags); *dir_offset = dirent->k.p.offset; @@ -466,7 +463,7 @@ int bch2_dirent_rename(struct btree_trans *trans, *src_offset = dst_iter.pos.offset; /* Create new dst key: */ - new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, + new_dst = dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name, dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0); ret = PTR_ERR_OR_ZERO(new_dst); if (ret) @@ -477,7 +474,7 @@ int bch2_dirent_rename(struct btree_trans *trans, /* Create new src key: */ if (mode == BCH_RENAME_EXCHANGE) { - new_src = dirent_create_key(trans, src_dir, 0, src_name, + new_src = dirent_create_key(trans, src_hash, src_dir, 0, src_name, src_hash->cf_encoding ? &src_name_lookup : NULL, 0); ret = PTR_ERR_OR_ZERO(new_src); if (ret) diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 9838a7ba7ed1..d3e7ae669575 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -65,7 +65,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, enum btree_iter_update_trigger_flags); int bch2_dirent_create(struct btree_trans *, subvol_inum, const struct bch_hash_info *, u8, - const struct qstr *, u64, u64 *, u64 *, + const struct qstr *, u64, u64 *, enum btree_iter_update_trigger_flags); static inline unsigned vfs_d_type(unsigned type) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 71d428f376a5..0cf9fd4c17bc 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -306,6 +306,7 @@ create_lostfound: &lostfound_str, lostfound->bi_inum, &lostfound->bi_dir_offset, + BTREE_UPDATE_internal_snapshot_node| STR_HASH_must_create) ?: bch2_inode_write_flags(trans, &lostfound_iter, lostfound, BTREE_UPDATE_internal_snapshot_node); @@ -431,6 +432,7 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * &name, inode->bi_subvol ?: inode->bi_inum, &inode->bi_dir_offset, + BTREE_UPDATE_internal_snapshot_node| STR_HASH_must_create); if (ret) { bch_err_msg(c, ret, "error creating dirent"); diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index 52c58c6d53d2..9136a9097789 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -158,7 +158,6 @@ int bch2_create_trans(struct btree_trans *trans, name, dir_target, &dir_offset, - &dir_u->bi_size, STR_HASH_must_create|BTREE_ITER_with_updates) ?: bch2_inode_write(trans, &dir_iter, dir_u); if (ret) @@ -225,7 +224,6 @@ int bch2_link_trans(struct btree_trans *trans, mode_to_type(inode_u->bi_mode), name, inum.inum, &dir_offset, - &dir_u->bi_size, STR_HASH_must_create); if (ret) goto err; -- 2.51.0 From 010c89468134d1991b87122379f86feae23d512f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 21 May 2025 00:38:04 -0400 Subject: [PATCH 03/16] bcachefs: Check for casefolded dirents in non casefolded dirs Check for mismatches between casefold dirents and casefold directories. A mismatch will cause lookups to fail, as we'll be doing the lookup with the casefolded name, which won't match the non-casefolded dirent, and vice versa. Reported-by: Christopher Snowhill Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 35 ++++++++++++++++++++++++++++++++++ fs/bcachefs/sb-errors_format.h | 8 +++++++- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 0cf9fd4c17bc..aaf187085276 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2190,6 +2190,41 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + /* check casefold */ + if (fsck_err_on(d.v->d_casefold != !!hash_info->cf_encoding, + trans, dirent_casefold_mismatch, + "dirent casefold does not match dir casefold\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) { + struct qstr name = bch2_dirent_get_name(d); + u32 subvol = d.v->d_type == DT_SUBVOL + ? d.v->d_parent_subvol + : 0; + u64 target = d.v->d_type == DT_SUBVOL + ? d.v->d_child_subvol + : d.v->d_inum; + u64 dir_offset; + + ret = bch2_hash_delete_at(trans, + bch2_dirent_hash_desc, hash_info, iter, + BTREE_UPDATE_internal_snapshot_node) ?: + bch2_dirent_create_snapshot(trans, subvol, + d.k->p.inode, d.k->p.snapshot, + hash_info, + d.v->d_type, + &name, + target, + &dir_offset, + BTREE_ITER_with_updates| + BTREE_UPDATE_internal_snapshot_node| + STR_HASH_must_create) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + + /* might need another check_dirents pass */ + goto out; + } + if (d.v->d_type == DT_SUBVOL) { ret = check_dirent_to_subvol(trans, iter, d); if (ret) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 3b69a924086f..4036a20c6adc 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -209,6 +209,7 @@ enum bch_fsck_flags { x(subvol_to_missing_root, 188, 0) \ x(subvol_root_wrong_bi_subvol, 189, FSCK_AUTOFIX) \ x(bkey_in_missing_snapshot, 190, 0) \ + x(bkey_in_deleted_snapshot, 315, 0) \ x(inode_pos_inode_nonzero, 191, 0) \ x(inode_pos_blockdev_range, 192, 0) \ x(inode_alloc_cursor_inode_bad, 301, 0) \ @@ -216,6 +217,7 @@ enum bch_fsck_flags { x(inode_str_hash_invalid, 194, 0) \ x(inode_v3_fields_start_bad, 195, 0) \ x(inode_snapshot_mismatch, 196, 0) \ + x(snapshot_key_missing_inode_snapshot, 314, 0) \ x(inode_unlinked_but_clean, 197, 0) \ x(inode_unlinked_but_nlink_nonzero, 198, 0) \ x(inode_unlinked_and_not_open, 281, 0) \ @@ -237,6 +239,8 @@ enum bch_fsck_flags { x(inode_unreachable, 210, FSCK_AUTOFIX) \ x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \ x(inode_i_sectors_underflow, 312, FSCK_AUTOFIX) \ + x(inode_has_case_insensitive_not_set, 316, FSCK_AUTOFIX) \ + x(inode_parent_has_case_insensitive_not_set, 317, FSCK_AUTOFIX) \ x(vfs_inode_i_blocks_underflow, 311, FSCK_AUTOFIX) \ x(vfs_inode_i_blocks_not_zero_at_truncate, 313, FSCK_AUTOFIX) \ x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ @@ -262,6 +266,7 @@ enum bch_fsck_flags { x(dirent_to_overwritten_inode, 302, 0) \ x(dirent_to_missing_subvol, 230, 0) \ x(dirent_to_itself, 231, 0) \ + x(dirent_casefold_mismatch, 318, FSCK_AUTOFIX) \ x(quota_type_invalid, 232, 0) \ x(xattr_val_size_too_small, 233, 0) \ x(xattr_val_size_too_big, 234, 0) \ @@ -301,6 +306,7 @@ enum bch_fsck_flags { x(btree_ptr_v2_written_0, 268, 0) \ x(subvol_snapshot_bad, 269, 0) \ x(subvol_inode_bad, 270, 0) \ + x(subvol_missing, 308, FSCK_AUTOFIX) \ x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \ x(accounting_mismatch, 272, FSCK_AUTOFIX) \ x(accounting_replicas_not_marked, 273, 0) \ @@ -322,7 +328,7 @@ enum bch_fsck_flags { x(dirent_stray_data_after_cf_name, 305, 0) \ x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ - x(MAX, 314, 0) + x(MAX, 319, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, -- 2.51.0 From 0e5f1f3f8fad0d195099e4a8e7c43ffe71676047 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 29 Mar 2025 17:59:30 -0400 Subject: [PATCH 04/16] bcachefs: bch2_subvolume_wait_for_pagecache_and_delete() cleanup Signed-off-by: Kent Overstreet --- fs/bcachefs/subvolume.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index d0209f7658bb..239ea783698c 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -479,13 +479,11 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_wait_for_pagecache_and_delete_work); - snapshot_id_list s; - u32 *id; int ret = 0; while (!ret) { mutex_lock(&c->snapshots_unlinked_lock); - s = c->snapshots_unlinked; + snapshot_id_list s = c->snapshots_unlinked; darray_init(&c->snapshots_unlinked); mutex_unlock(&c->snapshots_unlinked_lock); @@ -494,7 +492,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor bch2_evict_subvolume_inodes(c, &s); - for (id = s.data; id < s.data + s.nr; id++) { + darray_for_each(s, id) { ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id)); bch_err_msg(c, ret, "deleting subvolume %u", *id); if (ret) -- 2.51.0 From 6659ba3b18f71282c7c54f3bffcfecfac73f202f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Mar 2025 13:33:41 -0400 Subject: [PATCH 05/16] bcachefs: Be precise about bch_io_failures If the extent we're reading from changes, due to be being overwritten or moved (possibly partially) - we need to reset bch_io_failures so that we don't accidentally mark a new extent as poisoned prematurely. This means we have to separately track (in the retry path) the extent we previously read from. Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey.h | 1 + fs/bcachefs/io_read.c | 51 ++++++++++++++++++++++++++++++++++++++++--- fs/bcachefs/io_read.h | 5 +++-- 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 054e2d5e8448..082632905649 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -191,6 +191,7 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r) static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) { return bpos_eq(l.k->p, r.k->p) && + l.k->size == r.k->size && bkey_bytes(l.k) == bkey_bytes(r.k) && !memcmp(l.v, r.v, bkey_val_bytes(l.k)); } diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index def4a26a3b45..3705b606f675 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -296,6 +296,13 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, bool *read_full, struct bch_io_failures *failed) { + /* + * We're in the retry path, but we don't know what to repair yet, and we + * don't want to do a promote here: + */ + if (failed && !failed->nr) + return NULL; + struct bch_fs *c = trans->c; /* * if failed != NULL we're not actually doing a promote, we're @@ -430,6 +437,28 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) bio_endio(&rbio->bio); } +static void get_rbio_extent(struct btree_trans *trans, + struct bch_read_bio *rbio, + struct bkey_buf *sk) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = lockrestart_do(trans, + bkey_err(k = bch2_bkey_get_iter(trans, &iter, + rbio->data_btree, rbio->data_pos, 0))); + if (ret) + return; + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr(ptrs, ptr) + if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) { + bch2_bkey_buf_reassemble(sk, trans->c, k); + break; + } + + bch2_trans_iter_exit(trans, &iter); +} + static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, @@ -491,11 +520,18 @@ static void bch2_rbio_retry(struct work_struct *work) struct btree_trans *trans = bch2_trans_get(c); + struct bkey_buf sk; + bch2_bkey_buf_init(&sk); + bkey_init(&sk.k->k); + trace_io_read_retry(&rbio->bio); this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], bvec_iter_sectors(rbio->bvec_iter)); - if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) + get_rbio_extent(trans, rbio, &sk); + + if (!bkey_deleted(&sk.k->k) && + bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) bch2_mark_io_failure(&failed, &rbio->pick, rbio->ret == -BCH_ERR_data_read_retry_csum_err); @@ -516,7 +552,7 @@ static void bch2_rbio_retry(struct work_struct *work) int ret = rbio->data_update ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) - : __bch2_read(trans, rbio, iter, inum, &failed, flags); + : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags); if (ret) { rbio->ret = ret; @@ -539,6 +575,7 @@ static void bch2_rbio_retry(struct work_struct *work) } bch2_rbio_done(rbio); + bch2_bkey_buf_exit(&sk, c); bch2_trans_put(trans); } @@ -1265,7 +1302,9 @@ out_read_done: int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, subvol_inum inum, - struct bch_io_failures *failed, unsigned flags) + struct bch_io_failures *failed, + struct bkey_buf *prev_read, + unsigned flags) { struct bch_fs *c = trans->c; struct btree_iter iter; @@ -1313,6 +1352,12 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, k = bkey_i_to_s_c(sk.k); + if (unlikely(flags & BCH_READ_in_retry)) { + if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k))) + failed->nr = 0; + bch2_bkey_buf_copy(prev_read, c, sk.k); + } + /* * With indirect extents, the amount of data to read is the min * of the original extent and the indirect extent: diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index c78025d863e0..1a85b092fd1d 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -144,7 +144,8 @@ static inline void bch2_read_extent(struct btree_trans *trans, } int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, - subvol_inum, struct bch_io_failures *, unsigned flags); + subvol_inum, + struct bch_io_failures *, struct bkey_buf *, unsigned flags); static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, subvol_inum inum) @@ -154,7 +155,7 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, rbio->subvol = inum.subvol; bch2_trans_run(c, - __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, + __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL, BCH_READ_retry_if_stale| BCH_READ_may_promote| BCH_READ_user_mapped)); -- 2.51.0 From 760be1ad5e71b3a23644849cbf3c2245c4dc83f3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Mar 2025 14:03:25 -0400 Subject: [PATCH 06/16] bcachefs: Poison extents that can't be read due to checksum errors Copygc needs to be able to move extents that have bitrotted. We don't want to delete them - in the future we'll have an API for "read me the data even if there's checksum errors", and in general we don't want to delete anything unless the user asks us to. That will require writing it with a new checksum, which means we can't forget that there was a checksum error so we return the correct error to userspace. Rebalance also wants to skip bad extents; we can now use the poison flag for that. This is currently disabled by default, as we want read fua support so that we can distinguish between transient and permanent errors from the device. It may be enabled with the module parameter: poison_extents_on_checksum_error Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 71 ++++++++++++++++++++++++++++++-- fs/bcachefs/sb-counters_format.h | 1 + fs/bcachefs/trace.h | 5 +++ 3 files changed, 73 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 3705b606f675..3f111f918345 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -34,6 +34,12 @@ module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); MODULE_PARM_DESC(read_corrupt_ratio, ""); #endif +static bool bch2_poison_extents_on_checksum_error; +module_param_named(poison_extents_on_checksum_error, + bch2_poison_extents_on_checksum_error, bool, 0644); +MODULE_PARM_DESC(poison_extents_on_checksum_error, + "Extents with checksum errors are marked as poisoned - unsafe without read fua support"); + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static bool bch2_target_congested(struct bch_fs *c, u16 target) @@ -459,6 +465,52 @@ static void get_rbio_extent(struct btree_trans *trans, bch2_trans_iter_exit(trans, &iter); } +static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, + enum btree_id btree, struct bkey_s_c read_k) +{ + if (!bch2_poison_extents_on_checksum_error) + return 0; + + struct bch_fs *c = trans->c; + + struct data_update *u = rbio_data_update(rbio); + if (u) + read_k = bkey_i_to_s_c(u->k.k); + + u64 flags = bch2_bkey_extent_flags(read_k); + if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k), + BTREE_ITER_intent); + int ret = bkey_err(k); + if (ret) + return ret; + + if (!bkey_and_val_eq(k, read_k)) + goto out; + + struct bkey_i *new = bch2_trans_kmalloc(trans, + bkey_bytes(k.k) + sizeof(struct bch_extent_flags)); + ret = PTR_ERR_OR_ZERO(new) ?: + (bkey_reassemble(new, k), 0) ?: + bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?: + bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + + /* + * Propagate key change back to data update path, in particular so it + * knows the extent has been poisoned and it's safe to change the + * checksum + */ + if (u && !ret) + bch2_bkey_buf_copy(&u->k, c, new); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, @@ -492,7 +544,8 @@ retry: err: bch2_trans_iter_exit(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, BCH_ERR_data_read_retry)) goto retry; if (ret) { @@ -1008,6 +1061,16 @@ retry_pick: goto hole; if (unlikely(ret < 0)) { + if (ret == -BCH_ERR_data_read_csum_err) { + int ret2 = maybe_poison_extent(trans, orig, data_btree, k); + if (ret2) { + ret = ret2; + goto err; + } + + trace_and_count(c, io_read_fail_and_poison, &orig->bio); + } + struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, orig, read_pos); prt_printf(&buf, "%s\n ", bch2_err_str(ret)); @@ -1310,6 +1373,7 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; + enum btree_id data_btree; int ret; EBUG_ON(rbio->data_update); @@ -1320,7 +1384,7 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, BTREE_ITER_slots); while (1) { - enum btree_id data_btree = BTREE_ID_extents; + data_btree = BTREE_ID_extents; bch2_trans_begin(trans); @@ -1392,8 +1456,6 @@ err: break; } - bch2_trans_iter_exit(trans, &iter); - if (unlikely(ret)) { if (ret != -BCH_ERR_extent_poisoned) { struct printbuf buf = PRINTBUF; @@ -1412,6 +1474,7 @@ err: bch2_rbio_done(rbio); } + bch2_trans_iter_exit(trans, &iter); bch2_bkey_buf_exit(&sk, c); return ret; } diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h index fa27ec59a647..5c4e5de79d81 100644 --- a/fs/bcachefs/sb-counters_format.h +++ b/fs/bcachefs/sb-counters_format.h @@ -16,6 +16,7 @@ enum counters_flags { x(io_read_split, 33, TYPE_COUNTER) \ x(io_read_reuse_race, 34, TYPE_COUNTER) \ x(io_read_retry, 32, TYPE_COUNTER) \ + x(io_read_fail_and_poison, 82, TYPE_COUNTER) \ x(io_write, 1, TYPE_SECTORS) \ x(io_move, 2, TYPE_SECTORS) \ x(io_move_read, 35, TYPE_SECTORS) \ diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 519d00d62ae7..8c07189a080a 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -339,6 +339,11 @@ DEFINE_EVENT(bio, io_read_reuse_race, TP_ARGS(bio) ); +DEFINE_EVENT(bio, io_read_fail_and_poison, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + /* ec.c */ TRACE_EVENT(stripe_create, -- 2.51.0 From cb8336ca42e493a76b3cc05b76a51a2eed26cdaa Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 11 Mar 2025 10:02:40 -0400 Subject: [PATCH 07/16] bcachefs: Data move can read from poisoned extents Now, if an extent is poisoned we can move it even if there was a checksum error. We'll have to give it a new checksum, but the poison bit means that userspace will still see the appropriate error when they try to read it. Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 6 +----- fs/bcachefs/io_read.c | 4 ++++ fs/bcachefs/move.c | 26 ++++++++++++++++++++------ 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index e597fb9c9823..a369f978ffe6 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -136,12 +136,8 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (k.k->type == KEY_TYPE_error) return -BCH_ERR_key_type_error; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return -BCH_ERR_extent_poisoned; - rcu_read_lock(); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 pick_latency; diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 3f111f918345..751a9679d7e5 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -1053,6 +1053,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, bvec_iter_sectors(iter)); goto out_read_done; } + + if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) && + !orig->data_update) + return -BCH_ERR_extent_poisoned; retry_pick: ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index dfdbb9259985..fe2fa665150b 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -126,26 +126,40 @@ static void move_write_done(struct bch_write_op *op) static void move_write(struct moving_io *io) { + struct bch_fs *c = io->write.op.c; struct moving_context *ctxt = io->write.ctxt; + struct bch_read_bio *rbio = &io->write.rbio; if (ctxt->stats) { - if (io->write.rbio.bio.bi_status) + if (rbio->bio.bi_status) atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, &ctxt->stats->sectors_error_uncorrected); - else if (io->write.rbio.saw_error) + else if (rbio->saw_error) atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, &ctxt->stats->sectors_error_corrected); } - if (unlikely(io->write.rbio.ret || - io->write.rbio.bio.bi_status || - io->write.data_opts.scrub)) { + /* + * If the extent has been bitrotted, we're going to have to give it a + * new checksum in order to move it - but the poison bit will ensure + * that userspace still gets the appropriate error. + */ + if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err && + (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) { + struct bch_extent_crc_unpacked crc = rbio->pick.crc; + struct nonce nonce = extent_nonce(rbio->version, crc); + + rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, + nonce, &rbio->bio); + rbio->ret = 0; + } + + if (unlikely(rbio->ret || io->write.data_opts.scrub)) { move_free(io); return; } if (trace_io_move_write_enabled()) { - struct bch_fs *c = io->write.op.c; struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); -- 2.51.0 From 8c087d2ddf5d4d8c07bec96531a5f5629066cd00 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 11 Mar 2025 09:46:06 -0400 Subject: [PATCH 08/16] bcachefs: Rebalance now skips poisoned extents Let's not move poisoned extents unnecessarily, since we can't guard against introducing more bitrot. Signed-off-by: Kent Overstreet --- fs/bcachefs/rebalance.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 623273556aa9..3c45500c1a28 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -95,6 +95,9 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | bch2_bkey_ptrs_need_move(c, opts, ptrs); } @@ -107,6 +110,9 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) if (!opts) return 0; + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 sectors = 0; -- 2.51.0 From 4e2caf82ce958d1fae96a6d6ba23ea9e80c597b4 Mon Sep 17 00:00:00 2001 From: Roxana Nicolescu Date: Thu, 27 Mar 2025 14:50:05 +0000 Subject: [PATCH 09/16] bcachefs: replace strncpy() with memcpy_and_pad in journal_transaction_name Strncpy is now deprecated. The buffer destination is not required to be NULL-terminated, but we also want to zero out the rest of the buffer as it is already done in other places. Link: https://github.com/KSPP/linux/issues/90 Signed-off-by: Roxana Nicolescu Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 7d7e52ddde02..4297d8b5eddd 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -20,6 +20,7 @@ #include "snapshot.h" #include +#include static const char * const trans_commit_flags_strs[] = { #define x(n, ...) #n, @@ -366,7 +367,8 @@ static noinline void journal_transaction_name(struct btree_trans *trans) struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); - strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); + memcpy_and_pad(l->d, JSET_ENTRY_LOG_U64s * sizeof(u64), + trans->fn, strlen(trans->fn), 0); } static inline int btree_key_can_insert(struct btree_trans *trans, -- 2.51.0 From caa6baa45f809bd932362030b16a8bb3e1dae083 Mon Sep 17 00:00:00 2001 From: Roxana Nicolescu Date: Thu, 27 Mar 2025 14:50:09 +0000 Subject: [PATCH 10/16] bcachefs: replace memcpy with memcpy_and_pad for jset_entry_log->d buff This was achieved before by zero-ing out the source buffer and then copying the bytes into the destination buffer. This can also be done with memcpy_and_pad which will zero out only the destination buffer if its size is bigger than the size of the source buffer. This is already used in the same way in journal_transaction_name(). Moreover, zero-ing the source buffer was done twice, first in __bch2_fs_log_msg() and then in bch2_trans_log_msg(). And this method may also require allocating some extra memory for the source buffer. In conclusion, using memcpy_and_pad is better even tough the result is the same because it brings uniformity with what's already used in journal_transaction_name, it avoids code duplication and reallocating extra memory. Signed-off-by: Roxana Nicolescu Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 1e6b7836cc01..2bffd5121c31 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -14,6 +14,8 @@ #include "snapshot.h" #include "trace.h" +#include + static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, const struct btree_insert_entry *r) { @@ -829,7 +831,6 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) { unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64)); - prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos); int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; if (ret) @@ -842,7 +843,7 @@ int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry); journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s); - memcpy(l->d, buf->buf, buf->pos); + memcpy_and_pad(l->d, u64s * sizeof(u64), buf->buf, buf->pos, 0); return 0; } @@ -868,7 +869,6 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, prt_vprintf(&buf, fmt, args); unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); - prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos); int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; if (ret) @@ -881,7 +881,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries); journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s); - memcpy(l->d, buf.buf, buf.pos); + memcpy_and_pad(l->d, u64s * sizeof(u64), buf.buf, buf.pos, 0); c->journal.early_journal_entries.nr += jset_u64s(u64s); } else { ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags, -- 2.51.0 From d02755b8c5f38e737037e0ff0820eb66ab63c4f5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 1 Apr 2025 14:29:31 -0400 Subject: [PATCH 11/16] bcachefs: trace bch2_trans_kmalloc() We're occasionally seeing the WARN_ON() for bump allocator usage exceeding BTREE_TRANS_MEM_MAX; add some tracing so we can see what's going on. Signed-off-by: Kent Overstreet --- fs/bcachefs/Kconfig | 4 +++ fs/bcachefs/bcachefs.h | 3 ++ fs/bcachefs/btree_iter.c | 51 ++++++++++++++++++++++++++++++++-- fs/bcachefs/btree_iter.h | 56 +++++++++++++++++++++++++++++--------- fs/bcachefs/btree_types.h | 9 ++++++ fs/bcachefs/btree_update.h | 4 +-- fs/bcachefs/debug.c | 6 ++++ 7 files changed, 115 insertions(+), 18 deletions(-) diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 07709b0d7688..a14e4a60b187 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -103,6 +103,10 @@ config BCACHEFS_PATH_TRACEPOINTS Enable extra tracepoints for debugging btree_path operations; we don't normally want these enabled because they happen at very high rates. +config BCACHEFS_TRANS_KMALLOC_TRACE + bool "Trace bch2_trans_kmalloc() calls" + depends on BCACHEFS_FS + config MEAN_AND_VARIANCE_UNIT_TEST tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 75f7408da173..24eed2b3be4d 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -650,6 +650,9 @@ struct btree_transaction_stats { unsigned nr_max_paths; unsigned journal_entries_size; unsigned max_mem; +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_trans_kmalloc_trace trans_kmalloc_trace; +#endif char *max_paths_text; }; diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index ac5f2046550d..cfd6363dfc39 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -3095,7 +3095,19 @@ void bch2_trans_copy_iter(struct btree_trans *trans, dst->key_cache_path = 0; } -void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE +void bch2_trans_kmalloc_trace_to_text(struct printbuf *out, + darray_trans_kmalloc_trace *trace) +{ + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 60); + + darray_for_each(*trace, i) + prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes); +} +#endif + +void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip) { struct bch_fs *c = trans->c; unsigned new_top = trans->mem_top + size; @@ -3105,14 +3117,35 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) void *new_mem; void *p; - WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) { +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + struct printbuf buf = PRINTBUF; + bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace); + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); +#endif + } ret = trans_maybe_inject_restart(trans, _RET_IP_); if (ret) return ERR_PTR(ret); struct btree_transaction_stats *s = btree_trans_stats(trans); - s->max_mem = max(s->max_mem, new_bytes); + if (new_bytes > s->max_mem) { + mutex_lock(&s->lock); +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr); + s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size, + trans->trans_kmalloc_trace.nr); + + memcpy(s->trans_kmalloc_trace.data, + trans->trans_kmalloc_trace.data, + sizeof(s->trans_kmalloc_trace.data[0]) * + s->trans_kmalloc_trace.nr); +#endif + s->max_mem = new_bytes; + mutex_unlock(&s->lock); + } if (trans->used_mempool) { if (trans->mem_bytes >= new_bytes) @@ -3172,6 +3205,8 @@ out_new_mem: BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); } out_change_top: + bch2_trans_kmalloc_trace(trans, size, ip); + p = trans->mem + trans->mem_top; trans->mem_top += size; memset(p, 0, size); @@ -3285,6 +3320,10 @@ u32 bch2_trans_begin(struct btree_trans *trans) } #endif +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + trans->trans_kmalloc_trace.nr = 0; +#endif + trans_set_locked(trans, false); if (trans->restarted) { @@ -3454,6 +3493,9 @@ void bch2_trans_put(struct btree_trans *trans) #ifdef CONFIG_BCACHEFS_DEBUG darray_exit(&trans->last_restarted_trace); #endif +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_exit(&trans->trans_kmalloc_trace); +#endif unsigned long *paths_allocated = trans->paths_allocated; trans->paths_allocated = NULL; @@ -3608,6 +3650,9 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_exit(&s->trans_kmalloc_trace); +#endif kfree(s->max_paths_text); bch2_time_stats_exit(&s->lock_hold_times); } diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 9d2cccf5d21a..78a805a89860 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -543,43 +543,73 @@ void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btre void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *); -void *__bch2_trans_kmalloc(struct btree_trans *, size_t); +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE +void bch2_trans_kmalloc_trace_to_text(struct printbuf *, + darray_trans_kmalloc_trace *); +#endif -/** - * bch2_trans_kmalloc - allocate memory for use by the current transaction - * - * Must be called after bch2_trans_begin, which on second and further calls - * frees all memory allocated in this transaction - */ -static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +void *__bch2_trans_kmalloc(struct btree_trans *, size_t, unsigned long); + +static inline void bch2_trans_kmalloc_trace(struct btree_trans *trans, size_t size, + unsigned long ip) +{ +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_push(&trans->trans_kmalloc_trace, + ((struct trans_kmalloc_trace) { .ip = ip, .bytes = size })); +#endif +} + +static __always_inline void *bch2_trans_kmalloc_nomemzero_ip(struct btree_trans *trans, size_t size, + unsigned long ip) { size = roundup(size, 8); + bch2_trans_kmalloc_trace(trans, size, ip); + if (likely(trans->mem_top + size <= trans->mem_bytes)) { void *p = trans->mem + trans->mem_top; trans->mem_top += size; - memset(p, 0, size); return p; } else { - return __bch2_trans_kmalloc(trans, size); + return __bch2_trans_kmalloc(trans, size, ip); } } -static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) +static __always_inline void *bch2_trans_kmalloc_ip(struct btree_trans *trans, size_t size, + unsigned long ip) { - size = round_up(size, 8); + size = roundup(size, 8); + + bch2_trans_kmalloc_trace(trans, size, ip); if (likely(trans->mem_top + size <= trans->mem_bytes)) { void *p = trans->mem + trans->mem_top; trans->mem_top += size; + memset(p, 0, size); return p; } else { - return __bch2_trans_kmalloc(trans, size); + return __bch2_trans_kmalloc(trans, size, ip); } } +/** + * bch2_trans_kmalloc - allocate memory for use by the current transaction + * + * Must be called after bch2_trans_begin, which on second and further calls + * frees all memory allocated in this transaction + */ +static __always_inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +{ + return bch2_trans_kmalloc_ip(trans, size, _THIS_IP_); +} + +static __always_inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) +{ + return bch2_trans_kmalloc_nomemzero_ip(trans, size, _THIS_IP_); +} + static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 023c472dc9ee..81175c1344d2 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -477,6 +477,12 @@ struct btree_trans_paths { struct btree_path paths[]; }; +struct trans_kmalloc_trace { + unsigned long ip; + size_t bytes; +}; +typedef DARRAY(struct trans_kmalloc_trace) darray_trans_kmalloc_trace; + struct btree_trans { struct bch_fs *c; @@ -488,6 +494,9 @@ struct btree_trans { void *mem; unsigned mem_top; unsigned mem_bytes; +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_trans_kmalloc_trace trans_kmalloc_trace; +#endif btree_path_idx_t nr_sorted; btree_path_idx_t nr_paths; diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 568e56c91190..e674419c299e 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -222,7 +222,7 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans) trans->extra_disk_res = 0; } -static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, +static __always_inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, unsigned type, unsigned min_bytes) { unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k)); @@ -245,7 +245,7 @@ static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *t return mut; } -static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) +static __always_inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) { return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0); } diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 5a8bc7013512..2c52a2c6502b 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -770,6 +770,12 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, mutex_lock(&s->lock); prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem); +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + printbuf_indent_add(&i->buf, 2); + bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace); + printbuf_indent_sub(&i->buf, 2); +#endif + prt_printf(&i->buf, "Transaction duration:\n"); printbuf_indent_add(&i->buf, 2); -- 2.51.0 From ad63f9f1e9a10792847bc46f0226323f238a171d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 20:42:42 -0400 Subject: [PATCH 12/16] bcachefs: struct alloc_request Add a struct for common state for satisfying an on disk allocation, instead of passing the same long list of items to every function. This will help with stack usage, performance, and perhaps enable some code cleanups. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 292 ++++++++++++--------------------- fs/bcachefs/alloc_foreground.h | 25 ++- fs/bcachefs/ec.c | 46 +++--- fs/bcachefs/io_write.h | 28 ---- fs/bcachefs/io_write_types.h | 28 ++++ 5 files changed, 179 insertions(+), 240 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 7ec022e9361a..93c91b5706fb 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -693,24 +693,20 @@ void bch2_dev_stripe_increment(struct bch_dev *ca, } static int add_new_bucket(struct bch_fs *c, - struct open_buckets *ptrs, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - struct open_bucket *ob) + struct alloc_request *req, + struct open_bucket *ob) { unsigned durability = ob_dev(c, ob)->mi.durability; - BUG_ON(*nr_effective >= nr_replicas); + BUG_ON(req->nr_effective >= req->nr_replicas); - __clear_bit(ob->dev, devs_may_alloc->d); - *nr_effective += durability; - *have_cache |= !durability; + __clear_bit(ob->dev, req->devs_may_alloc.d); + req->nr_effective += durability; + req->have_cache |= !durability; - ob_push(c, ptrs, ob); + ob_push(c, &req->ptrs, ob); - if (*nr_effective >= nr_replicas) + if (req->nr_effective >= req->nr_replicas) return 1; if (ob->ec) return 1; @@ -718,36 +714,32 @@ static int add_new_bucket(struct bch_fs *c, } int bch2_bucket_alloc_set_trans(struct btree_trans *trans, - struct open_buckets *ptrs, - struct dev_stripe_state *stripe, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_write_flags flags, - enum bch_data_type data_type, - enum bch_watermark watermark, - struct closure *cl) + struct alloc_request *req, + struct dev_stripe_state *stripe, + enum bch_data_type data_type, + struct closure *cl) { struct bch_fs *c = trans->c; int ret = -BCH_ERR_insufficient_devices; - BUG_ON(*nr_effective >= nr_replicas); + BUG_ON(req->nr_effective >= req->nr_replicas); - struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); + struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc); darray_for_each(devs_sorted, i) { struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i); if (!ca) continue; - if (!ca->mi.durability && *have_cache) { + if (!ca->mi.durability && req->have_cache) { bch2_dev_put(ca); continue; } struct bch_dev_usage usage; - struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, - cl, flags & BCH_WRITE_alloc_nowait, &usage); + struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, + req->watermark, data_type, + cl, req->flags & BCH_WRITE_alloc_nowait, + &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); bch2_dev_put(ca); @@ -759,9 +751,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - if (add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob)) { + if (add_new_bucket(c, req, ob)) { ret = 0; break; } @@ -779,34 +769,27 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, */ static int bucket_alloc_from_stripe(struct btree_trans *trans, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - u16 target, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *cl) + struct alloc_request *req, + struct closure *cl) { struct bch_fs *c = trans->c; int ret = 0; - if (nr_replicas < 2) + if (req->nr_replicas < 2) return 0; - if (ec_open_bucket(c, ptrs)) + if (ec_open_bucket(c, &req->ptrs)) return 0; struct ec_stripe_head *h = - bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); + bch2_ec_stripe_head_get(trans, req->target, 0, req->nr_replicas - 1, req->watermark, cl); if (IS_ERR(h)) return PTR_ERR(h); if (!h) return 0; - struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); + struct dev_alloc_list devs_sorted = + bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc); darray_for_each(devs_sorted, i) for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { if (!h->s->blocks[ec_idx]) @@ -818,9 +801,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, ob->ec = h->s; ec_stripe_new_get(h->s, STRIPE_REF_io); - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); + ret = add_new_bucket(c, req, ob); goto out; } } @@ -832,65 +813,48 @@ out: /* Sector allocator */ static bool want_bucket(struct bch_fs *c, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - bool *have_cache, bool ec, + struct alloc_request *req, struct open_bucket *ob) { struct bch_dev *ca = ob_dev(c, ob); - if (!test_bit(ob->dev, devs_may_alloc->d)) + if (!test_bit(ob->dev, req->devs_may_alloc.d)) return false; - if (ob->data_type != wp->data_type) + if (ob->data_type != req->wp->data_type) return false; if (!ca->mi.durability && - (wp->data_type == BCH_DATA_btree || ec || *have_cache)) + (req->wp->data_type == BCH_DATA_btree || req->ec || req->have_cache)) return false; - if (ec != (ob->ec != NULL)) + if (req->ec != (ob->ec != NULL)) return false; return true; } static int bucket_alloc_set_writepoint(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - bool ec) + struct alloc_request *req) { struct open_buckets ptrs_skip = { .nr = 0 }; struct open_bucket *ob; unsigned i; int ret = 0; - open_bucket_for_each(c, &wp->ptrs, ob, i) { - if (!ret && want_bucket(c, wp, devs_may_alloc, - have_cache, ec, ob)) - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); + open_bucket_for_each(c, &req->wp->ptrs, ob, i) { + if (!ret && want_bucket(c, req, ob)) + ret = add_new_bucket(c, req, ob); else ob_push(c, &ptrs_skip, ob); } - wp->ptrs = ptrs_skip; + req->wp->ptrs = ptrs_skip; return ret; } static int bucket_alloc_set_partial(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, bool ec, - enum bch_watermark watermark) + struct alloc_request *req) { int i, ret = 0; @@ -905,13 +869,13 @@ static int bucket_alloc_set_partial(struct bch_fs *c, for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; - if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { + if (want_bucket(c, req, ob)) { struct bch_dev *ca = ob_dev(c, ob); struct bch_dev_usage usage; u64 avail; bch2_dev_usage_read_fast(ca, &usage); - avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets; + avail = dev_buckets_free(ca, usage, req->watermark) + ca->nr_partial_buckets; if (!avail) continue; @@ -924,9 +888,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c, bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; rcu_read_unlock(); - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); + ret = add_new_bucket(c, req, ob); if (ret) break; } @@ -937,61 +899,42 @@ unlock: } static int __open_bucket_add_buckets(struct btree_trans *trans, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_list *devs_have, - u16 target, - bool erasure_code, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *_cl) + struct alloc_request *req, + struct closure *_cl) { struct bch_fs *c = trans->c; - struct bch_devs_mask devs; struct open_bucket *ob; struct closure *cl = NULL; unsigned i; int ret; - devs = target_rw_devs(c, wp->data_type, target); + req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target); /* Don't allocate from devices we already have pointers to: */ - darray_for_each(*devs_have, i) - __clear_bit(*i, devs.d); + darray_for_each(*req->devs_have, i) + __clear_bit(*i, req->devs_may_alloc.d); - open_bucket_for_each(c, ptrs, ob, i) - __clear_bit(ob->dev, devs.d); + open_bucket_for_each(c, &req->ptrs, ob, i) + __clear_bit(ob->dev, req->devs_may_alloc.d); - ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, - nr_replicas, nr_effective, - have_cache, erasure_code); + ret = bucket_alloc_set_writepoint(c, req); if (ret) return ret; - ret = bucket_alloc_set_partial(c, ptrs, wp, &devs, - nr_replicas, nr_effective, - have_cache, erasure_code, watermark); + ret = bucket_alloc_set_partial(c, req); if (ret) return ret; - if (erasure_code) { - ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs, - target, - nr_replicas, nr_effective, - have_cache, - watermark, flags, _cl); + if (req->ec) { + ret = bucket_alloc_from_stripe(trans, req, _cl); } else { retry_blocking: /* * Try nonblocking first, so that if one device is full we'll try from * other devices: */ - ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, - nr_replicas, nr_effective, have_cache, - flags, wp->data_type, watermark, cl); + ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, + req->wp->data_type, cl); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && @@ -1005,38 +948,27 @@ retry_blocking: } static int open_bucket_add_buckets(struct btree_trans *trans, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_list *devs_have, - u16 target, - unsigned erasure_code, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *cl) + struct alloc_request *req, + struct closure *cl) { int ret; - if (erasure_code && !ec_open_bucket(trans->c, ptrs)) { - ret = __open_bucket_add_buckets(trans, ptrs, wp, - devs_have, target, erasure_code, - nr_replicas, nr_effective, have_cache, - watermark, flags, cl); + if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) { + ret = __open_bucket_add_buckets(trans, req, cl); if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || bch2_err_matches(ret, BCH_ERR_operation_blocked) || bch2_err_matches(ret, BCH_ERR_freelist_empty) || bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) return ret; - if (*nr_effective >= nr_replicas) + if (req->nr_effective >= req->nr_replicas) return 0; } - ret = __open_bucket_add_buckets(trans, ptrs, wp, - devs_have, target, false, - nr_replicas, nr_effective, have_cache, - watermark, flags, cl); + bool ec = false; + swap(ec, req->ec); + ret = __open_bucket_add_buckets(trans, req, cl); + swap(ec, req->ec); + return ret < 0 ? ret : 0; } @@ -1327,51 +1259,49 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, struct write_point **wp_ret) { struct bch_fs *c = trans->c; - struct write_point *wp; struct open_bucket *ob; - struct open_buckets ptrs; - unsigned nr_effective, write_points_nr; - bool have_cache; + unsigned write_points_nr; int ret; int i; + struct alloc_request req = { + .nr_replicas = nr_replicas, + .target = target, + .ec = erasure_code, + .watermark = watermark, + .flags = flags, + .devs_have = devs_have, + }; + if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) erasure_code = false; BUG_ON(!nr_replicas || !nr_replicas_required); retry: - ptrs.nr = 0; - nr_effective = 0; - write_points_nr = c->write_points_nr; - have_cache = false; + req.ptrs.nr = 0; + req.nr_effective = 0; + req.have_cache = false; + write_points_nr = c->write_points_nr; - *wp_ret = wp = writepoint_find(trans, write_point.v); + *wp_ret = req.wp = writepoint_find(trans, write_point.v); ret = bch2_trans_relock(trans); if (ret) goto err; /* metadata may not allocate on cache devices: */ - if (wp->data_type != BCH_DATA_user) - have_cache = true; + if (req.wp->data_type != BCH_DATA_user) + req.have_cache = true; if (target && !(flags & BCH_WRITE_only_specified_devs)) { - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, NULL); + ret = open_bucket_add_buckets(trans, &req, NULL); if (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto alloc_done; /* Don't retry from all devices if we're out of open buckets: */ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { - int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, cl); + int ret2 = open_bucket_add_buckets(trans, &req, cl); if (!ret2 || bch2_err_matches(ret2, BCH_ERR_transaction_restart) || bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) { @@ -1384,45 +1314,39 @@ retry: * Only try to allocate cache (durability = 0 devices) from the * specified target: */ - have_cache = true; + req.have_cache = true; + req.target = 0; - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - 0, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, cl); + ret = open_bucket_add_buckets(trans, &req, cl); } else { - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, cl); + ret = open_bucket_add_buckets(trans, &req, cl); } alloc_done: - BUG_ON(!ret && nr_effective < nr_replicas); + BUG_ON(!ret && req.nr_effective < req.nr_replicas); - if (erasure_code && !ec_open_bucket(c, &ptrs)) + if (erasure_code && !ec_open_bucket(c, &req.ptrs)) pr_debug("failed to get ec bucket: ret %u", ret); if (ret == -BCH_ERR_insufficient_devices && - nr_effective >= nr_replicas_required) + req.nr_effective >= nr_replicas_required) ret = 0; if (ret) goto err; - if (nr_effective > nr_replicas) - deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas); + if (req.nr_effective > req.nr_replicas) + deallocate_extra_replicas(c, &req.ptrs, &req.wp->ptrs, + req.nr_effective - req.nr_replicas); /* Free buckets we didn't use: */ - open_bucket_for_each(c, &wp->ptrs, ob, i) + open_bucket_for_each(c, &req.wp->ptrs, ob, i) open_bucket_free_unused(c, ob); - wp->ptrs = ptrs; + req.wp->ptrs = req.ptrs; - wp->sectors_free = UINT_MAX; + req.wp->sectors_free = UINT_MAX; - open_bucket_for_each(c, &wp->ptrs, ob, i) { + open_bucket_for_each(c, &req.wp->ptrs, ob, i) { /* * Ensure proper write alignment - either due to misaligned * bucket sizes (from buggy bcachefs-tools), or writes that mix @@ -1436,29 +1360,29 @@ alloc_done: ob->sectors_free = max_t(int, 0, ob->sectors_free - align); - wp->sectors_free = min(wp->sectors_free, ob->sectors_free); + req.wp->sectors_free = min(req.wp->sectors_free, ob->sectors_free); } - wp->sectors_free = rounddown(wp->sectors_free, block_sectors(c)); + req.wp->sectors_free = rounddown(req.wp->sectors_free, block_sectors(c)); /* Did alignment use up space in an open_bucket? */ - if (unlikely(!wp->sectors_free)) { - bch2_alloc_sectors_done(c, wp); + if (unlikely(!req.wp->sectors_free)) { + bch2_alloc_sectors_done(c, req.wp); goto retry; } - BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); + BUG_ON(!req.wp->sectors_free || req.wp->sectors_free == UINT_MAX); return 0; err: - open_bucket_for_each(c, &wp->ptrs, ob, i) - if (ptrs.nr < ARRAY_SIZE(ptrs.v)) - ob_push(c, &ptrs, ob); + open_bucket_for_each(c, &req.wp->ptrs, ob, i) + if (req.ptrs.nr < ARRAY_SIZE(req.ptrs.v)) + ob_push(c, &req.ptrs, ob); else open_bucket_free_unused(c, ob); - wp->ptrs = ptrs; + req.wp->ptrs = req.ptrs; - mutex_unlock(&wp->lock); + mutex_unlock(&req.wp->lock); if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && try_decrease_writepoints(trans, write_points_nr)) diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 4c1e33cf57c0..874aadf34ebf 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -5,6 +5,7 @@ #include "bcachefs.h" #include "alloc_types.h" #include "extents.h" +#include "io_write_types.h" #include "sb-members.h" #include @@ -23,6 +24,22 @@ struct dev_alloc_list { u8 data[BCH_SB_MEMBERS_MAX]; }; +struct alloc_request { + unsigned nr_replicas; + unsigned target; + bool ec; + enum bch_watermark watermark; + enum bch_write_flags flags; + struct bch_devs_list *devs_have; + + struct write_point *wp; + struct open_buckets ptrs; + unsigned nr_effective; + bool have_cache; + + struct bch_devs_mask devs_may_alloc; +}; + struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, struct dev_stripe_state *, struct bch_devs_mask *); @@ -173,11 +190,9 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 } enum bch_write_flags; -int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *, - struct dev_stripe_state *, struct bch_devs_mask *, - unsigned, unsigned *, bool *, enum bch_write_flags, - enum bch_data_type, enum bch_watermark, - struct closure *); +int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *, + struct dev_stripe_state *, enum bch_data_type, + struct closure *); int bch2_alloc_sectors_start_trans(struct btree_trans *, unsigned, unsigned, diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index c6cb26981923..fc09e0655014 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1714,19 +1714,23 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, enum bch_watermark watermark, struct closure *cl) { struct bch_fs *c = trans->c; - struct bch_devs_mask devs = h->devs; struct open_bucket *ob; - struct open_buckets buckets; struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; unsigned i, j, nr_have_parity = 0, nr_have_data = 0; - bool have_cache = true; int ret = 0; + struct alloc_request req = { + .watermark = watermark, + .devs_may_alloc = h->devs, + .have_cache = true, + }; + BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); BUG_ON(v->nr_redundant != s->nr_parity); /* * We bypass the sector allocator which normally does this: */ - bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); + bitmap_and(req.devs_may_alloc.d, req.devs_may_alloc.d, + c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { /* @@ -1736,7 +1740,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, * block when updating the stripe */ if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) - __clear_bit(v->ptrs[i].dev, devs.d); + __clear_bit(v->ptrs[i].dev, req.devs_may_alloc.d); if (i < s->nr_data) nr_have_data++; @@ -1747,25 +1751,23 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, BUG_ON(nr_have_data > s->nr_data); BUG_ON(nr_have_parity > s->nr_parity); - buckets.nr = 0; + req.ptrs.nr = 0; if (nr_have_parity < s->nr_parity) { - ret = bch2_bucket_alloc_set_trans(trans, &buckets, + req.nr_replicas = s->nr_parity; + req.nr_effective = nr_have_parity; + + ret = bch2_bucket_alloc_set_trans(trans, &req, &h->parity_stripe, - &devs, - s->nr_parity, - &nr_have_parity, - &have_cache, 0, BCH_DATA_parity, - watermark, cl); - open_bucket_for_each(c, &buckets, ob, i) { + open_bucket_for_each(c, &req.ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, s->nr_data + s->nr_parity, s->nr_data); BUG_ON(j >= s->nr_data + s->nr_parity); - s->blocks[j] = buckets.v[i]; + s->blocks[j] = req.ptrs.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, s->blocks_gotten); } @@ -1774,24 +1776,22 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, return ret; } - buckets.nr = 0; + req.ptrs.nr = 0; if (nr_have_data < s->nr_data) { - ret = bch2_bucket_alloc_set_trans(trans, &buckets, + req.nr_replicas = s->nr_data; + req.nr_effective = nr_have_data; + + ret = bch2_bucket_alloc_set_trans(trans, &req, &h->block_stripe, - &devs, - s->nr_data, - &nr_have_data, - &have_cache, 0, BCH_DATA_user, - watermark, cl); - open_bucket_for_each(c, &buckets, ob, i) { + open_bucket_for_each(c, &req.ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, s->nr_data, 0); BUG_ON(j >= s->nr_data); - s->blocks[j] = buckets.v[i]; + s->blocks[j] = req.ptrs.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, s->blocks_gotten); } diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index b8ab19a1e1da..2c0a8f35ee1f 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -17,34 +17,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, __printf(3, 4) void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...); -#define BCH_WRITE_FLAGS() \ - x(alloc_nowait) \ - x(cached) \ - x(data_encoded) \ - x(pages_stable) \ - x(pages_owned) \ - x(only_specified_devs) \ - x(wrote_data_inline) \ - x(check_enospc) \ - x(sync) \ - x(move) \ - x(in_worker) \ - x(submitted) \ - x(io_error) \ - x(convert_unwritten) - -enum __bch_write_flags { -#define x(f) __BCH_WRITE_##f, - BCH_WRITE_FLAGS() -#undef x -}; - -enum bch_write_flags { -#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), - BCH_WRITE_FLAGS() -#undef x -}; - static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) { return op->watermark == BCH_WATERMARK_copygc diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h index 3ef6df9145ef..b4a6a44a45d0 100644 --- a/fs/bcachefs/io_write_types.h +++ b/fs/bcachefs/io_write_types.h @@ -13,6 +13,34 @@ #include #include +#define BCH_WRITE_FLAGS() \ + x(alloc_nowait) \ + x(cached) \ + x(data_encoded) \ + x(pages_stable) \ + x(pages_owned) \ + x(only_specified_devs) \ + x(wrote_data_inline) \ + x(check_enospc) \ + x(sync) \ + x(move) \ + x(in_worker) \ + x(submitted) \ + x(io_error) \ + x(convert_unwritten) + +enum __bch_write_flags { +#define x(f) __BCH_WRITE_##f, + BCH_WRITE_FLAGS() +#undef x +}; + +enum bch_write_flags { +#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), + BCH_WRITE_FLAGS() +#undef x +}; + struct bch_write_bio { struct_group(wbio, struct bch_fs *c; -- 2.51.0 From 799c41830332121b067c26b43887a95a4f848c22 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 21:06:43 -0400 Subject: [PATCH 13/16] bcachefs: alloc_request.data_type Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 10 +++++----- fs/bcachefs/alloc_foreground.h | 4 ++-- fs/bcachefs/ec.c | 12 ++++-------- 3 files changed, 11 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 93c91b5706fb..5cca41b28236 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -716,7 +716,6 @@ static int add_new_bucket(struct bch_fs *c, int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct alloc_request *req, struct dev_stripe_state *stripe, - enum bch_data_type data_type, struct closure *cl) { struct bch_fs *c = trans->c; @@ -737,7 +736,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct bch_dev_usage usage; struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, - req->watermark, data_type, + req->watermark, req->data_type, cl, req->flags & BCH_WRITE_alloc_nowait, &usage); if (!IS_ERR(ob)) @@ -933,8 +932,7 @@ retry_blocking: * Try nonblocking first, so that if one device is full we'll try from * other devices: */ - ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, - req->wp->data_type, cl); + ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && @@ -1285,12 +1283,14 @@ retry: *wp_ret = req.wp = writepoint_find(trans, write_point.v); + req.data_type = req.wp->data_type; + ret = bch2_trans_relock(trans); if (ret) goto err; /* metadata may not allocate on cache devices: */ - if (req.wp->data_type != BCH_DATA_user) + if (req.data_type != BCH_DATA_user) req.have_cache = true; if (target && !(flags & BCH_WRITE_only_specified_devs)) { diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 874aadf34ebf..24d6e5863737 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -30,6 +30,7 @@ struct alloc_request { bool ec; enum bch_watermark watermark; enum bch_write_flags flags; + enum bch_data_type data_type; struct bch_devs_list *devs_have; struct write_point *wp; @@ -191,8 +192,7 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 enum bch_write_flags; int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *, - struct dev_stripe_state *, enum bch_data_type, - struct closure *); + struct dev_stripe_state *, struct closure *); int bch2_alloc_sectors_start_trans(struct btree_trans *, unsigned, unsigned, diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index fc09e0655014..0865fb8a6f36 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1755,11 +1755,9 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, if (nr_have_parity < s->nr_parity) { req.nr_replicas = s->nr_parity; req.nr_effective = nr_have_parity; + req.data_type = BCH_DATA_parity; - ret = bch2_bucket_alloc_set_trans(trans, &req, - &h->parity_stripe, - BCH_DATA_parity, - cl); + ret = bch2_bucket_alloc_set_trans(trans, &req, &h->parity_stripe, cl); open_bucket_for_each(c, &req.ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, @@ -1780,11 +1778,9 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, if (nr_have_data < s->nr_data) { req.nr_replicas = s->nr_data; req.nr_effective = nr_have_data; + req.data_type = BCH_DATA_user; - ret = bch2_bucket_alloc_set_trans(trans, &req, - &h->block_stripe, - BCH_DATA_user, - cl); + ret = bch2_bucket_alloc_set_trans(trans, &req, &h->block_stripe, cl); open_bucket_for_each(c, &req.ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, -- 2.51.0 From 9259883b79e14aaf127c0ef59b5ccca8e04e77ae Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 21:13:53 -0400 Subject: [PATCH 14/16] bcachefs: bch2_bucket_alloc_trans() takes alloc_request Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 37 +++++++++++++++++----------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 5cca41b28236..ca8df935f198 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -505,24 +505,23 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, * Returns: an open_bucket on success, or an ERR_PTR() on failure. */ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, - struct bch_dev *ca, - enum bch_watermark watermark, - enum bch_data_type data_type, - struct closure *cl, - bool nowait, - struct bch_dev_usage *usage) + struct alloc_request *req, + struct bch_dev *ca, + struct closure *cl, + bool nowait, + struct bch_dev_usage *usage) { struct bch_fs *c = trans->c; struct open_bucket *ob = NULL; bool freespace = READ_ONCE(ca->mi.freespace_initialized); u64 avail; struct bucket_alloc_state s = { - .btree_bitmap = data_type == BCH_DATA_btree, + .btree_bitmap = req->data_type == BCH_DATA_btree, }; bool waiting = nowait; again: bch2_dev_usage_read_fast(ca, usage); - avail = dev_buckets_free(ca, *usage, watermark); + avail = dev_buckets_free(ca, *usage, req->watermark); if (usage->buckets[BCH_DATA_need_discard] > avail) bch2_dev_do_discards(ca); @@ -534,7 +533,7 @@ again: bch2_dev_do_invalidates(ca); if (!avail) { - if (watermark > BCH_WATERMARK_normal && + if (req->watermark > BCH_WATERMARK_normal && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) goto alloc; @@ -554,8 +553,8 @@ again: closure_wake_up(&c->freelist_wait); alloc: ob = likely(freespace) - ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) - : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); + ? bch2_bucket_alloc_freelist(trans, ca, req->watermark, &s, cl) + : bch2_bucket_alloc_early(trans, ca, req->watermark, &s, cl); if (s.need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); @@ -574,7 +573,7 @@ err: ob = ERR_PTR(-BCH_ERR_no_buckets_found); if (!IS_ERR(ob)) - ob->data_type = data_type; + ob->data_type = req->data_type; if (!IS_ERR(ob)) count_event(c, bucket_alloc); @@ -584,7 +583,7 @@ err: if (!IS_ERR(ob) ? trace_bucket_alloc_enabled() : trace_bucket_alloc_fail_enabled()) - trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob); + trace_bucket_alloc2(c, ca, req->watermark, req->data_type, cl, usage, &s, ob); return ob; } @@ -596,10 +595,13 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, { struct bch_dev_usage usage; struct open_bucket *ob; + struct alloc_request req = { + .watermark = watermark, + .data_type = data_type, + }; bch2_trans_do(c, - PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, - data_type, cl, false, &usage))); + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, ca, cl, false, &usage))); return ob; } @@ -735,9 +737,8 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, } struct bch_dev_usage usage; - struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, - req->watermark, req->data_type, - cl, req->flags & BCH_WRITE_alloc_nowait, + struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, ca, cl, + req->flags & BCH_WRITE_alloc_nowait, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); -- 2.51.0 From 7100344301d80f09ea64c37c35ea10163d35d433 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 31 Mar 2025 15:37:28 -0400 Subject: [PATCH 15/16] bcachefs: bch2_ec_stripe_head_get() takes alloc_request Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 5 ++--- fs/bcachefs/alloc_foreground.h | 7 +++---- fs/bcachefs/ec.c | 20 ++++++++++---------- fs/bcachefs/ec.h | 5 +++-- 4 files changed, 18 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index ca8df935f198..aef27d40d354 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -495,9 +495,8 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, /** * bch2_bucket_alloc_trans - allocate a single bucket from a specific device * @trans: transaction object + * @req: state for the entire allocation * @ca: device to allocate from - * @watermark: how important is this allocation? - * @data_type: BCH_DATA_journal, btree, user... * @cl: if not NULL, closure to be used to wait if buckets not available * @nowait: if true, do not wait for buckets to become available * @usage: for secondarily also returning the current device usage @@ -782,7 +781,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, return 0; struct ec_stripe_head *h = - bch2_ec_stripe_head_get(trans, req->target, 0, req->nr_replicas - 1, req->watermark, cl); + bch2_ec_stripe_head_get(trans, req, 0, cl); if (IS_ERR(h)) return PTR_ERR(h); if (!h) diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 24d6e5863737..27219cd1368f 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -32,12 +32,11 @@ struct alloc_request { enum bch_write_flags flags; enum bch_data_type data_type; struct bch_devs_list *devs_have; - struct write_point *wp; - struct open_buckets ptrs; - unsigned nr_effective; - bool have_cache; + struct open_buckets ptrs; + unsigned nr_effective; /* sum of @ptrs durability */ + bool have_cache; /* have we allocated from a 0 durability dev */ struct bch_devs_mask devs_may_alloc; }; diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 0865fb8a6f36..6f977e134d08 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1977,17 +1977,15 @@ err: } struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, - unsigned target, + struct alloc_request *req, unsigned algo, - unsigned redundancy, - enum bch_watermark watermark, struct closure *cl) { struct bch_fs *c = trans->c; - struct ec_stripe_head *h; - bool waiting = false; + unsigned redundancy = req->nr_replicas - 1; unsigned disk_label = 0; - struct target t = target_decode(target); + struct target t = target_decode(req->target); + bool waiting = false; int ret; if (t.type == TARGET_GROUP) { @@ -1998,7 +1996,9 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, disk_label = t.group + 1; /* 0 == no label */ } - h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark); + struct ec_stripe_head *h = + __bch2_ec_stripe_head_get(trans, disk_label, algo, + redundancy, req->watermark); if (IS_ERR_OR_NULL(h)) return h; @@ -2041,8 +2041,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) goto err; - if (watermark == BCH_WATERMARK_copygc) { - ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?: + if (req->watermark == BCH_WATERMARK_copygc) { + ret = new_stripe_alloc_buckets(trans, h, s, req->watermark, NULL) ?: __bch2_ec_stripe_head_reserve(trans, h, s); if (ret) goto err; @@ -2061,7 +2061,7 @@ alloc_existing: * Retry allocating buckets, with the watermark for this * particular write: */ - ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl); + ret = new_stripe_alloc_buckets(trans, h, s, req->watermark, cl); if (ret) goto err; diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 51893e1ee874..83d37bcb548a 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -255,9 +255,10 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); + +struct alloc_request; struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, - unsigned, unsigned, unsigned, - enum bch_watermark, struct closure *); + struct alloc_request *, unsigned, struct closure *); void bch2_do_stripe_deletes(struct bch_fs *); void bch2_ec_do_stripe_creates(struct bch_fs *); -- 2.51.0 From ac0952b0e50934a15fdd67a2ff376e6ab8152c39 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 31 Mar 2025 15:46:45 -0400 Subject: [PATCH 16/16] bcachefs: new_stripe_alloc_buckets() takes alloc_request More stack usage improvements: instead of creating a new alloc_request (currently on the stack), save/restore just the fields we need to reuse. This is a bit tricky, because we're doing a normal alloc_foreground.c allocation, which calls into ec.c to get a stripe, which then does more normal allocations - some of the fields get reused, and used differently. So we have to save and restore them - but the stack usage improvements will be well worth it. Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 73 +++++++++++++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 29 deletions(-) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 6f977e134d08..11f46dccc14f 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1710,8 +1710,9 @@ err: } static int new_stripe_alloc_buckets(struct btree_trans *trans, + struct alloc_request *req, struct ec_stripe_head *h, struct ec_stripe_new *s, - enum bch_watermark watermark, struct closure *cl) + struct closure *cl) { struct bch_fs *c = trans->c; struct open_bucket *ob; @@ -1719,17 +1720,21 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, unsigned i, j, nr_have_parity = 0, nr_have_data = 0; int ret = 0; - struct alloc_request req = { - .watermark = watermark, - .devs_may_alloc = h->devs, - .have_cache = true, - }; + enum bch_data_type saved_data_type = req->data_type; + struct open_buckets saved_ptrs = req->ptrs; + unsigned saved_nr_replicas = req->nr_replicas; + unsigned saved_nr_effective = req->nr_effective; + bool saved_have_cache = req->have_cache; + struct bch_devs_mask saved_devs_may_alloc = req->devs_may_alloc; + + req->devs_may_alloc = h->devs; + req->have_cache = true; BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); BUG_ON(v->nr_redundant != s->nr_parity); /* * We bypass the sector allocator which normally does this: */ - bitmap_and(req.devs_may_alloc.d, req.devs_may_alloc.d, + bitmap_and(req->devs_may_alloc.d, req->devs_may_alloc.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { @@ -1740,7 +1745,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, * block when updating the stripe */ if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) - __clear_bit(v->ptrs[i].dev, req.devs_may_alloc.d); + __clear_bit(v->ptrs[i].dev, req->devs_may_alloc.d); if (i < s->nr_data) nr_have_data++; @@ -1751,52 +1756,58 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, BUG_ON(nr_have_data > s->nr_data); BUG_ON(nr_have_parity > s->nr_parity); - req.ptrs.nr = 0; + req->ptrs.nr = 0; if (nr_have_parity < s->nr_parity) { - req.nr_replicas = s->nr_parity; - req.nr_effective = nr_have_parity; - req.data_type = BCH_DATA_parity; + req->nr_replicas = s->nr_parity; + req->nr_effective = nr_have_parity; + req->data_type = BCH_DATA_parity; - ret = bch2_bucket_alloc_set_trans(trans, &req, &h->parity_stripe, cl); + ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl); - open_bucket_for_each(c, &req.ptrs, ob, i) { + open_bucket_for_each(c, &req->ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, s->nr_data + s->nr_parity, s->nr_data); BUG_ON(j >= s->nr_data + s->nr_parity); - s->blocks[j] = req.ptrs.v[i]; + s->blocks[j] = req->ptrs.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, s->blocks_gotten); } if (ret) - return ret; + goto err; } - req.ptrs.nr = 0; + req->ptrs.nr = 0; if (nr_have_data < s->nr_data) { - req.nr_replicas = s->nr_data; - req.nr_effective = nr_have_data; - req.data_type = BCH_DATA_user; + req->nr_replicas = s->nr_data; + req->nr_effective = nr_have_data; + req->data_type = BCH_DATA_user; - ret = bch2_bucket_alloc_set_trans(trans, &req, &h->block_stripe, cl); + ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl); - open_bucket_for_each(c, &req.ptrs, ob, i) { + open_bucket_for_each(c, &req->ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, s->nr_data, 0); BUG_ON(j >= s->nr_data); - s->blocks[j] = req.ptrs.v[i]; + s->blocks[j] = req->ptrs.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, s->blocks_gotten); } if (ret) - return ret; + goto err; } - - return 0; +err: + req->data_type = saved_data_type; + req->ptrs = saved_ptrs; + req->nr_replicas = saved_nr_replicas; + req->nr_effective = saved_nr_effective; + req->have_cache = saved_have_cache; + req->devs_may_alloc = saved_devs_may_alloc; + return ret; } static int __get_existing_stripe(struct btree_trans *trans, @@ -2022,8 +2033,12 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, goto alloc_existing; /* First, try to allocate a full stripe: */ - ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?: + enum bch_watermark saved_watermark = BCH_WATERMARK_stripe; + swap(req->watermark, saved_watermark); + ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: __bch2_ec_stripe_head_reserve(trans, h, s); + swap(req->watermark, saved_watermark); + if (!ret) goto allocate_buf; if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || @@ -2042,7 +2057,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, goto err; if (req->watermark == BCH_WATERMARK_copygc) { - ret = new_stripe_alloc_buckets(trans, h, s, req->watermark, NULL) ?: + ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: __bch2_ec_stripe_head_reserve(trans, h, s); if (ret) goto err; @@ -2061,7 +2076,7 @@ alloc_existing: * Retry allocating buckets, with the watermark for this * particular write: */ - ret = new_stripe_alloc_buckets(trans, h, s, req->watermark, cl); + ret = new_stripe_alloc_buckets(trans, req, h, s, cl); if (ret) goto err; -- 2.51.0