From a678543e609dfb145f0498f895bee05bbc7994a5 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jan 2025 20:31:45 +0100 Subject: [PATCH 01/16] btrfs: fix front delete range calculation for RAID stripe extents When deleting the front of a RAID stripe-extent the delete code miscalculates the size on how much to pad the remaining extent part in the front. Fix the calculation so we're always having the sizes we expect. Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/raid-stripe-tree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 0c351eda3551..9e559ad48810 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -136,10 +136,12 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le * length to the new size and then re-insert the item. */ if (found_end > end) { - u64 diff = found_end - end; + u64 diff_end = found_end - end; btrfs_partially_delete_raid_extent(trans, path, &key, - diff, diff); + key.offset - length, + length); + ASSERT(key.offset - diff_end == length); break; } -- 2.51.0 From 50cae2ca69561cbd9a90308ad2a14a442d230662 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jan 2025 20:31:46 +0100 Subject: [PATCH 02/16] btrfs: fix tail delete of RAID stripe-extents Fix tail delete of RAID stripe-extents, if there is a range to be deleted as well after the tail delete of the extent. Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/raid-stripe-tree.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 9e559ad48810..ef76202c3a38 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -119,11 +119,18 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le * length to the new size and then re-insert the item. */ if (found_start < start) { - u64 diff = start - found_start; + u64 diff_start = start - found_start; btrfs_partially_delete_raid_extent(trans, path, &key, - diff, 0); - break; + diff_start, 0); + + start += (key.offset - diff_start); + length -= (key.offset - diff_start); + if (length == 0) + break; + + btrfs_release_path(path); + continue; } /* -- 2.51.0 From 76643119045eed639a3334370cba30c54c4074c1 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jan 2025 20:31:47 +0100 Subject: [PATCH 03/16] btrfs: fix deletion of a range spanning parts two RAID stripe extents When a user requests the deletion of a range that spans multiple stripe extents and btrfs_search_slot() returns us the second RAID stripe extent, we need to pick the previous item and truncate it, if there's still a range to delete left, move on to the next item. The following diagram illustrates the operation: |--- RAID Stripe Extent ---||--- RAID Stripe Extent ---| |--- keep ---|--- drop ---| While at it, comment the trivial case of a whole item delete as well. Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/raid-stripe-tree.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index ef76202c3a38..bf665fdef18b 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -99,6 +99,37 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le found_end = found_start + key.offset; ret = 0; + /* + * The stripe extent starts before the range we want to delete, + * but the range spans more than one stripe extent: + * + * |--- RAID Stripe Extent ---||--- RAID Stripe Extent ---| + * |--- keep ---|--- drop ---| + * + * This means we have to get the previous item, truncate its + * length and then restart the search. + */ + if (found_start > start) { + if (slot == 0) { + ret = btrfs_previous_item(stripe_root, path, start, + BTRFS_RAID_STRIPE_KEY); + if (ret) { + if (ret > 0) + ret = -ENOENT; + break; + } + } else { + path->slots[0]--; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + found_start = key.objectid; + found_end = found_start + key.offset; + ASSERT(found_start <= start); + } + if (key.type != BTRFS_RAID_STRIPE_KEY) break; @@ -152,6 +183,7 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le break; } + /* Finally we can delete the whole item, no more special cases. */ ret = btrfs_del_item(trans, stripe_root, path); if (ret) break; -- 2.51.0 From 6aa0e7cc569eb24a7a99c70ad7477d454b3ac0ca Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jan 2025 20:31:48 +0100 Subject: [PATCH 04/16] btrfs: implement hole punching for RAID stripe extents If the stripe extent we want to delete starts before the range we want to delete and ends after the range we want to delete we're punching a hole in the stripe extent: |--- RAID Stripe Extent ---| | keep |--- drop ---| keep | This means we need to a) truncate the existing item and b) create a second item for the remaining range. Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 1 + fs/btrfs/raid-stripe-tree.c | 48 +++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index c93f52a30a16..92071ca0655f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3833,6 +3833,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && + key.type != BTRFS_RAID_STRIPE_KEY && key.type != BTRFS_EXTENT_CSUM_KEY); if (btrfs_leaf_free_space(leaf) >= ins_len) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index bf665fdef18b..858abf518e9b 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -140,6 +140,54 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le trace_btrfs_raid_extent_delete(fs_info, start, end, found_start, found_end); + /* + * The stripe extent starts before the range we want to delete + * and ends after the range we want to delete, i.e. we're + * punching a hole in the stripe extent: + * + * |--- RAID Stripe Extent ---| + * | keep |--- drop ---| keep | + * + * This means we need to a) truncate the existing item and b) + * create a second item for the remaining range. + */ + if (found_start < start && found_end > end) { + size_t item_size; + u64 diff_start = start - found_start; + u64 diff_end = found_end - end; + struct btrfs_stripe_extent *extent; + struct btrfs_key newkey = { + .objectid = end, + .type = BTRFS_RAID_STRIPE_KEY, + .offset = diff_end, + }; + + /* The "right" item. */ + ret = btrfs_duplicate_item(trans, stripe_root, path, &newkey); + if (ret) + break; + + item_size = btrfs_item_size(leaf, path->slots[0]); + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_stripe_extent); + + for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) { + struct btrfs_raid_stride *stride = &extent->strides[i]; + u64 phys; + + phys = btrfs_raid_stride_physical(leaf, stride); + phys += diff_start + length; + btrfs_set_raid_stride_physical(leaf, stride, phys); + } + + /* The "left" item. */ + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_partially_delete_raid_extent(trans, path, &key, + diff_start, 0); + break; + } + /* * The stripe extent starts before the range we want to delete: * -- 2.51.0 From dc14ba10781bd2629835696b7cc1febf914768e9 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jan 2025 20:31:49 +0100 Subject: [PATCH 05/16] btrfs: don't use btrfs_set_item_key_safe on RAID stripe-extents Don't use btrfs_set_item_key_safe() to modify the keys in the RAID stripe-tree, as this can lead to corruption of the tree, which is caught by the checks in btrfs_set_item_key_safe(): BTRFS info (device nvme1n1): leaf 49168384 gen 15 total ptrs 194 free space 8329 owner 12 BTRFS info (device nvme1n1): refs 2 lock_owner 1030 current 1030 [ snip ] item 105 key (354549760 230 20480) itemoff 14587 itemsize 16 stride 0 devid 5 physical 67502080 item 106 key (354631680 230 4096) itemoff 14571 itemsize 16 stride 0 devid 1 physical 88559616 item 107 key (354631680 230 32768) itemoff 14555 itemsize 16 stride 0 devid 1 physical 88555520 item 108 key (354717696 230 28672) itemoff 14539 itemsize 16 stride 0 devid 2 physical 67604480 [ snip ] BTRFS critical (device nvme1n1): slot 106 key (354631680 230 32768) new key (354635776 230 4096) ------------[ cut here ]------------ kernel BUG at fs/btrfs/ctree.c:2602! Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI CPU: 1 UID: 0 PID: 1055 Comm: fsstress Not tainted 6.13.0-rc1+ #1464 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-rebuilt.opensuse.org 04/01/2014 RIP: 0010:btrfs_set_item_key_safe+0xf7/0x270 Code: RSP: 0018:ffffc90001337ab0 EFLAGS: 00010287 RAX: 0000000000000000 RBX: ffff8881115fd000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000001 RDI: 00000000ffffffff RBP: ffff888110ed6f50 R08: 00000000ffffefff R09: ffffffff8244c500 R10: 00000000ffffefff R11: 00000000ffffffff R12: ffff888100586000 R13: 00000000000000c9 R14: ffffc90001337b1f R15: ffff888110f23b58 FS: 00007f7d75c72740(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa811652c60 CR3: 0000000111398001 CR4: 0000000000370eb0 Call Trace: ? __die_body.cold+0x14/0x1a ? die+0x2e/0x50 ? do_trap+0xca/0x110 ? do_error_trap+0x65/0x80 ? btrfs_set_item_key_safe+0xf7/0x270 ? exc_invalid_op+0x50/0x70 ? btrfs_set_item_key_safe+0xf7/0x270 ? asm_exc_invalid_op+0x1a/0x20 ? btrfs_set_item_key_safe+0xf7/0x270 btrfs_partially_delete_raid_extent+0xc4/0xe0 btrfs_delete_raid_extent+0x227/0x240 __btrfs_free_extent.isra.0+0x57f/0x9c0 ? exc_coproc_segment_overrun+0x40/0x40 __btrfs_run_delayed_refs+0x2fa/0xe80 btrfs_run_delayed_refs+0x81/0xe0 btrfs_commit_transaction+0x2dd/0xbe0 ? preempt_count_add+0x52/0xb0 btrfs_sync_file+0x375/0x4c0 do_fsync+0x39/0x70 __x64_sys_fsync+0x13/0x20 do_syscall_64+0x54/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f7d7550ef90 Code: RSP: 002b:00007ffd70237248 EFLAGS: 00000202 ORIG_RAX: 000000000000004a RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f7d7550ef90 RDX: 000000000000013a RSI: 000000000040eb28 RDI: 0000000000000004 RBP: 000000000000001b R08: 0000000000000078 R09: 00007ffd7023725c R10: 00007f7d75400390 R11: 0000000000000202 R12: 028f5c28f5c28f5c R13: 8f5c28f5c28f5c29 R14: 000000000040b520 R15: 00007f7d75c726c8 While the root cause of the tree order corruption isn't clear, using btrfs_duplicate_item() to copy the item and then adjusting both the key and the per-device physical addresses is a safe way to counter this problem. Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/raid-stripe-tree.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 858abf518e9b..1834011ccc49 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -13,12 +13,13 @@ #include "volumes.h" #include "print-tree.h" -static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, +static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, const struct btrfs_key *oldkey, u64 newlen, u64 frontpad) { - struct btrfs_stripe_extent *extent; + struct btrfs_root *stripe_root = trans->fs_info->stripe_root; + struct btrfs_stripe_extent *extent, *newitem; struct extent_buffer *leaf; int slot; size_t item_size; @@ -27,6 +28,7 @@ static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, .type = BTRFS_RAID_STRIPE_KEY, .offset = newlen, }; + int ret; ASSERT(newlen > 0); ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY); @@ -34,17 +36,31 @@ static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; slot = path->slots[0]; item_size = btrfs_item_size(leaf, slot); + + newitem = kzalloc(item_size, GFP_NOFS); + if (!newitem) + return -ENOMEM; + extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) { struct btrfs_raid_stride *stride = &extent->strides[i]; u64 phys; - phys = btrfs_raid_stride_physical(leaf, stride); - btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad); + phys = btrfs_raid_stride_physical(leaf, stride) + frontpad; + btrfs_set_stack_raid_stride_physical(&newitem->strides[i], phys); } - btrfs_set_item_key_safe(trans, path, &newkey); + ret = btrfs_del_item(trans, stripe_root, path); + if (ret) + goto out; + + btrfs_release_path(path); + ret = btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size); + +out: + kfree(newitem); + return ret; } int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) -- 2.51.0 From d44d3d724bb24701546c92ed5f341736bc9d832e Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jan 2025 20:31:50 +0100 Subject: [PATCH 06/16] btrfs: selftests: check for correct return value of failed lookup Commit 5e72aabc1fff ("btrfs: return ENODATA in case RST lookup fails") changed btrfs_get_raid_extent_offset()'s return value to ENODATA in case the RAID stripe-tree lookup failed. Adjust the test cases which check for absence of a given range to check for ENODATA as return value in this case. Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/tests/raid-stripe-tree-tests.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c index 5801142ba7c3..446c46d89152 100644 --- a/fs/btrfs/tests/raid-stripe-tree-tests.c +++ b/fs/btrfs/tests/raid-stripe-tree-tests.c @@ -125,7 +125,7 @@ static int test_front_delete(struct btrfs_trans_handle *trans) } ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); - if (!ret) { + if (ret != -ENODATA) { ret = -EINVAL; test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", logical, logical + SZ_32K); -- 2.51.0 From a0afdec2552cf1ae059d58c3ffaa83aae7ddbfe1 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jan 2025 20:31:51 +0100 Subject: [PATCH 07/16] btrfs: selftests: don't split RAID extents in half The selftests for partially deleting the start or tail of RAID stripe-extents split these extents in half. This can hide errors in the calculation, so don't split the RAID stripe-extents in half but delete the first or last 16K of the 64K extents. Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/tests/raid-stripe-tree-tests.c | 44 ++++++++++++++++--------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c index 446c46d89152..da73369a79b4 100644 --- a/fs/btrfs/tests/raid-stripe-tree-tests.c +++ b/fs/btrfs/tests/raid-stripe-tree-tests.c @@ -14,6 +14,8 @@ #define RST_TEST_NUM_DEVICES (2) #define RST_TEST_RAID1_TYPE (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_RAID1) +#define SZ_48K (SZ_32K + SZ_16K) + typedef int (*test_func_t)(struct btrfs_trans_handle *trans); static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_devices, @@ -94,32 +96,32 @@ static int test_front_delete(struct btrfs_trans_handle *trans) goto out; } - ret = btrfs_delete_raid_extent(trans, logical, SZ_32K); + ret = btrfs_delete_raid_extent(trans, logical, SZ_16K); if (ret) { test_err("deleting RAID extent [%llu, %llu] failed", logical, - logical + SZ_32K); + logical + SZ_16K); goto out; } - len = SZ_32K; - ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_32K, &len, + len -= SZ_16K; + ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_16K, &len, map_type, 0, &io_stripe); if (ret) { test_err("lookup of RAID extent [%llu, %llu] failed", - logical + SZ_32K, logical + SZ_32K + len); + logical + SZ_16K, logical + SZ_64K); goto out; } - if (io_stripe.physical != logical + SZ_32K) { + if (io_stripe.physical != logical + SZ_16K) { test_err("invalid physical address, expected %llu, got %llu", - logical + SZ_32K, io_stripe.physical); + logical + SZ_16K, io_stripe.physical); ret = -EINVAL; goto out; } - if (len != SZ_32K) { + if (len != SZ_48K) { test_err("invalid stripe length, expected %llu, got %llu", - (u64)SZ_32K, len); + (u64)SZ_48K, len); ret = -EINVAL; goto out; } @@ -128,11 +130,11 @@ static int test_front_delete(struct btrfs_trans_handle *trans) if (ret != -ENODATA) { ret = -EINVAL; test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", - logical, logical + SZ_32K); + logical, logical + SZ_16K); goto out; } - ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K); + ret = btrfs_delete_raid_extent(trans, logical + SZ_16K, SZ_48K); out: btrfs_put_bioc(bioc); return ret; @@ -209,14 +211,14 @@ static int test_tail_delete(struct btrfs_trans_handle *trans) goto out; } - ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K); + ret = btrfs_delete_raid_extent(trans, logical + SZ_48K, SZ_16K); if (ret) { test_err("deleting RAID extent [%llu, %llu] failed", - logical + SZ_32K, logical + SZ_64K); + logical + SZ_48K, logical + SZ_64K); goto out; } - len = SZ_32K; + len = SZ_48K; ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); if (ret) { test_err("lookup of RAID extent [%llu, %llu] failed", logical, @@ -231,9 +233,19 @@ static int test_tail_delete(struct btrfs_trans_handle *trans) goto out; } - if (len != SZ_32K) { + if (len != SZ_48K) { test_err("invalid stripe length, expected %llu, got %llu", - (u64)SZ_32K, len); + (u64)SZ_48K, len); + ret = -EINVAL; + goto out; + } + + len = SZ_16K; + ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_48K, &len, + map_type, 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded should fail", + logical + SZ_48K, logical + SZ_64K); ret = -EINVAL; goto out; } -- 2.51.0 From 1d395c3926d8996918ca29a67fe194e7088491d9 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jan 2025 20:31:52 +0100 Subject: [PATCH 08/16] btrfs: selftests: test RAID stripe-tree deletion spanning two items Add a selftest for RAID stripe-tree deletion with a delete range spanning two items, so that we're punching a hole into two adjacent RAID stripe extents truncating the first and "moving" the second to the right. The following diagram illustrates the operation: |--- RAID Stripe Extent ---||--- RAID Stripe Extent ---| |----- keep -----|--- drop ---|----- keep ----| Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/tests/raid-stripe-tree-tests.c | 144 ++++++++++++++++++++++++ 1 file changed, 144 insertions(+) diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c index da73369a79b4..8dd609c66e95 100644 --- a/fs/btrfs/tests/raid-stripe-tree-tests.c +++ b/fs/btrfs/tests/raid-stripe-tree-tests.c @@ -31,6 +31,149 @@ static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_de return NULL; } +/* + * Test a 1M RST write that spans two adjacent RST items on disk and then + * delete a portion starting in the first item and spanning into the second + * item. This is similar to test_front_delete(), but spanning multiple items. + */ +static int test_front_delete_prev_item(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 logical2 = SZ_2M; + u64 len = SZ_1M; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + /* Insert RAID extent 1. */ + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical2; + /* Insert RAID extent 2, directly adjacent to it. */ + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical2 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical1 + SZ_512K, SZ_1M); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1 + SZ_512K, (u64)SZ_1M); + goto out; + } + + /* Verify item 1 is truncated to 512K. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len, map_type, 0, + &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical1, + logical1 + len); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_512K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_512K, len); + ret = -EINVAL; + goto out; + } + + /* Verify item 2's start is moved by 512K. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical2 + SZ_512K, &len, + map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical2 + SZ_512K, logical2 + len); + goto out; + } + + if (io_stripe.physical != logical2 + SZ_512K) { + test_err("invalid physical address, expected %llu got %llu", + logical2 + SZ_512K, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_512K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_512K, len); + ret = -EINVAL; + goto out; + } + + /* Verify there's a hole at [1M+512K, 2M+512K] . */ + len = SZ_1M; + ret = btrfs_get_raid_extent_offset(fs_info, logical1 + SZ_512K, &len, + map_type, 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID [%llu, %llu] succeeded, should fail", + logical1 + SZ_512K, logical1 + SZ_512K + len); + goto out; + } + + /* Clean up after us. */ + ret = btrfs_delete_raid_extent(trans, logical1, SZ_512K); + if (ret) + goto out; + + ret = btrfs_delete_raid_extent(trans, logical2 + SZ_512K, SZ_512K); + +out: + btrfs_put_bioc(bioc); + return ret; +} + /* * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then * delete the 1st 32K, making the new start address 1M+32K. @@ -468,6 +611,7 @@ static const test_func_t tests[] = { test_create_update_delete, test_tail_delete, test_front_delete, + test_front_delete_prev_item, }; static int run_test(test_func_t test, u32 sectorsize, u32 nodesize) -- 2.51.0 From 27ae15b25b6e892a4161bc33c7f2b8a356318a2c Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jan 2025 20:31:53 +0100 Subject: [PATCH 09/16] btrfs: selftests: add selftest for punching holes into the RAID stripe extents Add a selftest for punching a hole into a RAID stripe extent. The test create an 1M extent and punches a 64k bytes long hole at offset of 32k from the start of the extent. Afterwards it verifies the start and length of both resulting new extents "left" and "right" as well as the absence of the hole. Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/tests/raid-stripe-tree-tests.c | 140 ++++++++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c index 8dd609c66e95..d28a48470c26 100644 --- a/fs/btrfs/tests/raid-stripe-tree-tests.c +++ b/fs/btrfs/tests/raid-stripe-tree-tests.c @@ -31,6 +31,145 @@ static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_de return NULL; } +/* Test punching a hole into a single RAID stripe-extent. */ +static int test_punch_hole(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 hole_start = logical1 + SZ_32K; + u64 hole_len = SZ_64K; + u64 logical2 = hole_start + hole_len; + u64 len = SZ_1M; + u64 len1 = SZ_32K; + u64 len2 = len - len1 - hole_len; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len, map_type, 0, + &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical1, + logical1 + len); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_1M) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_1M, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, hole_start, hole_len); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + hole_start, hole_start + hole_len); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu, got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len1 != SZ_32K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_32K, len1); + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical2, + logical2 + len2); + goto out; + } + + if (io_stripe.physical != logical2) { + test_err("invalid physical address, expected %llu, got %llu", + logical2, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len2 != len - len1 - hole_len) { + test_err("invalid length, expected %llu, got %llu", + len - len1 - hole_len, len2); + ret = -EINVAL; + goto out; + } + + /* Check for the absence of the hole. */ + ret = btrfs_get_raid_extent_offset(fs_info, hole_start, &hole_len, + map_type, 0, &io_stripe); + if (ret != -ENODATA) { + ret = -EINVAL; + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + hole_start, hole_start + SZ_64K); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical1, len1); + if (ret) + goto out; + + ret = btrfs_delete_raid_extent(trans, logical2, len2); +out: + btrfs_put_bioc(bioc); + return ret; +} + /* * Test a 1M RST write that spans two adjacent RST items on disk and then * delete a portion starting in the first item and spanning into the second @@ -612,6 +751,7 @@ static const test_func_t tests[] = { test_tail_delete, test_front_delete, test_front_delete_prev_item, + test_punch_hole, }; static int run_test(test_func_t test, u32 sectorsize, u32 nodesize) -- 2.51.0 From cfda28fb706d53b332d5183d6091224289e96863 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jan 2025 20:31:54 +0100 Subject: [PATCH 10/16] btrfs: selftests: add test for punching a hole into 3 RAID stripe-extents Test creating a range of three RAID stripe-extents and then punch a hole in the middle, deleting all of the middle extents and partially deleting the "book ends". Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/tests/raid-stripe-tree-tests.c | 183 ++++++++++++++++++++++++ 1 file changed, 183 insertions(+) diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c index d28a48470c26..482e10b0d7ed 100644 --- a/fs/btrfs/tests/raid-stripe-tree-tests.c +++ b/fs/btrfs/tests/raid-stripe-tree-tests.c @@ -31,6 +31,188 @@ static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_de return NULL; } +/* + * Test creating a range of three extents and then punch a hole in the middle, + * deleting all of the middle extents and partially deleting the "book ends". + */ +static int test_punch_hole_3extents(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 len1 = SZ_1M; + u64 logical2 = logical1 + len1; + u64 len2 = SZ_1M; + u64 logical3 = logical2 + len2; + u64 len3 = SZ_1M; + u64 hole_start = logical1 + SZ_256K; + u64 hole_len = SZ_2M; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + + /* Prepare for the test, 1st create 3 x 1M extents. */ + bioc->map_type = map_type; + bioc->size = len1; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical2; + bioc->size = len2; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical2 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical3; + bioc->size = len3; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical3 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + /* + * Delete a range starting at logical1 + 256K and 2M in length. Extent + * 1 is truncated to 256k length, extent 2 is completely dropped and + * extent 3 is moved 256K to the right. + */ + ret = btrfs_delete_raid_extent(trans, hole_start, hole_len); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + hole_start, hole_start + hole_len); + goto out; + } + + /* Get the first extent and check its size. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu, got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len1 != SZ_256K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_256K, len1); + ret = -EINVAL; + goto out; + } + + /* Get the second extent and check it's absent. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, + 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded should fail", + logical2, logical2 + len2); + ret = -EINVAL; + goto out; + } + + /* Get the third extent and check its size. */ + logical3 += SZ_256K; + ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical3, logical3 + len3); + goto out; + } + + if (io_stripe.physical != logical3) { + test_err("invalid physical address, expected %llu, got %llu", + logical3 + SZ_256K, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len3 != SZ_1M - SZ_256K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_1M - SZ_256K, len3); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical1, len1); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical3, len3); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + +out: + btrfs_put_bioc(bioc); + return ret; +} + /* Test punching a hole into a single RAID stripe-extent. */ static int test_punch_hole(struct btrfs_trans_handle *trans) { @@ -752,6 +934,7 @@ static const test_func_t tests[] = { test_front_delete, test_front_delete_prev_item, test_punch_hole, + test_punch_hole_3extents, }; static int run_test(test_func_t test, u32 sectorsize, u32 nodesize) -- 2.51.0 From 9d0c23db26cb58c9fc6ee8817e8f9ebeb25776e5 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jan 2025 20:31:55 +0100 Subject: [PATCH 11/16] btrfs: selftests: add a selftest for deleting two out of three extents Add a selftest creating three extents and then deleting two out of the three extents. Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/tests/raid-stripe-tree-tests.c | 144 ++++++++++++++++++++++++ 1 file changed, 144 insertions(+) diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c index 482e10b0d7ed..a7bc58a5c1e2 100644 --- a/fs/btrfs/tests/raid-stripe-tree-tests.c +++ b/fs/btrfs/tests/raid-stripe-tree-tests.c @@ -213,6 +213,149 @@ out: return ret; } +static int test_delete_two_extents(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 len1 = SZ_1M; + u64 logical2 = logical1 + len1; + u64 len2 = SZ_1M; + u64 logical3 = logical2 + len2; + u64 len3 = SZ_1M; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + + /* Prepare for the test, 1st create 3 x 1M extents. */ + bioc->map_type = map_type; + bioc->size = len1; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical2; + bioc->size = len2; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical2 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical3; + bioc->size = len3; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical3 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + /* + * Delete a range starting at logical1 and 2M in length. Extents 1 + * and 2 are dropped and extent 3 is kept as is. + */ + ret = btrfs_delete_raid_extent(trans, logical1, len1 + len2); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1, logical1 + len1 + len2); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, + 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + logical1, len1); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, + 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + logical2, len2); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical3, len3); + goto out; + } + + if (io_stripe.physical != logical3) { + test_err("invalid physical address, expected %llu, got %llu", + logical3, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len3 != SZ_1M) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_1M, len3); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical3, len3); +out: + btrfs_put_bioc(bioc); + return ret; +} + /* Test punching a hole into a single RAID stripe-extent. */ static int test_punch_hole(struct btrfs_trans_handle *trans) { @@ -935,6 +1078,7 @@ static const test_func_t tests[] = { test_front_delete_prev_item, test_punch_hole, test_punch_hole_3extents, + test_delete_two_extents, }; static int run_test(test_func_t test, u32 sectorsize, u32 nodesize) -- 2.51.0 From a216542027b892e6651c1b4e076012140d04afaf Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 10 Jan 2025 15:22:24 +0000 Subject: [PATCH 12/16] btrfs: fix lockdep splat while merging a relocation root When COWing a relocation tree path, at relocation.c:replace_path(), we can trigger a lockdep splat while we are in the btrfs_search_slot() call against the relocation root. This happens in that callchain at ctree.c:read_block_for_search() when we happen to find a child extent buffer already loaded through the fs tree with a lockdep class set to the fs tree. So when we attempt to lock that extent buffer through a relocation tree we have to reset the lockdep class to the class for a relocation tree, since a relocation tree has extent buffers that used to belong to a fs tree and may currently be already loaded (we swap extent buffers between the two trees at the end of replace_path()). However we are missing calls to btrfs_maybe_reset_lockdep_class() to reset the lockdep class at ctree.c:read_block_for_search() before we read lock an extent buffer, just like we did for btrfs_search_slot() in commit b40130b23ca4 ("btrfs: fix lockdep splat with reloc root extent buffers"). So add the missing btrfs_maybe_reset_lockdep_class() calls before the attempts to read lock an extent buffer at ctree.c:read_block_for_search(). The lockdep splat was reported by syzbot and it looks like this: ====================================================== WARNING: possible circular locking dependency detected 6.13.0-rc5-syzkaller-00163-gab75170520d4 #0 Not tainted ------------------------------------------------------ syz.0.0/5335 is trying to acquire lock: ffff8880545dbc38 (btrfs-tree-01){++++}-{4:4}, at: btrfs_tree_read_lock_nested+0x2f/0x250 fs/btrfs/locking.c:146 but task is already holding lock: ffff8880545dba58 (btrfs-treloc-02/1){+.+.}-{4:4}, at: btrfs_tree_lock_nested+0x2f/0x250 fs/btrfs/locking.c:189 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (btrfs-treloc-02/1){+.+.}-{4:4}: reacquire_held_locks+0x3eb/0x690 kernel/locking/lockdep.c:5374 __lock_release kernel/locking/lockdep.c:5563 [inline] lock_release+0x396/0xa30 kernel/locking/lockdep.c:5870 up_write+0x79/0x590 kernel/locking/rwsem.c:1629 btrfs_force_cow_block+0x14b3/0x1fd0 fs/btrfs/ctree.c:660 btrfs_cow_block+0x371/0x830 fs/btrfs/ctree.c:755 btrfs_search_slot+0xc01/0x3180 fs/btrfs/ctree.c:2153 replace_path+0x1243/0x2740 fs/btrfs/relocation.c:1224 merge_reloc_root+0xc46/0x1ad0 fs/btrfs/relocation.c:1692 merge_reloc_roots+0x3b3/0x980 fs/btrfs/relocation.c:1942 relocate_block_group+0xb0a/0xd40 fs/btrfs/relocation.c:3754 btrfs_relocate_block_group+0x77d/0xd90 fs/btrfs/relocation.c:4087 btrfs_relocate_chunk+0x12c/0x3b0 fs/btrfs/volumes.c:3494 __btrfs_balance+0x1b0f/0x26b0 fs/btrfs/volumes.c:4278 btrfs_balance+0xbdc/0x10c0 fs/btrfs/volumes.c:4655 btrfs_ioctl_balance+0x493/0x7c0 fs/btrfs/ioctl.c:3670 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl+0xf5/0x170 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #1 (btrfs-tree-01/1){+.+.}-{4:4}: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 down_write_nested+0xa2/0x220 kernel/locking/rwsem.c:1693 btrfs_tree_lock_nested+0x2f/0x250 fs/btrfs/locking.c:189 btrfs_init_new_buffer fs/btrfs/extent-tree.c:5052 [inline] btrfs_alloc_tree_block+0x41c/0x1440 fs/btrfs/extent-tree.c:5132 btrfs_force_cow_block+0x526/0x1fd0 fs/btrfs/ctree.c:573 btrfs_cow_block+0x371/0x830 fs/btrfs/ctree.c:755 btrfs_search_slot+0xc01/0x3180 fs/btrfs/ctree.c:2153 btrfs_insert_empty_items+0x9c/0x1a0 fs/btrfs/ctree.c:4351 btrfs_insert_empty_item fs/btrfs/ctree.h:688 [inline] btrfs_insert_inode_ref+0x2bb/0xf80 fs/btrfs/inode-item.c:330 btrfs_rename_exchange fs/btrfs/inode.c:7990 [inline] btrfs_rename2+0xcb7/0x2b90 fs/btrfs/inode.c:8374 vfs_rename+0xbdb/0xf00 fs/namei.c:5067 do_renameat2+0xd94/0x13f0 fs/namei.c:5224 __do_sys_renameat2 fs/namei.c:5258 [inline] __se_sys_renameat2 fs/namei.c:5255 [inline] __x64_sys_renameat2+0xce/0xe0 fs/namei.c:5255 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #0 (btrfs-tree-01){++++}-{4:4}: check_prev_add kernel/locking/lockdep.c:3161 [inline] check_prevs_add kernel/locking/lockdep.c:3280 [inline] validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904 __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 down_read_nested+0xb5/0xa50 kernel/locking/rwsem.c:1649 btrfs_tree_read_lock_nested+0x2f/0x250 fs/btrfs/locking.c:146 btrfs_tree_read_lock fs/btrfs/locking.h:188 [inline] read_block_for_search+0x718/0xbb0 fs/btrfs/ctree.c:1610 btrfs_search_slot+0x1274/0x3180 fs/btrfs/ctree.c:2237 replace_path+0x1243/0x2740 fs/btrfs/relocation.c:1224 merge_reloc_root+0xc46/0x1ad0 fs/btrfs/relocation.c:1692 merge_reloc_roots+0x3b3/0x980 fs/btrfs/relocation.c:1942 relocate_block_group+0xb0a/0xd40 fs/btrfs/relocation.c:3754 btrfs_relocate_block_group+0x77d/0xd90 fs/btrfs/relocation.c:4087 btrfs_relocate_chunk+0x12c/0x3b0 fs/btrfs/volumes.c:3494 __btrfs_balance+0x1b0f/0x26b0 fs/btrfs/volumes.c:4278 btrfs_balance+0xbdc/0x10c0 fs/btrfs/volumes.c:4655 btrfs_ioctl_balance+0x493/0x7c0 fs/btrfs/ioctl.c:3670 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl+0xf5/0x170 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f other info that might help us debug this: Chain exists of: btrfs-tree-01 --> btrfs-tree-01/1 --> btrfs-treloc-02/1 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(btrfs-treloc-02/1); lock(btrfs-tree-01/1); lock(btrfs-treloc-02/1); rlock(btrfs-tree-01); *** DEADLOCK *** 8 locks held by syz.0.0/5335: #0: ffff88801e3ae420 (sb_writers#13){.+.+}-{0:0}, at: mnt_want_write_file+0x5e/0x200 fs/namespace.c:559 #1: ffff888052c760d0 (&fs_info->reclaim_bgs_lock){+.+.}-{4:4}, at: __btrfs_balance+0x4c2/0x26b0 fs/btrfs/volumes.c:4183 #2: ffff888052c74850 (&fs_info->cleaner_mutex){+.+.}-{4:4}, at: btrfs_relocate_block_group+0x775/0xd90 fs/btrfs/relocation.c:4086 #3: ffff88801e3ae610 (sb_internal#2){.+.+}-{0:0}, at: merge_reloc_root+0xf11/0x1ad0 fs/btrfs/relocation.c:1659 #4: ffff888052c76470 (btrfs_trans_num_writers){++++}-{0:0}, at: join_transaction+0x405/0xda0 fs/btrfs/transaction.c:288 #5: ffff888052c76498 (btrfs_trans_num_extwriters){++++}-{0:0}, at: join_transaction+0x405/0xda0 fs/btrfs/transaction.c:288 #6: ffff8880545db878 (btrfs-tree-01/1){+.+.}-{4:4}, at: btrfs_tree_lock_nested+0x2f/0x250 fs/btrfs/locking.c:189 #7: ffff8880545dba58 (btrfs-treloc-02/1){+.+.}-{4:4}, at: btrfs_tree_lock_nested+0x2f/0x250 fs/btrfs/locking.c:189 stack backtrace: CPU: 0 UID: 0 PID: 5335 Comm: syz.0.0 Not tainted 6.13.0-rc5-syzkaller-00163-gab75170520d4 #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_circular_bug+0x13a/0x1b0 kernel/locking/lockdep.c:2074 check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2206 check_prev_add kernel/locking/lockdep.c:3161 [inline] check_prevs_add kernel/locking/lockdep.c:3280 [inline] validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904 __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 down_read_nested+0xb5/0xa50 kernel/locking/rwsem.c:1649 btrfs_tree_read_lock_nested+0x2f/0x250 fs/btrfs/locking.c:146 btrfs_tree_read_lock fs/btrfs/locking.h:188 [inline] read_block_for_search+0x718/0xbb0 fs/btrfs/ctree.c:1610 btrfs_search_slot+0x1274/0x3180 fs/btrfs/ctree.c:2237 replace_path+0x1243/0x2740 fs/btrfs/relocation.c:1224 merge_reloc_root+0xc46/0x1ad0 fs/btrfs/relocation.c:1692 merge_reloc_roots+0x3b3/0x980 fs/btrfs/relocation.c:1942 relocate_block_group+0xb0a/0xd40 fs/btrfs/relocation.c:3754 btrfs_relocate_block_group+0x77d/0xd90 fs/btrfs/relocation.c:4087 btrfs_relocate_chunk+0x12c/0x3b0 fs/btrfs/volumes.c:3494 __btrfs_balance+0x1b0f/0x26b0 fs/btrfs/volumes.c:4278 btrfs_balance+0xbdc/0x10c0 fs/btrfs/volumes.c:4655 btrfs_ioctl_balance+0x493/0x7c0 fs/btrfs/ioctl.c:3670 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl+0xf5/0x170 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f1ac6985d29 Code: ff ff c3 (...) RSP: 002b:00007f1ac63fe038 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007f1ac6b76160 RCX: 00007f1ac6985d29 RDX: 0000000020000180 RSI: 00000000c4009420 RDI: 0000000000000007 RBP: 00007f1ac6a01b08 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000001 R14: 00007f1ac6b76160 R15: 00007fffda145a88 Reported-by: syzbot+63913e558c084f7f8fdc@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/677b3014.050a0220.3b53b0.0064.GAE@google.com/ Fixes: 99785998ed1c ("btrfs: reduce lock contention when eb cache miss for btree search") Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 92071ca0655f..3dc5a35dd19b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1496,6 +1496,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (!p->skip_locking) { btrfs_unlock_up_safe(p, parent_level + 1); + btrfs_maybe_reset_lockdep_class(root, tmp); tmp_locked = true; btrfs_tree_read_lock(tmp); btrfs_release_path(p); @@ -1539,6 +1540,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (!p->skip_locking) { ASSERT(ret == -EAGAIN); + btrfs_maybe_reset_lockdep_class(root, tmp); tmp_locked = true; btrfs_tree_read_lock(tmp); btrfs_release_path(p); -- 2.51.0 From 0d85f5c2dd91df6b5da454406756f463ba923b69 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Jan 2025 15:01:08 +0000 Subject: [PATCH 13/16] btrfs: fix assertion failure when splitting ordered extent after transaction abort If while we are doing a direct IO write a transaction abort happens, we mark all existing ordered extents with the BTRFS_ORDERED_IOERR flag (done at btrfs_destroy_ordered_extents()), and then after that if we enter btrfs_split_ordered_extent() and the ordered extent has bytes left (meaning we have a bio that doesn't cover the whole ordered extent, see details at btrfs_extract_ordered_extent()), we will fail on the following assertion at btrfs_split_ordered_extent(): ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); because the BTRFS_ORDERED_IOERR flag is set and the definition of BTRFS_ORDERED_TYPE_FLAGS is just the union of all flags that identify the type of write (regular, nocow, prealloc, compressed, direct IO, encoded). Fix this by returning an error from btrfs_extract_ordered_extent() if we find the BTRFS_ORDERED_IOERR flag in the ordered extent. The error will be the error that resulted in the transaction abort or -EIO if no transaction abort happened. This was recently reported by syzbot with the following trace: FAULT_INJECTION: forcing a failure. name failslab, interval 1, probability 0, space 0, times 1 CPU: 0 UID: 0 PID: 5321 Comm: syz.0.0 Not tainted 6.13.0-rc5-syzkaller #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 fail_dump lib/fault-inject.c:53 [inline] should_fail_ex+0x3b0/0x4e0 lib/fault-inject.c:154 should_failslab+0xac/0x100 mm/failslab.c:46 slab_pre_alloc_hook mm/slub.c:4072 [inline] slab_alloc_node mm/slub.c:4148 [inline] __do_kmalloc_node mm/slub.c:4297 [inline] __kmalloc_noprof+0xdd/0x4c0 mm/slub.c:4310 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1037 [inline] btrfs_chunk_alloc_add_chunk_item+0x244/0x1100 fs/btrfs/volumes.c:5742 reserve_chunk_space+0x1ca/0x2c0 fs/btrfs/block-group.c:4292 check_system_chunk fs/btrfs/block-group.c:4319 [inline] do_chunk_alloc fs/btrfs/block-group.c:3891 [inline] btrfs_chunk_alloc+0x77b/0xf80 fs/btrfs/block-group.c:4187 find_free_extent_update_loop fs/btrfs/extent-tree.c:4166 [inline] find_free_extent+0x42d1/0x5810 fs/btrfs/extent-tree.c:4579 btrfs_reserve_extent+0x422/0x810 fs/btrfs/extent-tree.c:4672 btrfs_new_extent_direct fs/btrfs/direct-io.c:186 [inline] btrfs_get_blocks_direct_write+0x706/0xfa0 fs/btrfs/direct-io.c:321 btrfs_dio_iomap_begin+0xbb7/0x1180 fs/btrfs/direct-io.c:525 iomap_iter+0x697/0xf60 fs/iomap/iter.c:90 __iomap_dio_rw+0xeb9/0x25b0 fs/iomap/direct-io.c:702 btrfs_dio_write fs/btrfs/direct-io.c:775 [inline] btrfs_direct_write+0x610/0xa30 fs/btrfs/direct-io.c:880 btrfs_do_write_iter+0x2a0/0x760 fs/btrfs/file.c:1397 do_iter_readv_writev+0x600/0x880 vfs_writev+0x376/0xba0 fs/read_write.c:1050 do_pwritev fs/read_write.c:1146 [inline] __do_sys_pwritev2 fs/read_write.c:1204 [inline] __se_sys_pwritev2+0x196/0x2b0 fs/read_write.c:1195 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f1281f85d29 RSP: 002b:00007f12819fe038 EFLAGS: 00000246 ORIG_RAX: 0000000000000148 RAX: ffffffffffffffda RBX: 00007f1282176080 RCX: 00007f1281f85d29 RDX: 0000000000000001 RSI: 0000000020000240 RDI: 0000000000000005 RBP: 00007f12819fe090 R08: 0000000000000000 R09: 0000000000000003 R10: 0000000000007000 R11: 0000000000000246 R12: 0000000000000002 R13: 0000000000000000 R14: 00007f1282176080 R15: 00007ffcb9e23328 BTRFS error (device loop0 state A): Transaction aborted (error -12) BTRFS: error (device loop0 state A) in btrfs_chunk_alloc_add_chunk_item:5745: errno=-12 Out of memory BTRFS info (device loop0 state EA): forced readonly assertion failed: !(flags & ~BTRFS_ORDERED_TYPE_FLAGS), in fs/btrfs/ordered-data.c:1234 ------------[ cut here ]------------ kernel BUG at fs/btrfs/ordered-data.c:1234! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI CPU: 0 UID: 0 PID: 5321 Comm: syz.0.0 Not tainted 6.13.0-rc5-syzkaller #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 RIP: 0010:btrfs_split_ordered_extent+0xd8d/0xe20 fs/btrfs/ordered-data.c:1234 RSP: 0018:ffffc9000d1df2b8 EFLAGS: 00010246 RAX: 0000000000000057 RBX: 000000000006a000 RCX: 9ce21886c4195300 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: 0000000000000091 R08: ffffffff817f0a3c R09: 1ffff92001a3bdf4 R10: dffffc0000000000 R11: fffff52001a3bdf5 R12: 1ffff1100a45f401 R13: ffff8880522fa018 R14: dffffc0000000000 R15: 000000000006a000 FS: 00007f12819fe6c0(0000) GS:ffff88801fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000557750bd7da8 CR3: 00000000400ea000 CR4: 0000000000352ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btrfs_extract_ordered_extent fs/btrfs/direct-io.c:702 [inline] btrfs_dio_submit_io+0x4be/0x6d0 fs/btrfs/direct-io.c:737 iomap_dio_submit_bio fs/iomap/direct-io.c:85 [inline] iomap_dio_bio_iter+0x1022/0x1740 fs/iomap/direct-io.c:447 __iomap_dio_rw+0x13b7/0x25b0 fs/iomap/direct-io.c:703 btrfs_dio_write fs/btrfs/direct-io.c:775 [inline] btrfs_direct_write+0x610/0xa30 fs/btrfs/direct-io.c:880 btrfs_do_write_iter+0x2a0/0x760 fs/btrfs/file.c:1397 do_iter_readv_writev+0x600/0x880 vfs_writev+0x376/0xba0 fs/read_write.c:1050 do_pwritev fs/read_write.c:1146 [inline] __do_sys_pwritev2 fs/read_write.c:1204 [inline] __se_sys_pwritev2+0x196/0x2b0 fs/read_write.c:1195 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f1281f85d29 RSP: 002b:00007f12819fe038 EFLAGS: 00000246 ORIG_RAX: 0000000000000148 RAX: ffffffffffffffda RBX: 00007f1282176080 RCX: 00007f1281f85d29 RDX: 0000000000000001 RSI: 0000000020000240 RDI: 0000000000000005 RBP: 00007f12819fe090 R08: 0000000000000000 R09: 0000000000000003 R10: 0000000000007000 R11: 0000000000000246 R12: 0000000000000002 R13: 0000000000000000 R14: 00007f1282176080 R15: 00007ffcb9e23328 Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:btrfs_split_ordered_extent+0xd8d/0xe20 fs/btrfs/ordered-data.c:1234 RSP: 0018:ffffc9000d1df2b8 EFLAGS: 00010246 RAX: 0000000000000057 RBX: 000000000006a000 RCX: 9ce21886c4195300 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: 0000000000000091 R08: ffffffff817f0a3c R09: 1ffff92001a3bdf4 R10: dffffc0000000000 R11: fffff52001a3bdf5 R12: 1ffff1100a45f401 R13: ffff8880522fa018 R14: dffffc0000000000 R15: 000000000006a000 FS: 00007f12819fe6c0(0000) GS:ffff88801fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000557750bd7da8 CR3: 00000000400ea000 CR4: 0000000000352ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 In this case the transaction abort was due to (an injected) memory allocation failure when attempting to allocate a new chunk. Reported-by: syzbot+f60d8337a5c8e8d92a77@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/6777f2dd.050a0220.178762.0045.GAE@google.com/ Fixes: 52b1fdca23ac ("btrfs: handle completed ordered extents in btrfs_split_ordered_extent") Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 30eceaf829a7..4aca7475fd82 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1229,6 +1229,18 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( */ if (WARN_ON_ONCE(len >= ordered->num_bytes)) return ERR_PTR(-EINVAL); + /* + * If our ordered extent had an error there's no point in continuing. + * The error may have come from a transaction abort done either by this + * task or some other concurrent task, and the transaction abort path + * iterates over all existing ordered extents and sets the flag + * BTRFS_ORDERED_IOERR on them. + */ + if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) { + const int fs_error = BTRFS_FS_ERROR(fs_info); + + return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO); + } /* We cannot split partially completed ordered extents. */ if (ordered->bytes_left) { ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); -- 2.51.0 From c9c863793395cf0a66c2778a29d72c48c02fbb66 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 20 Jan 2025 09:40:43 +1030 Subject: [PATCH 14/16] btrfs: do not output error message if a qgroup has been already cleaned up [BUG] There is a bug report that btrfs outputs the following error message: BTRFS info (device nvme0n1p2): qgroup scan completed (inconsistency flag cleared) BTRFS warning (device nvme0n1p2): failed to cleanup qgroup 0/1179: -2 [CAUSE] The error itself is pretty harmless, and the end user should ignore it. When a subvolume is fully dropped, btrfs will call btrfs_qgroup_cleanup_dropped_subvolume() to delete the qgroup. However if a qgroup rescan happened before a subvolume fully dropped, qgroup for that subvolume will not be re-created, as rescan will only create new qgroup if there is a BTRFS_ROOT_REF_KEY found. But before we drop a subvolume, the subvolume is unlinked thus there is no BTRFS_ROOT_REF_KEY. In that case, btrfs_remove_qgroup() will fail with -ENOENT and trigger the above error message. [FIX] Just ignore -ENOENT error from btrfs_remove_qgroup() inside btrfs_qgroup_cleanup_dropped_subvolume(). Reported-by: John Shand Link: https://bugzilla.suse.com/show_bug.cgi?id=1236056 Fixes: 839d6ea4f86d ("btrfs: automatically remove the subvolume qgroup") Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b90fabe302e6..aaf16019d829 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1897,8 +1897,11 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su /* * It's squota and the subvolume still has numbers needed for future * accounting, in this case we can not delete it. Just skip it. + * + * Or the qgroup is already removed by a qgroup rescan. For both cases we're + * safe to ignore them. */ - if (ret == -EBUSY) + if (ret == -EBUSY || ret == -ENOENT) ret = 0; return ret; } -- 2.51.0 From e2f0943cf37305dbdeaf9846e3c941451bcdef63 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 20 Jan 2025 17:26:10 +0000 Subject: [PATCH 15/16] btrfs: fix use-after-free when attempting to join an aborted transaction When we are trying to join the current transaction and if it's aborted, we read its 'aborted' field after unlocking fs_info->trans_lock and without holding any extra reference count on it. This means that a concurrent task that is aborting the transaction may free the transaction before we read its 'aborted' field, leading to a use-after-free. Fix this by reading the 'aborted' field while holding fs_info->trans_lock since any freeing task must first acquire that lock and set fs_info->running_transaction to NULL before freeing the transaction. This was reported by syzbot and Dmitry with the following stack traces from KASAN: ================================================================== BUG: KASAN: slab-use-after-free in join_transaction+0xd9b/0xda0 fs/btrfs/transaction.c:278 Read of size 4 at addr ffff888011839024 by task kworker/u4:9/1128 CPU: 0 UID: 0 PID: 1128 Comm: kworker/u4:9 Not tainted 6.13.0-rc7-syzkaller-00019-gc45323b7560e #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Workqueue: events_unbound btrfs_async_reclaim_data_space Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0x169/0x550 mm/kasan/report.c:489 kasan_report+0x143/0x180 mm/kasan/report.c:602 join_transaction+0xd9b/0xda0 fs/btrfs/transaction.c:278 start_transaction+0xaf8/0x1670 fs/btrfs/transaction.c:697 flush_space+0x448/0xcf0 fs/btrfs/space-info.c:803 btrfs_async_reclaim_data_space+0x159/0x510 fs/btrfs/space-info.c:1321 process_one_work kernel/workqueue.c:3236 [inline] process_scheduled_works+0xa66/0x1840 kernel/workqueue.c:3317 worker_thread+0x870/0xd30 kernel/workqueue.c:3398 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Allocated by task 5315: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x98/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x243/0x390 mm/slub.c:4329 kmalloc_noprof include/linux/slab.h:901 [inline] join_transaction+0x144/0xda0 fs/btrfs/transaction.c:308 start_transaction+0xaf8/0x1670 fs/btrfs/transaction.c:697 btrfs_create_common+0x1b2/0x2e0 fs/btrfs/inode.c:6572 lookup_open fs/namei.c:3649 [inline] open_last_lookups fs/namei.c:3748 [inline] path_openat+0x1c03/0x3590 fs/namei.c:3984 do_filp_open+0x27f/0x4e0 fs/namei.c:4014 do_sys_openat2+0x13e/0x1d0 fs/open.c:1402 do_sys_open fs/open.c:1417 [inline] __do_sys_creat fs/open.c:1495 [inline] __se_sys_creat fs/open.c:1489 [inline] __x64_sys_creat+0x123/0x170 fs/open.c:1489 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 5336: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:582 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x59/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2353 [inline] slab_free mm/slub.c:4613 [inline] kfree+0x196/0x430 mm/slub.c:4761 cleanup_transaction fs/btrfs/transaction.c:2063 [inline] btrfs_commit_transaction+0x2c97/0x3720 fs/btrfs/transaction.c:2598 insert_balance_item+0x1284/0x20b0 fs/btrfs/volumes.c:3757 btrfs_balance+0x992/0x10c0 fs/btrfs/volumes.c:4633 btrfs_ioctl_balance+0x493/0x7c0 fs/btrfs/ioctl.c:3670 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl+0xf5/0x170 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f The buggy address belongs to the object at ffff888011839000 which belongs to the cache kmalloc-2k of size 2048 The buggy address is located 36 bytes inside of freed 2048-byte region [ffff888011839000, ffff888011839800) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11838 head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 flags: 0xfff00000000040(head|node=0|zone=1|lastcpupid=0x7ff) page_type: f5(slab) raw: 00fff00000000040 ffff88801ac42000 ffffea0000493400 dead000000000002 raw: 0000000000000000 0000000000080008 00000001f5000000 0000000000000000 head: 00fff00000000040 ffff88801ac42000 ffffea0000493400 dead000000000002 head: 0000000000000000 0000000000080008 00000001f5000000 0000000000000000 head: 00fff00000000003 ffffea0000460e01 ffffffffffffffff 0000000000000000 head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 3, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 57, tgid 57 (kworker/0:2), ts 67248182943, free_ts 67229742023 set_page_owner include/linux/page_owner.h:32 [inline] post_alloc_hook+0x1f3/0x230 mm/page_alloc.c:1558 prep_new_page mm/page_alloc.c:1566 [inline] get_page_from_freelist+0x365c/0x37a0 mm/page_alloc.c:3476 __alloc_pages_noprof+0x292/0x710 mm/page_alloc.c:4753 alloc_pages_mpol_noprof+0x3e1/0x780 mm/mempolicy.c:2269 alloc_slab_page+0x6a/0x110 mm/slub.c:2423 allocate_slab+0x5a/0x2b0 mm/slub.c:2589 new_slab mm/slub.c:2642 [inline] ___slab_alloc+0xc27/0x14a0 mm/slub.c:3830 __slab_alloc+0x58/0xa0 mm/slub.c:3920 __slab_alloc_node mm/slub.c:3995 [inline] slab_alloc_node mm/slub.c:4156 [inline] __do_kmalloc_node mm/slub.c:4297 [inline] __kmalloc_node_track_caller_noprof+0x2e9/0x4c0 mm/slub.c:4317 kmalloc_reserve+0x111/0x2a0 net/core/skbuff.c:609 __alloc_skb+0x1f3/0x440 net/core/skbuff.c:678 alloc_skb include/linux/skbuff.h:1323 [inline] alloc_skb_with_frags+0xc3/0x820 net/core/skbuff.c:6612 sock_alloc_send_pskb+0x91a/0xa60 net/core/sock.c:2884 sock_alloc_send_skb include/net/sock.h:1803 [inline] mld_newpack+0x1c3/0xaf0 net/ipv6/mcast.c:1747 add_grhead net/ipv6/mcast.c:1850 [inline] add_grec+0x1492/0x19a0 net/ipv6/mcast.c:1988 mld_send_cr net/ipv6/mcast.c:2114 [inline] mld_ifc_work+0x691/0xd90 net/ipv6/mcast.c:2651 page last free pid 5300 tgid 5300 stack trace: reset_page_owner include/linux/page_owner.h:25 [inline] free_pages_prepare mm/page_alloc.c:1127 [inline] free_unref_page+0xd3f/0x1010 mm/page_alloc.c:2659 __slab_free+0x2c2/0x380 mm/slub.c:4524 qlink_free mm/kasan/quarantine.c:163 [inline] qlist_free_all+0x9a/0x140 mm/kasan/quarantine.c:179 kasan_quarantine_reduce+0x14f/0x170 mm/kasan/quarantine.c:286 __kasan_slab_alloc+0x23/0x80 mm/kasan/common.c:329 kasan_slab_alloc include/linux/kasan.h:250 [inline] slab_post_alloc_hook mm/slub.c:4119 [inline] slab_alloc_node mm/slub.c:4168 [inline] __do_kmalloc_node mm/slub.c:4297 [inline] __kmalloc_noprof+0x236/0x4c0 mm/slub.c:4310 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1037 [inline] fib_create_info+0xc14/0x25b0 net/ipv4/fib_semantics.c:1435 fib_table_insert+0x1f6/0x1f20 net/ipv4/fib_trie.c:1231 fib_magic+0x3d8/0x620 net/ipv4/fib_frontend.c:1112 fib_add_ifaddr+0x40c/0x5e0 net/ipv4/fib_frontend.c:1156 fib_netdev_event+0x375/0x490 net/ipv4/fib_frontend.c:1494 notifier_call_chain+0x1a5/0x3f0 kernel/notifier.c:85 __dev_notify_flags+0x207/0x400 dev_change_flags+0xf0/0x1a0 net/core/dev.c:9045 do_setlink+0xc90/0x4210 net/core/rtnetlink.c:3109 rtnl_changelink net/core/rtnetlink.c:3723 [inline] __rtnl_newlink net/core/rtnetlink.c:3875 [inline] rtnl_newlink+0x1bb6/0x2210 net/core/rtnetlink.c:4012 Memory state around the buggy address: ffff888011838f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888011838f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff888011839000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888011839080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888011839100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Reported-by: syzbot+45212e9d87a98c3f5b42@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/678e7da5.050a0220.303755.007c.GAE@google.com/ Reported-by: Dmitry Vyukov Link: https://lore.kernel.org/linux-btrfs/CACT4Y+ZFBdo7pT8L2AzM=vegZwjp-wNkVJZQf0Ta3vZqtExaSw@mail.gmail.com/ Fixes: 871383be592b ("btrfs: add missing unlocks to transaction abort paths") Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 15312013f2a3..aca83a98b75a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -274,8 +274,10 @@ loop: cur_trans = fs_info->running_transaction; if (cur_trans) { if (TRANS_ABORTED(cur_trans)) { + const int abort_error = cur_trans->aborted; + spin_unlock(&fs_info->trans_lock); - return cur_trans->aborted; + return abort_error; } if (btrfs_blocked_trans_types[cur_trans->state] & type) { spin_unlock(&fs_info->trans_lock); -- 2.51.0 From fdef89ce6fada462aef9cb90a140c93c8c209f0f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 21 Jan 2025 12:24:39 +0000 Subject: [PATCH 16/16] btrfs: avoid starting new transaction when cleaning qgroup during subvolume drop At btrfs_qgroup_cleanup_dropped_subvolume() all we want to commit the current transaction in order to have all the qgroup rfer/excl numbers up to date. However we are using btrfs_start_transaction(), which joins the current transaction if there is one that is not yet committing, but also starts a new one if there is none or if the current one is already committing (its state is >= TRANS_STATE_COMMIT_START). This later case results in unnecessary IO, wasting time and a pointless rotation of the backup roots in the super block. So instead of using btrfs_start_transaction() followed by a btrfs_commit_transaction(), use btrfs_commit_current_transaction() which achieves our purpose and avoids starting and committing new transactions. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index aaf16019d829..f9d3766c809b 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1880,11 +1880,7 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su * Commit current transaction to make sure all the rfer/excl numbers * get updated. */ - trans = btrfs_start_transaction(fs_info->quota_root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_commit_transaction(trans); + ret = btrfs_commit_current_transaction(fs_info->quota_root); if (ret < 0) return ret; -- 2.51.0