From cfda28fb706d53b332d5183d6091224289e96863 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jan 2025 20:31:54 +0100 Subject: [PATCH 01/16] btrfs: selftests: add test for punching a hole into 3 RAID stripe-extents Test creating a range of three RAID stripe-extents and then punch a hole in the middle, deleting all of the middle extents and partially deleting the "book ends". Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/tests/raid-stripe-tree-tests.c | 183 ++++++++++++++++++++++++ 1 file changed, 183 insertions(+) diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c index d28a48470c26..482e10b0d7ed 100644 --- a/fs/btrfs/tests/raid-stripe-tree-tests.c +++ b/fs/btrfs/tests/raid-stripe-tree-tests.c @@ -31,6 +31,188 @@ static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_de return NULL; } +/* + * Test creating a range of three extents and then punch a hole in the middle, + * deleting all of the middle extents and partially deleting the "book ends". + */ +static int test_punch_hole_3extents(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 len1 = SZ_1M; + u64 logical2 = logical1 + len1; + u64 len2 = SZ_1M; + u64 logical3 = logical2 + len2; + u64 len3 = SZ_1M; + u64 hole_start = logical1 + SZ_256K; + u64 hole_len = SZ_2M; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + + /* Prepare for the test, 1st create 3 x 1M extents. */ + bioc->map_type = map_type; + bioc->size = len1; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical2; + bioc->size = len2; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical2 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical3; + bioc->size = len3; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical3 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + /* + * Delete a range starting at logical1 + 256K and 2M in length. Extent + * 1 is truncated to 256k length, extent 2 is completely dropped and + * extent 3 is moved 256K to the right. + */ + ret = btrfs_delete_raid_extent(trans, hole_start, hole_len); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + hole_start, hole_start + hole_len); + goto out; + } + + /* Get the first extent and check its size. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu, got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len1 != SZ_256K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_256K, len1); + ret = -EINVAL; + goto out; + } + + /* Get the second extent and check it's absent. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, + 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded should fail", + logical2, logical2 + len2); + ret = -EINVAL; + goto out; + } + + /* Get the third extent and check its size. */ + logical3 += SZ_256K; + ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical3, logical3 + len3); + goto out; + } + + if (io_stripe.physical != logical3) { + test_err("invalid physical address, expected %llu, got %llu", + logical3 + SZ_256K, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len3 != SZ_1M - SZ_256K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_1M - SZ_256K, len3); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical1, len1); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical3, len3); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + +out: + btrfs_put_bioc(bioc); + return ret; +} + /* Test punching a hole into a single RAID stripe-extent. */ static int test_punch_hole(struct btrfs_trans_handle *trans) { @@ -752,6 +934,7 @@ static const test_func_t tests[] = { test_front_delete, test_front_delete_prev_item, test_punch_hole, + test_punch_hole_3extents, }; static int run_test(test_func_t test, u32 sectorsize, u32 nodesize) -- 2.51.0 From 9d0c23db26cb58c9fc6ee8817e8f9ebeb25776e5 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jan 2025 20:31:55 +0100 Subject: [PATCH 02/16] btrfs: selftests: add a selftest for deleting two out of three extents Add a selftest creating three extents and then deleting two out of the three extents. Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/tests/raid-stripe-tree-tests.c | 144 ++++++++++++++++++++++++ 1 file changed, 144 insertions(+) diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c index 482e10b0d7ed..a7bc58a5c1e2 100644 --- a/fs/btrfs/tests/raid-stripe-tree-tests.c +++ b/fs/btrfs/tests/raid-stripe-tree-tests.c @@ -213,6 +213,149 @@ out: return ret; } +static int test_delete_two_extents(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 len1 = SZ_1M; + u64 logical2 = logical1 + len1; + u64 len2 = SZ_1M; + u64 logical3 = logical2 + len2; + u64 len3 = SZ_1M; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + + /* Prepare for the test, 1st create 3 x 1M extents. */ + bioc->map_type = map_type; + bioc->size = len1; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical2; + bioc->size = len2; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical2 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical3; + bioc->size = len3; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical3 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + /* + * Delete a range starting at logical1 and 2M in length. Extents 1 + * and 2 are dropped and extent 3 is kept as is. + */ + ret = btrfs_delete_raid_extent(trans, logical1, len1 + len2); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1, logical1 + len1 + len2); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, + 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + logical1, len1); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, + 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + logical2, len2); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical3, len3); + goto out; + } + + if (io_stripe.physical != logical3) { + test_err("invalid physical address, expected %llu, got %llu", + logical3, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len3 != SZ_1M) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_1M, len3); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical3, len3); +out: + btrfs_put_bioc(bioc); + return ret; +} + /* Test punching a hole into a single RAID stripe-extent. */ static int test_punch_hole(struct btrfs_trans_handle *trans) { @@ -935,6 +1078,7 @@ static const test_func_t tests[] = { test_front_delete_prev_item, test_punch_hole, test_punch_hole_3extents, + test_delete_two_extents, }; static int run_test(test_func_t test, u32 sectorsize, u32 nodesize) -- 2.51.0 From a216542027b892e6651c1b4e076012140d04afaf Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 10 Jan 2025 15:22:24 +0000 Subject: [PATCH 03/16] btrfs: fix lockdep splat while merging a relocation root When COWing a relocation tree path, at relocation.c:replace_path(), we can trigger a lockdep splat while we are in the btrfs_search_slot() call against the relocation root. This happens in that callchain at ctree.c:read_block_for_search() when we happen to find a child extent buffer already loaded through the fs tree with a lockdep class set to the fs tree. So when we attempt to lock that extent buffer through a relocation tree we have to reset the lockdep class to the class for a relocation tree, since a relocation tree has extent buffers that used to belong to a fs tree and may currently be already loaded (we swap extent buffers between the two trees at the end of replace_path()). However we are missing calls to btrfs_maybe_reset_lockdep_class() to reset the lockdep class at ctree.c:read_block_for_search() before we read lock an extent buffer, just like we did for btrfs_search_slot() in commit b40130b23ca4 ("btrfs: fix lockdep splat with reloc root extent buffers"). So add the missing btrfs_maybe_reset_lockdep_class() calls before the attempts to read lock an extent buffer at ctree.c:read_block_for_search(). The lockdep splat was reported by syzbot and it looks like this: ====================================================== WARNING: possible circular locking dependency detected 6.13.0-rc5-syzkaller-00163-gab75170520d4 #0 Not tainted ------------------------------------------------------ syz.0.0/5335 is trying to acquire lock: ffff8880545dbc38 (btrfs-tree-01){++++}-{4:4}, at: btrfs_tree_read_lock_nested+0x2f/0x250 fs/btrfs/locking.c:146 but task is already holding lock: ffff8880545dba58 (btrfs-treloc-02/1){+.+.}-{4:4}, at: btrfs_tree_lock_nested+0x2f/0x250 fs/btrfs/locking.c:189 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (btrfs-treloc-02/1){+.+.}-{4:4}: reacquire_held_locks+0x3eb/0x690 kernel/locking/lockdep.c:5374 __lock_release kernel/locking/lockdep.c:5563 [inline] lock_release+0x396/0xa30 kernel/locking/lockdep.c:5870 up_write+0x79/0x590 kernel/locking/rwsem.c:1629 btrfs_force_cow_block+0x14b3/0x1fd0 fs/btrfs/ctree.c:660 btrfs_cow_block+0x371/0x830 fs/btrfs/ctree.c:755 btrfs_search_slot+0xc01/0x3180 fs/btrfs/ctree.c:2153 replace_path+0x1243/0x2740 fs/btrfs/relocation.c:1224 merge_reloc_root+0xc46/0x1ad0 fs/btrfs/relocation.c:1692 merge_reloc_roots+0x3b3/0x980 fs/btrfs/relocation.c:1942 relocate_block_group+0xb0a/0xd40 fs/btrfs/relocation.c:3754 btrfs_relocate_block_group+0x77d/0xd90 fs/btrfs/relocation.c:4087 btrfs_relocate_chunk+0x12c/0x3b0 fs/btrfs/volumes.c:3494 __btrfs_balance+0x1b0f/0x26b0 fs/btrfs/volumes.c:4278 btrfs_balance+0xbdc/0x10c0 fs/btrfs/volumes.c:4655 btrfs_ioctl_balance+0x493/0x7c0 fs/btrfs/ioctl.c:3670 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl+0xf5/0x170 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #1 (btrfs-tree-01/1){+.+.}-{4:4}: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 down_write_nested+0xa2/0x220 kernel/locking/rwsem.c:1693 btrfs_tree_lock_nested+0x2f/0x250 fs/btrfs/locking.c:189 btrfs_init_new_buffer fs/btrfs/extent-tree.c:5052 [inline] btrfs_alloc_tree_block+0x41c/0x1440 fs/btrfs/extent-tree.c:5132 btrfs_force_cow_block+0x526/0x1fd0 fs/btrfs/ctree.c:573 btrfs_cow_block+0x371/0x830 fs/btrfs/ctree.c:755 btrfs_search_slot+0xc01/0x3180 fs/btrfs/ctree.c:2153 btrfs_insert_empty_items+0x9c/0x1a0 fs/btrfs/ctree.c:4351 btrfs_insert_empty_item fs/btrfs/ctree.h:688 [inline] btrfs_insert_inode_ref+0x2bb/0xf80 fs/btrfs/inode-item.c:330 btrfs_rename_exchange fs/btrfs/inode.c:7990 [inline] btrfs_rename2+0xcb7/0x2b90 fs/btrfs/inode.c:8374 vfs_rename+0xbdb/0xf00 fs/namei.c:5067 do_renameat2+0xd94/0x13f0 fs/namei.c:5224 __do_sys_renameat2 fs/namei.c:5258 [inline] __se_sys_renameat2 fs/namei.c:5255 [inline] __x64_sys_renameat2+0xce/0xe0 fs/namei.c:5255 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #0 (btrfs-tree-01){++++}-{4:4}: check_prev_add kernel/locking/lockdep.c:3161 [inline] check_prevs_add kernel/locking/lockdep.c:3280 [inline] validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904 __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 down_read_nested+0xb5/0xa50 kernel/locking/rwsem.c:1649 btrfs_tree_read_lock_nested+0x2f/0x250 fs/btrfs/locking.c:146 btrfs_tree_read_lock fs/btrfs/locking.h:188 [inline] read_block_for_search+0x718/0xbb0 fs/btrfs/ctree.c:1610 btrfs_search_slot+0x1274/0x3180 fs/btrfs/ctree.c:2237 replace_path+0x1243/0x2740 fs/btrfs/relocation.c:1224 merge_reloc_root+0xc46/0x1ad0 fs/btrfs/relocation.c:1692 merge_reloc_roots+0x3b3/0x980 fs/btrfs/relocation.c:1942 relocate_block_group+0xb0a/0xd40 fs/btrfs/relocation.c:3754 btrfs_relocate_block_group+0x77d/0xd90 fs/btrfs/relocation.c:4087 btrfs_relocate_chunk+0x12c/0x3b0 fs/btrfs/volumes.c:3494 __btrfs_balance+0x1b0f/0x26b0 fs/btrfs/volumes.c:4278 btrfs_balance+0xbdc/0x10c0 fs/btrfs/volumes.c:4655 btrfs_ioctl_balance+0x493/0x7c0 fs/btrfs/ioctl.c:3670 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl+0xf5/0x170 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f other info that might help us debug this: Chain exists of: btrfs-tree-01 --> btrfs-tree-01/1 --> btrfs-treloc-02/1 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(btrfs-treloc-02/1); lock(btrfs-tree-01/1); lock(btrfs-treloc-02/1); rlock(btrfs-tree-01); *** DEADLOCK *** 8 locks held by syz.0.0/5335: #0: ffff88801e3ae420 (sb_writers#13){.+.+}-{0:0}, at: mnt_want_write_file+0x5e/0x200 fs/namespace.c:559 #1: ffff888052c760d0 (&fs_info->reclaim_bgs_lock){+.+.}-{4:4}, at: __btrfs_balance+0x4c2/0x26b0 fs/btrfs/volumes.c:4183 #2: ffff888052c74850 (&fs_info->cleaner_mutex){+.+.}-{4:4}, at: btrfs_relocate_block_group+0x775/0xd90 fs/btrfs/relocation.c:4086 #3: ffff88801e3ae610 (sb_internal#2){.+.+}-{0:0}, at: merge_reloc_root+0xf11/0x1ad0 fs/btrfs/relocation.c:1659 #4: ffff888052c76470 (btrfs_trans_num_writers){++++}-{0:0}, at: join_transaction+0x405/0xda0 fs/btrfs/transaction.c:288 #5: ffff888052c76498 (btrfs_trans_num_extwriters){++++}-{0:0}, at: join_transaction+0x405/0xda0 fs/btrfs/transaction.c:288 #6: ffff8880545db878 (btrfs-tree-01/1){+.+.}-{4:4}, at: btrfs_tree_lock_nested+0x2f/0x250 fs/btrfs/locking.c:189 #7: ffff8880545dba58 (btrfs-treloc-02/1){+.+.}-{4:4}, at: btrfs_tree_lock_nested+0x2f/0x250 fs/btrfs/locking.c:189 stack backtrace: CPU: 0 UID: 0 PID: 5335 Comm: syz.0.0 Not tainted 6.13.0-rc5-syzkaller-00163-gab75170520d4 #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_circular_bug+0x13a/0x1b0 kernel/locking/lockdep.c:2074 check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2206 check_prev_add kernel/locking/lockdep.c:3161 [inline] check_prevs_add kernel/locking/lockdep.c:3280 [inline] validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904 __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 down_read_nested+0xb5/0xa50 kernel/locking/rwsem.c:1649 btrfs_tree_read_lock_nested+0x2f/0x250 fs/btrfs/locking.c:146 btrfs_tree_read_lock fs/btrfs/locking.h:188 [inline] read_block_for_search+0x718/0xbb0 fs/btrfs/ctree.c:1610 btrfs_search_slot+0x1274/0x3180 fs/btrfs/ctree.c:2237 replace_path+0x1243/0x2740 fs/btrfs/relocation.c:1224 merge_reloc_root+0xc46/0x1ad0 fs/btrfs/relocation.c:1692 merge_reloc_roots+0x3b3/0x980 fs/btrfs/relocation.c:1942 relocate_block_group+0xb0a/0xd40 fs/btrfs/relocation.c:3754 btrfs_relocate_block_group+0x77d/0xd90 fs/btrfs/relocation.c:4087 btrfs_relocate_chunk+0x12c/0x3b0 fs/btrfs/volumes.c:3494 __btrfs_balance+0x1b0f/0x26b0 fs/btrfs/volumes.c:4278 btrfs_balance+0xbdc/0x10c0 fs/btrfs/volumes.c:4655 btrfs_ioctl_balance+0x493/0x7c0 fs/btrfs/ioctl.c:3670 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl+0xf5/0x170 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f1ac6985d29 Code: ff ff c3 (...) RSP: 002b:00007f1ac63fe038 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007f1ac6b76160 RCX: 00007f1ac6985d29 RDX: 0000000020000180 RSI: 00000000c4009420 RDI: 0000000000000007 RBP: 00007f1ac6a01b08 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000001 R14: 00007f1ac6b76160 R15: 00007fffda145a88 Reported-by: syzbot+63913e558c084f7f8fdc@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/677b3014.050a0220.3b53b0.0064.GAE@google.com/ Fixes: 99785998ed1c ("btrfs: reduce lock contention when eb cache miss for btree search") Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 92071ca0655f..3dc5a35dd19b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1496,6 +1496,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (!p->skip_locking) { btrfs_unlock_up_safe(p, parent_level + 1); + btrfs_maybe_reset_lockdep_class(root, tmp); tmp_locked = true; btrfs_tree_read_lock(tmp); btrfs_release_path(p); @@ -1539,6 +1540,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (!p->skip_locking) { ASSERT(ret == -EAGAIN); + btrfs_maybe_reset_lockdep_class(root, tmp); tmp_locked = true; btrfs_tree_read_lock(tmp); btrfs_release_path(p); -- 2.51.0 From 0d85f5c2dd91df6b5da454406756f463ba923b69 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Jan 2025 15:01:08 +0000 Subject: [PATCH 04/16] btrfs: fix assertion failure when splitting ordered extent after transaction abort If while we are doing a direct IO write a transaction abort happens, we mark all existing ordered extents with the BTRFS_ORDERED_IOERR flag (done at btrfs_destroy_ordered_extents()), and then after that if we enter btrfs_split_ordered_extent() and the ordered extent has bytes left (meaning we have a bio that doesn't cover the whole ordered extent, see details at btrfs_extract_ordered_extent()), we will fail on the following assertion at btrfs_split_ordered_extent(): ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); because the BTRFS_ORDERED_IOERR flag is set and the definition of BTRFS_ORDERED_TYPE_FLAGS is just the union of all flags that identify the type of write (regular, nocow, prealloc, compressed, direct IO, encoded). Fix this by returning an error from btrfs_extract_ordered_extent() if we find the BTRFS_ORDERED_IOERR flag in the ordered extent. The error will be the error that resulted in the transaction abort or -EIO if no transaction abort happened. This was recently reported by syzbot with the following trace: FAULT_INJECTION: forcing a failure. name failslab, interval 1, probability 0, space 0, times 1 CPU: 0 UID: 0 PID: 5321 Comm: syz.0.0 Not tainted 6.13.0-rc5-syzkaller #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 fail_dump lib/fault-inject.c:53 [inline] should_fail_ex+0x3b0/0x4e0 lib/fault-inject.c:154 should_failslab+0xac/0x100 mm/failslab.c:46 slab_pre_alloc_hook mm/slub.c:4072 [inline] slab_alloc_node mm/slub.c:4148 [inline] __do_kmalloc_node mm/slub.c:4297 [inline] __kmalloc_noprof+0xdd/0x4c0 mm/slub.c:4310 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1037 [inline] btrfs_chunk_alloc_add_chunk_item+0x244/0x1100 fs/btrfs/volumes.c:5742 reserve_chunk_space+0x1ca/0x2c0 fs/btrfs/block-group.c:4292 check_system_chunk fs/btrfs/block-group.c:4319 [inline] do_chunk_alloc fs/btrfs/block-group.c:3891 [inline] btrfs_chunk_alloc+0x77b/0xf80 fs/btrfs/block-group.c:4187 find_free_extent_update_loop fs/btrfs/extent-tree.c:4166 [inline] find_free_extent+0x42d1/0x5810 fs/btrfs/extent-tree.c:4579 btrfs_reserve_extent+0x422/0x810 fs/btrfs/extent-tree.c:4672 btrfs_new_extent_direct fs/btrfs/direct-io.c:186 [inline] btrfs_get_blocks_direct_write+0x706/0xfa0 fs/btrfs/direct-io.c:321 btrfs_dio_iomap_begin+0xbb7/0x1180 fs/btrfs/direct-io.c:525 iomap_iter+0x697/0xf60 fs/iomap/iter.c:90 __iomap_dio_rw+0xeb9/0x25b0 fs/iomap/direct-io.c:702 btrfs_dio_write fs/btrfs/direct-io.c:775 [inline] btrfs_direct_write+0x610/0xa30 fs/btrfs/direct-io.c:880 btrfs_do_write_iter+0x2a0/0x760 fs/btrfs/file.c:1397 do_iter_readv_writev+0x600/0x880 vfs_writev+0x376/0xba0 fs/read_write.c:1050 do_pwritev fs/read_write.c:1146 [inline] __do_sys_pwritev2 fs/read_write.c:1204 [inline] __se_sys_pwritev2+0x196/0x2b0 fs/read_write.c:1195 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f1281f85d29 RSP: 002b:00007f12819fe038 EFLAGS: 00000246 ORIG_RAX: 0000000000000148 RAX: ffffffffffffffda RBX: 00007f1282176080 RCX: 00007f1281f85d29 RDX: 0000000000000001 RSI: 0000000020000240 RDI: 0000000000000005 RBP: 00007f12819fe090 R08: 0000000000000000 R09: 0000000000000003 R10: 0000000000007000 R11: 0000000000000246 R12: 0000000000000002 R13: 0000000000000000 R14: 00007f1282176080 R15: 00007ffcb9e23328 BTRFS error (device loop0 state A): Transaction aborted (error -12) BTRFS: error (device loop0 state A) in btrfs_chunk_alloc_add_chunk_item:5745: errno=-12 Out of memory BTRFS info (device loop0 state EA): forced readonly assertion failed: !(flags & ~BTRFS_ORDERED_TYPE_FLAGS), in fs/btrfs/ordered-data.c:1234 ------------[ cut here ]------------ kernel BUG at fs/btrfs/ordered-data.c:1234! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI CPU: 0 UID: 0 PID: 5321 Comm: syz.0.0 Not tainted 6.13.0-rc5-syzkaller #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 RIP: 0010:btrfs_split_ordered_extent+0xd8d/0xe20 fs/btrfs/ordered-data.c:1234 RSP: 0018:ffffc9000d1df2b8 EFLAGS: 00010246 RAX: 0000000000000057 RBX: 000000000006a000 RCX: 9ce21886c4195300 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: 0000000000000091 R08: ffffffff817f0a3c R09: 1ffff92001a3bdf4 R10: dffffc0000000000 R11: fffff52001a3bdf5 R12: 1ffff1100a45f401 R13: ffff8880522fa018 R14: dffffc0000000000 R15: 000000000006a000 FS: 00007f12819fe6c0(0000) GS:ffff88801fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000557750bd7da8 CR3: 00000000400ea000 CR4: 0000000000352ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btrfs_extract_ordered_extent fs/btrfs/direct-io.c:702 [inline] btrfs_dio_submit_io+0x4be/0x6d0 fs/btrfs/direct-io.c:737 iomap_dio_submit_bio fs/iomap/direct-io.c:85 [inline] iomap_dio_bio_iter+0x1022/0x1740 fs/iomap/direct-io.c:447 __iomap_dio_rw+0x13b7/0x25b0 fs/iomap/direct-io.c:703 btrfs_dio_write fs/btrfs/direct-io.c:775 [inline] btrfs_direct_write+0x610/0xa30 fs/btrfs/direct-io.c:880 btrfs_do_write_iter+0x2a0/0x760 fs/btrfs/file.c:1397 do_iter_readv_writev+0x600/0x880 vfs_writev+0x376/0xba0 fs/read_write.c:1050 do_pwritev fs/read_write.c:1146 [inline] __do_sys_pwritev2 fs/read_write.c:1204 [inline] __se_sys_pwritev2+0x196/0x2b0 fs/read_write.c:1195 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f1281f85d29 RSP: 002b:00007f12819fe038 EFLAGS: 00000246 ORIG_RAX: 0000000000000148 RAX: ffffffffffffffda RBX: 00007f1282176080 RCX: 00007f1281f85d29 RDX: 0000000000000001 RSI: 0000000020000240 RDI: 0000000000000005 RBP: 00007f12819fe090 R08: 0000000000000000 R09: 0000000000000003 R10: 0000000000007000 R11: 0000000000000246 R12: 0000000000000002 R13: 0000000000000000 R14: 00007f1282176080 R15: 00007ffcb9e23328 Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:btrfs_split_ordered_extent+0xd8d/0xe20 fs/btrfs/ordered-data.c:1234 RSP: 0018:ffffc9000d1df2b8 EFLAGS: 00010246 RAX: 0000000000000057 RBX: 000000000006a000 RCX: 9ce21886c4195300 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: 0000000000000091 R08: ffffffff817f0a3c R09: 1ffff92001a3bdf4 R10: dffffc0000000000 R11: fffff52001a3bdf5 R12: 1ffff1100a45f401 R13: ffff8880522fa018 R14: dffffc0000000000 R15: 000000000006a000 FS: 00007f12819fe6c0(0000) GS:ffff88801fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000557750bd7da8 CR3: 00000000400ea000 CR4: 0000000000352ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 In this case the transaction abort was due to (an injected) memory allocation failure when attempting to allocate a new chunk. Reported-by: syzbot+f60d8337a5c8e8d92a77@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/6777f2dd.050a0220.178762.0045.GAE@google.com/ Fixes: 52b1fdca23ac ("btrfs: handle completed ordered extents in btrfs_split_ordered_extent") Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 30eceaf829a7..4aca7475fd82 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1229,6 +1229,18 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( */ if (WARN_ON_ONCE(len >= ordered->num_bytes)) return ERR_PTR(-EINVAL); + /* + * If our ordered extent had an error there's no point in continuing. + * The error may have come from a transaction abort done either by this + * task or some other concurrent task, and the transaction abort path + * iterates over all existing ordered extents and sets the flag + * BTRFS_ORDERED_IOERR on them. + */ + if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) { + const int fs_error = BTRFS_FS_ERROR(fs_info); + + return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO); + } /* We cannot split partially completed ordered extents. */ if (ordered->bytes_left) { ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); -- 2.51.0 From c9c863793395cf0a66c2778a29d72c48c02fbb66 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 20 Jan 2025 09:40:43 +1030 Subject: [PATCH 05/16] btrfs: do not output error message if a qgroup has been already cleaned up [BUG] There is a bug report that btrfs outputs the following error message: BTRFS info (device nvme0n1p2): qgroup scan completed (inconsistency flag cleared) BTRFS warning (device nvme0n1p2): failed to cleanup qgroup 0/1179: -2 [CAUSE] The error itself is pretty harmless, and the end user should ignore it. When a subvolume is fully dropped, btrfs will call btrfs_qgroup_cleanup_dropped_subvolume() to delete the qgroup. However if a qgroup rescan happened before a subvolume fully dropped, qgroup for that subvolume will not be re-created, as rescan will only create new qgroup if there is a BTRFS_ROOT_REF_KEY found. But before we drop a subvolume, the subvolume is unlinked thus there is no BTRFS_ROOT_REF_KEY. In that case, btrfs_remove_qgroup() will fail with -ENOENT and trigger the above error message. [FIX] Just ignore -ENOENT error from btrfs_remove_qgroup() inside btrfs_qgroup_cleanup_dropped_subvolume(). Reported-by: John Shand Link: https://bugzilla.suse.com/show_bug.cgi?id=1236056 Fixes: 839d6ea4f86d ("btrfs: automatically remove the subvolume qgroup") Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b90fabe302e6..aaf16019d829 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1897,8 +1897,11 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su /* * It's squota and the subvolume still has numbers needed for future * accounting, in this case we can not delete it. Just skip it. + * + * Or the qgroup is already removed by a qgroup rescan. For both cases we're + * safe to ignore them. */ - if (ret == -EBUSY) + if (ret == -EBUSY || ret == -ENOENT) ret = 0; return ret; } -- 2.51.0 From e2f0943cf37305dbdeaf9846e3c941451bcdef63 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 20 Jan 2025 17:26:10 +0000 Subject: [PATCH 06/16] btrfs: fix use-after-free when attempting to join an aborted transaction When we are trying to join the current transaction and if it's aborted, we read its 'aborted' field after unlocking fs_info->trans_lock and without holding any extra reference count on it. This means that a concurrent task that is aborting the transaction may free the transaction before we read its 'aborted' field, leading to a use-after-free. Fix this by reading the 'aborted' field while holding fs_info->trans_lock since any freeing task must first acquire that lock and set fs_info->running_transaction to NULL before freeing the transaction. This was reported by syzbot and Dmitry with the following stack traces from KASAN: ================================================================== BUG: KASAN: slab-use-after-free in join_transaction+0xd9b/0xda0 fs/btrfs/transaction.c:278 Read of size 4 at addr ffff888011839024 by task kworker/u4:9/1128 CPU: 0 UID: 0 PID: 1128 Comm: kworker/u4:9 Not tainted 6.13.0-rc7-syzkaller-00019-gc45323b7560e #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Workqueue: events_unbound btrfs_async_reclaim_data_space Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0x169/0x550 mm/kasan/report.c:489 kasan_report+0x143/0x180 mm/kasan/report.c:602 join_transaction+0xd9b/0xda0 fs/btrfs/transaction.c:278 start_transaction+0xaf8/0x1670 fs/btrfs/transaction.c:697 flush_space+0x448/0xcf0 fs/btrfs/space-info.c:803 btrfs_async_reclaim_data_space+0x159/0x510 fs/btrfs/space-info.c:1321 process_one_work kernel/workqueue.c:3236 [inline] process_scheduled_works+0xa66/0x1840 kernel/workqueue.c:3317 worker_thread+0x870/0xd30 kernel/workqueue.c:3398 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Allocated by task 5315: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x98/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x243/0x390 mm/slub.c:4329 kmalloc_noprof include/linux/slab.h:901 [inline] join_transaction+0x144/0xda0 fs/btrfs/transaction.c:308 start_transaction+0xaf8/0x1670 fs/btrfs/transaction.c:697 btrfs_create_common+0x1b2/0x2e0 fs/btrfs/inode.c:6572 lookup_open fs/namei.c:3649 [inline] open_last_lookups fs/namei.c:3748 [inline] path_openat+0x1c03/0x3590 fs/namei.c:3984 do_filp_open+0x27f/0x4e0 fs/namei.c:4014 do_sys_openat2+0x13e/0x1d0 fs/open.c:1402 do_sys_open fs/open.c:1417 [inline] __do_sys_creat fs/open.c:1495 [inline] __se_sys_creat fs/open.c:1489 [inline] __x64_sys_creat+0x123/0x170 fs/open.c:1489 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 5336: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:582 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x59/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2353 [inline] slab_free mm/slub.c:4613 [inline] kfree+0x196/0x430 mm/slub.c:4761 cleanup_transaction fs/btrfs/transaction.c:2063 [inline] btrfs_commit_transaction+0x2c97/0x3720 fs/btrfs/transaction.c:2598 insert_balance_item+0x1284/0x20b0 fs/btrfs/volumes.c:3757 btrfs_balance+0x992/0x10c0 fs/btrfs/volumes.c:4633 btrfs_ioctl_balance+0x493/0x7c0 fs/btrfs/ioctl.c:3670 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl+0xf5/0x170 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f The buggy address belongs to the object at ffff888011839000 which belongs to the cache kmalloc-2k of size 2048 The buggy address is located 36 bytes inside of freed 2048-byte region [ffff888011839000, ffff888011839800) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11838 head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 flags: 0xfff00000000040(head|node=0|zone=1|lastcpupid=0x7ff) page_type: f5(slab) raw: 00fff00000000040 ffff88801ac42000 ffffea0000493400 dead000000000002 raw: 0000000000000000 0000000000080008 00000001f5000000 0000000000000000 head: 00fff00000000040 ffff88801ac42000 ffffea0000493400 dead000000000002 head: 0000000000000000 0000000000080008 00000001f5000000 0000000000000000 head: 00fff00000000003 ffffea0000460e01 ffffffffffffffff 0000000000000000 head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 3, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 57, tgid 57 (kworker/0:2), ts 67248182943, free_ts 67229742023 set_page_owner include/linux/page_owner.h:32 [inline] post_alloc_hook+0x1f3/0x230 mm/page_alloc.c:1558 prep_new_page mm/page_alloc.c:1566 [inline] get_page_from_freelist+0x365c/0x37a0 mm/page_alloc.c:3476 __alloc_pages_noprof+0x292/0x710 mm/page_alloc.c:4753 alloc_pages_mpol_noprof+0x3e1/0x780 mm/mempolicy.c:2269 alloc_slab_page+0x6a/0x110 mm/slub.c:2423 allocate_slab+0x5a/0x2b0 mm/slub.c:2589 new_slab mm/slub.c:2642 [inline] ___slab_alloc+0xc27/0x14a0 mm/slub.c:3830 __slab_alloc+0x58/0xa0 mm/slub.c:3920 __slab_alloc_node mm/slub.c:3995 [inline] slab_alloc_node mm/slub.c:4156 [inline] __do_kmalloc_node mm/slub.c:4297 [inline] __kmalloc_node_track_caller_noprof+0x2e9/0x4c0 mm/slub.c:4317 kmalloc_reserve+0x111/0x2a0 net/core/skbuff.c:609 __alloc_skb+0x1f3/0x440 net/core/skbuff.c:678 alloc_skb include/linux/skbuff.h:1323 [inline] alloc_skb_with_frags+0xc3/0x820 net/core/skbuff.c:6612 sock_alloc_send_pskb+0x91a/0xa60 net/core/sock.c:2884 sock_alloc_send_skb include/net/sock.h:1803 [inline] mld_newpack+0x1c3/0xaf0 net/ipv6/mcast.c:1747 add_grhead net/ipv6/mcast.c:1850 [inline] add_grec+0x1492/0x19a0 net/ipv6/mcast.c:1988 mld_send_cr net/ipv6/mcast.c:2114 [inline] mld_ifc_work+0x691/0xd90 net/ipv6/mcast.c:2651 page last free pid 5300 tgid 5300 stack trace: reset_page_owner include/linux/page_owner.h:25 [inline] free_pages_prepare mm/page_alloc.c:1127 [inline] free_unref_page+0xd3f/0x1010 mm/page_alloc.c:2659 __slab_free+0x2c2/0x380 mm/slub.c:4524 qlink_free mm/kasan/quarantine.c:163 [inline] qlist_free_all+0x9a/0x140 mm/kasan/quarantine.c:179 kasan_quarantine_reduce+0x14f/0x170 mm/kasan/quarantine.c:286 __kasan_slab_alloc+0x23/0x80 mm/kasan/common.c:329 kasan_slab_alloc include/linux/kasan.h:250 [inline] slab_post_alloc_hook mm/slub.c:4119 [inline] slab_alloc_node mm/slub.c:4168 [inline] __do_kmalloc_node mm/slub.c:4297 [inline] __kmalloc_noprof+0x236/0x4c0 mm/slub.c:4310 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1037 [inline] fib_create_info+0xc14/0x25b0 net/ipv4/fib_semantics.c:1435 fib_table_insert+0x1f6/0x1f20 net/ipv4/fib_trie.c:1231 fib_magic+0x3d8/0x620 net/ipv4/fib_frontend.c:1112 fib_add_ifaddr+0x40c/0x5e0 net/ipv4/fib_frontend.c:1156 fib_netdev_event+0x375/0x490 net/ipv4/fib_frontend.c:1494 notifier_call_chain+0x1a5/0x3f0 kernel/notifier.c:85 __dev_notify_flags+0x207/0x400 dev_change_flags+0xf0/0x1a0 net/core/dev.c:9045 do_setlink+0xc90/0x4210 net/core/rtnetlink.c:3109 rtnl_changelink net/core/rtnetlink.c:3723 [inline] __rtnl_newlink net/core/rtnetlink.c:3875 [inline] rtnl_newlink+0x1bb6/0x2210 net/core/rtnetlink.c:4012 Memory state around the buggy address: ffff888011838f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888011838f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff888011839000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888011839080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888011839100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Reported-by: syzbot+45212e9d87a98c3f5b42@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/678e7da5.050a0220.303755.007c.GAE@google.com/ Reported-by: Dmitry Vyukov Link: https://lore.kernel.org/linux-btrfs/CACT4Y+ZFBdo7pT8L2AzM=vegZwjp-wNkVJZQf0Ta3vZqtExaSw@mail.gmail.com/ Fixes: 871383be592b ("btrfs: add missing unlocks to transaction abort paths") Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 15312013f2a3..aca83a98b75a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -274,8 +274,10 @@ loop: cur_trans = fs_info->running_transaction; if (cur_trans) { if (TRANS_ABORTED(cur_trans)) { + const int abort_error = cur_trans->aborted; + spin_unlock(&fs_info->trans_lock); - return cur_trans->aborted; + return abort_error; } if (btrfs_blocked_trans_types[cur_trans->state] & type) { spin_unlock(&fs_info->trans_lock); -- 2.51.0 From fdef89ce6fada462aef9cb90a140c93c8c209f0f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 21 Jan 2025 12:24:39 +0000 Subject: [PATCH 07/16] btrfs: avoid starting new transaction when cleaning qgroup during subvolume drop At btrfs_qgroup_cleanup_dropped_subvolume() all we want to commit the current transaction in order to have all the qgroup rfer/excl numbers up to date. However we are using btrfs_start_transaction(), which joins the current transaction if there is one that is not yet committing, but also starts a new one if there is none or if the current one is already committing (its state is >= TRANS_STATE_COMMIT_START). This later case results in unnecessary IO, wasting time and a pointless rotation of the backup roots in the super block. So instead of using btrfs_start_transaction() followed by a btrfs_commit_transaction(), use btrfs_commit_current_transaction() which achieves our purpose and avoids starting and committing new transactions. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index aaf16019d829..f9d3766c809b 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1880,11 +1880,7 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su * Commit current transaction to make sure all the rfer/excl numbers * get updated. */ - trans = btrfs_start_transaction(fs_info->quota_root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_commit_transaction(trans); + ret = btrfs_commit_current_transaction(fs_info->quota_root); if (ret < 0) return ret; -- 2.51.0 From 01af106a076352182b2916b143fc50272600bd81 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 21 Jan 2025 05:40:51 +0000 Subject: [PATCH 08/16] btrfs: fix two misuses of folio_shift() It is meaningless to shift a byte count by folio_shift(). The folio index is in units of PAGE_SIZE, not folio_size(). We can use folio_contains() to make this work for arbitrary-order folios, so remove the assertion that the folios are of order 0. Reviewed-by: Qu Wenruo Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d9f856358704..6f64ee16744d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -523,8 +523,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) u64 end; u32 len; - /* For now only order 0 folios are supported for data. */ - ASSERT(folio_order(folio) == 0); btrfs_debug(fs_info, "%s: bi_sector=%llu, err=%d, mirror=%u", __func__, bio->bi_iter.bi_sector, bio->bi_status, @@ -552,7 +550,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) if (likely(uptodate)) { loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> folio_shift(folio); /* * Zero out the remaining part if this range straddles @@ -561,9 +558,11 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) * Here we should only zero the range inside the folio, * not touch anything else. * - * NOTE: i_size is exclusive while end is inclusive. + * NOTE: i_size is exclusive while end is inclusive and + * folio_contains() takes PAGE_SIZE units. */ - if (folio_index(folio) == end_index && i_size <= end) { + if (folio_contains(folio, i_size >> PAGE_SHIFT) && + i_size <= end) { u32 zero_start = max(offset_in_folio(folio, i_size), offset_in_folio(folio, start)); u32 zero_len = offset_in_folio(folio, end) + 1 - @@ -956,7 +955,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, return ret; } - if (folio->index == last_byte >> folio_shift(folio)) { + if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { size_t zero_offset = offset_in_folio(folio, last_byte); if (zero_offset) { -- 2.51.0 From acc18e1c1d8c0d59d793cf87790ccfcafb1bf5f0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 4 Feb 2025 11:02:32 +0000 Subject: [PATCH 09/16] btrfs: fix stale page cache after race between readahead and direct IO write After commit ac325fc2aad5 ("btrfs: do not hold the extent lock for entire read") we can now trigger a race between a task doing a direct IO write and readahead. When this race is triggered it results in tasks getting stale data when they attempt do a buffered read (including the task that did the direct IO write). This race can be sporadically triggered with test case generic/418, failing like this: $ ./check generic/418 FSTYP -- btrfs PLATFORM -- Linux/x86_64 debian0 6.13.0-rc7-btrfs-next-185+ #17 SMP PREEMPT_DYNAMIC Mon Feb 3 12:28:46 WET 2025 MKFS_OPTIONS -- /dev/sdc MOUNT_OPTIONS -- /dev/sdc /home/fdmanana/btrfs-tests/scratch_1 generic/418 14s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//generic/418.out.bad) --- tests/generic/418.out 2020-06-10 19:29:03.850519863 +0100 +++ /home/fdmanana/git/hub/xfstests/results//generic/418.out.bad 2025-02-03 15:42:36.974609476 +0000 @@ -1,2 +1,5 @@ QA output created by 418 +cmpbuf: offset 0: Expected: 0x1, got 0x0 +[6:0] FAIL - comparison failed, offset 24576 +diotest -wp -b 4096 -n 8 -i 4 failed at loop 3 Silence is golden ... (Run 'diff -u /home/fdmanana/git/hub/xfstests/tests/generic/418.out /home/fdmanana/git/hub/xfstests/results//generic/418.out.bad' to see the entire diff) Ran: generic/418 Failures: generic/418 Failed 1 of 1 tests The race happens like this: 1) A file has a prealloc extent for the range [16K, 28K); 2) Task A starts a direct IO write against file range [24K, 28K). At the start of the direct IO write it invalidates the page cache at __iomap_dio_rw() with kiocb_invalidate_pages() for the 4K page at file offset 24K; 3) Task A enters btrfs_dio_iomap_begin() and locks the extent range [24K, 28K); 4) Task B starts a readahead for file range [16K, 28K), entering btrfs_readahead(). First it attempts to read the page at offset 16K by entering btrfs_do_readpage(), where it calls get_extent_map(), locks the range [16K, 20K) and gets the extent map for the range [16K, 28K), caching it into the 'em_cached' variable declared in the local stack of btrfs_readahead(), and then unlocks the range [16K, 20K). Since the extent map has the prealloc flag, at btrfs_do_readpage() we zero out the page's content and don't submit any bio to read the page from the extent. Then it attempts to read the page at offset 20K entering btrfs_do_readpage() where we reuse the previously cached extent map (decided by get_extent_map()) since it spans the page's range and it's still in the inode's extent map tree. Just like for the previous page, we zero out the page's content since the extent map has the prealloc flag set. Then it attempts to read the page at offset 24K entering btrfs_do_readpage() where we reuse the previously cached extent map (decided by get_extent_map()) since it spans the page's range and it's still in the inode's extent map tree. Just like for the previous pages, we zero out the page's content since the extent map has the prealloc flag set. Note that we didn't lock the extent range [24K, 28K), so we didn't synchronize with the ongoing direct IO write being performed by task A; 5) Task A enters btrfs_create_dio_extent() and creates an ordered extent for the range [24K, 28K), with the flags BTRFS_ORDERED_DIRECT and BTRFS_ORDERED_PREALLOC set; 6) Task A unlocks the range [24K, 28K) at btrfs_dio_iomap_begin(); 7) The ordered extent enters btrfs_finish_one_ordered() and locks the range [24K, 28K); 8) Task A enters fs/iomap/direct-io.c:iomap_dio_complete() and it tries to invalidate the page at offset 24K by calling kiocb_invalidate_post_direct_write(), resulting in a call chain that ends up at btrfs_release_folio(). The btrfs_release_folio() call ends up returning false because the range for the page at file offset 24K is currently locked by the task doing the ordered extent completion in the previous step (7), so we have: btrfs_release_folio() -> __btrfs_release_folio() -> try_release_extent_mapping() -> try_release_extent_state() This last function checking that the range is locked and returning false and propagating it up to btrfs_release_folio(). So this results in a failure to invalidate the page and kiocb_invalidate_post_direct_write() triggers this message logged in dmesg: Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O! After this we leave the page cache with stale data for the file range [24K, 28K), filled with zeroes instead of the data written by direct IO write (all bytes with a 0x01 value), so any task attempting to read with buffered IO, including the task that did the direct IO write, will get all bytes in the range with a 0x00 value instead of the written data. Fix this by locking the range, with btrfs_lock_and_flush_ordered_range(), at the two callers of btrfs_do_readpage() instead of doing it at get_extent_map(), just like we did before commit ac325fc2aad5 ("btrfs: do not hold the extent lock for entire read"), and unlocking the range after all the calls to btrfs_do_readpage(). This way we never reuse a cached extent map without flushing any pending ordered extents from a concurrent direct IO write. Fixes: ac325fc2aad5 ("btrfs: do not hold the extent lock for entire read") Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6f64ee16744d..d8ded597edad 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -898,7 +898,6 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, u64 len, struct extent_map **em_cached) { struct extent_map *em; - struct extent_state *cached_state = NULL; ASSERT(em_cached); @@ -914,14 +913,12 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, *em_cached = NULL; } - btrfs_lock_and_flush_ordered_range(inode, start, start + len - 1, &cached_state); em = btrfs_get_extent(inode, folio, start, len); if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; } - unlock_extent(&inode->io_tree, start, start + len - 1, &cached_state); return em; } @@ -1078,11 +1075,18 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, int btrfs_read_folio(struct file *file, struct folio *folio) { + struct btrfs_inode *inode = folio_to_inode(folio); + const u64 start = folio_pos(folio); + const u64 end = start + folio_size(folio) - 1; + struct extent_state *cached_state = NULL; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; struct extent_map *em_cached = NULL; int ret; + btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); + unlock_extent(&inode->io_tree, start, end, &cached_state); + free_extent_map(em_cached); /* @@ -2379,12 +2383,20 @@ void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; struct folio *folio; + struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); + const u64 start = readahead_pos(rac); + const u64 end = start + readahead_length(rac) - 1; + struct extent_state *cached_state = NULL; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; + btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + while ((folio = readahead_folio(rac)) != NULL) btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); + unlock_extent(&inode->io_tree, start, end, &cached_state); + if (em_cached) free_extent_map(em_cached); submit_one_bio(&bio_ctrl); -- 2.51.0 From da2dccd7451de62b175fb8f0808d644959e964c7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 5 Feb 2025 17:36:48 +0000 Subject: [PATCH 10/16] btrfs: fix hole expansion when writing at an offset beyond EOF At btrfs_write_check() if our file's i_size is not sector size aligned and we have a write that starts at an offset larger than the i_size that falls within the same page of the i_size, then we end up not zeroing the file range [i_size, write_offset). The code is this: start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); if (start_pos > oldsize) { /* Expand hole size to cover write data, preventing empty gap */ loff_t end_pos = round_up(pos + count, fs_info->sectorsize); ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos); if (ret) return ret; } So if our file's i_size is 90269 bytes and a write at offset 90365 bytes comes in, we get 'start_pos' set to 90112 bytes, which is less than the i_size and therefore we don't zero out the range [90269, 90365) by calling btrfs_cont_expand(). This is an old bug introduced in commit 9036c10208e1 ("Btrfs: update hole handling v2"), from 2008, and the buggy code got moved around over the years. Fix this by discarding 'start_pos' and comparing against the write offset ('pos') without any alignment. This bug was recently exposed by test case generic/363 which tests this scenario by polluting ranges beyond EOF with an mmap write and than verify that after a file increases we get zeroes for the range which is supposed to be a hole and not what we wrote with the previous mmaped write. We're only seeing this exposed now because generic/363 used to run only on xfs until last Sunday's fstests update. The test was failing like this: $ ./check generic/363 FSTYP -- btrfs PLATFORM -- Linux/x86_64 debian0 6.13.0-rc7-btrfs-next-185+ #17 SMP PREEMPT_DYNAMIC Mon Feb 3 12:28:46 WET 2025 MKFS_OPTIONS -- /dev/sdc MOUNT_OPTIONS -- /dev/sdc /home/fdmanana/btrfs-tests/scratch_1 generic/363 0s ... [failed, exit status 1]- output mismatch (see /home/fdmanana/git/hub/xfstests/results//generic/363.out.bad) --- tests/generic/363.out 2025-02-05 15:31:14.013646509 +0000 +++ /home/fdmanana/git/hub/xfstests/results//generic/363.out.bad 2025-02-05 17:25:33.112630781 +0000 @@ -1 +1,46 @@ QA output created by 363 +READ BAD DATA: offset = 0xdcad, size = 0xd921, fname = /home/fdmanana/btrfs-tests/dev/junk +OFFSET GOOD BAD RANGE +0x1609d 0x0000 0x3104 0x0 +operation# (mod 256) for the bad data may be 4 +0x1609e 0x0000 0x0472 0x1 +operation# (mod 256) for the bad data may be 4 ... (Run 'diff -u /home/fdmanana/git/hub/xfstests/tests/generic/363.out /home/fdmanana/git/hub/xfstests/results//generic/363.out.bad' to see the entire diff) Ran: generic/363 Failures: generic/363 Failed 1 of 1 tests Fixes: 9036c10208e1 ("Btrfs: update hole handling v2") CC: stable@vger.kernel.org Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 36f51c311bb1..ed3c0d6546c5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1039,7 +1039,6 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) loff_t pos = iocb->ki_pos; int ret; loff_t oldsize; - loff_t start_pos; /* * Quickly bail out on NOWAIT writes if we don't have the nodatacow or @@ -1066,9 +1065,8 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) inode_inc_iversion(inode); } - start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); - if (start_pos > oldsize) { + if (pos > oldsize) { /* Expand hole size to cover write data, preventing empty gap */ loff_t end_pos = round_up(pos + count, fs_info->sectorsize); -- 2.51.0 From 290237fde9491ca26cf4020bbf5a2b330452e7db Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Mon, 10 Feb 2025 22:17:29 +1100 Subject: [PATCH 11/16] btrfs: selftests: fix btrfs_test_delayed_refs() leak of transaction The btrfs_transaction struct leaks, which can cause sporadic fstests failures when kmemleak checking is enabled: kmemleak: 5 new suspected memory leaks (see /sys/kernel/debug/kmemleak) > cat /sys/kernel/debug/kmemleak unreferenced object 0xffff88810fdc6c00 (size 512): comm "modprobe", pid 203, jiffies 4294892552 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc 6736050f): __kmalloc_cache_noprof+0x133/0x2c0 btrfs_test_delayed_refs+0x6f/0xbb0 [btrfs] btrfs_run_sanity_tests.cold+0x91/0xf9 [btrfs] 0xffffffffa02fd055 do_one_initcall+0x49/0x1c0 do_init_module+0x5b/0x1f0 init_module_from_file+0x70/0x90 idempotent_init_module+0xe8/0x2c0 __x64_sys_finit_module+0x6b/0xd0 do_syscall_64+0x54/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e The transaction struct was initially stack-allocated but switched to heap following frame size compiler warnings. Fixes: 2b34879d97e27 ("btrfs: selftests: add delayed ref self test cases") Signed-off-by: David Disseldorp Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tests/delayed-refs-tests.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c index 6558508c2ddf..265370e79a54 100644 --- a/fs/btrfs/tests/delayed-refs-tests.c +++ b/fs/btrfs/tests/delayed-refs-tests.c @@ -1009,6 +1009,7 @@ int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize) if (!ret) ret = select_delayed_refs_test(&trans); + kfree(transaction); out_free_fs_info: btrfs_free_dummy_fs_info(fs_info); return ret; -- 2.51.0 From 59f37036bb7ab3d554c24abc856aabca01126414 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 15 Feb 2025 11:36:15 +0000 Subject: [PATCH 12/16] btrfs: fix use-after-free on inode when scanning root during em shrinking At btrfs_scan_root() we are accessing the inode's root (and fs_info) in a call to btrfs_fs_closing() after we have scheduled the inode for a delayed iput, and that can result in a use-after-free on the inode in case the cleaner kthread does the iput before we dereference the inode in the call to btrfs_fs_closing(). Fix this by using the fs_info stored already in a local variable instead of doing inode->root->fs_info. Fixes: 102044384056 ("btrfs: make the extent map shrinker run asynchronously as a work queue job") CC: stable@vger.kernel.org # 6.13+ Tested-by: Ivan Shapovalov Link: https://lore.kernel.org/linux-btrfs/0414d690ac5680d0d77dfc930606cdc36e42e12f.camel@intelfx.name/ Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 67ce85ff0ae2..bee1b94a1049 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1222,8 +1222,7 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx fs_info->em_shrinker_last_ino = btrfs_ino(inode); btrfs_add_delayed_iput(inode); - if (ctx->scanned >= ctx->nr_to_scan || - btrfs_fs_closing(inode->root->fs_info)) + if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info)) break; cond_resched(); -- 2.51.0 From c6c9c4d56483d941f567eb921434c25fc6086dfa Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 15 Feb 2025 11:04:15 +0000 Subject: [PATCH 13/16] btrfs: skip inodes without loaded extent maps when shrinking extent maps If there are inodes that don't have any loaded extent maps, we end up grabbing a reference on them and later adding a delayed iput, which wakes up the cleaner and makes it do unnecessary work. This is common when for example the inodes were open only to run stat(2) or all their extent maps were already released through the folio release callback (btrfs_release_folio()) or released by a previous run of the shrinker, or directories which never have extent maps. Reported-by: Ivan Shapovalov Tested-by: Ivan Shapovalov Link: https://lore.kernel.org/linux-btrfs/0414d690ac5680d0d77dfc930606cdc36e42e12f.camel@intelfx.name/ CC: stable@vger.kernel.org # 6.13+ Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 78 +++++++++++++++++++++++++++++++------------ 1 file changed, 57 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index bee1b94a1049..8c6b85ffd18f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1128,6 +1128,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c long nr_dropped = 0; struct rb_node *node; + lockdep_assert_held_write(&tree->lock); + /* * Take the mmap lock so that we serialize with the inode logging phase * of fsync because we may need to set the full sync flag on the inode, @@ -1139,28 +1141,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c * to find new extents, which may not be there yet because ordered * extents haven't completed yet. * - * We also do a try lock because otherwise we could deadlock. This is - * because the shrinker for this filesystem may be invoked while we are - * in a path that is holding the mmap lock in write mode. For example in - * a reflink operation while COWing an extent buffer, when allocating - * pages for a new extent buffer and under memory pressure, the shrinker - * may be invoked, and therefore we would deadlock by attempting to read - * lock the mmap lock while we are holding already a write lock on it. + * We also do a try lock because we don't want to block for too long and + * we are holding the extent map tree's lock in write mode. */ if (!down_read_trylock(&inode->i_mmap_lock)) return 0; - /* - * We want to be fast so if the lock is busy we don't want to spend time - * waiting for it - either some task is about to do IO for the inode or - * we may have another task shrinking extent maps, here in this code, so - * skip this inode. - */ - if (!write_trylock(&tree->lock)) { - up_read(&inode->i_mmap_lock); - return 0; - } - node = rb_first(&tree->root); while (node) { struct rb_node *next = rb_next(node); @@ -1201,12 +1187,61 @@ next: break; node = next; } - write_unlock(&tree->lock); up_read(&inode->i_mmap_lock); return nr_dropped; } +static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root, + u64 min_ino) +{ + struct btrfs_inode *inode; + unsigned long from = min_ino; + + xa_lock(&root->inodes); + while (true) { + struct extent_map_tree *tree; + + inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); + if (!inode) + break; + + tree = &inode->extent_tree; + + /* + * We want to be fast so if the lock is busy we don't want to + * spend time waiting for it (some task is about to do IO for + * the inode). + */ + if (!write_trylock(&tree->lock)) + goto next; + + /* + * Skip inode if it doesn't have loaded extent maps, so we avoid + * getting a reference and doing an iput later. This includes + * cases like files that were opened for things like stat(2), or + * files with all extent maps previously released through the + * release folio callback (btrfs_release_folio()) or released in + * a previous run, or directories which never have extent maps. + */ + if (RB_EMPTY_ROOT(&tree->root)) { + write_unlock(&tree->lock); + goto next; + } + + if (igrab(&inode->vfs_inode)) + break; + + write_unlock(&tree->lock); +next: + from = btrfs_ino(inode) + 1; + cond_resched_lock(&root->inodes.xa_lock); + } + xa_unlock(&root->inodes); + + return inode; +} + static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -1214,9 +1249,10 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx long nr_dropped = 0; u64 min_ino = fs_info->em_shrinker_last_ino + 1; - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); while (inode) { nr_dropped += btrfs_scan_inode(inode, ctx); + write_unlock(&inode->extent_tree.lock); min_ino = btrfs_ino(inode) + 1; fs_info->em_shrinker_last_ino = btrfs_ino(inode); @@ -1227,7 +1263,7 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx cond_resched(); - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); } if (inode) { -- 2.51.0 From 15b3b3254d1453a8db038b7d44b311a2d6c71f98 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 15 Feb 2025 11:11:29 +0000 Subject: [PATCH 14/16] btrfs: do regular iput instead of delayed iput during extent map shrinking The extent map shrinker now runs in the system unbound workqueue and no longer in kswapd context so it can directly do an iput() on inodes even if that blocks or needs to acquire any lock (we aren't holding any locks when requesting the delayed iput from the shrinker). So we don't need to add a delayed iput, wake up the cleaner and delegate the iput() to the cleaner, which also adds extra contention on the spinlock that protects the delayed iputs list. Reported-by: Ivan Shapovalov Tested-by: Ivan Shapovalov Link: https://lore.kernel.org/linux-btrfs/0414d690ac5680d0d77dfc930606cdc36e42e12f.camel@intelfx.name/ CC: stable@vger.kernel.org # 6.12+ Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 8c6b85ffd18f..7f46abbd6311 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1256,7 +1256,7 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx min_ino = btrfs_ino(inode) + 1; fs_info->em_shrinker_last_ino = btrfs_ino(inode); - btrfs_add_delayed_iput(inode); + iput(&inode->vfs_inode); if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info)) break; -- 2.51.0 From b1bf18223a8340cf5d52162d320badcfe07b905d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 17 Feb 2025 20:16:39 +1030 Subject: [PATCH 15/16] btrfs: output an error message if btrfs failed to find the seed fsid [BUG] If btrfs failed to locate the seed device for whatever reason, mounting the sprouted device will fail without any meaning error message: # mkfs.btrfs -f /dev/test/scratch1 # btrfstune -S1 /dev/test/scratch1 # mount /dev/test/scratch1 /mnt/btrfs # btrfs dev add -f /dev/test/scratch2 /mnt/btrfs # umount /mnt/btrfs # btrfs dev scan -u # btrfs mount /dev/test/scratch2 /mnt/btrfs mount: /mnt/btrfs: fsconfig system call failed: No such file or directory. dmesg(1) may have more information after failed mount system call. # dmesg -t | tail -n6 BTRFS info (device dm-5): first mount of filesystem 64252ded-5953-4868-b962-cea48f7ac4ea BTRFS info (device dm-5): using crc32c (crc32c-generic) checksum algorithm BTRFS info (device dm-5): using free-space-tree BTRFS error (device dm-5): failed to read chunk tree: -2 BTRFS error (device dm-5): open_ctree failed: -2 [CAUSE] The failure to mount is pretty straight forward, just unable to find the seed device and its fsid, caused by `btrfs dev scan -u`. But the lack of any useful info is a problem. [FIX] Just add an extra error message in open_seed_devices() to indicate the error. Now the error message would look like this: BTRFS info (device dm-4): first mount of filesystem 7769223d-4db1-4e4c-ac29-0a96f53576ab BTRFS info (device dm-4): using crc32c (crc32c-generic) checksum algorithm BTRFS info (device dm-4): using free-space-tree BTRFS error (device dm-4): failed to find fsid e87c12e6-584b-4e98-8b88-962c33a619ff when attempting to open seed devices BTRFS error (device dm-4): failed to read chunk tree: -2 BTRFS error (device dm-4): open_ctree failed: -2 Link: https://github.com/kdave/btrfs-progs/issues/959 Reviewed-by: Anand Jain Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a594f66daedf..f6ae76815e4b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7196,8 +7196,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { - if (!btrfs_test_opt(fs_info, DEGRADED)) + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_err(fs_info, + "failed to find fsid %pU when attempting to open seed devices", + fsid); return ERR_PTR(-ENOENT); + } fs_devices = alloc_fs_devices(fsid); if (IS_ERR(fs_devices)) -- 2.51.0 From efa11fd269c139e29b71ec21bc9c9c0063fde40d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 19 Feb 2025 09:06:33 +1030 Subject: [PATCH 16/16] btrfs: fix data overwriting bug during buffered write when block size < page size [BUG] When running generic/418 with a btrfs whose block size < page size (subpage cases), it always fails. And the following minimal reproducer is more than enough to trigger it reliably: workload() { mkfs.btrfs -s 4k -f $dev > /dev/null dmesg -C mount $dev $mnt $fsstree_dir/src/dio-invalidate-cache -r -b 4096 -n 3 -i 1 -f $mnt/diotest ret=$? umount $mnt stop_trace if [ $ret -ne 0 ]; then fail fi } for (( i = 0; i < 1024; i++)); do echo "=== $i/$runtime ===" workload done [CAUSE] With extra trace printk added to the following functions: - btrfs_buffered_write() * Which folio is touched * The file offset (start) where the buffered write is at * How many bytes are copied * The content of the write (the first 2 bytes) - submit_one_sector() * Which folio is touched * The position inside the folio * The content of the page cache (the first 2 bytes) - pagecache_isize_extended() * The parameters of the function itself * The parameters of the folio_zero_range() Which are enough to show the problem: 22.158114: btrfs_buffered_write: folio pos=0 start=0 copied=4096 content=0x0101 22.158161: submit_one_sector: r/i=5/257 folio=0 pos=0 content=0x0101 22.158609: btrfs_buffered_write: folio pos=0 start=4096 copied=4096 content=0x0101 22.158634: btrfs_buffered_write: folio pos=0 start=8192 copied=4096 content=0x0101 22.158650: pagecache_isize_extended: folio=0 from=4096 to=8192 bsize=4096 zero off=4096 len=8192 22.158682: submit_one_sector: r/i=5/257 folio=0 pos=4096 content=0x0000 22.158686: submit_one_sector: r/i=5/257 folio=0 pos=8192 content=0x0101 The tool dio-invalidate-cache will start 3 threads, each doing a buffered write with 0x01 at offset 0, 4096 and 8192, do a fsync, then do a direct read, and compare the read buffer with the write buffer. Note that all 3 btrfs_buffered_write() are writing the correct 0x01 into the page cache. But at submit_one_sector(), at file offset 4096, the content is zeroed out, by pagecache_isize_extended(). The race happens like this: Thread A is writing into range [4K, 8K). Thread B is writing into range [8K, 12k). Thread A | Thread B -------------------------------------+------------------------------------ btrfs_buffered_write() | btrfs_buffered_write() |- old_isize = 4K; | |- old_isize = 4096; |- btrfs_inode_lock() | | |- write into folio range [4K, 8K) | | |- pagecache_isize_extended() | | | extend isize from 4096 to 8192 | | | no folio_zero_range() called | | |- btrfs_inode_lock() | | | |- btrfs_inode_lock() | |- write into folio range [8K, 12K) | |- pagecache_isize_extended() | | calling folio_zero_range(4K, 8K) | | This is caused by the old_isize is | | grabbed too early, without any | | inode lock. | |- btrfs_inode_unlock() The @old_isize is grabbed without inode lock, causing race between two buffered write threads and making pagecache_isize_extended() to zero range which is still containing cached data. And this is only affecting subpage btrfs, because for regular blocksize == page size case, the function pagecache_isize_extended() will do nothing if the block size >= page size. [FIX] Grab the old i_size while holding the inode lock. This means each buffered write thread will have a stable view of the old inode size, thus avoid the above race. CC: stable@vger.kernel.org # 5.15+ Fixes: 5e8b9ef30392 ("btrfs: move pos increment and pagecache extension to btrfs_buffered_write") Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/file.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ed3c0d6546c5..0b568c8d24cb 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1090,7 +1090,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) u64 lockend; size_t num_written = 0; ssize_t ret; - loff_t old_isize = i_size_read(inode); + loff_t old_isize; unsigned int ilock_flags = 0; const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); @@ -1103,6 +1103,13 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) if (ret < 0) return ret; + /* + * We can only trust the isize with inode lock held, or it can race with + * other buffered writes and cause incorrect call of + * pagecache_isize_extended() to overwrite existing data. + */ + old_isize = i_size_read(inode); + ret = generic_write_checks(iocb, i); if (ret <= 0) goto out; -- 2.51.0