From ace149e0830c380ddfce7e466fe860ca502fe4ee Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 13 Sep 2024 13:57:04 -0400 Subject: [PATCH 01/16] filemap: Fix bounds checking in filemap_read() If the caller supplies an iocb->ki_pos value that is close to the filesystem upper limit, and an iterator with a count that causes us to overflow that limit, then filemap_read() enters an infinite loop. This behaviour was discovered when testing xfstests generic/525 with the "localio" optimisation for loopback NFS mounts. Reported-by: Mike Snitzer Fixes: c2a9737f45e2 ("vfs,mm: fix a dead loop in truncate_inode_pages_range()") Tested-by: Mike Snitzer Signed-off-by: Trond Myklebust Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 36d22968be9a..56fa431c52af 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2625,7 +2625,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(!iov_iter_count(iter))) return 0; - iov_iter_truncate(iter, inode->i_sb->s_maxbytes); + iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos); folio_batch_init(&fbatch); do { -- 2.50.1 From 2d5404caa8c7bb5c4e0435f94b28834ae5456623 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 10 Nov 2024 14:19:35 -0800 Subject: [PATCH 02/16] Linux 6.12-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b8efbfe9da94..79192a3024bf 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 12 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.50.1 From 8cca35cb29f81eba3e96ec44dad8696c8a2f9138 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 10 Sep 2024 09:55:01 +0200 Subject: [PATCH 03/16] btrfs: don't take dev_replace rwsem on task already holding it Running fstests btrfs/011 with MKFS_OPTIONS="-O rst" to force the usage of the RAID stripe-tree, we get the following splat from lockdep: BTRFS info (device sdd): dev_replace from /dev/sdd (devid 1) to /dev/sdb started ============================================ WARNING: possible recursive locking detected 6.11.0-rc3-btrfs-for-next #599 Not tainted -------------------------------------------- btrfs/2326 is trying to acquire lock: ffff88810f215c98 (&fs_info->dev_replace.rwsem){++++}-{3:3}, at: btrfs_map_block+0x39f/0x2250 but task is already holding lock: ffff88810f215c98 (&fs_info->dev_replace.rwsem){++++}-{3:3}, at: btrfs_map_block+0x39f/0x2250 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&fs_info->dev_replace.rwsem); lock(&fs_info->dev_replace.rwsem); *** DEADLOCK *** May be due to missing lock nesting notation 1 lock held by btrfs/2326: #0: ffff88810f215c98 (&fs_info->dev_replace.rwsem){++++}-{3:3}, at: btrfs_map_block+0x39f/0x2250 stack backtrace: CPU: 1 UID: 0 PID: 2326 Comm: btrfs Not tainted 6.11.0-rc3-btrfs-for-next #599 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Call Trace: dump_stack_lvl+0x5b/0x80 __lock_acquire+0x2798/0x69d0 ? __pfx___lock_acquire+0x10/0x10 ? __pfx___lock_acquire+0x10/0x10 lock_acquire+0x19d/0x4a0 ? btrfs_map_block+0x39f/0x2250 ? __pfx_lock_acquire+0x10/0x10 ? find_held_lock+0x2d/0x110 ? lock_is_held_type+0x8f/0x100 down_read+0x8e/0x440 ? btrfs_map_block+0x39f/0x2250 ? __pfx_down_read+0x10/0x10 ? do_raw_read_unlock+0x44/0x70 ? _raw_read_unlock+0x23/0x40 btrfs_map_block+0x39f/0x2250 ? btrfs_dev_replace_by_ioctl+0xd69/0x1d00 ? btrfs_bio_counter_inc_blocked+0xd9/0x2e0 ? __kasan_slab_alloc+0x6e/0x70 ? __pfx_btrfs_map_block+0x10/0x10 ? __pfx_btrfs_bio_counter_inc_blocked+0x10/0x10 ? kmem_cache_alloc_noprof+0x1f2/0x300 ? mempool_alloc_noprof+0xed/0x2b0 btrfs_submit_chunk+0x28d/0x17e0 ? __pfx_btrfs_submit_chunk+0x10/0x10 ? bvec_alloc+0xd7/0x1b0 ? bio_add_folio+0x171/0x270 ? __pfx_bio_add_folio+0x10/0x10 ? __kasan_check_read+0x20/0x20 btrfs_submit_bio+0x37/0x80 read_extent_buffer_pages+0x3df/0x6c0 btrfs_read_extent_buffer+0x13e/0x5f0 read_tree_block+0x81/0xe0 read_block_for_search+0x4bd/0x7a0 ? __pfx_read_block_for_search+0x10/0x10 btrfs_search_slot+0x78d/0x2720 ? __pfx_btrfs_search_slot+0x10/0x10 ? lock_is_held_type+0x8f/0x100 ? kasan_save_track+0x14/0x30 ? __kasan_slab_alloc+0x6e/0x70 ? kmem_cache_alloc_noprof+0x1f2/0x300 btrfs_get_raid_extent_offset+0x181/0x820 ? __pfx_lock_acquire+0x10/0x10 ? __pfx_btrfs_get_raid_extent_offset+0x10/0x10 ? down_read+0x194/0x440 ? __pfx_down_read+0x10/0x10 ? do_raw_read_unlock+0x44/0x70 ? _raw_read_unlock+0x23/0x40 btrfs_map_block+0x5b5/0x2250 ? __pfx_btrfs_map_block+0x10/0x10 scrub_submit_initial_read+0x8fe/0x11b0 ? __pfx_scrub_submit_initial_read+0x10/0x10 submit_initial_group_read+0x161/0x3a0 ? lock_release+0x20e/0x710 ? __pfx_submit_initial_group_read+0x10/0x10 ? __pfx_lock_release+0x10/0x10 scrub_simple_mirror.isra.0+0x3eb/0x580 scrub_stripe+0xe4d/0x1440 ? lock_release+0x20e/0x710 ? __pfx_scrub_stripe+0x10/0x10 ? __pfx_lock_release+0x10/0x10 ? do_raw_read_unlock+0x44/0x70 ? _raw_read_unlock+0x23/0x40 scrub_chunk+0x257/0x4a0 scrub_enumerate_chunks+0x64c/0xf70 ? __mutex_unlock_slowpath+0x147/0x5f0 ? __pfx_scrub_enumerate_chunks+0x10/0x10 ? bit_wait_timeout+0xb0/0x170 ? __up_read+0x189/0x700 ? scrub_workers_get+0x231/0x300 ? up_write+0x490/0x4f0 btrfs_scrub_dev+0x52e/0xcd0 ? create_pending_snapshots+0x230/0x250 ? __pfx_btrfs_scrub_dev+0x10/0x10 btrfs_dev_replace_by_ioctl+0xd69/0x1d00 ? lock_acquire+0x19d/0x4a0 ? __pfx_btrfs_dev_replace_by_ioctl+0x10/0x10 ? lock_release+0x20e/0x710 ? btrfs_ioctl+0xa09/0x74f0 ? __pfx_lock_release+0x10/0x10 ? do_raw_spin_lock+0x11e/0x240 ? __pfx_do_raw_spin_lock+0x10/0x10 btrfs_ioctl+0xa14/0x74f0 ? lock_acquire+0x19d/0x4a0 ? find_held_lock+0x2d/0x110 ? __pfx_btrfs_ioctl+0x10/0x10 ? lock_release+0x20e/0x710 ? do_sigaction+0x3f0/0x860 ? __pfx_do_vfs_ioctl+0x10/0x10 ? do_raw_spin_lock+0x11e/0x240 ? lockdep_hardirqs_on_prepare+0x270/0x3e0 ? _raw_spin_unlock_irq+0x28/0x50 ? do_sigaction+0x3f0/0x860 ? __pfx_do_sigaction+0x10/0x10 ? __x64_sys_rt_sigaction+0x18e/0x1e0 ? __pfx___x64_sys_rt_sigaction+0x10/0x10 ? __x64_sys_close+0x7c/0xd0 __x64_sys_ioctl+0x137/0x190 do_syscall_64+0x71/0x140 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f0bd1114f9b Code: Unable to access opcode bytes at 0x7f0bd1114f71. RSP: 002b:00007ffc8a8c3130 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f0bd1114f9b RDX: 00007ffc8a8c35e0 RSI: 00000000ca289435 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000007 R10: 0000000000000008 R11: 0000000000000246 R12: 00007ffc8a8c6c85 R13: 00000000398e72a0 R14: 0000000000004361 R15: 0000000000000004 This happens because on RAID stripe-tree filesystems we recurse back into btrfs_map_block() on scrub to perform the logical to device physical mapping. But as the device replace task is already holding the dev_replace::rwsem we deadlock. So don't take the dev_replace::rwsem in case our task is the task performing the device replace. Suggested-by: Filipe Manana Signed-off-by: Johannes Thumshirn Reviewed-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 2 ++ fs/btrfs/fs.h | 2 ++ fs/btrfs/volumes.c | 8 +++++--- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 83d5cdd77f29..604399e59a3d 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -641,6 +641,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return ret; down_write(&dev_replace->rwsem); + dev_replace->replace_task = current; switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -994,6 +995,7 @@ error: list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list); fs_devices->rw_devices++; + dev_replace->replace_task = NULL; up_write(&dev_replace->rwsem); btrfs_rm_dev_replace_blocked(fs_info); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 79f64e383edd..cbfb225858a5 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -317,6 +317,8 @@ struct btrfs_dev_replace { struct percpu_counter bio_counter; wait_queue_head_t replace_wait; + + struct task_struct *replace_task; }; /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index eb51b609190f..920df7585b0d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6481,13 +6481,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, max_len = btrfs_max_io_len(map, map_offset, &io_geom); *length = min_t(u64, map->chunk_len - map_offset, max_len); - down_read(&dev_replace->rwsem); + if (dev_replace->replace_task != current) + down_read(&dev_replace->rwsem); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); /* * Hold the semaphore for read during the whole operation, write is * requested at commit time but must wait. */ - if (!dev_replace_is_ongoing) + if (!dev_replace_is_ongoing && dev_replace->replace_task != current) up_read(&dev_replace->rwsem); switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { @@ -6627,7 +6629,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, bioc->mirror_num = io_geom.mirror_num; out: - if (dev_replace_is_ongoing) { + if (dev_replace_is_ongoing && dev_replace->replace_task != current) { lockdep_assert_held(&dev_replace->rwsem); /* Unlock and let waiting writers proceed */ up_read(&dev_replace->rwsem); -- 2.50.1 From c186345a6b4b8ff082e5ef9515b782704dbba6be Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 16 Sep 2024 17:55:42 +0930 Subject: [PATCH 04/16] btrfs: make assert_rbio() to only check CONFIG_BTRFS_ASSERT According to the description, CONFIG_BTRFS_DEBUG is only for extra debug info, meanwhile sanity checks should be managed by CONFIG_BTRFS_ASSERT. There is no need to check both to enable assert_rbio(). Just remove the check for CONFIG_BTRFS_DEBUG. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 39bec672df0c..cdd373c27784 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1272,8 +1272,7 @@ static inline void bio_list_put(struct bio_list *bio_list) static void assert_rbio(struct btrfs_raid_bio *rbio) { - if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) || - !IS_ENABLED(CONFIG_BTRFS_ASSERT)) + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; /* -- 2.50.1 From 67cd3f22176904e7445fea5f307f6fa2ad1d89e4 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 16 Sep 2024 18:18:25 +0930 Subject: [PATCH 05/16] btrfs: split out CONFIG_BTRFS_EXPERIMENTAL from CONFIG_BTRFS_DEBUG Currently CONFIG_BTRFS_EXPERIMENTAL is not only for the extra debugging output, but also for experimental features. This is not ideal to distinguish planned but not yet stable features from those purely designed for debugging. This patch splits the following features into CONFIG_BTRFS_EXPERIMENTAL: - Extent map shrinker This seems to be the first one to exit experimental. - Extent tree v2 This seems to be the last one to graduate from experimental. - Raid stripe tree - Csum offload mode - Send protocol v3 Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Kconfig | 26 ++++++++++++++++++++++++++ fs/btrfs/bio.c | 2 +- fs/btrfs/fs.h | 4 ++-- fs/btrfs/send.h | 2 +- fs/btrfs/super.c | 6 +++--- fs/btrfs/sysfs.c | 4 ++-- fs/btrfs/volumes.h | 4 ++-- 7 files changed, 37 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 4fb925e8c981..fa8515598341 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -78,6 +78,32 @@ config BTRFS_ASSERT If unsure, say N. +config BTRFS_EXPERIMENTAL + bool "Btrfs experimental features" + depends on BTRFS_FS + default n + help + Enable experimental features. These features may not be stable enough + for end users. This is meant for btrfs developers or users who wish + to test the functionality and report problems. + + Current list: + + - extent map shrinker - performance problems with too frequent shrinks + + - send stream protocol v3 - fs-verity support + + - checksum offload mode - sysfs knob to affect when checksums are + calculated (at IO time, or in a thread) + + - raid-stripe-tree - additional mapping of extents to devices to + support RAID1* profiles on zoned devices, + RAID56 not yet supported + + - extent tree v2 - complex rework of extent tracking + + If unsure, say N. + config BTRFS_FS_REF_VERIFY bool "Btrfs with the ref verify tool compiled in" depends on BTRFS_FS diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 7e0f9600b80c..1f216d07eff6 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -587,7 +587,7 @@ static bool should_async_write(struct btrfs_bio *bbio) { bool auto_csum_mode = true; -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices; enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index cbfb225858a5..785ec15c1b84 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -263,10 +263,10 @@ enum { BTRFS_FEATURE_INCOMPAT_ZONED | \ BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA) -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* * Features under developmen like Extent tree v2 support is enabled - * only under CONFIG_BTRFS_DEBUG. + * only under CONFIG_BTRFS_EXPERIMENTAL */ #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index b07f4aa66878..9309886c5ea1 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -16,7 +16,7 @@ struct btrfs_ioctl_send_args; #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" /* Conditional support for the upcoming protocol version. */ -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL #define BTRFS_SEND_STREAM_VERSION 3 #else #define BTRFS_SEND_STREAM_VERSION 2 diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c64d07134122..029e2bb4466b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2396,10 +2396,10 @@ static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_contro trace_btrfs_extent_map_shrinker_count(fs_info, nr); /* - * Only report the real number for DEBUG builds, as there are reports of - * serious performance degradation caused by too frequent shrinks. + * Only report the real number for EXPERIMENTAL builds, as there are + * reports of serious performance degradation caused by too frequent shrinks. */ - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) + if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) return nr; return 0; } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 03926ad467c9..b843308e2bc6 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1390,7 +1390,7 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, btrfs_bg_reclaim_threshold_store); -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL static ssize_t btrfs_offload_csum_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1450,7 +1450,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), BTRFS_ATTR_PTR(, temp_fsid), -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_ATTR_PTR(, offload_csum), #endif NULL, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 4481575dd70f..26e35fc1c8fd 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -306,7 +306,7 @@ enum btrfs_read_policy { BTRFS_NR_READ_POLICY, }; -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* * Checksum mode - offload it to workqueues or do it synchronously in * btrfs_submit_chunk(). @@ -430,7 +430,7 @@ struct btrfs_fs_devices { /* Policy used to read the mirrored stripes. */ enum btrfs_read_policy read_policy; -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* Checksum mode - offload it or do it synchronously. */ enum btrfs_offload_csum_mode offload_csum_mode; #endif -- 2.50.1 From f6ebedb09bb276256e084196e2322562dc4aac10 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 6 Sep 2024 14:14:55 +0930 Subject: [PATCH 06/16] btrfs: zlib: make the compression path to handle sector size < page size Inside zlib_compress_folios(), each time we switch the input page cache, the @start is increased by PAGE_SIZE. But for the incoming compression support for sector size < page size (previously we support compression only when the range is fully page aligned), this is not going to handle the following case: 0 32K 64K 96K | |///////////||///////////| @start has the initial value 32K, indicating the start filepos of the to-be-compressed range. And when grabbing the first page as input, we always call "start += PAGE_SIZE;". But since @start is starting at 32K, it will be increased by 64K, resulting it to be 96K for the next range, causing incorrect input range and corruption for the future subpage compression. Fix it by only increase @start by the input size. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/zlib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 100abc00b794..ddf0d5a448a7 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -194,7 +194,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, pg_off = offset_in_page(start); cur_len = btrfs_calc_input_length(orig_end, start); data_in = kmap_local_folio(in_folio, pg_off); - start += PAGE_SIZE; + start += cur_len; workspace->strm.next_in = data_in; workspace->strm.avail_in = cur_len; } -- 2.50.1 From 90275a7762c85bde21c0884404993ed20e265d86 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 6 Sep 2024 13:40:30 +0930 Subject: [PATCH 07/16] btrfs: zstd: make the compression path to handle sector size < page size Inside zstd_compress_folios(), after exhausted one input page, we need to switch to the next page as input. However when counting the total input bytes (@tot_in), we always increase it by PAGE_SIZE. For the following case, it can cause incorrect value: 0 32K 64K 96K | |///////////||///////////| After compressing range [32K, 64K), we switch to the next page, and increasing @tot_in by 64K, while we only read 32K. This will cause the @total_in to return a value larger than the input length. Fix it by only increase @tot_in by the input size. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/zstd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 866607fd3e58..15f8a83165a3 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -495,7 +495,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, /* Check if we need more input */ if (workspace->in_buf.pos == workspace->in_buf.size) { - tot_in += PAGE_SIZE; + tot_in += workspace->in_buf.size; kunmap_local(workspace->in_buf.src); workspace->in_buf.src = NULL; folio_put(in_folio); -- 2.50.1 From dd5e2762544d9bd59c101de0afaad1317c2876a0 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 6 Sep 2024 14:27:56 +0930 Subject: [PATCH 08/16] btrfs: compression: add an ASSERT() to ensure the read-in length is sane There are already two bugs (one in zlib, one in zstd) that involved compression path is not handling sector size < page size cases well. So it makes more sense to make sure that btrfs_compress_folios() returns Since we already have two bugs (one in zlib, one in zstd) in the compression path resulting the @total_in be to larger than the to-be-compressed range length, there is enough reason to add an ASSERT() to make sure the total read-in length doesn't exceed the input length. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/compression.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 90aef2627ca2..6e9c4a5e0d51 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1030,6 +1030,7 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping { int type = btrfs_compress_type(type_level); int level = btrfs_compress_level(type_level); + const unsigned long orig_len = *total_out; struct list_head *workspace; int ret; @@ -1037,6 +1038,8 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping workspace = get_workspace(type, level); ret = compression_compress_pages(type, workspace, mapping, start, folios, out_folios, total_in, total_out); + /* The total read-in bytes should be no larger than the input. */ + ASSERT(*total_in <= orig_len); put_workspace(type, workspace); return ret; } -- 2.50.1 From a8706d0271a8ef7d8d0916d78ab4821e9ccb7464 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 9 Sep 2024 08:22:06 +0930 Subject: [PATCH 09/16] btrfs: wait for writeback if sector size is smaller than page size [PROBLEM] If sector perfect compression is enabled for sector size < page size case, the following case can lead dirty ranges not being written back: 0 32K 64K 96K 128K | |///////||//////| |/| 124K In above example, the page size is 64K, and we need to write back above two pages. - Submit for page 0 (main thread) We found delalloc range [32K, 96K), which can be compressed. So we queue an async range for [32K, 96K). This means, the page unlock/clearing dirty/setting writeback will all happen in a workqueue context. - The compression is done, and compressed range is submitted (workqueue) Since the compression is done in asynchronously, the compression can be done before the main thread to submit for page 64K. Now the whole range [32K, 96K), involving two pages, will be marked writeback. - Submit for page 64K (main thread) extent_write_cache_pages() got its wbc->sync_mode is WB_SYNC_NONE, so it skips the writeback wait. And unlock the page and exit. This means the dirty range [124K, 128K) will never be submitted, until next writeback happens for page 64K. This will never happen for previous kernels because: - For sector size == page size case Since one page is one sector, if a page is marked writeback it will not have dirty flags. So this corner case will never hit. - For sector size < page size case We never do subpage compression, a range can only be submitted for compression if the range is fully page aligned. This change makes the subpage behavior mostly the same as non-subpage cases. [ENHANCEMENT] Instead of relying WB_SYNC_NONE check only, if it's a subpage case, then always wait for writeback flags. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 872cca54cc6c..7620d335f1ec 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2116,7 +2116,27 @@ retry: continue; } - if (wbc->sync_mode != WB_SYNC_NONE) { + /* + * For subpage case, compression can lead to mixed + * writeback and dirty flags, e.g: + * 0 32K 64K 96K 128K + * | |//////||/////| |//| + * + * In above case, [32K, 96K) is asynchronously submitted + * for compression, and [124K, 128K) needs to be written back. + * + * If we didn't wait wrtiteback for page 64K, [128K, 128K) + * won't be submitted as the page still has writeback flag + * and will be skipped in the next check. + * + * This mixed writeback and dirty case is only possible for + * subpage case. + * + * TODO: Remove this check after migrating compression to + * regular submission. + */ + if (wbc->sync_mode != WB_SYNC_NONE || + btrfs_is_subpage(inode_to_fs_info(inode), mapping)) { if (folio_test_writeback(folio)) submit_write_bio(bio_ctrl, 0); folio_wait_writeback(folio); -- 2.50.1 From a4ef54dbb576032ba31a646a5ffc8a26a83cb92c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 10 Sep 2024 16:12:57 +0930 Subject: [PATCH 10/16] btrfs: make extent_range_clear_dirty_for_io() to handle sector size < page size cases For btrfs with sector size < page size (e.g. 4K sector size, 64K page size), and enable the sector perfect compression support, then the following dirty range can lead to problems: 0 32K 64K 96K 128K | |///////||//////| |/| 124K In above case, if we start writeback for that inode, the last dirty range [124K, 128K) will not be submitted and cause reserved space leakage: - Start writeback for page 0 We find the range [32K, 96K) is suitable for compression, and queue it into a workqueue to do the delayed compression and submission. - Compression happens for range [32K, 96K) Function extent_range_clear_dirty_for_io() is called, however it is only doing full page handling, not considering any the extra bitmaps for subpage cases. That function will clear page dirty for both page 0 and page 64K. - Writeback for the inode is done Because page 64K has its dirty flag cleared, it will not be considered as a writeback target. This means the range [124K, 128K) will not be submitted, and reserved space for it will be leaked. Fix this problem by using the subpage helper to clear the dirty flag. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1e4ca1e7d2e5..686d39309410 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -902,7 +902,8 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e ret = PTR_ERR(folio); continue; } - folio_clear_dirty_for_io(folio); + btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start, + end + 1 - start); folio_put(folio); } return ret; -- 2.50.1 From 928b4de66ed3b0d9a6f201ce41ab2eed6ea2e7ef Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 11 Sep 2024 08:52:36 +0930 Subject: [PATCH 11/16] btrfs: do not assume the full page range is not dirty in extent_writepage_io() The function extent_writepage_io() will submit the dirty sectors inside the page for the write. But recently to co-operate with the incoming subpage compression enhancement, a new bitmap is introduced to btrfs_bio_ctrl::submit_bitmap, to only avoid a subset of the dirty range. This is because we can have the following cases with 64K page size: 0 16K 32K 48K 64K | |/////////| |/| 52K For range [16K, 32K), we queue the dirty range for compression, which is ran in a delayed workqueue. Then for range [48K, 52K), we go through the regular submission path. In that case, our btrfs_bio_ctrl::submit_bitmap will exclude the range [16K, 32K). The dirty flags for the range [16K, 32K) is only cleared when the compression is done, by the extent_clear_unlock_delalloc() call inside submit_one_async_extent(). This patch fix the false alert by removing the btrfs_folio_assert_not_dirty() check, since it's no longer correct for subpage compression cases. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7620d335f1ec..e5075b95cfd9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1391,8 +1391,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, goto out; submitted_io = true; } - - btrfs_folio_assert_not_dirty(fs_info, folio, start, len); out: /* * If we didn't submitted any sector (>= i_size), folio dirty get -- 2.50.1 From 2bca8eb0774d271b1077b72f1be135073e0a898f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 16 Sep 2024 08:03:00 +0930 Subject: [PATCH 12/16] btrfs: move the delalloc range bitmap search into extent_io.c Currently for subpage (sector size < page size) cases, we reuse subpage locked bitmap to find out all delalloc ranges we have locked, and run all those found ranges. However such reuse is not perfect, e.g.: 0 32K 64K 96K 128K | |////////||///////| |////| 120K For above range, writepage_delalloc() for page 0 will handle the range [32K, 96k), note delalloc range can be beyond the page boundary. But writepage_delalloc() for page 64K will only handle range [120K, 128K), as the previous run on page 0 has already handled range [64K, 96K). Meanwhile for the writeback we should expect range [64K, 96K) to also be locked, this leads to the mismatch from locked bitmap and delalloc range. This is not causing problems yet, but it's still an inconsistent behavior. So instead of relying on the subpage locked bitmap, move the delalloc range search using local @delalloc_bitmap, so that we can remove the existing btrfs_folio_find_writer_locked(). Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 44 ++++++++++++++++++++++++++++++++++++++++- fs/btrfs/subpage.c | 47 -------------------------------------------- fs/btrfs/subpage.h | 4 ---- 3 files changed, 43 insertions(+), 52 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e5075b95cfd9..40297345b89d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1102,6 +1102,45 @@ int btrfs_read_folio(struct file *file, struct folio *folio) return ret; } +static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap, + u64 start, u32 len) +{ + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + const u64 folio_start = folio_pos(folio); + unsigned int start_bit; + unsigned int nbits; + + ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE); + start_bit = (start - folio_start) >> fs_info->sectorsize_bits; + nbits = len >> fs_info->sectorsize_bits; + ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); + bitmap_set(delalloc_bitmap, start_bit, nbits); +} + +static bool find_next_delalloc_bitmap(struct folio *folio, + unsigned long *delalloc_bitmap, u64 start, + u64 *found_start, u32 *found_len) +{ + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + const u64 folio_start = folio_pos(folio); + const unsigned int bitmap_size = fs_info->sectors_per_page; + unsigned int start_bit; + unsigned int first_zero; + unsigned int first_set; + + ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE); + + start_bit = (start - folio_start) >> fs_info->sectorsize_bits; + first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); + if (first_set >= bitmap_size) + return false; + + *found_start = folio_start + (first_set << fs_info->sectorsize_bits); + first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set); + *found_len = (first_zero - first_set) << fs_info->sectorsize_bits; + return true; +} + /* * helper for extent_writepage(), doing all of the delayed allocation setup. * @@ -1121,6 +1160,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); const u64 page_start = folio_pos(folio); const u64 page_end = page_start + folio_size(folio) - 1; + unsigned long delalloc_bitmap = 0; /* * Save the last found delalloc end. As the delalloc end can go beyond * page boundary, thus we cannot rely on subpage bitmap to locate the @@ -1148,6 +1188,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, delalloc_start = delalloc_end + 1; continue; } + set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start, + min(delalloc_end, page_end) + 1 - delalloc_start); btrfs_folio_set_writer_lock(fs_info, folio, delalloc_start, min(delalloc_end, page_end) + 1 - delalloc_start); @@ -1175,7 +1217,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, found_len = last_delalloc_end + 1 - found_start; found = true; } else { - found = btrfs_subpage_find_writer_locked(fs_info, folio, + found = find_next_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start, &found_start, &found_len); } if (!found) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index fe4d719d506b..17bc53a8df01 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -801,53 +801,6 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, spin_unlock_irqrestore(&subpage->lock, flags); } -/* - * Find any subpage writer locked range inside @folio, starting at file offset - * @search_start. The caller should ensure the folio is locked. - * - * Return true and update @found_start_ret and @found_len_ret to the first - * writer locked range. - * Return false if there is no writer locked range. - */ -bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 search_start, - u64 *found_start_ret, u32 *found_len_ret) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const u32 sectors_per_page = fs_info->sectors_per_page; - const unsigned int len = PAGE_SIZE - offset_in_page(search_start); - const unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, - locked, search_start, len); - const unsigned int locked_bitmap_start = sectors_per_page * btrfs_bitmap_nr_locked; - const unsigned int locked_bitmap_end = locked_bitmap_start + sectors_per_page; - unsigned long flags; - int first_zero; - int first_set; - bool found = false; - - ASSERT(folio_test_locked(folio)); - spin_lock_irqsave(&subpage->lock, flags); - first_set = find_next_bit(subpage->bitmaps, locked_bitmap_end, start_bit); - if (first_set >= locked_bitmap_end) - goto out; - - found = true; - - *found_start_ret = folio_pos(folio) + - ((first_set - locked_bitmap_start) << fs_info->sectorsize_bits); - /* - * Since @first_set is ensured to be smaller than locked_bitmap_end - * here, @found_start_ret should be inside the folio. - */ - ASSERT(*found_start_ret < folio_pos(folio) + PAGE_SIZE); - - first_zero = find_next_zero_bit(subpage->bitmaps, locked_bitmap_end, first_set); - *found_len_ret = (first_zero - first_set) << fs_info->sectorsize_bits; -out: - spin_unlock_irqrestore(&subpage->lock, flags); - return found; -} - #define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ { \ const int sectors_per_page = fs_info->sectors_per_page; \ diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 4b85d91d0e18..d16fbddeda68 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -108,10 +108,6 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long bitmap); -bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 search_start, - u64 *found_start_ret, u32 *found_len_ret); - /* * Template for subpage related operations. * -- 2.50.1 From c96d0e3921419bd3e5d8a1f355970c8ae3047ef4 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 16 Sep 2024 08:12:40 +0930 Subject: [PATCH 13/16] btrfs: mark all dirty sectors as locked inside writepage_delalloc() Currently we only mark sectors as locked if there is a *NEW* delalloc range for it. But NEW delalloc range is not the same as dirty sectors we want to submit, e.g: 0 32K 64K 96K 128K | |////////||///////| |////| 120K For above 64K page size case, writepage_delalloc() for page 0 will find and lock the delalloc range [32K, 96K), which is beyond the page boundary. Then when writepage_delalloc() is called for the page 64K, since [64K, 96K) is already locked, only [120K, 128K) will be locked. This means, although range [64K, 96K) is dirty and will be submitted later by extent_writepage_io(), it will not be marked as locked. This is fine for now, as we call btrfs_folio_end_writer_lock_bitmap() to free every non-compressed sector, and compression is only allowed for full page range. But this is not safe for future sector perfect compression support, as this can lead to double folio unlock: Thread A | Thread B ---------------------------------------+-------------------------------- | submit_one_async_extent() | |- extent_clear_unlock_delalloc() extent_writepage() | |- btrfs_folio_end_writer_lock() |- btrfs_folio_end_writer_lock_bitmap()| |- btrfs_subpage_end_and_test_writer() | | | |- atomic_sub_and_test() | | | /* Now the atomic value is 0 */ |- if (atomic_read() == 0) | | |- folio_unlock() | |- folio_unlock() The root cause is the above range [64K, 96K) is dirtied and should also be locked but it isn't. So to make everything more consistent and prepare for the incoming sector perfect compression, mark all dirty sectors as locked. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 40297345b89d..89a7e85f2b38 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1171,6 +1171,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, u64 delalloc_end = page_end; u64 delalloc_to_write = 0; int ret = 0; + int bit; /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { @@ -1180,6 +1181,12 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, bio_ctrl->submit_bitmap = 1; } + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + u64 start = page_start + (bit << fs_info->sectorsize_bits); + + btrfs_folio_set_writer_lock(fs_info, folio, start, fs_info->sectorsize); + } + /* Lock all (subpage) delalloc ranges inside the folio first. */ while (delalloc_start < page_end) { delalloc_end = page_end; @@ -1190,9 +1197,6 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, } set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start, min(delalloc_end, page_end) + 1 - delalloc_start); - btrfs_folio_set_writer_lock(fs_info, folio, delalloc_start, - min(delalloc_end, page_end) + 1 - - delalloc_start); last_delalloc_end = delalloc_end; delalloc_start = delalloc_end + 1; } -- 2.50.1 From 1d2fbb7f1f9e33eb448c7d2e2ae883801c8c4a21 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 6 Sep 2024 12:22:56 +0930 Subject: [PATCH 14/16] btrfs: allow compression even if the range is not page aligned Previously for btrfs with sector size smaller than page size (subpage), we only allow compression if the range is fully page aligned. This is to work around the asynchronous submission of compressed range, which delayed the page unlock and writeback into a workqueue, furthermore asynchronous submission can lock multiple sector range across page boundary. Such asynchronous submission makes it very hard to co-operate with other regular writes. With the recent changes to the subpage folio unlock path, now asynchronous submission of compressed pages can co-operate with regular submission, so enable sector perfect compression if it's an experimental build. The ETA for moving this feature out of experimental is 6.15, and I hope all remaining corner cases can be exposed before that. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 41 +++++++---------------------------------- 1 file changed, 7 insertions(+), 34 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 686d39309410..355d83dd43c1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -832,32 +832,16 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, return 0; } /* - * Special check for subpage. + * Only enable sector perfect compression for experimental builds. * - * We lock the full page then run each delalloc range in the page, thus - * for the following case, we will hit some subpage specific corner case: + * This is a big feature change for subpage cases, and can hit + * different corner cases, so only limit this feature for + * experimental build for now. * - * 0 32K 64K - * | |///////| |///////| - * \- A \- B - * - * In above case, both range A and range B will try to unlock the full - * page [0, 64K), causing the one finished later will have page - * unlocked already, triggering various page lock requirement BUG_ON()s. - * - * So here we add an artificial limit that subpage compression can only - * if the range is fully page aligned. - * - * In theory we only need to ensure the first page is fully covered, but - * the tailing partial page will be locked until the full compression - * finishes, delaying the write of other range. - * - * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range - * first to prevent any submitted async extent to unlock the full page. - * By this, we can ensure for subpage case that only the last async_cow - * will unlock the full page. + * ETA for moving this out of experimental builds is 6.15. */ - if (fs_info->sectorsize < PAGE_SIZE) { + if (fs_info->sectorsize < PAGE_SIZE && + !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(end + 1)) return 0; @@ -1002,17 +986,6 @@ again: (start > 0 || end + 1 < inode->disk_i_size)) goto cleanup_and_bail_uncompressed; - /* - * For subpage case, we require full page alignment for the sector - * aligned range. - * Thus we must also check against @actual_end, not just @end. - */ - if (blocksize < PAGE_SIZE) { - if (!PAGE_ALIGNED(start) || - !PAGE_ALIGNED(round_up(actual_end, blocksize))) - goto cleanup_and_bail_uncompressed; - } - total_compressed = min_t(unsigned long, total_compressed, BTRFS_MAX_UNCOMPRESSED); total_in = 0; -- 2.50.1 From 2e8b6bc0ab41ce41e6dfcc204b6cc01d5abbc952 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 24 Sep 2024 12:52:17 +0930 Subject: [PATCH 15/16] btrfs: avoid unnecessary device path update for the same device [PROBLEM] It is very common for udev to trigger device scan, and every time a mounted btrfs device got re-scan from different soft links, we will get some of unnecessary device path updates, this is especially common for LVM based storage: # lvs scratch1 test -wi-ao---- 10.00g scratch2 test -wi-a----- 10.00g scratch3 test -wi-a----- 10.00g scratch4 test -wi-a----- 10.00g scratch5 test -wi-a----- 10.00g test test -wi-a----- 10.00g # mkfs.btrfs -f /dev/test/scratch1 # mount /dev/test/scratch1 /mnt/btrfs # dmesg -c [ 205.705234] BTRFS: device fsid 7be2602f-9e35-4ecf-a6ff-9e91d2c182c9 devid 1 transid 6 /dev/mapper/test-scratch1 (253:4) scanned by mount (1154) [ 205.710864] BTRFS info (device dm-4): first mount of filesystem 7be2602f-9e35-4ecf-a6ff-9e91d2c182c9 [ 205.711923] BTRFS info (device dm-4): using crc32c (crc32c-intel) checksum algorithm [ 205.713856] BTRFS info (device dm-4): using free-space-tree [ 205.722324] BTRFS info (device dm-4): checking UUID tree So far so good, but even if we just touched any soft link of "dm-4", we will get quite some unnecessary device path updates. # touch /dev/mapper/test-scratch1 # dmesg -c [ 469.295796] BTRFS info: devid 1 device path /dev/mapper/test-scratch1 changed to /dev/dm-4 scanned by (udev-worker) (1221) [ 469.300494] BTRFS info: devid 1 device path /dev/dm-4 changed to /dev/mapper/test-scratch1 scanned by (udev-worker) (1221) Such device path rename is unnecessary and can lead to random path change due to the udev race. [CAUSE] Inside device_list_add(), we are using a very primitive way checking if the device has changed, strcmp(). Which can never handle links well, no matter if it's hard or soft links. So every different link of the same device will be treated as a different device, causing the unnecessary device path update. [FIX] Introduce a helper, is_same_device(), and use path_equal() to properly detect the same block device. So that the different soft links won't trigger the rename race. Reviewed-by: Filipe Manana Link: https://bugzilla.suse.com/show_bug.cgi?id=1230641 Reported-by: Fabian Vogt Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 920df7585b0d..5e75a4e3a5be 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -732,6 +732,42 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb) return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; } +static bool is_same_device(struct btrfs_device *device, const char *new_path) +{ + struct path old = { .mnt = NULL, .dentry = NULL }; + struct path new = { .mnt = NULL, .dentry = NULL }; + char *old_path = NULL; + bool is_same = false; + int ret; + + if (!device->name) + goto out; + + old_path = kzalloc(PATH_MAX, GFP_NOFS); + if (!old_path) + goto out; + + rcu_read_lock(); + ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX); + rcu_read_unlock(); + if (ret < 0) + goto out; + + ret = kern_path(old_path, LOOKUP_FOLLOW, &old); + if (ret) + goto out; + ret = kern_path(new_path, LOOKUP_FOLLOW, &new); + if (ret) + goto out; + if (path_equal(&old, &new)) + is_same = true; +out: + kfree(old_path); + path_put(&old); + path_put(&new); + return is_same; +} + /* * Add new device to list of registered devices * @@ -852,7 +888,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current)); - } else if (!device->name || strcmp(device->name->str, path)) { + } else if (!device->name || !is_same_device(device, path)) { /* * When FS is already mounted. * 1. If you are here and if the device->name is NULL that -- 2.50.1 From 7e06de7c83a746e58d4701e013182af133395188 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 24 Sep 2024 14:27:07 +0930 Subject: [PATCH 16/16] btrfs: canonicalize the device path before adding it [PROBLEM] Currently btrfs accepts any file path for its device, resulting some weird situation: # ./mount_by_fd /dev/test/scratch1 /mnt/btrfs/ The program has the following source code: #include #include #include int main(int argc, char *argv[]) { int fd = open(argv[1], O_RDWR); char path[256]; snprintf(path, sizeof(path), "/proc/self/fd/%d", fd); return mount(path, argv[2], "btrfs", 0, NULL); } Then we can have the following weird device path: BTRFS: device fsid 2378be81-fe12-46d2-a9e8-68cf08dd98d5 devid 1 transid 7 /proc/self/fd/3 (253:2) scanned by mount_by_fd (18440) Normally it's not a big deal, and later udev can trigger a device path rename. But if udev didn't trigger, the device path "/proc/self/fd/3" will show up in mtab. [CAUSE] For filename "/proc/self/fd/3", it means the opened file descriptor 3. In above case, it's exactly the device we want to open, aka points to "/dev/test/scratch1" which is another symlink pointing to "/dev/dm-2". Inside kernel we solve the mount source using LOOKUP_FOLLOW, which follows the symbolic link and grab the proper block device. But inside btrfs we also save the filename into btrfs_device::name, and utilize that member to report our mount source, which leads to the above situation. [FIX] Instead of unconditionally trust the path, check if the original file (not following the symbolic link) is inside "/dev/", if not, then manually lookup the path to its final destination, and use that as our device path. This allows us to still use symbolic links, like "/dev/mapper/test-scratch" from LVM2, which is required for fstests runs with LVM2 setup. And for really weird names, like the above case, we solve it to "/dev/dm-2" instead. Reviewed-by: Filipe Manana Link: https://bugzilla.suse.com/show_bug.cgi?id=1230641 Reported-by: Fabian Vogt Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 87 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5e75a4e3a5be..5895397364aa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -732,6 +732,78 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb) return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; } +/* + * We can have very weird soft links passed in. + * One example is "/proc/self/fd/", which can be a soft link to + * a block device. + * + * But it's never a good idea to use those weird names. + * Here we check if the path (not following symlinks) is a good one inside + * "/dev/". + */ +static bool is_good_dev_path(const char *dev_path) +{ + struct path path = { .mnt = NULL, .dentry = NULL }; + char *path_buf = NULL; + char *resolved_path; + bool is_good = false; + int ret; + + if (!dev_path) + goto out; + + path_buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!path_buf) + goto out; + + /* + * Do not follow soft link, just check if the original path is inside + * "/dev/". + */ + ret = kern_path(dev_path, 0, &path); + if (ret) + goto out; + resolved_path = d_path(&path, path_buf, PATH_MAX); + if (IS_ERR(resolved_path)) + goto out; + if (strncmp(resolved_path, "/dev/", strlen("/dev/"))) + goto out; + is_good = true; +out: + kfree(path_buf); + path_put(&path); + return is_good; +} + +static int get_canonical_dev_path(const char *dev_path, char *canonical) +{ + struct path path = { .mnt = NULL, .dentry = NULL }; + char *path_buf = NULL; + char *resolved_path; + int ret; + + if (!dev_path) { + ret = -EINVAL; + goto out; + } + + path_buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!path_buf) { + ret = -ENOMEM; + goto out; + } + + ret = kern_path(dev_path, LOOKUP_FOLLOW, &path); + if (ret) + goto out; + resolved_path = d_path(&path, path_buf, PATH_MAX); + ret = strscpy(canonical, resolved_path, PATH_MAX); +out: + kfree(path_buf); + path_put(&path); + return ret; +} + static bool is_same_device(struct btrfs_device *device, const char *new_path) { struct path old = { .mnt = NULL, .dentry = NULL }; @@ -1419,12 +1491,23 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, bool new_device_added = false; struct btrfs_device *device = NULL; struct file *bdev_file; + char *canonical_path = NULL; u64 bytenr; dev_t devt; int ret; lockdep_assert_held(&uuid_mutex); + if (!is_good_dev_path(path)) { + canonical_path = kmalloc(PATH_MAX, GFP_KERNEL); + if (canonical_path) { + ret = get_canonical_dev_path(path, canonical_path); + if (ret < 0) { + kfree(canonical_path); + canonical_path = NULL; + } + } + } /* * Avoid an exclusive open here, as the systemd-udev may initiate the * device scan which may race with the user's mount or mkfs command, @@ -1469,7 +1552,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, goto free_disk_super; } - device = device_list_add(path, disk_super, &new_device_added); + device = device_list_add(canonical_path ? : path, disk_super, + &new_device_added); if (!IS_ERR(device) && new_device_added) btrfs_free_stale_devices(device->devt, device); @@ -1478,6 +1562,7 @@ free_disk_super: error_bdev_put: fput(bdev_file); + kfree(canonical_path); return device; } -- 2.50.1