From 0dfcb72d33c767bbe63f4a6872108515594154d9 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 10 Oct 2024 11:36:51 -0400 Subject: [PATCH 01/16] coredump: add cond_resched() to dump_user_range The loop between elf_core_dump() and dump_user_range() can run for so long that the system shows softlockup messages, with side effects like workqueues and RCU getting stuck on the core dumping CPU. Add a cond_resched() in dump_user_range() to avoid that softlockup. Signed-off-by: Rik van Riel Link: https://lore.kernel.org/r/20241010113651.50cb0366@imladris.surriel.com Signed-off-by: Christian Brauner --- fs/coredump.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/coredump.c b/fs/coredump.c index 45737b43dda5..d48edb37bc35 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -951,6 +951,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, } else { dump_skip(cprm, PAGE_SIZE); } + cond_resched(); } dump_page_free(dump_page); return 1; -- 2.51.0 From 900bbaae67e980945dec74d36f8afe0de7556d5a Mon Sep 17 00:00:00 2001 From: Xuewen Yan Date: Fri, 26 Apr 2024 16:05:48 +0800 Subject: [PATCH 02/16] epoll: Add synchronous wakeup support for ep_poll_callback Now, the epoll only use wake_up() interface to wake up task. However, sometimes, there are epoll users which want to use the synchronous wakeup flag to hint the scheduler, such as Android binder driver. So add a wake_up_sync() define, and use the wake_up_sync() when the sync is true in ep_poll_callback(). Co-developed-by: Jing Xia Signed-off-by: Jing Xia Signed-off-by: Xuewen Yan Link: https://lore.kernel.org/r/20240426080548.8203-1-xuewen.yan@unisoc.com Tested-by: Brian Geffon Reviewed-by: Brian Geffon Reported-by: Benoit Lize Signed-off-by: Christian Brauner --- fs/eventpoll.c | 5 ++++- include/linux/wait.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 90fbab6b6f03..1a06e462b6ef 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1373,7 +1373,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v break; } } - wake_up(&ep->wq); + if (sync) + wake_up_sync(&ep->wq); + else + wake_up(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) pwake++; diff --git a/include/linux/wait.h b/include/linux/wait.h index 8aa3372f21a0..2b322a9b88a2 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -221,6 +221,7 @@ void __wake_up_pollfree(struct wait_queue_head *wq_head); #define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL) #define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL, 1) #define wake_up_all_locked(x) __wake_up_locked((x), TASK_NORMAL, 0) +#define wake_up_sync(x) __wake_up_sync(x, TASK_NORMAL) #define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL) #define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL) -- 2.51.0 From 99bdadbde9c418f29b78b7241732268dbc0a05cc Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 15 Oct 2024 22:21:54 +0200 Subject: [PATCH 03/16] acl: Realign struct posix_acl to save 8 bytes Reduce posix_acl's struct size by 8 bytes by realigning its members. Cc: Christian Brauner Reviewed-by: Jan Kara Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20241015202158.2376-1-thorsten.blum@linux.dev Signed-off-by: Christian Brauner --- include/linux/posix_acl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 0e65b3d634d9..2d6a4badd306 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -28,8 +28,8 @@ struct posix_acl_entry { struct posix_acl { refcount_t a_refcount; - struct rcu_head a_rcu; unsigned int a_count; + struct rcu_head a_rcu; struct posix_acl_entry a_entries[]; }; -- 2.51.0 From 8c6e03ffedc5463d1aa1ba89f6ceb082518a3520 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 18 Oct 2024 14:14:21 +0200 Subject: [PATCH 04/16] acl: Annotate struct posix_acl with __counted_by() Add the __counted_by compiler attribute to the flexible array member a_entries to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Use struct_size() to calculate the number of bytes to allocate for new and cloned acls and remove the local size variables. Change the posix_acl_alloc() function parameter count from int to unsigned int to match posix_acl's a_count data type. Add identifier names to the function definition to silence two checkpatch warnings. Reviewed-by: Jan Kara Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20241018121426.155247-2-thorsten.blum@linux.dev Cc: Nathan Chancellor Signed-off-by: Christian Brauner --- fs/posix_acl.c | 13 ++++++------- include/linux/posix_acl.h | 4 ++-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 6c66a37522d0..4050942ab52f 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -200,11 +200,11 @@ EXPORT_SYMBOL(posix_acl_init); * Allocate a new ACL with the specified number of entries. */ struct posix_acl * -posix_acl_alloc(int count, gfp_t flags) +posix_acl_alloc(unsigned int count, gfp_t flags) { - const size_t size = sizeof(struct posix_acl) + - count * sizeof(struct posix_acl_entry); - struct posix_acl *acl = kmalloc(size, flags); + struct posix_acl *acl; + + acl = kmalloc(struct_size(acl, a_entries, count), flags); if (acl) posix_acl_init(acl, count); return acl; @@ -220,9 +220,8 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags) struct posix_acl *clone = NULL; if (acl) { - int size = sizeof(struct posix_acl) + acl->a_count * - sizeof(struct posix_acl_entry); - clone = kmemdup(acl, size, flags); + clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count), + flags); if (clone) refcount_set(&clone->a_refcount, 1); } diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 2d6a4badd306..e2d47eb1a7f3 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -30,7 +30,7 @@ struct posix_acl { refcount_t a_refcount; unsigned int a_count; struct rcu_head a_rcu; - struct posix_acl_entry a_entries[]; + struct posix_acl_entry a_entries[] __counted_by(a_count); }; #define FOREACH_ACL_ENTRY(pa, acl, pe) \ @@ -62,7 +62,7 @@ posix_acl_release(struct posix_acl *acl) /* posix_acl.c */ extern void posix_acl_init(struct posix_acl *, int); -extern struct posix_acl *posix_acl_alloc(int, gfp_t); +extern struct posix_acl *posix_acl_alloc(unsigned int count, gfp_t flags); extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t); extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *); extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *); -- 2.51.0 From 30dac24e14b52e1787572d1d4e06eeabe8a63630 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 26 Sep 2024 16:01:21 +0200 Subject: [PATCH 05/16] fs/writeback: convert wbc_account_cgroup_owner to take a folio Most of the callers of wbc_account_cgroup_owner() are converting a folio to page before calling the function. wbc_account_cgroup_owner() is converting the page back to a folio to call mem_cgroup_css_from_folio(). Convert wbc_account_cgroup_owner() to take a folio instead of a page, and convert all callers to pass a folio directly except f2fs. Convert the page to folio for all the callers from f2fs as they were the only callers calling wbc_account_cgroup_owner() with a page. As f2fs is already in the process of converting to folios, these call sites might also soon be calling wbc_account_cgroup_owner() with a folio directly in the future. No functional changes. Only compile tested. Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20240926140121.203821-1-kernel@pankajraghav.com Acked-by: David Sterba Acked-by: Tejun Heo Signed-off-by: Christian Brauner --- Documentation/admin-guide/cgroup-v2.rst | 2 +- fs/btrfs/extent_io.c | 7 +++---- fs/btrfs/inode.c | 2 +- fs/buffer.c | 4 ++-- fs/ext4/page-io.c | 2 +- fs/f2fs/data.c | 9 ++++++--- fs/fs-writeback.c | 8 +++----- fs/iomap/buffered-io.c | 2 +- fs/mpage.c | 2 +- include/linux/writeback.h | 4 ++-- 10 files changed, 21 insertions(+), 21 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 69af2173555f..064012ea6f36 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2945,7 +2945,7 @@ following two functions. a queue (device) has been associated with the bio and before submission. - wbc_account_cgroup_owner(@wbc, @page, @bytes) + wbc_account_cgroup_owner(@wbc, @folio, @bytes) Should be called for each data segment being written out. While this function doesn't care exactly when it's called during the writeback session, it's the easiest and most diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 39c9677c47d5..4667d1e034e0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -785,7 +785,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, } if (bio_ctrl->wbc) - wbc_account_cgroup_owner(bio_ctrl->wbc, &folio->page, + wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len); size -= len; @@ -1707,7 +1707,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, ret = bio_add_folio(&bbio->bio, folio, eb->len, eb->start - folio_pos(folio)); ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len); + wbc_account_cgroup_owner(wbc, folio, eb->len); folio_unlock(folio); } else { int num_folios = num_extent_folios(eb); @@ -1721,8 +1721,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, folio_start_writeback(folio); ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio_page(folio, 0), - eb->folio_size); + wbc_account_cgroup_owner(wbc, folio, eb->folio_size); wbc->nr_to_write -= folio_nr_pages(folio); folio_unlock(folio); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index edac499fd83d..eb64f04755c2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1729,7 +1729,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * need full accuracy. Just account the whole thing * against the first page. */ - wbc_account_cgroup_owner(wbc, &locked_folio->page, + wbc_account_cgroup_owner(wbc, locked_folio, cur_end - start); async_chunk[i].locked_folio = locked_folio; locked_folio = NULL; diff --git a/fs/buffer.c b/fs/buffer.c index 1fc9a50def0b..32bd0f4c4223 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2803,7 +2803,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_write_hint = write_hint; - __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); + bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh)); bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; @@ -2813,7 +2813,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, if (wbc) { wbc_init_bio(wbc, bio); - wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size); + wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size); } submit_bio(bio); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index ad5543866d21..b7b9261fec3b 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -421,7 +421,7 @@ submit_and_retry: io_submit_init_bio(io, bh); if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh))) goto submit_and_retry; - wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size); + wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size); io->io_next_block++; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 94f7b084f601..e3ce763cce18 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -711,7 +711,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) } if (fio->io_wbc && !is_read_io(fio->op)) - wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), + PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? __read_io_type(page) : WB_DATA_TYPE(fio->page, false)); @@ -911,7 +912,8 @@ alloc_new: } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), + PAGE_SIZE); inc_page_count(fio->sbi, WB_DATA_TYPE(page, false)); @@ -1011,7 +1013,8 @@ alloc_new: } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), + PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d8bec3c1bb1f..2391b09f4ced 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -890,17 +890,16 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode); /** * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership * @wbc: writeback_control of the writeback in progress - * @page: page being written out + * @folio: folio being written out * @bytes: number of bytes being written out * - * @bytes from @page are about to written out during the writeback + * @bytes from @folio are about to written out during the writeback * controlled by @wbc. Keep the book for foreign inode detection. See * wbc_detach_inode(). */ -void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, +void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio, size_t bytes) { - struct folio *folio; struct cgroup_subsys_state *css; int id; @@ -913,7 +912,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, if (!wbc->wb || wbc->no_cgroup_owner) return; - folio = page_folio(page); css = mem_cgroup_css_from_folio(folio); /* dead cgroups shouldn't contribute to inode ownership arbitration */ if (!(css->flags & CSS_ONLINE)) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 11ea747228ae..3d1fae7d3a64 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1833,7 +1833,7 @@ new_ioend: if (ifs) atomic_add(len, &ifs->write_bytes_pending); wpc->ioend->io_size += len; - wbc_account_cgroup_owner(wbc, &folio->page, len); + wbc_account_cgroup_owner(wbc, folio, len); return 0; } diff --git a/fs/mpage.c b/fs/mpage.c index b5b5ddf9d513..82aecf372743 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -606,7 +606,7 @@ alloc_new: * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ - wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio)); + wbc_account_cgroup_owner(wbc, folio, folio_size(folio)); length = first_unmapped << blkbits; if (!bio_add_folio(bio, folio, length, 0)) { bio = mpage_bio_submit_write(bio); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d6db822e4bb3..641a057e0413 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -217,7 +217,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) __releases(&inode->i_lock); void wbc_detach_inode(struct writeback_control *wbc); -void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, +void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio, size_t bytes); int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done); @@ -324,7 +324,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) } static inline void wbc_account_cgroup_owner(struct writeback_control *wbc, - struct page *page, size_t bytes) + struct folio *folio, size_t bytes) { } -- 2.51.0 From e017671f534dd3f568db9e47b0583e853d2da9b5 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Wed, 30 Oct 2024 03:55:10 +0000 Subject: [PATCH 06/16] initramfs: avoid filename buffer overrun The initramfs filename field is defined in Documentation/driver-api/early-userspace/buffer-format.rst as: 37 cpio_file := ALGN(4) + cpio_header + filename + "\0" + ALGN(4) + data ... 55 ============= ================== ========================= 56 Field name Field size Meaning 57 ============= ================== ========================= ... 70 c_namesize 8 bytes Length of filename, including final \0 When extracting an initramfs cpio archive, the kernel's do_name() path handler assumes a zero-terminated path at @collected, passing it directly to filp_open() / init_mkdir() / init_mknod(). If a specially crafted cpio entry carries a non-zero-terminated filename and is followed by uninitialized memory, then a file may be created with trailing characters that represent the uninitialized memory. The ability to create an initramfs entry would imply already having full control of the system, so the buffer overrun shouldn't be considered a security vulnerability. Append the output of the following bash script to an existing initramfs and observe any created /initramfs_test_fname_overrunAA* path. E.g. ./reproducer.sh | gzip >> /myinitramfs It's easiest to observe non-zero uninitialized memory when the output is gzipped, as it'll overflow the heap allocated @out_buf in __gunzip(), rather than the initrd_start+initrd_size block. ---- reproducer.sh ---- nilchar="A" # change to "\0" to properly zero terminate / pad magic="070701" ino=1 mode=$(( 0100777 )) uid=0 gid=0 nlink=1 mtime=1 filesize=0 devmajor=0 devminor=1 rdevmajor=0 rdevminor=0 csum=0 fname="initramfs_test_fname_overrun" namelen=$(( ${#fname} + 1 )) # plus one to account for terminator printf "%s%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%s" \ $magic $ino $mode $uid $gid $nlink $mtime $filesize \ $devmajor $devminor $rdevmajor $rdevminor $namelen $csum $fname termpadlen=$(( 1 + ((4 - ((110 + $namelen) & 3)) % 4) )) printf "%.s${nilchar}" $(seq 1 $termpadlen) ---- reproducer.sh ---- Symlink filename fields handled in do_symlink() won't overrun past the data segment, due to the explicit zero-termination of the symlink target. Fix filename buffer overrun by aborting the initramfs FSM if any cpio entry doesn't carry a zero-terminator at the expected (name_len - 1) offset. Fixes: 1da177e4c3f41 ("Linux-2.6.12-rc2") Signed-off-by: David Disseldorp Link: https://lore.kernel.org/r/20241030035509.20194-2-ddiss@suse.de Signed-off-by: Christian Brauner --- init/initramfs.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/init/initramfs.c b/init/initramfs.c index bc911e466d5b..b2f7583bb1f5 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -360,6 +360,15 @@ static int __init do_name(void) { state = SkipIt; next_state = Reset; + + /* name_len > 0 && name_len <= PATH_MAX checked in do_header */ + if (collected[name_len - 1] != '\0') { + pr_err("initramfs name without nulterm: %.*s\n", + (int)name_len, collected); + error("malformed archive"); + return 1; + } + if (strcmp(collected, "TRAILER!!!") == 0) { free_hash(); return 0; @@ -424,6 +433,12 @@ static int __init do_copy(void) static int __init do_symlink(void) { + if (collected[name_len - 1] != '\0') { + pr_err("initramfs symlink without nulterm: %.*s\n", + (int)name_len, collected); + error("malformed archive"); + return 1; + } collected[N_ALIGN(name_len) + body_len] = '\0'; clean_path(collected, 0); init_symlink(collected + N_ALIGN(name_len), collected); -- 2.51.0 From cb80d9074f2a56c8226657b01f19656584fc3ab5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 1 Nov 2024 13:52:25 +0100 Subject: [PATCH 07/16] fs: optimize acl_permission_check() generic_permission() turned out to be costlier than expected. The reason is that acl_permission_check() performs owner checks that involve costly pointer dereferences. There's already code that skips expensive group checks if the group and other permission bits are the same wrt to the mask that is checked. This logic can be extended to also shortcut permission checking in acl_permission_check(). If the inode doesn't have POSIX ACLs than ownership doesn't matter. If there are no missing UGO permissions the permission check can be shortcut. Acked-by: Al Viro Link: https://lore.kernel.org/all/CAHk-=wgXEoAOFRkDg+grxs+p1U+QjWXLixRGmYEfd=vG+OBuFw@mail.gmail.com Signed-off-by: Linus Torvalds Signed-off-by: Christian Brauner --- fs/namei.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/fs/namei.c b/fs/namei.c index 4a4a22a08ac2..05a8d544fb35 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -326,6 +326,25 @@ static int check_acl(struct mnt_idmap *idmap, return -EAGAIN; } +/* + * Very quick optimistic "we know we have no ACL's" check. + * + * Note that this is purely for ACL_TYPE_ACCESS, and purely + * for the "we have cached that there are no ACLs" case. + * + * If this returns true, we know there are no ACLs. But if + * it returns false, we might still not have ACLs (it could + * be the is_uncached_acl() case). + */ +static inline bool no_acl_inode(struct inode *inode) +{ +#ifdef CONFIG_FS_POSIX_ACL + return likely(!READ_ONCE(inode->i_acl)); +#else + return true; +#endif +} + /** * acl_permission_check - perform basic UNIX permission checking * @idmap: idmap of the mount the inode was found from @@ -348,6 +367,28 @@ static int acl_permission_check(struct mnt_idmap *idmap, unsigned int mode = inode->i_mode; vfsuid_t vfsuid; + /* + * Common cheap case: everybody has the requested + * rights, and there are no ACLs to check. No need + * to do any owner/group checks in that case. + * + * - 'mask&7' is the requested permission bit set + * - multiplying by 0111 spreads them out to all of ugo + * - '& ~mode' looks for missing inode permission bits + * - the '!' is for "no missing permissions" + * + * After that, we just need to check that there are no + * ACL's on the inode - do the 'IS_POSIXACL()' check last + * because it will dereference the ->i_sb pointer and we + * want to avoid that if at all possible. + */ + if (!((mask & 7) * 0111 & ~mode)) { + if (no_acl_inode(inode)) + return 0; + if (!IS_POSIXACL(inode)) + return 0; + } + /* Are we the owner? If so, ACL's don't matter */ vfsuid = i_uid_into_vfsuid(idmap, inode); if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { -- 2.51.0 From fdfa4c02e6dd6c67f5cef8d78c6204e1ff7e12ca Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 3 Nov 2024 13:17:09 +0100 Subject: [PATCH 08/16] freevxfs: Replace one-element array with flexible array member Replace the deprecated one-element array with a modern flexible array member in the struct vxfs_dirblk. Link: https://github.com/KSPP/linux/issues/79 Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20241103121707.102838-3-thorsten.blum@linux.dev Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/freevxfs/vxfs_dir.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h index fbcd603365ad..8c67627f2a3d 100644 --- a/fs/freevxfs/vxfs_dir.h +++ b/fs/freevxfs/vxfs_dir.h @@ -25,7 +25,7 @@ struct vxfs_dirblk { __fs16 d_free; /* free space in dirblock */ __fs16 d_nhash; /* no of hash chains */ - __fs16 d_hash[1]; /* hash chain */ + __fs16 d_hash[]; /* hash chain */ }; /* -- 2.51.0 From 1c82587cb57687de3f18ab4b98a8850c789bedcf Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Thu, 7 Nov 2024 08:41:09 -0300 Subject: [PATCH 09/16] hfsplus: don't query the device logical block size multiple times Devices block sizes may change. One of these cases is a loop device by using ioctl LOOP_SET_BLOCK_SIZE. While this may cause other issues like IO being rejected, in the case of hfsplus, it will allocate a block by using that size and potentially write out-of-bounds when hfsplus_read_wrapper calls hfsplus_submit_bio and the latter function reads a different io_size. Using a new min_io_size initally set to sb_min_blocksize works for the purposes of the original fix, since it will be set to the max between HFSPLUS_SECTOR_SIZE and the first seen logical block size. We still use the max between HFSPLUS_SECTOR_SIZE and min_io_size in case the latter is not initialized. Tested by mounting an hfsplus filesystem with loop block sizes 512, 1024 and 4096. The produced KASAN report before the fix looks like this: [ 419.944641] ================================================================== [ 419.945655] BUG: KASAN: slab-use-after-free in hfsplus_read_wrapper+0x659/0xa0a [ 419.946703] Read of size 2 at addr ffff88800721fc00 by task repro/10678 [ 419.947612] [ 419.947846] CPU: 0 UID: 0 PID: 10678 Comm: repro Not tainted 6.12.0-rc5-00008-gdf56e0f2f3ca #84 [ 419.949007] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 [ 419.950035] Call Trace: [ 419.950384] [ 419.950676] dump_stack_lvl+0x57/0x78 [ 419.951212] ? hfsplus_read_wrapper+0x659/0xa0a [ 419.951830] print_report+0x14c/0x49e [ 419.952361] ? __virt_addr_valid+0x267/0x278 [ 419.952979] ? kmem_cache_debug_flags+0xc/0x1d [ 419.953561] ? hfsplus_read_wrapper+0x659/0xa0a [ 419.954231] kasan_report+0x89/0xb0 [ 419.954748] ? hfsplus_read_wrapper+0x659/0xa0a [ 419.955367] hfsplus_read_wrapper+0x659/0xa0a [ 419.955948] ? __pfx_hfsplus_read_wrapper+0x10/0x10 [ 419.956618] ? do_raw_spin_unlock+0x59/0x1a9 [ 419.957214] ? _raw_spin_unlock+0x1a/0x2e [ 419.957772] hfsplus_fill_super+0x348/0x1590 [ 419.958355] ? hlock_class+0x4c/0x109 [ 419.958867] ? __pfx_hfsplus_fill_super+0x10/0x10 [ 419.959499] ? __pfx_string+0x10/0x10 [ 419.960006] ? lock_acquire+0x3e2/0x454 [ 419.960532] ? bdev_name.constprop.0+0xce/0x243 [ 419.961129] ? __pfx_bdev_name.constprop.0+0x10/0x10 [ 419.961799] ? pointer+0x3f0/0x62f [ 419.962277] ? __pfx_pointer+0x10/0x10 [ 419.962761] ? vsnprintf+0x6c4/0xfba [ 419.963178] ? __pfx_vsnprintf+0x10/0x10 [ 419.963621] ? setup_bdev_super+0x376/0x3b3 [ 419.964029] ? snprintf+0x9d/0xd2 [ 419.964344] ? __pfx_snprintf+0x10/0x10 [ 419.964675] ? lock_acquired+0x45c/0x5e9 [ 419.965016] ? set_blocksize+0x139/0x1c1 [ 419.965381] ? sb_set_blocksize+0x6d/0xae [ 419.965742] ? __pfx_hfsplus_fill_super+0x10/0x10 [ 419.966179] mount_bdev+0x12f/0x1bf [ 419.966512] ? __pfx_mount_bdev+0x10/0x10 [ 419.966886] ? vfs_parse_fs_string+0xce/0x111 [ 419.967293] ? __pfx_vfs_parse_fs_string+0x10/0x10 [ 419.967702] ? __pfx_hfsplus_mount+0x10/0x10 [ 419.968073] legacy_get_tree+0x104/0x178 [ 419.968414] vfs_get_tree+0x86/0x296 [ 419.968751] path_mount+0xba3/0xd0b [ 419.969157] ? __pfx_path_mount+0x10/0x10 [ 419.969594] ? kmem_cache_free+0x1e2/0x260 [ 419.970311] do_mount+0x99/0xe0 [ 419.970630] ? __pfx_do_mount+0x10/0x10 [ 419.971008] __do_sys_mount+0x199/0x1c9 [ 419.971397] do_syscall_64+0xd0/0x135 [ 419.971761] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 419.972233] RIP: 0033:0x7c3cb812972e [ 419.972564] Code: 48 8b 0d f5 46 0d 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c2 46 0d 00 f7 d8 64 89 01 48 [ 419.974371] RSP: 002b:00007ffe30632548 EFLAGS: 00000286 ORIG_RAX: 00000000000000a5 [ 419.975048] RAX: ffffffffffffffda RBX: 00007ffe306328d8 RCX: 00007c3cb812972e [ 419.975701] RDX: 0000000020000000 RSI: 0000000020000c80 RDI: 00007ffe306325d0 [ 419.976363] RBP: 00007ffe30632720 R08: 00007ffe30632610 R09: 0000000000000000 [ 419.977034] R10: 0000000000200008 R11: 0000000000000286 R12: 0000000000000000 [ 419.977713] R13: 00007ffe306328e8 R14: 00005a0eb298bc68 R15: 00007c3cb8356000 [ 419.978375] [ 419.978589] Fixes: 6596528e391a ("hfsplus: ensure bio requests are not smaller than the hardware sectors") Signed-off-by: Thadeu Lima de Souza Cascardo Link: https://lore.kernel.org/r/20241107114109.839253-1-cascardo@igalia.com Signed-off-by: Christian Brauner --- fs/hfsplus/hfsplus_fs.h | 3 ++- fs/hfsplus/wrapper.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 59ce81dca73f..5389918bbf29 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -156,6 +156,7 @@ struct hfsplus_sb_info { /* Runtime variables */ u32 blockoffset; + u32 min_io_size; sector_t part_start; sector_t sect_count; int fs_shift; @@ -307,7 +308,7 @@ struct hfsplus_readdir_data { */ static inline unsigned short hfsplus_min_io_size(struct super_block *sb) { - return max_t(unsigned short, bdev_logical_block_size(sb->s_bdev), + return max_t(unsigned short, HFSPLUS_SB(sb)->min_io_size, HFSPLUS_SECTOR_SIZE); } diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index ce9346099c72..4b0fdc49d1fe 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -172,6 +172,8 @@ int hfsplus_read_wrapper(struct super_block *sb) if (!blocksize) goto out; + sbi->min_io_size = blocksize; + if (hfsplus_get_last_session(sb, &part_start, &part_size)) goto out; -- 2.51.0 From c4d7d90747f4e8b528c8cd0a2d9ac01dc4a9339e Mon Sep 17 00:00:00 2001 From: Mohammed Anees Date: Tue, 12 Nov 2024 17:08:34 +0530 Subject: [PATCH 10/16] fs:aio: Remove TODO comment suggesting hash or array usage in io_cancel() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The comment suggests a hash or array approach to store the active requests. Currently it iterates through all the active requests and when found deletes the requested request, in the linked list. However io_cancel() isn’t a frequently used operation, and optimizing it wouldn’t bring a substantial benefit to real users and the increased complexity of maintaining a hashtable for this would be significant and will slow down other operation. Therefore remove this TODO to avoid people spending time improving this. Signed-off-by: Mohammed Anees Link: https://lore.kernel.org/r/20241112113906.15825-1-pvmohammedanees2003@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/aio.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/aio.c b/fs/aio.c index e8920178b50f..72e3970f4225 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -2191,7 +2191,6 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { if (kiocb->ki_res.obj == obj) { ret = kiocb->ki_cancel(&kiocb->rw); -- 2.51.0 From 75ead69a717332efa70303fba85e1876793c74a9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Nov 2024 10:09:55 -0500 Subject: [PATCH 11/16] fs: don't let statmount return empty strings When one of the statmount_string() handlers doesn't emit anything to seq, the kernel currently sets the corresponding flag and emits an empty string. Given that statmount() returns a mask of accessible fields, just leave the bit unset in this case, and skip any NULL termination. If nothing was emitted to the seq, then the EOVERFLOW and EAGAIN cases aren't applicable and the function can just return immediately. Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20241111-statmount-v4-1-2eaf35d07a80@kernel.org Acked-by: Miklos Szeredi Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 9a3c251d033d..23187a414754 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5044,22 +5044,23 @@ static int statmount_string(struct kstatmount *s, u64 flag) size_t kbufsize; struct seq_file *seq = &s->seq; struct statmount *sm = &s->sm; + u32 start = seq->count; switch (flag) { case STATMOUNT_FS_TYPE: - sm->fs_type = seq->count; + sm->fs_type = start; ret = statmount_fs_type(s, seq); break; case STATMOUNT_MNT_ROOT: - sm->mnt_root = seq->count; + sm->mnt_root = start; ret = statmount_mnt_root(s, seq); break; case STATMOUNT_MNT_POINT: - sm->mnt_point = seq->count; + sm->mnt_point = start; ret = statmount_mnt_point(s, seq); break; case STATMOUNT_MNT_OPTS: - sm->mnt_opts = seq->count; + sm->mnt_opts = start; ret = statmount_mnt_opts(s, seq); break; default: @@ -5067,6 +5068,12 @@ static int statmount_string(struct kstatmount *s, u64 flag) return -EINVAL; } + /* + * If nothing was emitted, return to avoid setting the flag + * and terminating the buffer. + */ + if (seq->count == start) + return ret; if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize))) return -EOVERFLOW; if (kbufsize >= s->bufsize) -- 2.51.0 From ed9d95f691c29748f21bc019de9566b698fdfab7 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Nov 2024 10:09:56 -0500 Subject: [PATCH 12/16] fs: add the ability for statmount() to report the fs_subtype /proc/self/mountinfo prints out the sb->s_subtype after the type. This is particularly useful for disambiguating FUSE mounts (at least when the userland driver bothers to set it). Add STATMOUNT_FS_SUBTYPE and claim one of the __spare2 fields to point to the offset into the str[] array. Reviewed-by: Jan Kara Reviewed-by: Ian Kent Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20241111-statmount-v4-2-2eaf35d07a80@kernel.org Acked-by: Miklos Szeredi Signed-off-by: Christian Brauner --- fs/namespace.c | 19 +++++++++++++++++-- include/uapi/linux/mount.h | 5 ++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 23187a414754..dbd89fffd919 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5004,6 +5004,14 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq) return 0; } +static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq) +{ + struct super_block *sb = s->mnt->mnt_sb; + + if (sb->s_subtype) + seq_puts(seq, sb->s_subtype); +} + static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) { s->sm.mask |= STATMOUNT_MNT_NS_ID; @@ -5040,7 +5048,7 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) static int statmount_string(struct kstatmount *s, u64 flag) { - int ret; + int ret = 0; size_t kbufsize; struct seq_file *seq = &s->seq; struct statmount *sm = &s->sm; @@ -5063,6 +5071,10 @@ static int statmount_string(struct kstatmount *s, u64 flag) sm->mnt_opts = start; ret = statmount_mnt_opts(s, seq); break; + case STATMOUNT_FS_SUBTYPE: + sm->fs_subtype = start; + statmount_fs_subtype(s, seq); + break; default: WARN_ON_ONCE(true); return -EINVAL; @@ -5208,6 +5220,9 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!err && s->mask & STATMOUNT_MNT_OPTS) err = statmount_string(s, STATMOUNT_MNT_OPTS); + if (!err && s->mask & STATMOUNT_FS_SUBTYPE) + err = statmount_string(s, STATMOUNT_FS_SUBTYPE); + if (!err && s->mask & STATMOUNT_MNT_NS_ID) statmount_mnt_ns_id(s, ns); @@ -5229,7 +5244,7 @@ static inline bool retry_statmount(const long ret, size_t *seq_size) } #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ - STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS) + STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | STATMOUNT_FS_SUBTYPE) static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 225bc366ffcb..2e939dddf9cb 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -173,7 +173,9 @@ struct statmount { __u32 mnt_root; /* [str] Root of mount relative to root of fs */ __u32 mnt_point; /* [str] Mountpoint relative to current root */ __u64 mnt_ns_id; /* ID of the mount namespace */ - __u64 __spare2[49]; + __u32 fs_subtype; /* [str] Subtype of fs_type (if any) */ + __u32 __spare1[1]; + __u64 __spare2[48]; char str[]; /* Variable size part containing strings */ }; @@ -207,6 +209,7 @@ struct mnt_id_req { #define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */ #define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */ #define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */ +#define STATMOUNT_FS_SUBTYPE 0x00000100U /* Want/got fs_subtype */ /* * Special @mnt_id values that can be passed to listmount -- 2.51.0 From 4d7485cff59951c83aa2b6891b24d68b76d86f6f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Nov 2024 06:43:54 +0100 Subject: [PATCH 13/16] writeback: add a __releases annoation to wbc_attach_and_unlock_inode This shuts up a sparse lock context tracking warning. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241112054403.1470586-2-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d8bec3c1bb1f..3fb115ae44b1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -733,6 +733,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) */ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) + __releases(&inode->i_lock) { if (!inode_cgwb_enabled(inode)) { spin_unlock(&inode->i_lock); -- 2.51.0 From 8182a8b39aa227f5b99b8d4d18f296b82ce4b94c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Nov 2024 06:43:55 +0100 Subject: [PATCH 14/16] writeback: wbc_attach_fdatawrite_inode out of line This allows exporting this high-level interface only while keeping wbc_attach_and_unlock_inode private in fs-writeback.c and unexporting __inode_attach_wb. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241112054403.1470586-3-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 31 +++++++++++++++++++++++++++---- include/linux/writeback.h | 28 ++-------------------------- 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3fb115ae44b1..77db1f10023e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -290,7 +290,6 @@ void __inode_attach_wb(struct inode *inode, struct folio *folio) if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) wb_put(wb); } -EXPORT_SYMBOL_GPL(__inode_attach_wb); /** * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list @@ -731,8 +730,8 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) * writeback completion, wbc_detach_inode() should be called. This is used * to track the cgroup writeback context. */ -void wbc_attach_and_unlock_inode(struct writeback_control *wbc, - struct inode *inode) +static void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) __releases(&inode->i_lock) { if (!inode_cgwb_enabled(inode)) { @@ -763,7 +762,24 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css))) inode_switch_wbs(inode, wbc->wb_id); } -EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode); + +/** + * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite + * @wbc: writeback_control of interest + * @inode: target inode + * + * This function is to be used by __filemap_fdatawrite_range(), which is an + * alternative entry point into writeback code, and first ensures @inode is + * associated with a bdi_writeback and attaches it to @wbc. + */ +void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, + struct inode *inode) +{ + spin_lock(&inode->i_lock); + inode_attach_wb(inode, NULL); + wbc_attach_and_unlock_inode(wbc, inode); +} +EXPORT_SYMBOL_GPL(wbc_attach_fdatawrite_inode); /** * wbc_detach_inode - disassociate wbc from inode and perform foreign detection @@ -1228,6 +1244,13 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, } } +static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock) +{ + spin_unlock(&inode->i_lock); +} + #endif /* CONFIG_CGROUP_WRITEBACK */ /* diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d6db822e4bb3..aee3e1b4c50f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -213,9 +213,6 @@ static inline void wait_on_inode(struct inode *inode) #include void __inode_attach_wb(struct inode *inode, struct folio *folio); -void wbc_attach_and_unlock_inode(struct writeback_control *wbc, - struct inode *inode) - __releases(&inode->i_lock); void wbc_detach_inode(struct writeback_control *wbc); void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, size_t bytes); @@ -254,22 +251,8 @@ static inline void inode_detach_wb(struct inode *inode) } } -/** - * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite - * @wbc: writeback_control of interest - * @inode: target inode - * - * This function is to be used by __filemap_fdatawrite_range(), which is an - * alternative entry point into writeback code, and first ensures @inode is - * associated with a bdi_writeback and attaches it to @wbc. - */ -static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, - struct inode *inode) -{ - spin_lock(&inode->i_lock); - inode_attach_wb(inode, NULL); - wbc_attach_and_unlock_inode(wbc, inode); -} +void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, + struct inode *inode); /** * wbc_init_bio - writeback specific initializtion of bio @@ -303,13 +286,6 @@ static inline void inode_detach_wb(struct inode *inode) { } -static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc, - struct inode *inode) - __releases(&inode->i_lock) -{ - spin_unlock(&inode->i_lock); -} - static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, struct inode *inode) { -- 2.51.0 From 44010543fc8bedad172aa5b6c43480e5d2124497 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Nov 2024 10:09:57 -0500 Subject: [PATCH 15/16] fs: add the ability for statmount() to report the sb_source /proc/self/mountinfo displays the source for the mount, but statmount() doesn't yet have a way to return it. Add a new STATMOUNT_SB_SOURCE flag, claim the 32-bit __spare1 field to hold the offset into the str[] array. Reviewed-by: Jan Kara Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20241111-statmount-v4-3-2eaf35d07a80@kernel.org Acked-by: Miklos Szeredi Signed-off-by: Christian Brauner --- fs/namespace.c | 36 +++++++++++++++++++++++++++++++++++- include/uapi/linux/mount.h | 3 ++- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index dbd89fffd919..d32b5afa99dc 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5012,6 +5012,32 @@ static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq) seq_puts(seq, sb->s_subtype); } +static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq) +{ + struct super_block *sb = s->mnt->mnt_sb; + struct mount *r = real_mount(s->mnt); + + if (sb->s_op->show_devname) { + size_t start = seq->count; + int ret; + + ret = sb->s_op->show_devname(seq, s->mnt->mnt_root); + if (ret) + return ret; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + /* Unescape the result */ + seq->buf[seq->count] = '\0'; + seq->count = start; + seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); + } else if (r->mnt_devname) { + seq_puts(seq, r->mnt_devname); + } + return 0; +} + static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) { s->sm.mask |= STATMOUNT_MNT_NS_ID; @@ -5075,6 +5101,10 @@ static int statmount_string(struct kstatmount *s, u64 flag) sm->fs_subtype = start; statmount_fs_subtype(s, seq); break; + case STATMOUNT_SB_SOURCE: + sm->sb_source = start; + ret = statmount_sb_source(s, seq); + break; default: WARN_ON_ONCE(true); return -EINVAL; @@ -5223,6 +5253,9 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!err && s->mask & STATMOUNT_FS_SUBTYPE) err = statmount_string(s, STATMOUNT_FS_SUBTYPE); + if (!err && s->mask & STATMOUNT_SB_SOURCE) + err = statmount_string(s, STATMOUNT_SB_SOURCE); + if (!err && s->mask & STATMOUNT_MNT_NS_ID) statmount_mnt_ns_id(s, ns); @@ -5244,7 +5277,8 @@ static inline bool retry_statmount(const long ret, size_t *seq_size) } #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ - STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | STATMOUNT_FS_SUBTYPE) + STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \ + STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE) static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 2e939dddf9cb..2b49e9131d77 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -174,7 +174,7 @@ struct statmount { __u32 mnt_point; /* [str] Mountpoint relative to current root */ __u64 mnt_ns_id; /* ID of the mount namespace */ __u32 fs_subtype; /* [str] Subtype of fs_type (if any) */ - __u32 __spare1[1]; + __u32 sb_source; /* [str] Source string of the mount */ __u64 __spare2[48]; char str[]; /* Variable size part containing strings */ }; @@ -210,6 +210,7 @@ struct mnt_id_req { #define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */ #define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */ #define STATMOUNT_FS_SUBTYPE 0x00000100U /* Want/got fs_subtype */ +#define STATMOUNT_SB_SOURCE 0x00000200U /* Want/got sb_source */ /* * Special @mnt_id values that can be passed to listmount -- 2.51.0 From 2f4d4503e9e5ab765a7948f98bc5deef7850f607 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 12 Nov 2024 11:10:04 +0100 Subject: [PATCH 16/16] statmount: add flag to retrieve unescaped options Filesystem options can be retrieved with STATMOUNT_MNT_OPTS, which returns a string of comma separated options, where some characters are escaped using the \OOO notation. Add a new flag, STATMOUNT_OPT_ARRAY, which instead returns the raw option values separated with '\0' charaters. Since escaped charaters are rare, this inteface is preferable for non-libmount users which likley don't want to deal with option de-escaping. Example code: if (st->mask & STATMOUNT_OPT_ARRAY) { const char *opt = st->str + st->opt_array; for (unsigned int i = 0; i < st->opt_num; i++) { printf("opt_array[%i]: <%s>\n", i, opt); opt += strlen(opt) + 1; } } Example ouput: (1) mnt_opts: (2) opt_array[0]: opt_array[1]: opt_array[2]: opt_array[3]: opt_array[4]: opt_array[5]: Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20241112101006.30715-1-mszeredi@redhat.com Acked-by: Jeff Layton [brauner: tweak variable naming and parsing add example output] Signed-off-by: Christian Brauner --- fs/namespace.c | 47 +++++++++++++++++++++++++++++++++++++- include/uapi/linux/mount.h | 7 ++++-- 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index d32b5afa99dc..4f39c4aba85d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5072,6 +5072,43 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) return 0; } +static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq) +{ + struct vfsmount *mnt = s->mnt; + struct super_block *sb = mnt->mnt_sb; + size_t start = seq->count; + char *buf_start, *buf_end, *opt_start, *opt_end; + u32 count = 0; + int err; + + if (!sb->s_op->show_options) + return 0; + + buf_start = seq->buf + start; + err = sb->s_op->show_options(seq, mnt->mnt_root); + if (err) + return err; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + if (seq->count == start) + return 0; + + buf_end = seq->buf + seq->count; + *buf_end = '\0'; + for (opt_start = buf_start + 1; opt_start < buf_end; opt_start = opt_end + 1) { + opt_end = strchrnul(opt_start, ','); + *opt_end = '\0'; + buf_start += string_unescape(opt_start, buf_start, 0, UNESCAPE_OCTAL) + 1; + if (WARN_ON_ONCE(++count == 0)) + return -EOVERFLOW; + } + seq->count = buf_start - 1 - seq->buf; + s->sm.opt_num = count; + return 0; +} + static int statmount_string(struct kstatmount *s, u64 flag) { int ret = 0; @@ -5097,6 +5134,10 @@ static int statmount_string(struct kstatmount *s, u64 flag) sm->mnt_opts = start; ret = statmount_mnt_opts(s, seq); break; + case STATMOUNT_OPT_ARRAY: + sm->opt_array = start; + ret = statmount_opt_array(s, seq); + break; case STATMOUNT_FS_SUBTYPE: sm->fs_subtype = start; statmount_fs_subtype(s, seq); @@ -5250,6 +5291,9 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!err && s->mask & STATMOUNT_MNT_OPTS) err = statmount_string(s, STATMOUNT_MNT_OPTS); + if (!err && s->mask & STATMOUNT_OPT_ARRAY) + err = statmount_string(s, STATMOUNT_OPT_ARRAY); + if (!err && s->mask & STATMOUNT_FS_SUBTYPE) err = statmount_string(s, STATMOUNT_FS_SUBTYPE); @@ -5278,7 +5322,8 @@ static inline bool retry_statmount(const long ret, size_t *seq_size) #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \ - STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE) + STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \ + STATMOUNT_OPT_ARRAY) static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 2b49e9131d77..c0fda4604187 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -154,7 +154,7 @@ struct mount_attr { */ struct statmount { __u32 size; /* Total size, including strings */ - __u32 mnt_opts; /* [str] Mount options of the mount */ + __u32 mnt_opts; /* [str] Options (comma separated, escaped) */ __u64 mask; /* What results were written */ __u32 sb_dev_major; /* Device ID */ __u32 sb_dev_minor; @@ -175,7 +175,9 @@ struct statmount { __u64 mnt_ns_id; /* ID of the mount namespace */ __u32 fs_subtype; /* [str] Subtype of fs_type (if any) */ __u32 sb_source; /* [str] Source string of the mount */ - __u64 __spare2[48]; + __u32 opt_num; /* Number of fs options */ + __u32 opt_array; /* [str] Array of nul terminated fs options */ + __u64 __spare2[47]; char str[]; /* Variable size part containing strings */ }; @@ -211,6 +213,7 @@ struct mnt_id_req { #define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */ #define STATMOUNT_FS_SUBTYPE 0x00000100U /* Want/got fs_subtype */ #define STATMOUNT_SB_SOURCE 0x00000200U /* Want/got sb_source */ +#define STATMOUNT_OPT_ARRAY 0x00000400U /* Want/got opt_... */ /* * Special @mnt_id values that can be passed to listmount -- 2.51.0