From 40384c840ea1944d7c5a392e8975ed088ecf0b37 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 1 Dec 2024 14:28:56 -0800 Subject: [PATCH 01/16] Linux 6.13-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index e34a97473fb6..93ab62cef244 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 -PATCHLEVEL = 12 +PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.50.1 From a48bdf80ce6938f8c1de6a56fed7c4f6f46904e9 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sat, 16 Nov 2024 07:41:28 +0100 Subject: [PATCH 02/16] fs: delay sysctl_nr_open check in expand_files() Suppose a thread sharing the table started a resize, while sysctl_nr_open got lowered to a value which prohibits it. This is still going to go through with and without the patch, which is fine. Further suppose another thread shows up to do a matching expansion while resize_in_progress == true. It is going to error out since it performs the sysctl_nr_open check *before* finding out if there is an expansion in progress. But the aformentioned thread is going to succeded, so the error is spurious (and it would not happen if the thread showed up a little bit later). Checking the sysctl *after* we know there are no pending updates sorts it out. While here annotate the thing as unlikely. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20241116064128.280870-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/file.c b/fs/file.c index fb1011cf6b4a..019fb9acf91b 100644 --- a/fs/file.c +++ b/fs/file.c @@ -278,10 +278,6 @@ repeat: if (nr < fdt->max_fds) return 0; - /* Can we expand? */ - if (nr >= sysctl_nr_open) - return -EMFILE; - if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); wait_event(files->resize_wait, !files->resize_in_progress); @@ -289,6 +285,10 @@ repeat: goto repeat; } + /* Can we expand? */ + if (unlikely(nr >= sysctl_nr_open)) + return -EMFILE; + /* All good, so we try */ files->resize_in_progress = true; error = expand_fdtable(files, nr); -- 2.50.1 From ea382199071931d19aac5f688b543e07360e2b64 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 20 Nov 2024 12:20:34 +0100 Subject: [PATCH 03/16] vfs: support caching symlink lengths in inodes When utilized it dodges strlen() in vfs_readlink(), giving about 1.5% speed up when issuing readlink on /initrd.img on ext4. Filesystems opt in by calling inode_set_cached_link() when creating an inode. The size is stored in a new union utilizing the same space as i_devices, thus avoiding growing the struct or taking up any more space. Churn-wise the current readlink_copy() helper is patched to accept the size instead of calculating it. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20241120112037.822078-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/namei.c | 34 +++++++++++++++++++--------------- fs/proc/namespaces.c | 2 +- include/linux/fs.h | 15 +++++++++++++-- security/apparmor/apparmorfs.c | 2 +- 4 files changed, 34 insertions(+), 19 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 9d30c7aa9aa6..e56c29a22d26 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -5272,19 +5272,16 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna getname(newname), 0); } -int readlink_copy(char __user *buffer, int buflen, const char *link) +int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen) { - int len = PTR_ERR(link); - if (IS_ERR(link)) - goto out; + int copylen; - len = strlen(link); - if (len > (unsigned) buflen) - len = buflen; - if (copy_to_user(buffer, link, len)) - len = -EFAULT; -out: - return len; + copylen = linklen; + if (unlikely(copylen > (unsigned) buflen)) + copylen = buflen; + if (copy_to_user(buffer, link, copylen)) + copylen = -EFAULT; + return copylen; } /** @@ -5304,6 +5301,9 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) const char *link; int res; + if (inode->i_opflags & IOP_CACHED_LINK) + return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen); + if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) { if (unlikely(inode->i_op->readlink)) return inode->i_op->readlink(dentry, buffer, buflen); @@ -5322,7 +5322,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) if (IS_ERR(link)) return PTR_ERR(link); } - res = readlink_copy(buffer, buflen, link); + res = readlink_copy(buffer, buflen, link, strlen(link)); do_delayed_call(&done); return res; } @@ -5391,10 +5391,14 @@ EXPORT_SYMBOL(page_put_link); int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { + const char *link; + int res; + DEFINE_DELAYED_CALL(done); - int res = readlink_copy(buffer, buflen, - page_get_link(dentry, d_inode(dentry), - &done)); + link = page_get_link(dentry, d_inode(dentry), &done); + res = PTR_ERR(link); + if (!IS_ERR(link)) + res = readlink_copy(buffer, buflen, link, strlen(link)); do_delayed_call(&done); return res; } diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 8e159fc78c0a..c610224faf10 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -83,7 +83,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { res = ns_get_name(name, sizeof(name), task, ns_ops); if (res >= 0) - res = readlink_copy(buffer, buflen, name); + res = readlink_copy(buffer, buflen, name, strlen(name)); } put_task_struct(task); return res; diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..2cc98de5af43 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -626,6 +626,7 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 #define IOP_MGTIME 0x0020 +#define IOP_CACHED_LINK 0x0040 /* * Keep mostly read-only and often accessed (especially for @@ -723,7 +724,10 @@ struct inode { }; struct file_lock_context *i_flctx; struct address_space i_data; - struct list_head i_devices; + union { + struct list_head i_devices; + int i_linklen; + }; union { struct pipe_inode_info *i_pipe; struct cdev *i_cdev; @@ -749,6 +753,13 @@ struct inode { void *i_private; /* fs or device private pointer */ } __randomize_layout; +static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) +{ + inode->i_link = link; + inode->i_linklen = linklen; + inode->i_opflags |= IOP_CACHED_LINK; +} + /* * Get bit address from inode->i_state to use with wait_var_event() * infrastructre. @@ -3351,7 +3362,7 @@ extern const struct file_operations generic_ro_fops; #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) -extern int readlink_copy(char __user *, int, const char *); +extern int readlink_copy(char __user *, int, const char *, int); extern int page_readlink(struct dentry *, char __user *, int); extern const char *page_get_link(struct dentry *, struct inode *, struct delayed_call *); diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 2c0185ebc900..c07d150685d7 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -2612,7 +2612,7 @@ static int policy_readlink(struct dentry *dentry, char __user *buffer, res = snprintf(name, sizeof(name), "%s:[%lu]", AAFS_NAME, d_inode(dentry)->i_ino); if (res > 0 && res < sizeof(name)) - res = readlink_copy(buffer, buflen, name); + res = readlink_copy(buffer, buflen, name, strlen(name)); else res = -ENOENT; -- 2.50.1 From c7175957b28a69947dd1d36e8b19ac0d3c1a5d7d Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 27 Jul 2023 20:03:55 +0200 Subject: [PATCH 04/16] seqlock: annotate spinning as unlikely() in __read_seqcount_begin Annotation already used to be there, but got lost in 52ac39e5db5148f7 ("seqlock: seqcount_t: Implement all read APIs as statement expressions"). Does not look like it was intentional. Without it gcc 12 decides to compile the following in path_init: nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); into 2 cases of conditional jumps forward if the value is even, aka branch prediction miss by default in the common case on x86-64. With the patch jumps are only for odd values. before: [snip] mov 0x104fe96(%rip),%eax # 0xffffffff82409680 test $0x1,%al je 0xffffffff813b97fa pause mov 0x104fe8a(%rip),%eax # 0xffffffff82409680 test $0x1,%al jne 0xffffffff813b97ee mov %eax,0x48(%rbx) mov 0x104fdfd(%rip),%eax # 0xffffffff82409600 test $0x1,%al je 0xffffffff813b9813 pause mov 0x104fdf1(%rip),%eax # 0xffffffff82409600 test $0x1,%al jne 0xffffffff813b9807 [/snip] after: [snip] mov 0x104fec6(%rip),%eax # 0xffffffff82409680 test $0x1,%al jne 0xffffffff813b99af mov %eax,0x48(%rbx) mov 0x104fe35(%rip),%eax # 0xffffffff82409600 test $0x1,%al jne 0xffffffff813b999d [/snip] Interestingly .text gets slightly smaller (as reported by size(1)): before: 20702563 after: 20702429 Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20230727180355.813995-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/seqlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5298765d6ca4..eb20dcaa51b5 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -272,7 +272,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) ({ \ unsigned __seq; \ \ - while ((__seq = seqprop_sequence(s)) & 1) \ + while (unlikely((__seq = seqprop_sequence(s)) & 1)) \ cpu_relax(); \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ -- 2.50.1 From bae80473f7b0b25772619e7692019b1549d4a82c Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 20 Nov 2024 12:20:35 +0100 Subject: [PATCH 05/16] ext4: use inode_set_cached_link() Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20241120112037.822078-3-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/ext4/inode.c | 3 ++- fs/ext4/namei.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 89aade6f45f6..7c54ae5fcbd4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5006,10 +5006,11 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (IS_ENCRYPTED(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { - inode->i_link = (char *)ei->i_data; inode->i_op = &ext4_fast_symlink_inode_operations; nd_terminate_link(ei->i_data, inode->i_size, sizeof(ei->i_data) - 1); + inode_set_cached_link(inode, (char *)ei->i_data, + inode->i_size); } else { inode->i_op = &ext4_symlink_inode_operations; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index bcf2737078b8..536d56d15072 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3418,7 +3418,6 @@ retry: inode->i_op = &ext4_symlink_inode_operations; } else { inode->i_op = &ext4_fast_symlink_inode_operations; - inode->i_link = (char *)&EXT4_I(inode)->i_data; } } @@ -3434,6 +3433,9 @@ retry: disk_link.len); inode->i_size = disk_link.len - 1; EXT4_I(inode)->i_disksize = inode->i_size; + if (!IS_ENCRYPTED(inode)) + inode_set_cached_link(inode, (char *)&EXT4_I(inode)->i_data, + inode->i_size); } err = ext4_add_nondir(handle, dentry, &inode); if (handle) -- 2.50.1 From 135ec43eb29c68ed26e2d10f221d43f7d9139a8f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 20 Nov 2024 17:13:52 -0800 Subject: [PATCH 06/16] fiemap: use kernel-doc includes in fiemap docbook Add some kernel-doc notation to structs in fiemap header files then pull that into Documentation/filesystems/fiemap.rst instead of duplicating the header file structs in fiemap.rst. This helps to future-proof fiemap.rst against struct changes. Add missing flags documentation from header files into fiemap.rst for FIEMAP_FLAG_CACHE and FIEMAP_EXTENT_SHARED. Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20241121011352.201907-1-rdunlap@infradead.org Cc: Christoph Hellwig Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: Matthew Wilcox Signed-off-by: Christian Brauner --- Documentation/filesystems/fiemap.rst | 49 +++++++++------------------- include/linux/fiemap.h | 16 ++++++--- include/uapi/linux/fiemap.h | 47 ++++++++++++++++++-------- 3 files changed, 59 insertions(+), 53 deletions(-) diff --git a/Documentation/filesystems/fiemap.rst b/Documentation/filesystems/fiemap.rst index 93fc96f760aa..23b3ed229e49 100644 --- a/Documentation/filesystems/fiemap.rst +++ b/Documentation/filesystems/fiemap.rst @@ -12,21 +12,10 @@ returns a list of extents. Request Basics -------------- -A fiemap request is encoded within struct fiemap:: - - struct fiemap { - __u64 fm_start; /* logical offset (inclusive) at - * which to start mapping (in) */ - __u64 fm_length; /* logical length of mapping which - * userspace cares about (in) */ - __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ - __u32 fm_mapped_extents; /* number of extents that were - * mapped (out) */ - __u32 fm_extent_count; /* size of fm_extents array (in) */ - __u32 fm_reserved; - struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */ - }; +A fiemap request is encoded within struct fiemap: +.. kernel-doc:: include/uapi/linux/fiemap.h + :identifiers: fiemap fm_start, and fm_length specify the logical range within the file which the process would like mappings for. Extents returned mirror @@ -60,6 +49,8 @@ FIEMAP_FLAG_XATTR If this flag is set, the extents returned will describe the inodes extended attribute lookup tree, instead of its data tree. +FIEMAP_FLAG_CACHE + This flag requests caching of the extents. Extent Mapping -------------- @@ -77,18 +68,10 @@ complete the requested range and will not have the FIEMAP_EXTENT_LAST flag set (see the next section on extent flags). Each extent is described by a single fiemap_extent structure as -returned in fm_extents:: - - struct fiemap_extent { - __u64 fe_logical; /* logical offset in bytes for the start of - * the extent */ - __u64 fe_physical; /* physical offset in bytes for the start - * of the extent */ - __u64 fe_length; /* length in bytes for the extent */ - __u64 fe_reserved64[2]; - __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ - __u32 fe_reserved[3]; - }; +returned in fm_extents: + +.. kernel-doc:: include/uapi/linux/fiemap.h + :identifiers: fiemap_extent All offsets and lengths are in bytes and mirror those on disk. It is valid for an extents logical offset to start before the request or its logical @@ -175,6 +158,8 @@ FIEMAP_EXTENT_MERGED userspace would be highly inefficient, the kernel will try to merge most adjacent blocks into 'extents'. +FIEMAP_EXTENT_SHARED + This flag is set to request that space be shared with other files. VFS -> File System Implementation --------------------------------- @@ -191,14 +176,10 @@ each discovered extent:: u64 len); ->fiemap is passed struct fiemap_extent_info which describes the -fiemap request:: - - struct fiemap_extent_info { - unsigned int fi_flags; /* Flags as passed from user */ - unsigned int fi_extents_mapped; /* Number of mapped extents */ - unsigned int fi_extents_max; /* Size of fiemap_extent array */ - struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent array */ - }; +fiemap request: + +.. kernel-doc:: include/linux/fiemap.h + :identifiers: fiemap_extent_info It is intended that the file system should not need to access any of this structure directly. Filesystem handlers should be tolerant to signals and return diff --git a/include/linux/fiemap.h b/include/linux/fiemap.h index c50882f19235..966092ffa89a 100644 --- a/include/linux/fiemap.h +++ b/include/linux/fiemap.h @@ -5,12 +5,18 @@ #include #include +/** + * struct fiemap_extent_info - fiemap request to a filesystem + * @fi_flags: Flags as passed from user + * @fi_extents_mapped: Number of mapped extents + * @fi_extents_max: Size of fiemap_extent array + * @fi_extents_start: Start of fiemap_extent array + */ struct fiemap_extent_info { - unsigned int fi_flags; /* Flags as passed from user */ - unsigned int fi_extents_mapped; /* Number of mapped extents */ - unsigned int fi_extents_max; /* Size of fiemap_extent array */ - struct fiemap_extent __user *fi_extents_start; /* Start of - fiemap_extent array */ + unsigned int fi_flags; + unsigned int fi_extents_mapped; + unsigned int fi_extents_max; + struct fiemap_extent __user *fi_extents_start; }; int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, diff --git a/include/uapi/linux/fiemap.h b/include/uapi/linux/fiemap.h index 24ca0c00cae3..9d9e8ae32b41 100644 --- a/include/uapi/linux/fiemap.h +++ b/include/uapi/linux/fiemap.h @@ -14,37 +14,56 @@ #include +/** + * struct fiemap_extent - description of one fiemap extent + * @fe_logical: byte offset of the extent in the file + * @fe_physical: byte offset of extent on disk + * @fe_length: length in bytes for this extent + * @fe_flags: FIEMAP_EXTENT_* flags for this extent + */ struct fiemap_extent { - __u64 fe_logical; /* logical offset in bytes for the start of - * the extent from the beginning of the file */ - __u64 fe_physical; /* physical offset in bytes for the start - * of the extent from the beginning of the disk */ - __u64 fe_length; /* length in bytes for this extent */ + __u64 fe_logical; + __u64 fe_physical; + __u64 fe_length; + /* private: */ __u64 fe_reserved64[2]; - __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ + /* public: */ + __u32 fe_flags; + /* private: */ __u32 fe_reserved[3]; }; +/** + * struct fiemap - file extent mappings + * @fm_start: byte offset (inclusive) at which to start mapping (in) + * @fm_length: logical length of mapping which userspace wants (in) + * @fm_flags: FIEMAP_FLAG_* flags for request (in/out) + * @fm_mapped_extents: number of extents that were mapped (out) + * @fm_extent_count: size of fm_extents array (in) + * @fm_extents: array of mapped extents (out) + */ struct fiemap { - __u64 fm_start; /* logical offset (inclusive) at - * which to start mapping (in) */ - __u64 fm_length; /* logical length of mapping which - * userspace wants (in) */ - __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ - __u32 fm_mapped_extents;/* number of extents that were mapped (out) */ - __u32 fm_extent_count; /* size of fm_extents array (in) */ + __u64 fm_start; + __u64 fm_length; + __u32 fm_flags; + __u32 fm_mapped_extents; + __u32 fm_extent_count; + /* private: */ __u32 fm_reserved; - struct fiemap_extent fm_extents[]; /* array of mapped extents (out) */ + /* public: */ + struct fiemap_extent fm_extents[]; }; #define FIEMAP_MAX_OFFSET (~0ULL) +/* flags used in fm_flags: */ #define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ #define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ #define FIEMAP_FLAG_CACHE 0x00000004 /* request caching of the extents */ #define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) +/* flags used in fe_flags: */ #define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */ #define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */ #define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending. -- 2.50.1 From 657e726e0cb9ba4f583ae7d226100bc43cc43a41 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 20 Nov 2024 12:20:36 +0100 Subject: [PATCH 07/16] tmpfs: use inode_set_cached_link() Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20241120112037.822078-4-mjguzik@gmail.com Signed-off-by: Christian Brauner --- mm/shmem.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index ccb9629a0f70..7beba4c1be5a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3914,6 +3914,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, int len; struct inode *inode; struct folio *folio; + char *link; len = strlen(symname) + 1; if (len > PAGE_SIZE) @@ -3935,12 +3936,13 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { - inode->i_link = kmemdup(symname, len, GFP_KERNEL); - if (!inode->i_link) { + link = kmemdup(symname, len, GFP_KERNEL); + if (!link) { error = -ENOMEM; goto out_remove_offset; } inode->i_op = &shmem_short_symlink_operations; + inode_set_cached_link(inode, link, len - 1); } else { inode_nohighmem(inode); inode->i_mapping->a_ops = &shmem_aops; -- 2.50.1 From d727935cad9f6f52c8d184968f9720fdc966c669 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Sun, 24 Nov 2024 11:46:36 +0800 Subject: [PATCH 08/16] fs: fix proc_handler for sysctl_nr_open Use proc_douintvec_minmax() instead of proc_dointvec_minmax() to handle sysctl_nr_open, because its data type is unsigned int, not int. Fixes: 9b80a184eaad ("fs/file: more unsigned file descriptors") Signed-off-by: Jinliang Zheng Link: https://lore.kernel.org/r/20241124034636.325337-1-alexjlzheng@tencent.com Signed-off-by: Christian Brauner --- fs/file_table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/file_table.c b/fs/file_table.c index 976736be47cb..502b81f614d9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -128,7 +128,7 @@ static struct ctl_table fs_stat_sysctls[] = { .data = &sysctl_nr_open, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_douintvec_minmax, .extra1 = &sysctl_nr_open_min, .extra2 = &sysctl_nr_open_max, }, -- 2.50.1 From 1197867a5dc8924d83ce484b6fd361ca32423dac Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 17:54:41 +0000 Subject: [PATCH 09/16] watch_queue: Use page->private instead of page->index We are attempting to eliminate page->index, so use page->private instead. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20241125175443.2911738-1-willy@infradead.org Signed-off-by: Christian Brauner --- kernel/watch_queue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 1895fbc32bcb..5267adeaa403 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -71,7 +71,7 @@ static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, bit /= WATCH_QUEUE_NOTE_SIZE; page = buf->page; - bit += page->index; + bit += page->private; set_bit(bit, wqueue->notes_bitmap); generic_pipe_buf_release(pipe, buf); @@ -278,7 +278,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) pages[i] = alloc_page(GFP_KERNEL); if (!pages[i]) goto error_p; - pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; + pages[i]->private = i * WATCH_QUEUE_NOTES_PER_PAGE; } bitmap = bitmap_alloc(nr_notes, GFP_KERNEL); -- 2.50.1 From 9b7da575f85962c44abe7dc245b0a58179ad2c45 Mon Sep 17 00:00:00 2001 From: shao mingyin Date: Wed, 23 Oct 2024 13:58:50 +0800 Subject: [PATCH 10/16] file: flush delayed work in delayed fput() The fput() of file rcS might not have completed causing issues when executing the file. rcS is opened in do_populate_rootfs before executed. At the end of do_populate_rootfs() flush_delayed_fput() is called. Now do_populate_rootfs() assumes that all fput()s caused by do_populate_rootfs() have completed. But flush_delayed_fput() can only ensure that fput() on the current delayed_fput_list has finished. Any file that has been removed from delayed_fput_list asynchronously in the meantime might not have completed causing the exec to fail. do_populate_rootfs delayed_fput_list delayed_fput execve fput() a fput() a->b fput() a->b->rcS __fput(a) fput() c fput() c->d __fput(b) flush_delayed_fput __fput(c) __fput(d) __fput(b) __fput(b) execve(rcS) Ensure that all delayed work is done by calling flush_delayed_work() in flush_delayed_fput() explicitly. Signed-off-by: Chen Lin Signed-off-by: Shao Mingyin Link: https://lore.kernel.org/r/20241023135850067m3w2R0UXESiVCYz_wdAoT@zte.com.cn Cc: Yang Yang Cc: Yang Tao Cc: Xu Xin [brauner: rewrite commit message] Signed-off-by: Christian Brauner --- fs/file_table.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index 502b81f614d9..a32171d2b83f 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -478,6 +478,8 @@ static void ____fput(struct callback_head *work) __fput(container_of(work, struct file, f_task_work)); } +static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); + /* * If kernel thread really needs to have the final fput() it has done * to complete, call this. The only user right now is the boot - we @@ -491,11 +493,10 @@ static void ____fput(struct callback_head *work) void flush_delayed_fput(void) { delayed_fput(NULL); + flush_delayed_work(&delayed_fput_work); } EXPORT_SYMBOL_GPL(flush_delayed_fput); -static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); - void fput(struct file *file) { if (file_ref_put(&file->f_ref)) { -- 2.50.1 From 3212a8f34021a16d13ace91d3ac5f451ef8d0103 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sat, 30 Nov 2024 06:17:11 +0100 Subject: [PATCH 11/16] fs: use a consume fence in mnt_idmap() The routine is used in link_path_walk() for every path component. To my reading the entire point of the fence was to grab a fully populated mnt_idmap, but that's already going to happen with mere consume fence. Eliminates an actual fence on arm64. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20241130051712.1036527-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/mount.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mount.h b/include/linux/mount.h index c34c18b4e8f3..33f17b6e8732 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -76,7 +76,7 @@ struct vfsmount { static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt) { /* Pairs with smp_store_release() in do_idmap_mount(). */ - return smp_load_acquire(&mnt->mnt_idmap); + return READ_ONCE(mnt->mnt_idmap); } extern int mnt_want_write(struct vfsmount *mnt); -- 2.50.1 From 4db9f52fa9b81addc412330957bb7a657d2f1ffb Mon Sep 17 00:00:00 2001 From: Guo Weikang Date: Mon, 2 Dec 2024 16:11:45 +0800 Subject: [PATCH 12/16] fs: fc_log replace magic number 7 with ARRAY_SIZE() Replace the hardcoded value `7` in `put_fc_log()` with `ARRAY_SIZE`. This improves maintainability by ensuring the loop adapts to changes in the buffer size. Signed-off-by: Guo Weikang Link: https://lore.kernel.org/r/20241202081146.1031780-1-guoweikang.kernel@gmail.com Signed-off-by: Christian Brauner --- fs/fs_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fs_context.c b/fs/fs_context.c index 98589aae5208..582d33e81117 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -493,7 +493,7 @@ static void put_fc_log(struct fs_context *fc) if (log) { if (refcount_dec_and_test(&log->usage)) { fc->log.log = NULL; - for (i = 0; i <= 7; i++) + for (i = 0; i < ARRAY_SIZE(log->buffer) ; i++) if (log->need_free & (1 << i)) kfree(log->buffer[i]); kfree(log); -- 2.50.1 From 175c6a216dda4c88f7050b67e75a6cf331086c75 Mon Sep 17 00:00:00 2001 From: Zhu Jun Date: Wed, 4 Dec 2024 00:12:18 -0800 Subject: [PATCH 13/16] fs: Fix grammar and spelling in propagate_umount() Fix grammar and spelling in the propagate_umount() function. Signed-off-by: Zhu Jun Link: https://lore.kernel.org/r/20241204081218.12141-1-zhujun2@cmss.chinamobile.com Signed-off-by: Christian Brauner --- fs/pnode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/pnode.c b/fs/pnode.c index a799e0315cc9..ef048f008bdd 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -611,10 +611,10 @@ int propagate_umount(struct list_head *list) continue; } else if (child->mnt.mnt_flags & MNT_UMOUNT) { /* - * We have come accross an partially unmounted - * mount in list that has not been visited yet. - * Remember it has been visited and continue - * about our merry way. + * We have come across a partially unmounted + * mount in a list that has not been visited + * yet. Remember it has been visited and + * continue about our merry way. */ list_add_tail(&child->mnt_umounting, &visited); continue; -- 2.50.1 From ec052fae814d467d6aa7e591b4b24531b87e65ec Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 5 Dec 2024 16:47:43 +0100 Subject: [PATCH 14/16] fs: sort out a stale comment about races between fd alloc and dup2 It claims the issue is only relevant for shared descriptor tables which is of no concern for POSIX (but then is POSIX of concern to anyone today?), which I presume predates standarized threading. The comment also mentions the following systems: - OpenBSD installing a larval file -- they moved away from it, file is installed late and EBUSY is returned on conflict - FreeBSD returning EBADF -- reworked to install the file early like OpenBSD used to do - NetBSD "deadlocks in amusing ways" -- their solution looks Solaris-inspired (not a compliment) and I would not be particularly surprised if it indeed deadlocked, in amusing ways or otherwise I don't believe mentioning any of these adds anything and the statement about the issue not being POSIX-relevant is outdated. dup2 description in POSIX still does not mention the problem. Just shorten the comment and be done with it. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20241205154743.1586584-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/file.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/fs/file.c b/fs/file.c index 019fb9acf91b..d498715ef415 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1230,17 +1230,9 @@ __releases(&files->file_lock) /* * We need to detect attempts to do dup2() over allocated but still - * not finished descriptor. NB: OpenBSD avoids that at the price of - * extra work in their equivalent of fget() - they insert struct - * file immediately after grabbing descriptor, mark it larval if - * more work (e.g. actual opening) is needed and make sure that - * fget() treats larval files as absent. Potentially interesting, - * but while extra work in fget() is trivial, locking implications - * and amount of surgery on open()-related paths in VFS are not. - * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" - * deadlocks in rather amusing ways, AFAICS. All of that is out of - * scope of POSIX or SUS, since neither considers shared descriptor - * tables and this condition does not arise without those. + * not finished descriptor. + * + * POSIX is silent on the issue, we return -EBUSY. */ fdt = files_fdtable(files); fd = array_index_nospec(fd, fdt->max_fds); -- 2.50.1 From af6505e5745b9f3a670de405b08b73573343c15c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:45 -0700 Subject: [PATCH 15/16] fs: add RWF_DONTCACHE iocb and FOP_DONTCACHE file_operations flag If a file system supports uncached buffered IO, it may set FOP_DONTCACHE and enable support for RWF_DONTCACHE. If RWF_DONTCACHE is attempted without the file system supporting it, it'll get errored with -EOPNOTSUPP. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20241220154831.1086649-8-axboe@kernel.dk Signed-off-by: Christian Brauner --- include/linux/fs.h | 14 +++++++++++++- include/uapi/linux/fs.h | 6 +++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..6a838b5479a6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -322,6 +322,7 @@ struct readahead_control; #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC +#define IOCB_DONTCACHE (__force int) RWF_DONTCACHE /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -356,7 +357,8 @@ struct readahead_control; { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ - { IOCB_ATOMIC, "ATOMIC"}, \ + { IOCB_ATOMIC, "ATOMIC" }, \ + { IOCB_DONTCACHE, "DONTCACHE" }, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ @@ -2127,6 +2129,8 @@ struct file_operations { #define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5)) /* Supports asynchronous lock callbacks */ #define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6)) +/* File system supports uncached read/write buffered IO */ +#define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, @@ -3614,6 +3618,14 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) return -EOPNOTSUPP; } + if (flags & RWF_DONTCACHE) { + /* file system must support it */ + if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE)) + return -EOPNOTSUPP; + /* DAX mappings not supported */ + if (IS_DAX(ki->ki_filp->f_mapping->host)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 753971770733..56a4f93a08f4 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -332,9 +332,13 @@ typedef int __bitwise __kernel_rwf_t; /* Atomic Write */ #define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) +/* buffered IO that drops the cache after reading or writing data */ +#define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC) + RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\ + RWF_DONTCACHE) #define PROCFS_IOCTL_MAGIC 'f' -- 2.50.1 From aaec5a95d59615523db03dd53c2052f0a87beea7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 2 Jan 2025 15:07:15 +0100 Subject: [PATCH 16/16] pipe_read: don't wake up the writer if the pipe is still full wake_up(pipe->wr_wait) makes no sense if pipe_full() is still true after the reading, the writer sleeping in wait_event(wr_wait, pipe_writable()) will check the pipe_writable() == !pipe_full() condition and sleep again. Only wake the writer if we actually released a pipe buf, and the pipe was full before we did so. Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/all/20241229135737.GA3293@redhat.com/ Link: https://lore.kernel.org/r/20250102140715.GA7091@redhat.com Reported-by: WangYuli Signed-off-by: Christian Brauner --- fs/pipe.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 12b22c2723b7..82fede0f2111 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -253,7 +253,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; - bool was_full, wake_next_reader = false; + bool wake_writer = false, wake_next_reader = false; ssize_t ret; /* Null read succeeds. */ @@ -264,14 +264,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) mutex_lock(&pipe->mutex); /* - * We only wake up writers if the pipe was full when we started - * reading in order to avoid unnecessary wakeups. + * We only wake up writers if the pipe was full when we started reading + * and it is no longer full after reading to avoid unnecessary wakeups. * * But when we do wake up writers, we do so using a sync wakeup * (WF_SYNC), because we want them to get going and generate more * data for us. */ - was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); for (;;) { /* Read ->head with a barrier vs post_one_notification() */ unsigned int head = smp_load_acquire(&pipe->head); @@ -340,8 +339,10 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) buf->len = 0; } - if (!buf->len) + if (!buf->len) { + wake_writer |= pipe_full(head, tail, pipe->max_usage); tail = pipe_update_tail(pipe, buf, tail); + } total_len -= chars; if (!total_len) break; /* common path: read succeeded */ @@ -377,7 +378,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) * _very_ unlikely case that the pipe was full, but we got * no data. */ - if (unlikely(was_full)) + if (unlikely(wake_writer)) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); @@ -390,15 +391,15 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; - mutex_lock(&pipe->mutex); - was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); + wake_writer = false; wake_next_reader = true; + mutex_lock(&pipe->mutex); } if (pipe_empty(pipe->head, pipe->tail)) wake_next_reader = false; mutex_unlock(&pipe->mutex); - if (was_full) + if (wake_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); if (wake_next_reader) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); -- 2.50.1