From 2298abcbe11e9b553d03c0f1d084da786f7eff88 Mon Sep 17 00:00:00 2001 From: Long Li Date: Sat, 1 Mar 2025 14:48:36 +0800 Subject: [PATCH 01/16] sunrpc: fix race in cache cleanup causing stale nextcheck time When cache cleanup runs concurrently with cache entry removal, a race condition can occur that leads to incorrect nextcheck times. This can delay cache cleanup for the cache_detail by up to 1800 seconds: 1. cache_clean() sets nextcheck to current time plus 1800 seconds 2. While scanning a non-empty bucket, concurrent cache entry removal can empty that bucket 3. cache_clean() finds no cache entries in the now-empty bucket to update the nextcheck time 4. This maybe delays the next scan of the cache_detail by up to 1800 seconds even when it should be scanned earlier based on remaining entries Fix this by moving the hash_lock acquisition earlier in cache_clean(). This ensures bucket emptiness checks and nextcheck updates happen atomically, preventing the race between cleanup and entry removal. Signed-off-by: Long Li Signed-off-by: Chuck Lever --- net/sunrpc/cache.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index bbaa77d7bbc8..131090f31e6a 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -464,24 +464,21 @@ static int cache_clean(void) } } + spin_lock(¤t_detail->hash_lock); + /* find a non-empty bucket in the table */ - while (current_detail && - current_index < current_detail->hash_size && + while (current_index < current_detail->hash_size && hlist_empty(¤t_detail->hash_table[current_index])) current_index++; /* find a cleanable entry in the bucket and clean it, or set to next bucket */ - - if (current_detail && current_index < current_detail->hash_size) { + if (current_index < current_detail->hash_size) { struct cache_head *ch = NULL; struct cache_detail *d; struct hlist_head *head; struct hlist_node *tmp; - spin_lock(¤t_detail->hash_lock); - /* Ok, now to clean this strand */ - head = ¤t_detail->hash_table[current_index]; hlist_for_each_entry_safe(ch, tmp, head, cache_list) { if (current_detail->nextcheck > ch->expiry_time) @@ -502,8 +499,10 @@ static int cache_clean(void) spin_unlock(&cache_list_lock); if (ch) sunrpc_end_cache_remove_entry(ch, d); - } else + } else { + spin_unlock(¤t_detail->hash_lock); spin_unlock(&cache_list_lock); + } return rv; } -- 2.50.1 From ff12eb379554eea7932ad6caea55e3091701cce4 Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Thu, 6 Mar 2025 14:50:06 +0530 Subject: [PATCH 02/16] NFSD: unregister filesystem in case genl_register_family() fails With rpc_status netlink support, unregister of register_filesystem() was missed in case of genl_register_family() fails. Correcting it by making new label. Fixes: bd9d6a3efa97 ("NFSD: add rpc_status netlink support") Cc: stable@vger.kernel.org Signed-off-by: Maninder Singh Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfsctl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index ac265d6fde35..d773481bcf10 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -2305,7 +2305,7 @@ static int __init init_nfsd(void) goto out_free_cld; retval = register_filesystem(&nfsd_fs_type); if (retval) - goto out_free_all; + goto out_free_nfsd4; retval = genl_register_family(&nfsd_nl_family); if (retval) goto out_free_all; @@ -2313,6 +2313,8 @@ static int __init init_nfsd(void) return 0; out_free_all: + unregister_filesystem(&nfsd_fs_type); +out_free_nfsd4: nfsd4_destroy_laundry_wq(); out_free_cld: unregister_cld_notifier(); -- 2.50.1 From f7fb730cac9aafda8b9813b55d04e28a9664d17c Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Thu, 6 Mar 2025 14:50:07 +0530 Subject: [PATCH 03/16] NFSD: fix race between nfsd registration and exports_proc As of now nfsd calls create_proc_exports_entry() at start of init_nfsd and cleanup by remove_proc_entry() at last of exit_nfsd. Which causes kernel OOPs if there is race between below 2 operations: (i) exportfs -r (ii) mount -t nfsd none /proc/fs/nfsd for 5.4 kernel ARM64: CPU 1: el1_irq+0xbc/0x180 arch_counter_get_cntvct+0x14/0x18 running_clock+0xc/0x18 preempt_count_add+0x88/0x110 prep_new_page+0xb0/0x220 get_page_from_freelist+0x2d8/0x1778 __alloc_pages_nodemask+0x15c/0xef0 __vmalloc_node_range+0x28c/0x478 __vmalloc_node_flags_caller+0x8c/0xb0 kvmalloc_node+0x88/0xe0 nfsd_init_net+0x6c/0x108 [nfsd] ops_init+0x44/0x170 register_pernet_operations+0x114/0x270 register_pernet_subsys+0x34/0x50 init_nfsd+0xa8/0x718 [nfsd] do_one_initcall+0x54/0x2e0 CPU 2 : Unable to handle kernel NULL pointer dereference at virtual address 0000000000000010 PC is at : exports_net_open+0x50/0x68 [nfsd] Call trace: exports_net_open+0x50/0x68 [nfsd] exports_proc_open+0x2c/0x38 [nfsd] proc_reg_open+0xb8/0x198 do_dentry_open+0x1c4/0x418 vfs_open+0x38/0x48 path_openat+0x28c/0xf18 do_filp_open+0x70/0xe8 do_sys_open+0x154/0x248 Sometimes it crashes at exports_net_open() and sometimes cache_seq_next_rcu(). and same is happening on latest 6.14 kernel as well: [ 0.000000] Linux version 6.14.0-rc5-next-20250304-dirty ... [ 285.455918] Unable to handle kernel paging request at virtual address 00001f4800001f48 ... [ 285.464902] pc : cache_seq_next_rcu+0x78/0xa4 ... [ 285.469695] Call trace: [ 285.470083] cache_seq_next_rcu+0x78/0xa4 (P) [ 285.470488] seq_read+0xe0/0x11c [ 285.470675] proc_reg_read+0x9c/0xf0 [ 285.470874] vfs_read+0xc4/0x2fc [ 285.471057] ksys_read+0x6c/0xf4 [ 285.471231] __arm64_sys_read+0x1c/0x28 [ 285.471428] invoke_syscall+0x44/0x100 [ 285.471633] el0_svc_common.constprop.0+0x40/0xe0 [ 285.471870] do_el0_svc_compat+0x1c/0x34 [ 285.472073] el0_svc_compat+0x2c/0x80 [ 285.472265] el0t_32_sync_handler+0x90/0x140 [ 285.472473] el0t_32_sync+0x19c/0x1a0 [ 285.472887] Code: f9400885 93407c23 937d7c27 11000421 (f86378a3) [ 285.473422] ---[ end trace 0000000000000000 ]--- It reproduced simply with below script: while [ 1 ] do /exportfs -r done & while [ 1 ] do insmod /nfsd.ko mount -t nfsd none /proc/fs/nfsd umount /proc/fs/nfsd rmmod nfsd done & So exporting interfaces to user space shall be done at last and cleanup at first place. With change there is no Kernel OOPs. Co-developed-by: Shubham Rana Signed-off-by: Shubham Rana Signed-off-by: Maninder Singh Reviewed-by: Jeff Layton Cc: stable@vger.kernel.org Signed-off-by: Chuck Lever --- fs/nfsd/nfsctl.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index d773481bcf10..f9763ced743d 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -2291,12 +2291,9 @@ static int __init init_nfsd(void) if (retval) goto out_free_pnfs; nfsd_lockd_init(); /* lockd->nfsd callbacks */ - retval = create_proc_exports_entry(); - if (retval) - goto out_free_lockd; retval = register_pernet_subsys(&nfsd_net_ops); if (retval < 0) - goto out_free_exports; + goto out_free_lockd; retval = register_cld_notifier(); if (retval) goto out_free_subsys; @@ -2307,12 +2304,17 @@ static int __init init_nfsd(void) if (retval) goto out_free_nfsd4; retval = genl_register_family(&nfsd_nl_family); + if (retval) + goto out_free_filesystem; + retval = create_proc_exports_entry(); if (retval) goto out_free_all; nfsd_localio_ops_init(); return 0; out_free_all: + genl_unregister_family(&nfsd_nl_family); +out_free_filesystem: unregister_filesystem(&nfsd_fs_type); out_free_nfsd4: nfsd4_destroy_laundry_wq(); @@ -2320,9 +2322,6 @@ out_free_cld: unregister_cld_notifier(); out_free_subsys: unregister_pernet_subsys(&nfsd_net_ops); -out_free_exports: - remove_proc_entry("fs/nfs/exports", NULL); - remove_proc_entry("fs/nfs", NULL); out_free_lockd: nfsd_lockd_shutdown(); nfsd_drc_slab_free(); @@ -2335,14 +2334,14 @@ out_free_slabs: static void __exit exit_nfsd(void) { + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); genl_unregister_family(&nfsd_nl_family); unregister_filesystem(&nfsd_fs_type); nfsd4_destroy_laundry_wq(); unregister_cld_notifier(); unregister_pernet_subsys(&nfsd_net_ops); nfsd_drc_slab_free(); - remove_proc_entry("fs/nfs/exports", NULL); - remove_proc_entry("fs/nfs", NULL); nfsd_lockd_shutdown(); nfsd4_free_slabs(); nfsd4_exit_pnfs(); -- 2.50.1 From 9fe5ea760e64f04412dbed51645a0dac7220d40a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 8 Mar 2025 15:14:37 -0500 Subject: [PATCH 04/16] NFSD: Add /sys/kernel/debug/nfsd Create a small sandbox under /sys/kernel/debug for experimental NFS server feature settings. There is no API/ABI compatibility guarantee for these settings. The only documentation for such settings, if any documentation exists, is in the kernel source code. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/Makefile | 1 + fs/nfsd/debugfs.c | 18 ++++++++++++++++++ fs/nfsd/nfsctl.c | 4 ++++ fs/nfsd/nfsd.h | 8 ++++++++ 4 files changed, 31 insertions(+) create mode 100644 fs/nfsd/debugfs.c diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 2f687619f65b..55744bb786c9 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -24,6 +24,7 @@ nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o nfsd-$(CONFIG_NFS_LOCALIO) += localio.o +nfsd-$(CONFIG_DEBUG_FS) += debugfs.o .PHONY: xdrgen diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c new file mode 100644 index 000000000000..e913268d9c2d --- /dev/null +++ b/fs/nfsd/debugfs.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +#include "nfsd.h" + +static struct dentry *nfsd_top_dir __read_mostly; + +void nfsd_debugfs_exit(void) +{ + debugfs_remove_recursive(nfsd_top_dir); + nfsd_top_dir = NULL; +} + +void nfsd_debugfs_init(void) +{ + nfsd_top_dir = debugfs_create_dir("nfsd", NULL); +} diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index f9763ced743d..3f3e9f6c4250 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -2281,6 +2281,8 @@ static int __init init_nfsd(void) { int retval; + nfsd_debugfs_init(); + retval = nfsd4_init_slabs(); if (retval) return retval; @@ -2329,6 +2331,7 @@ out_free_pnfs: nfsd4_exit_pnfs(); out_free_slabs: nfsd4_free_slabs(); + nfsd_debugfs_exit(); return retval; } @@ -2345,6 +2348,7 @@ static void __exit exit_nfsd(void) nfsd_lockd_shutdown(); nfsd4_free_slabs(); nfsd4_exit_pnfs(); + nfsd_debugfs_exit(); } MODULE_AUTHOR("Olaf Kirch "); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index e2997f0ffbc5..8a53ddab5df0 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -156,6 +156,14 @@ void nfsd_reset_versions(struct nfsd_net *nn); int nfsd_create_serv(struct net *net); void nfsd_destroy_serv(struct net *net); +#ifdef CONFIG_DEBUG_FS +void nfsd_debugfs_init(void); +void nfsd_debugfs_exit(void); +#else +static inline void nfsd_debugfs_init(void) {} +static inline void nfsd_debugfs_exit(void) {} +#endif + extern int nfsd_max_blksize; static inline int nfsd_v4client(struct svc_rqst *rq) -- 2.50.1 From 1218149037ee80f88aee4fa4c34b1de23df77fb1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 8 Mar 2025 15:14:38 -0500 Subject: [PATCH 05/16] NFSD: Add experimental setting to disable the use of splice read NFSD currently has two separate code paths for handling read requests. One uses page splicing; the other is a traditional read based on an iov iterator. Because most Linux file systems support splice read, the latter does not get nearly the same test experience as splice reads. To force the use of vectored reads for testing and benchmarking, introduce the ability to disable splice reads for all NFS READ operations. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/debugfs.c | 29 +++++++++++++++++++++++++++++ fs/nfsd/nfsd.h | 2 ++ fs/nfsd/vfs.c | 4 ++++ 3 files changed, 35 insertions(+) diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c index e913268d9c2d..84b0c8b559dc 100644 --- a/fs/nfsd/debugfs.c +++ b/fs/nfsd/debugfs.c @@ -6,6 +6,32 @@ static struct dentry *nfsd_top_dir __read_mostly; +/* + * /sys/kernel/debug/nfsd/disable-splice-read + * + * Contents: + * %0: NFS READ is allowed to use page splicing + * %1: NFS READ uses only iov iter read + * + * The default value of this setting is zero (page splicing is + * allowed). This setting takes immediate effect for all NFS + * versions, all exports, and in all NFSD net namespaces. + */ + +static int nfsd_dsr_get(void *data, u64 *val) +{ + *val = nfsd_disable_splice_read ? 1 : 0; + return 0; +} + +static int nfsd_dsr_set(void *data, u64 val) +{ + nfsd_disable_splice_read = (val > 0) ? true : false; + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n"); + void nfsd_debugfs_exit(void) { debugfs_remove_recursive(nfsd_top_dir); @@ -15,4 +41,7 @@ void nfsd_debugfs_exit(void) void nfsd_debugfs_init(void) { nfsd_top_dir = debugfs_create_dir("nfsd", NULL); + + debugfs_create_file("disable-splice-read", S_IWUSR | S_IRUGO, + nfsd_top_dir, NULL, &nfsd_dsr_fops); } diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 8a53ddab5df0..232aee06223d 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -164,6 +164,8 @@ static inline void nfsd_debugfs_init(void) {} static inline void nfsd_debugfs_exit(void) {} #endif +extern bool nfsd_disable_splice_read __read_mostly; + extern int nfsd_max_blksize; static inline int nfsd_v4client(struct svc_rqst *rq) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 9abdc4b75813..02827b0f0492 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -47,6 +47,8 @@ #define NFSDDBG_FACILITY NFSDDBG_FILEOP +bool nfsd_disable_splice_read __read_mostly; + /** * nfserrno - Map Linux errnos to NFS errnos * @errno: POSIX(-ish) error code to be mapped @@ -1236,6 +1238,8 @@ out_nfserr: */ bool nfsd_read_splice_ok(struct svc_rqst *rqstp) { + if (nfsd_disable_splice_read) + return false; switch (svc_auth_flavor(rqstp)) { case RPC_AUTH_GSS_KRB5I: case RPC_AUTH_GSS_KRB5P: -- 2.50.1 From c447d2ac987bb5e155ed817a61db29978e684339 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 2 Apr 2025 22:06:19 +0800 Subject: [PATCH 06/16] nfsd: remove redundant WARN_ON_ONCE in nfsd4_write It can be removed since svc_fill_write_vector already has the same WARN_ON_ONCE. Signed-off-by: Guoqing Jiang Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index c20f1abcb94f..f5a06912ff8c 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1228,7 +1228,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, write->wr_how_written = write->wr_stable_how; nvecs = svc_fill_write_vector(rqstp, &write->wr_payload); - WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf, write->wr_offset, rqstp->rq_vec, nvecs, &cnt, -- 2.50.1 From 0813c5f01249dbc32ccbc68d27a24fde5bf2901c Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 21 Mar 2025 20:13:04 -0400 Subject: [PATCH 07/16] nfsd: fix access checking for NLM under XPRTSEC policies When an export policy with xprtsec policy is set with "tls" and/or "mtls", but an NFS client is doing a v3 xprtsec=tls mount, then NLM locking calls fail with an error because there is currently no support for NLM with TLS. Until such support is added, allow NLM calls under TLS-secured policy. Fixes: 4cc9b9f2bf4d ("nfsd: refine and rename NFSD_MAY_LOCK") Cc: stable@vger.kernel.org Signed-off-by: Olga Kornievskaia Reviewed-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/export.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 0363720280d4..88ae410b4113 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1124,7 +1124,8 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp, test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) goto ok; } - goto denied; + if (!may_bypass_gss) + goto denied; ok: /* legacy gss-only clients are always OK: */ -- 2.50.1 From 1244f0b2c3cecd3f349a877006e67c9492b41807 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 28 Mar 2025 11:05:59 +1100 Subject: [PATCH 08/16] nfsd: nfsd4_spo_must_allow() must check this is a v4 compound request If the request being processed is not a v4 compound request, then examining the cstate can have undefined results. This patch adds a check that the rpc procedure being executed (rq_procinfo) is the NFSPROC4_COMPOUND procedure. Reported-by: Olga Kornievskaia Cc: stable@vger.kernel.org Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index f5a06912ff8c..77895e099673 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -3777,7 +3777,8 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow; u32 opiter; - if (!cstate->minorversion) + if (rqstp->rq_procinfo != &nfsd_version4.vs_proc[NFSPROC4_COMPOUND] || + cstate->minorversion == 0) return false; if (cstate->spo_must_allowed) -- 2.50.1 From 8c4aae5582cf7901655988809ad94a6f6f2bce63 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 9 Apr 2025 10:32:23 -0400 Subject: [PATCH 09/16] nfsd: add commit start/done tracepoints around nfsd_commit() Very useful for gauging how long the vfs_fsync_range() takes. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/trace.h | 2 ++ fs/nfsd/vfs.c | 3 +++ 2 files changed, 5 insertions(+) diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index a7630e9f6577..0d49fc064f72 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -451,6 +451,8 @@ DEFINE_NFSD_IO_EVENT(write_start); DEFINE_NFSD_IO_EVENT(write_opened); DEFINE_NFSD_IO_EVENT(write_io_done); DEFINE_NFSD_IO_EVENT(write_done); +DEFINE_NFSD_IO_EVENT(commit_start); +DEFINE_NFSD_IO_EVENT(commit_done); DECLARE_EVENT_CLASS(nfsd_err_class, TP_PROTO(struct svc_rqst *rqstp, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 02827b0f0492..68f7d0094b06 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1343,6 +1343,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, loff_t start, end; struct nfsd_net *nn; + trace_nfsd_commit_start(rqstp, fhp, offset, count); + /* * Convert the client-provided (offset, count) range to a * (start, end) range. If the client-provided range falls @@ -1381,6 +1383,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, } else nfsd_copy_write_verifier(verf, nn); + trace_nfsd_commit_done(rqstp, fhp, offset, count); return err; } -- 2.50.1 From 18c64378ad85ef00e70f196793ee8901a8aa2fa1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 11 Apr 2025 10:22:14 -0400 Subject: [PATCH 10/16] sunrpc: add info about xprt queue times to svc_xprt_dequeue tracepoint I've been looking at a problem where we see increased RPC timeouts in clients when the nfs_layout_flexfiles dataserver_timeo value is tuned very low (6s). This is necessary to ensure quick failover to a different mirror if a server goes down, but it causes a lot more major RPC timeouts. Ultimately, the problem is server-side however. It's sometimes doesn't respond to connection attempts. My theory is that the interrupt handler runs when a connection comes in, the xprt ends up being enqueued, but it takes a significant amount of time for the nfsd thread to pick it up. Currently, the svc_xprt_dequeue tracepoint displays "wakeup-us". This is the time between the wake_up() call, and the thread dequeueing the xprt. If no thread was woken, or the thread ended up picking up a different xprt than intended, then this value won't tell us how long the xprt was waiting. Add a new xpt_qtime field to struct svc_xprt and set it in svc_xprt_enqueue(). When the dequeue tracepoint fires, also store the time that the xprt sat on the queue in total. Display it as "qtime-us". Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_xprt.h | 1 + include/trace/events/sunrpc.h | 13 +++++++------ net/sunrpc/svc_xprt.c | 1 + 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 72be60952579..369a89aea186 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -53,6 +53,7 @@ struct svc_xprt { struct svc_xprt_class *xpt_class; const struct svc_xprt_ops *xpt_ops; struct kref xpt_ref; + ktime_t xpt_qtime; struct list_head xpt_list; struct lwq_node xpt_ready; unsigned long xpt_flags; diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 5d331383047b..67db3f2953d5 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -2040,19 +2040,20 @@ TRACE_EVENT(svc_xprt_dequeue, TP_STRUCT__entry( SVC_XPRT_ENDPOINT_FIELDS(rqst->rq_xprt) - __field(unsigned long, wakeup) + __field(unsigned long, qtime) ), TP_fast_assign( - SVC_XPRT_ENDPOINT_ASSIGNMENTS(rqst->rq_xprt); + ktime_t ktime = ktime_get(); - __entry->wakeup = ktime_to_us(ktime_sub(ktime_get(), - rqst->rq_qtime)); + SVC_XPRT_ENDPOINT_ASSIGNMENTS(rqst->rq_xprt); + __entry->wakeup = ktime_to_us(ktime_sub(ktime, rqst->rq_qtime)); + __entry->qtime = ktime_to_us(ktime_sub(ktime, rqst->rq_xprt->xpt_qtime)); ), - TP_printk(SVC_XPRT_ENDPOINT_FORMAT " wakeup-us=%lu", - SVC_XPRT_ENDPOINT_VARARGS, __entry->wakeup) + TP_printk(SVC_XPRT_ENDPOINT_FORMAT " wakeup-us=%lu qtime-us=%lu", + SVC_XPRT_ENDPOINT_VARARGS, __entry->wakeup, __entry->qtime) ); DECLARE_EVENT_CLASS(svc_xprt_event, diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index ae25405d8bd2..32018557797b 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -488,6 +488,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) pool = svc_pool_for_cpu(xprt->xpt_server); percpu_counter_inc(&pool->sp_sockets_queued); + xprt->xpt_qtime = ktime_get(); lwq_enqueue(&xprt->xpt_ready, &pool->sp_xprts); svc_pool_wake_idle_thread(pool); -- 2.50.1 From b099ee28f9b026b18c229488474df104be0758c2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 10 Apr 2025 10:10:12 -0400 Subject: [PATCH 11/16] MAINTAINERS: Update Neil Brown's email address Neil is planning retirement, and has asked me to replace his Suse email address with his personal email address. Both addresses currently route to the same mailbox. Signed-off-by: Chuck Lever --- .mailmap | 2 ++ MAINTAINERS | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index 1c70e51c789d..437072d30175 100644 --- a/.mailmap +++ b/.mailmap @@ -543,6 +543,8 @@ Naveen N Rao Neeraj Upadhyay Neeraj Upadhyay Neil Armstrong +NeilBrown +NeilBrown Nguyen Anh Quynh Nicholas Piggin Nicholas Piggin diff --git a/MAINTAINERS b/MAINTAINERS index f21f1dabb5fe..9631c3f57788 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12908,7 +12908,7 @@ W: http://kernelnewbies.org/KernelJanitors KERNEL NFSD, SUNRPC, AND LOCKD SERVERS M: Chuck Lever M: Jeff Layton -R: Neil Brown +R: NeilBrown R: Olga Kornievskaia R: Dai Ngo R: Tom Talpey -- 2.50.1 From b31da62889e6d610114d81dc7a6edbcaa503fcf8 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Mon, 14 Apr 2025 22:38:52 +0800 Subject: [PATCH 12/16] nfsd: Initialize ssc before laundromat_work to prevent NULL dereference In nfs4_state_start_net(), laundromat_work may access nfsd_ssc through nfs4_laundromat -> nfsd4_ssc_expire_umount. If nfsd_ssc isn't initialized, this can cause NULL pointer dereference. Normally the delayed start of laundromat_work allows sufficient time for nfsd_ssc initialization to complete. However, when the kernel waits too long for userspace responses (e.g. in nfs4_state_start_net -> nfsd4_end_grace -> nfsd4_record_grace_done -> nfsd4_cld_grace_done -> cld_pipe_upcall -> __cld_pipe_upcall -> wait_for_completion path), the delayed work may start before nfsd_ssc initialization finishes. Fix this by moving nfsd_ssc initialization before starting laundromat_work. Fixes: f4e44b393389 ("NFSD: delay unmount source's export after inter-server copy completed.") Cc: stable@vger.kernel.org Reviewed-by: Jeff Layton Signed-off-by: Li Lingfeng Signed-off-by: Chuck Lever --- fs/nfsd/nfssvc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 9b3d6cff0e1e..8ed143ef8b41 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -396,13 +396,13 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred) if (ret) goto out_filecache; +#ifdef CONFIG_NFSD_V4_2_INTER_SSC + nfsd4_ssc_init_umount_work(nn); +#endif ret = nfs4_state_start_net(net); if (ret) goto out_reply_cache; -#ifdef CONFIG_NFSD_V4_2_INTER_SSC - nfsd4_ssc_init_umount_work(nn); -#endif nn->nfsd_net_up = true; return 0; -- 2.50.1 From de08ffb79c8f404adc611ddb580bc74133b2c986 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 17 Apr 2025 14:54:36 -0400 Subject: [PATCH 13/16] sunrpc: allow SOMAXCONN backlogged TCP connections The connection backlog passed to listen() denotes the number of connections that are fully established, but that have not yet been accept()ed. If the amount goes above that level, new connection requests will be dropped on the floor until the value goes down. If all the knfsd threads are bogged down in (e.g.) disk I/O, new connection attempts can stall because of this. For the same rationale that Trond points out in the userland patch [1], ensure that svc_xprt sockets created by the kernel allow SOMAXCONN (4096) backlogged connections instead of the 64 that they do today. [1]: https://lore.kernel.org/linux-nfs/20240308180223.2965601-1-trond.myklebust@hammerspace.com/ Cc: Trond Myklebust Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/svcsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 72e5a01df3d3..60f2883268fa 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1542,7 +1542,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, if (protocol == IPPROTO_TCP) { sk_net_refcnt_upgrade(sock->sk); - if ((error = kernel_listen(sock, 64)) < 0) + if ((error = kernel_listen(sock, SOMAXCONN)) < 0) goto bummer; } -- 2.50.1 From 8ac6fcae5dc0e801f1c82a83f5ae2c0a4db19932 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 27 Apr 2025 12:39:59 -0400 Subject: [PATCH 14/16] svcrdma: Unregister the device if svc_rdma_accept() fails To handle device removal, svc_rdma_accept() requests removal notification for the underlying device when accepting a connection. However svc_rdma_free() is not invoked if svc_rdma_accept() fails. There needs to be a matching "unregister" in that case; otherwise the device cannot be removed. Fixes: c4de97f7c454 ("svcrdma: Handle device removal outside of the CM event handler") Cc: stable@vger.kernel.org Reviewed-by: Zhu Yanjun Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index aca8bdf65d72..5940a56023d1 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -575,6 +575,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) ib_destroy_qp(newxprt->sc_qp); rdma_destroy_id(newxprt->sc_cm_id); + rpcrdma_rn_unregister(dev, &newxprt->sc_rn); /* This call to put will destroy the transport */ svc_xprt_put(&newxprt->sc_xprt); return NULL; -- 2.50.1 From c2c90a8b2620626cdb3eb315c2fafcc19fe24ee6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 28 Apr 2025 12:36:58 -0700 Subject: [PATCH 15/16] nfsd: use SHA-256 library API instead of crypto_shash API This user of SHA-256 does not support any other algorithm, so the crypto_shash abstraction provides no value. Just use the SHA-256 library API instead, which is much simpler and easier to use. Signed-off-by: Eric Biggers Reviewed-by: Scott Mayhew Signed-off-by: Chuck Lever --- fs/nfsd/Kconfig | 2 +- fs/nfsd/nfs4recover.c | 61 +++++++++---------------------------------- 2 files changed, 14 insertions(+), 49 deletions(-) diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 731a88f6313e..879e0b104d1c 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -77,8 +77,8 @@ config NFSD_V4 select FS_POSIX_ACL select RPCSEC_GSS_KRB5 select CRYPTO + select CRYPTO_LIB_SHA256 select CRYPTO_MD5 - select CRYPTO_SHA256 select GRACE_PERIOD select NFS_V4_2_SSC_HELPER if NFS_V4_2 help diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index c1d9bd07285f..a79823020062 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -33,6 +33,7 @@ */ #include +#include #include #include #include @@ -736,7 +737,6 @@ struct cld_net { spinlock_t cn_lock; struct list_head cn_list; unsigned int cn_xid; - struct crypto_shash *cn_tfm; #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING bool cn_has_legacy; #endif @@ -1062,8 +1062,6 @@ nfsd4_remove_cld_pipe(struct net *net) nfsd4_cld_unregister_net(net, cn->cn_pipe); rpc_destroy_pipe_data(cn->cn_pipe); - if (cn->cn_tfm) - crypto_free_shash(cn->cn_tfm); kfree(nn->cld_net); nn->cld_net = NULL; } @@ -1157,8 +1155,6 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct cld_net *cn = nn->cld_net; struct cld_msg_v2 *cmsg; - struct crypto_shash *tfm = cn->cn_tfm; - struct xdr_netobj cksum; char *principal = NULL; /* Don't upcall if it's already stored */ @@ -1181,22 +1177,9 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) else if (clp->cl_cred.cr_principal) principal = clp->cl_cred.cr_principal; if (principal) { - cksum.len = crypto_shash_digestsize(tfm); - cksum.data = kmalloc(cksum.len, GFP_KERNEL); - if (cksum.data == NULL) { - ret = -ENOMEM; - goto out; - } - ret = crypto_shash_tfm_digest(tfm, principal, strlen(principal), - cksum.data); - if (ret) { - kfree(cksum.data); - goto out; - } - cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = cksum.len; - memcpy(cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data, - cksum.data, cksum.len); - kfree(cksum.data); + sha256(principal, strlen(principal), + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data); + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = SHA256_DIGEST_SIZE; } else cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0; @@ -1206,7 +1189,6 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } -out: free_cld_upcall(cup); out_err: if (ret) @@ -1345,12 +1327,11 @@ found: static int nfsd4_cld_check_v2(struct nfs4_client *clp) { - struct nfs4_client_reclaim *crp; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING struct cld_net *cn = nn->cld_net; - int status; - struct crypto_shash *tfm = cn->cn_tfm; - struct xdr_netobj cksum; +#endif + struct nfs4_client_reclaim *crp; char *principal = NULL; /* did we already find that this client is stable? */ @@ -1366,6 +1347,7 @@ nfsd4_cld_check_v2(struct nfs4_client *clp) if (cn->cn_has_legacy) { struct xdr_netobj name; char dname[HEXDIR_LEN]; + int status; status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) @@ -1388,28 +1370,18 @@ nfsd4_cld_check_v2(struct nfs4_client *clp) return -ENOENT; found: if (crp->cr_princhash.len) { + u8 digest[SHA256_DIGEST_SIZE]; + if (clp->cl_cred.cr_raw_principal) principal = clp->cl_cred.cr_raw_principal; else if (clp->cl_cred.cr_principal) principal = clp->cl_cred.cr_principal; if (principal == NULL) return -ENOENT; - cksum.len = crypto_shash_digestsize(tfm); - cksum.data = kmalloc(cksum.len, GFP_KERNEL); - if (cksum.data == NULL) - return -ENOENT; - status = crypto_shash_tfm_digest(tfm, principal, - strlen(principal), cksum.data); - if (status) { - kfree(cksum.data); + sha256(principal, strlen(principal), digest); + if (memcmp(crp->cr_princhash.data, digest, + crp->cr_princhash.len)) return -ENOENT; - } - if (memcmp(crp->cr_princhash.data, cksum.data, - crp->cr_princhash.len)) { - kfree(cksum.data); - return -ENOENT; - } - kfree(cksum.data); } crp->cr_clp = clp; return 0; @@ -1589,7 +1561,6 @@ nfsd4_cld_tracking_init(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); bool running; int retries = 10; - struct crypto_shash *tfm; status = nfs4_cld_state_init(net); if (status) @@ -1614,12 +1585,6 @@ nfsd4_cld_tracking_init(struct net *net) status = -ETIMEDOUT; goto err_remove; } - tfm = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(tfm)) { - status = PTR_ERR(tfm); - goto err_remove; - } - nn->cld_net->cn_tfm = tfm; status = nfsd4_cld_get_version(nn); if (status == -EOPNOTSUPP) -- 2.50.1 From d6ca7d2643eebe09cf46840bdc7d68b6e07aba77 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 7 May 2025 10:45:15 -0400 Subject: [PATCH 16/16] NFSD: Implement FATTR4_CLONE_BLKSIZE attribute RFC 7862 states that if an NFS server implements a CLONE operation, it MUST also implement FATTR4_CLONE_BLKSIZE. NFSD implements CLONE, but does not implement FATTR4_CLONE_BLKSIZE. Note that in Section 12.2, RFC 7862 claims that FATTR4_CLONE_BLKSIZE is RECOMMENDED, not REQUIRED. Likely this is because a minor version is not permitted to add a REQUIRED attribute. Confusing. We assume this attribute reports a block size as a count of bytes, as RFC 7862 does not specify a unit. Reported-by: Roland Mainz Suggested-by: Christoph Hellwig Reviewed-by: Roland Mainz Cc: stable@vger.kernel.org # v6.7+ Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index e67420729ecd..9eb8e5704622 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3391,6 +3391,23 @@ static __be32 nfsd4_encode_fattr4_suppattr_exclcreat(struct xdr_stream *xdr, return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]); } +/* + * Copied from generic_remap_checks/generic_remap_file_range_prep. + * + * These generic functions use the file system's s_blocksize, but + * individual file systems aren't required to use + * generic_remap_file_range_prep. Until there is a mechanism for + * determining a particular file system's (or file's) clone block + * size, this is the best NFSD can do. + */ +static __be32 nfsd4_encode_fattr4_clone_blksize(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct inode *inode = d_inode(args->dentry); + + return nfsd4_encode_uint32_t(xdr, inode->i_sb->s_blocksize); +} + #ifdef CONFIG_NFSD_V4_SECURITY_LABEL static __be32 nfsd4_encode_fattr4_sec_label(struct xdr_stream *xdr, const struct nfsd4_fattr_args *args) @@ -3545,7 +3562,7 @@ static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = { [FATTR4_MODE_SET_MASKED] = nfsd4_encode_fattr4__noop, [FATTR4_SUPPATTR_EXCLCREAT] = nfsd4_encode_fattr4_suppattr_exclcreat, [FATTR4_FS_CHARSET_CAP] = nfsd4_encode_fattr4__noop, - [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4__noop, + [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4_clone_blksize, [FATTR4_SPACE_FREED] = nfsd4_encode_fattr4__noop, [FATTR4_CHANGE_ATTR_TYPE] = nfsd4_encode_fattr4__noop, -- 2.50.1