From 99bcd91fabada0dbb1d5f0de44532d8008db93c6 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 8 May 2025 16:44:52 +0300 Subject: [PATCH 01/16] perf/x86/intel: Fix segfault with PEBS-via-PT with sample_freq Currently, using PEBS-via-PT with a sample frequency instead of a sample period, causes a segfault. For example: BUG: kernel NULL pointer dereference, address: 0000000000000195 ? __die_body.cold+0x19/0x27 ? page_fault_oops+0xca/0x290 ? exc_page_fault+0x7e/0x1b0 ? asm_exc_page_fault+0x26/0x30 ? intel_pmu_pebs_event_update_no_drain+0x40/0x60 ? intel_pmu_pebs_event_update_no_drain+0x32/0x60 intel_pmu_drain_pebs_icl+0x333/0x350 handle_pmi_common+0x272/0x3c0 intel_pmu_handle_irq+0x10a/0x2e0 perf_event_nmi_handler+0x2a/0x50 That happens because intel_pmu_pebs_event_update_no_drain() assumes all the pebs_enabled bits represent counter indexes, which is not always the case. In this particular case, bits 60 and 61 are set for PEBS-via-PT purposes. The behaviour of PEBS-via-PT with sample frequency is questionable because although a PMI is generated (PEBS_PMI_AFTER_EACH_RECORD), the period is not adjusted anyway. Putting that aside, fix intel_pmu_pebs_event_update_no_drain() by passing the mask of counter bits instead of 'size'. Note, prior to the Fixes commit, 'size' would be limited to the maximum counter index, so the issue was not hit. Fixes: 722e42e45c2f1 ("perf/x86: Support counter mask") Signed-off-by: Adrian Hunter Signed-off-by: Ingo Molnar Reviewed-by: Kan Liang Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Cc: Ian Rogers Cc: linux-perf-users@vger.kernel.org Link: https://lore.kernel.org/r/20250508134452.73960-1-adrian.hunter@intel.com --- arch/x86/events/intel/ds.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 9b20acc0e932..8d86e91bd5e5 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2465,8 +2465,9 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_ setup_pebs_fixed_sample_data); } -static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size) +static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, u64 mask) { + u64 pebs_enabled = cpuc->pebs_enabled & mask; struct perf_event *event; int bit; @@ -2477,7 +2478,7 @@ static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int * It needs to call intel_pmu_save_and_restart_reload() to * update the event->count for this case. */ - for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, size) { + for_each_set_bit(bit, (unsigned long *)&pebs_enabled, X86_PMC_IDX_MAX) { event = cpuc->events[bit]; if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) intel_pmu_save_and_restart_reload(event, 0); @@ -2512,7 +2513,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d } if (unlikely(base >= top)) { - intel_pmu_pebs_event_update_no_drain(cpuc, size); + intel_pmu_pebs_event_update_no_drain(cpuc, mask); return; } @@ -2626,7 +2627,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d (hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED); if (unlikely(base >= top)) { - intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX); + intel_pmu_pebs_event_update_no_drain(cpuc, mask); return; } -- 2.51.0 From dd24f87f65c957f30e605e44961d2fd53a44c780 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 May 2025 00:26:01 +0800 Subject: [PATCH 02/16] ublk: fix dead loop when canceling io command Commit: f40139fde527 ("ublk: fix race between io_uring_cmd_complete_in_task and ublk_cancel_cmd") adds a request state check in ublk_cancel_cmd(), and if the request is started, skips canceling this uring_cmd. However, the current uring_cmd may be in ACTIVE state, without block request coming to the uring command. Meantime, if the cached request in tag_set.tags[tag] has been delivered to ublk server and reycycled, then this uring_cmd can't be canceled. ublk requests are aborted in ublk char device release handler, which depends on canceling all ACTIVE uring_cmd. So it causes a dead loop. Fix this issue by not taking a stale request into account when canceling uring_cmd in ublk_cancel_cmd(). Reported-by: Shinichiro Kawasaki Closes: https://lore.kernel.org/linux-block/mruqwpf4tqenkbtgezv5oxwq7ngyq24jzeyqy4ixzvivatbbxv@4oh2wzz4e6qn/ Fixes: f40139fde527 ("ublk: fix race between io_uring_cmd_complete_in_task and ublk_cancel_cmd") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250515162601.77346-1-ming.lei@redhat.com [axboe: rewording of commit message] Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index f9032076bc06..dc104c025cd5 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1708,7 +1708,7 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, * that ublk_dispatch_req() is always called */ req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); - if (req && blk_mq_request_started(req)) + if (req && blk_mq_request_started(req) && req->tag == tag) return; spin_lock(&ubq->cancel_lock); -- 2.51.0 From 03680913744de17fa49e62b1d8f71bab42b0b721 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 16 May 2025 11:08:10 +0200 Subject: [PATCH 03/16] x86/mm: Remove duplicated word in warning message Commit bbeb69ce3013 ("x86/mm: Remove CONFIG_HIGHMEM64G support") introduces a new warning message MSG_HIGHMEM_TRIMMED, which accidentally introduces a duplicated 'for for' in the warning message. Remove this duplicated word. This was noticed while reviewing for references to obsolete kernel build config options. Fixes: bbeb69ce3013 ("x86/mm: Remove CONFIG_HIGHMEM64G support") Signed-off-by: Lukas Bulwahn Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dave Hansen Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: kernel-janitors@vger.kernel.org Link: https://lore.kernel.org/r/20250516090810.556623-1-lukas.bulwahn@redhat.com --- arch/x86/mm/init_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ad662cc4605c..148eba50265a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -565,7 +565,7 @@ static void __init lowmem_pfn_init(void) "only %luMB highmem pages available, ignoring highmem size of %luMB!\n" #define MSG_HIGHMEM_TRIMMED \ - "Warning: only 4GB will be used. Support for for CONFIG_HIGHMEM64G was removed!\n" + "Warning: only 4GB will be used. Support for CONFIG_HIGHMEM64G was removed!\n" /* * We have more RAM than fits into lowmem - we try to put it into * highmem, also taking the highmem=x boot parameter into account: -- 2.51.0 From 6d6d7f91cc8c111d40416ac9240a3bb9396c5235 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 10 May 2025 10:50:13 -0400 Subject: [PATCH 04/16] NFSv4/pnfs: Reset the layout state after a layoutreturn If there are still layout segments in the layout plh_return_lsegs list after a layout return, we should be resetting the state to ensure they eventually get returned as well. Fixes: 68f744797edd ("pNFS: Do not free layout segments that are marked for return") Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 10fdd065a61c..fc7c5fb10198 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -745,6 +745,14 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, return remaining; } +static void pnfs_reset_return_info(struct pnfs_layout_hdr *lo) +{ + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &lo->plh_return_segs, pls_list) + pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); +} + static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo, struct list_head *free_me, @@ -1292,6 +1300,7 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq); pnfs_free_returned_lsegs(lo, &freeme, range, seq); pnfs_set_layout_stateid(lo, stateid, NULL, true); + pnfs_reset_return_info(lo); } else pnfs_mark_layout_stateid_invalid(lo, &freeme); out_unlock: -- 2.51.0 From 28511504f3ac73ebf45cbbe0dafeca1026e9a8f3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 10 May 2025 11:05:36 -0400 Subject: [PATCH 05/16] NFS/pnfs: Fix the error path in pnfs_layoutreturn_retry_later_locked() If there isn't a valid layout, or the layout stateid has changed, the cleanup after a layout return should clear out the old data. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index fc7c5fb10198..3adb7d0dbec7 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1254,21 +1254,15 @@ static void pnfs_clear_layoutcommit(struct inode *inode, static void pnfs_layoutreturn_retry_later_locked(struct pnfs_layout_hdr *lo, const nfs4_stateid *arg_stateid, - const struct pnfs_layout_range *range) + const struct pnfs_layout_range *range, + struct list_head *freeme) { - const struct pnfs_layout_segment *lseg; - u32 seq = be32_to_cpu(arg_stateid->seqid); - if (pnfs_layout_is_valid(lo) && - nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) { - list_for_each_entry(lseg, &lo->plh_return_segs, pls_list) { - if (pnfs_seqid_is_newer(lseg->pls_seq, seq) || - !pnfs_should_free_range(&lseg->pls_range, range)) - continue; - pnfs_set_plh_return_info(lo, range->iomode, seq); - break; - } - } + nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) + pnfs_reset_return_info(lo); + else + pnfs_mark_layout_stateid_invalid(lo, freeme); + pnfs_clear_layoutreturn_waitbit(lo); } void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo, @@ -1276,11 +1270,12 @@ void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo, const struct pnfs_layout_range *range) { struct inode *inode = lo->plh_inode; + LIST_HEAD(freeme); spin_lock(&inode->i_lock); - pnfs_layoutreturn_retry_later_locked(lo, arg_stateid, range); - pnfs_clear_layoutreturn_waitbit(lo); + pnfs_layoutreturn_retry_later_locked(lo, arg_stateid, range, &freeme); spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&freeme); } void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, @@ -1716,6 +1711,7 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct inode *inode = args->inode; const nfs4_stateid *res_stateid = NULL; struct nfs4_xdr_opaque_data *ld_private = args->ld_private; + LIST_HEAD(freeme); switch (ret) { case -NFS4ERR_BADSESSION: @@ -1724,9 +1720,9 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args, case -NFS4ERR_NOMATCHING_LAYOUT: spin_lock(&inode->i_lock); pnfs_layoutreturn_retry_later_locked(lo, &args->stateid, - &args->range); - pnfs_clear_layoutreturn_waitbit(lo); + &args->range, &freeme); spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&freeme); break; case 0: if (res->lrs_present) -- 2.51.0 From dcd21b609d4abc7303f8683bce4f35d78d7d6830 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 27 Apr 2025 18:21:06 -0400 Subject: [PATCH 06/16] NFS: Avoid flushing data while holding directory locks in nfs_rename() The Linux client assumes that all filehandles are non-volatile for renames within the same directory (otherwise sillyrename cannot work). However, the existence of the Linux 'subtree_check' export option has meant that nfs_rename() has always assumed it needs to flush writes before attempting to rename. Since NFSv4 does allow the client to query whether or not the server exhibits this behaviour, and since knfsd does actually set the appropriate flag when 'subtree_check' is enabled on an export, it should be OK to optimise away the write flushing behaviour in the cases where it is clearly not needed. Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton --- fs/nfs/client.c | 2 ++ fs/nfs/dir.c | 15 ++++++++++++++- include/linux/nfs_fs_sb.h | 12 +++++++++--- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 2115c1189c2d..6d63b958c4bb 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1105,6 +1105,8 @@ struct nfs_server *nfs_create_server(struct fs_context *fc) if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) server->namelen = NFS2_MAXNAMLEN; } + /* Linux 'subtree_check' borkenness mandates this setting */ + server->fh_expire_type = NFS_FH_VOL_RENAME; if (!(fattr->valid & NFS_ATTR_FATTR)) { error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh, diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index bd23fc736b39..d0e0b435a843 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2676,6 +2676,18 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data) unblock_revalidate(new_dentry); } +static bool nfs_rename_is_unsafe_cross_dir(struct dentry *old_dentry, + struct dentry *new_dentry) +{ + struct nfs_server *server = NFS_SB(old_dentry->d_sb); + + if (old_dentry->d_parent != new_dentry->d_parent) + return false; + if (server->fh_expire_type & NFS_FH_RENAME_UNSAFE) + return !(server->fh_expire_type & NFS_FH_NOEXPIRE_WITH_OPEN); + return true; +} + /* * RENAME * FIXME: Some nfsds, like the Linux user space nfsd, may generate a @@ -2763,7 +2775,8 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, } - if (S_ISREG(old_inode->i_mode)) + if (S_ISREG(old_inode->i_mode) && + nfs_rename_is_unsafe_cross_dir(old_dentry, new_dentry)) nfs_sync_inode(old_inode); task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, must_unblock ? nfs_unblock_rename : NULL); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 71319637a84e..ee03f3cef30c 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -213,6 +213,15 @@ struct nfs_server { char *fscache_uniq; /* Uniquifier (or NULL) */ #endif + /* The following #defines numerically match the NFSv4 equivalents */ +#define NFS_FH_NOEXPIRE_WITH_OPEN (0x1) +#define NFS_FH_VOLATILE_ANY (0x2) +#define NFS_FH_VOL_MIGRATION (0x4) +#define NFS_FH_VOL_RENAME (0x8) +#define NFS_FH_RENAME_UNSAFE (NFS_FH_VOLATILE_ANY | NFS_FH_VOL_RENAME) + u32 fh_expire_type; /* V4 bitmask representing file + handle volatility type for + this filesystem */ u32 pnfs_blksize; /* layout_blksize attr */ #if IS_ENABLED(CONFIG_NFS_V4) u32 attr_bitmask[3];/* V4 bitmask representing the set @@ -236,9 +245,6 @@ struct nfs_server { u32 acl_bitmask; /* V4 bitmask representing the ACEs that are supported on this filesystem */ - u32 fh_expire_type; /* V4 bitmask representing file - handle volatility type for - this filesystem */ struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */ struct rpc_wait_queue roc_rpcwaitq; void *pnfs_ld_data; /* per mount point data */ -- 2.51.0 From a5806cd506af5a7c19bcd596e4708b5c464bfd21 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 18 May 2025 13:57:29 -0700 Subject: [PATCH 07/16] Linux 6.15-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0eb9e6c49b32..a9edd0303653 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 15 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.51.0 From 08d6ee6d8a10aef958c2af16bb121070290ed589 Mon Sep 17 00:00:00 2001 From: Nikhil Jha Date: Wed, 19 Mar 2025 13:02:39 -0400 Subject: [PATCH 08/16] sunrpc: implement rfc2203 rpcsec_gss seqnum cache This implements a sequence number cache of the last three (right now hardcoded) sent sequence numbers for a given XID, as suggested by the RFC. From RFC2203 5.3.3.1: "Note that the sequence number algorithm requires that the client increment the sequence number even if it is retrying a request with the same RPC transaction identifier. It is not infrequent for clients to get into a situation where they send two or more attempts and a slow server sends the reply for the first attempt. With RPCSEC_GSS, each request and reply will have a unique sequence number. If the client wishes to improve turn around time on the RPC call, it can cache the RPCSEC_GSS sequence number of each request it sends. Then when it receives a response with a matching RPC transaction identifier, it can compute the checksum of each sequence number in the cache to try to match the checksum in the reply's verifier." Signed-off-by: Nikhil Jha Acked-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprt.h | 17 +++++++++- include/trace/events/rpcgss.h | 4 +-- include/trace/events/sunrpc.h | 2 +- net/sunrpc/auth_gss/auth_gss.c | 59 +++++++++++++++++++++------------- net/sunrpc/xprt.c | 3 +- 5 files changed, 57 insertions(+), 28 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 81b952649d35..f46d1fb8f71a 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -30,6 +30,8 @@ #define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT) #define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) +#define RPC_GSS_SEQNO_ARRAY_SIZE 3U + enum rpc_display_format_t { RPC_DISPLAY_ADDR = 0, RPC_DISPLAY_PORT, @@ -66,7 +68,8 @@ struct rpc_rqst { struct rpc_cred * rq_cred; /* Bound cred */ __be32 rq_xid; /* request XID */ int rq_cong; /* has incremented xprt->cong */ - u32 rq_seqno; /* gss seq no. used on req. */ + u32 rq_seqnos[RPC_GSS_SEQNO_ARRAY_SIZE]; /* past gss req seq nos. */ + unsigned int rq_seqno_count; /* number of entries in rq_seqnos */ int rq_enc_pages_num; struct page **rq_enc_pages; /* scratch pages for use by gss privacy code */ @@ -119,6 +122,18 @@ struct rpc_rqst { #define rq_svec rq_snd_buf.head #define rq_slen rq_snd_buf.len +static inline int xprt_rqst_add_seqno(struct rpc_rqst *req, u32 seqno) +{ + if (likely(req->rq_seqno_count < RPC_GSS_SEQNO_ARRAY_SIZE)) + req->rq_seqno_count++; + + /* Shift array to make room for the newest element at the beginning */ + memmove(&req->rq_seqnos[1], &req->rq_seqnos[0], + (RPC_GSS_SEQNO_ARRAY_SIZE - 1) * sizeof(req->rq_seqnos[0])); + req->rq_seqnos[0] = seqno; + return 0; +} + /* RPC transport layer security policies */ enum xprtsec_policies { RPC_XPRTSEC_NONE = 0, diff --git a/include/trace/events/rpcgss.h b/include/trace/events/rpcgss.h index b0b6300a0cab..8aeae06cf434 100644 --- a/include/trace/events/rpcgss.h +++ b/include/trace/events/rpcgss.h @@ -409,7 +409,7 @@ TRACE_EVENT(rpcgss_seqno, __entry->task_id = task->tk_pid; __entry->client_id = task->tk_client->cl_clid; __entry->xid = be32_to_cpu(rqst->rq_xid); - __entry->seqno = rqst->rq_seqno; + __entry->seqno = *rqst->rq_seqnos; ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER " xid=0x%08x seqno=%u", @@ -440,7 +440,7 @@ TRACE_EVENT(rpcgss_need_reencode, __entry->client_id = task->tk_client->cl_clid; __entry->xid = be32_to_cpu(task->tk_rqstp->rq_xid); __entry->seq_xmit = seq_xmit; - __entry->seqno = task->tk_rqstp->rq_seqno; + __entry->seqno = *task->tk_rqstp->rq_seqnos; __entry->ret = ret; ), diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 5d331383047b..751af7b024f9 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1100,7 +1100,7 @@ TRACE_EVENT(xprt_transmit, __entry->client_id = rqst->rq_task->tk_client ? rqst->rq_task->tk_client->cl_clid : -1; __entry->xid = be32_to_cpu(rqst->rq_xid); - __entry->seqno = rqst->rq_seqno; + __entry->seqno = *rqst->rq_seqnos; __entry->status = status; ), diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 369310909fc9..0fa244f16876 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1545,6 +1545,7 @@ static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr) struct kvec iov; struct xdr_buf verf_buf; int status; + u32 seqno; /* Credential */ @@ -1556,15 +1557,16 @@ static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr) cred_len = p++; spin_lock(&ctx->gc_seq_lock); - req->rq_seqno = (ctx->gc_seq < MAXSEQ) ? ctx->gc_seq++ : MAXSEQ; + seqno = (ctx->gc_seq < MAXSEQ) ? ctx->gc_seq++ : MAXSEQ; + xprt_rqst_add_seqno(req, seqno); spin_unlock(&ctx->gc_seq_lock); - if (req->rq_seqno == MAXSEQ) + if (*req->rq_seqnos == MAXSEQ) goto expired; trace_rpcgss_seqno(task); *p++ = cpu_to_be32(RPC_GSS_VERSION); *p++ = cpu_to_be32(ctx->gc_proc); - *p++ = cpu_to_be32(req->rq_seqno); + *p++ = cpu_to_be32(*req->rq_seqnos); *p++ = cpu_to_be32(gss_cred->gc_service); p = xdr_encode_netobj(p, &ctx->gc_wire_ctx); *cred_len = cpu_to_be32((p - (cred_len + 1)) << 2); @@ -1678,17 +1680,31 @@ gss_refresh_null(struct rpc_task *task) return 0; } +static u32 +gss_validate_seqno_mic(struct gss_cl_ctx *ctx, u32 seqno, __be32 *seq, __be32 *p, u32 len) +{ + struct kvec iov; + struct xdr_buf verf_buf; + struct xdr_netobj mic; + + *seq = cpu_to_be32(seqno); + iov.iov_base = seq; + iov.iov_len = 4; + xdr_buf_from_iov(&iov, &verf_buf); + mic.data = (u8 *)p; + mic.len = len; + return gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic); +} + static int gss_validate(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); __be32 *p, *seq = NULL; - struct kvec iov; - struct xdr_buf verf_buf; - struct xdr_netobj mic; u32 len, maj_stat; int status; + int i = 1; /* don't recheck the first item */ p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) @@ -1705,13 +1721,10 @@ gss_validate(struct rpc_task *task, struct xdr_stream *xdr) seq = kmalloc(4, GFP_KERNEL); if (!seq) goto validate_failed; - *seq = cpu_to_be32(task->tk_rqstp->rq_seqno); - iov.iov_base = seq; - iov.iov_len = 4; - xdr_buf_from_iov(&iov, &verf_buf); - mic.data = (u8 *)p; - mic.len = len; - maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic); + maj_stat = gss_validate_seqno_mic(ctx, task->tk_rqstp->rq_seqnos[0], seq, p, len); + /* RFC 2203 5.3.3.1 - compute the checksum of each sequence number in the cache */ + while (unlikely(maj_stat == GSS_S_BAD_SIG && i < task->tk_rqstp->rq_seqno_count)) + maj_stat = gss_validate_seqno_mic(ctx, task->tk_rqstp->rq_seqnos[i], seq, p, len); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat) @@ -1750,7 +1763,7 @@ gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, if (!p) goto wrap_failed; integ_len = p++; - *p = cpu_to_be32(rqstp->rq_seqno); + *p = cpu_to_be32(*rqstp->rq_seqnos); if (rpcauth_wrap_req_encode(task, xdr)) goto wrap_failed; @@ -1847,7 +1860,7 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, if (!p) goto wrap_failed; opaque_len = p++; - *p = cpu_to_be32(rqstp->rq_seqno); + *p = cpu_to_be32(*rqstp->rq_seqnos); if (rpcauth_wrap_req_encode(task, xdr)) goto wrap_failed; @@ -2001,7 +2014,7 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, offset = rcv_buf->len - xdr_stream_remaining(xdr); if (xdr_stream_decode_u32(xdr, &seqno)) goto unwrap_failed; - if (seqno != rqstp->rq_seqno) + if (seqno != *rqstp->rq_seqnos) goto bad_seqno; if (xdr_buf_subsegment(rcv_buf, &gss_data, offset, len)) goto unwrap_failed; @@ -2045,7 +2058,7 @@ unwrap_failed: trace_rpcgss_unwrap_failed(task); goto out; bad_seqno: - trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, seqno); + trace_rpcgss_bad_seqno(task, *rqstp->rq_seqnos, seqno); goto out; bad_mic: trace_rpcgss_verify_mic(task, maj_stat); @@ -2077,7 +2090,7 @@ gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred, if (maj_stat != GSS_S_COMPLETE) goto bad_unwrap; /* gss_unwrap decrypted the sequence number */ - if (be32_to_cpup(p++) != rqstp->rq_seqno) + if (be32_to_cpup(p++) != *rqstp->rq_seqnos) goto bad_seqno; /* gss_unwrap redacts the opaque blob from the head iovec. @@ -2093,7 +2106,7 @@ unwrap_failed: trace_rpcgss_unwrap_failed(task); return -EIO; bad_seqno: - trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(--p)); + trace_rpcgss_bad_seqno(task, *rqstp->rq_seqnos, be32_to_cpup(--p)); return -EIO; bad_unwrap: trace_rpcgss_unwrap(task, maj_stat); @@ -2118,14 +2131,14 @@ gss_xmit_need_reencode(struct rpc_task *task) if (!ctx) goto out; - if (gss_seq_is_newer(req->rq_seqno, READ_ONCE(ctx->gc_seq))) + if (gss_seq_is_newer(*req->rq_seqnos, READ_ONCE(ctx->gc_seq))) goto out_ctx; seq_xmit = READ_ONCE(ctx->gc_seq_xmit); - while (gss_seq_is_newer(req->rq_seqno, seq_xmit)) { + while (gss_seq_is_newer(*req->rq_seqnos, seq_xmit)) { u32 tmp = seq_xmit; - seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, req->rq_seqno); + seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, *req->rq_seqnos); if (seq_xmit == tmp) { ret = false; goto out_ctx; @@ -2134,7 +2147,7 @@ gss_xmit_need_reencode(struct rpc_task *task) win = ctx->gc_win; if (win > 0) - ret = !gss_seq_is_newer(req->rq_seqno, seq_xmit - win); + ret = !gss_seq_is_newer(*req->rq_seqnos, seq_xmit - win); out_ctx: gss_put_ctx(ctx); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 0eab15465511..d5e0cdcad9e0 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1365,7 +1365,7 @@ xprt_request_enqueue_transmit(struct rpc_task *task) INIT_LIST_HEAD(&req->rq_xmit2); goto out; } - } else if (!req->rq_seqno) { + } else if (req->rq_seqno_count == 0) { list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { if (pos->rq_task->tk_owner != task->tk_owner) continue; @@ -1898,6 +1898,7 @@ xprt_request_init(struct rpc_task *task) req->rq_snd_buf.bvec = NULL; req->rq_rcv_buf.bvec = NULL; req->rq_release_snd_buf = NULL; + req->rq_seqno_count = 0; xprt_init_majortimeo(task, req, task->tk_client->cl_timeout); trace_xprt_reserve(req); -- 2.51.0 From fadc0f3bb2de8c570ced6d9c1f97222213d93140 Mon Sep 17 00:00:00 2001 From: Nikhil Jha Date: Wed, 19 Mar 2025 13:02:40 -0400 Subject: [PATCH 09/16] sunrpc: don't immediately retransmit on seqno miss RFC2203 requires that retransmitted messages use a new gss sequence number, but the same XID. This means that if the server is just slow (e.x. overloaded), the client might receive a response using an older seqno than the one it has recorded. Currently, Linux's client immediately retransmits in this case. However, this leads to a lot of wasted retransmits until the server eventually responds faster than the client can resend. Client -> SEQ 1 -> Server Client -> SEQ 2 -> Server Client <- SEQ 1 <- Server (misses, expecting seqno = 2) Client -> SEQ 3 -> Server (immediate retransmission on miss) Client <- SEQ 2 <- Server (misses, expecting seqno = 3) Client -> SEQ 4 -> Server (immediate retransmission on miss) ... and so on ... This commit makes it so that we ignore messages with bad checksums due to seqnum mismatch, and rely on the usual timeout behavior for retransmission instead of doing so immediately. Signed-off-by: Nikhil Jha Acked-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 6f75862d9782..21426c3049d3 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2771,8 +2771,13 @@ out_verifier: case -EPROTONOSUPPORT: goto out_err; case -EACCES: - /* Re-encode with a fresh cred */ - fallthrough; + /* possible RPCSEC_GSS out-of-sequence event (RFC2203), + * reset recv state and keep waiting, don't retransmit + */ + task->tk_rqstp->rq_reply_bytes_recvd = 0; + task->tk_status = xprt_request_enqueue_receive(task); + task->tk_action = call_transmit_status; + return -EBADMSG; default: goto out_garbage; } -- 2.51.0 From e5296637a322b840d772f88f4d58b4bc72b29058 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 8 Apr 2025 09:43:15 -0400 Subject: [PATCH 10/16] nfs: add a refcount tracker for struct net as held by the nfs_client These are long-held references to the netns, so make sure the refcount tracker is aware of them. Signed-off-by: Jeff Layton Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 4 ++-- include/linux/nfs_fs_sb.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 6d63b958c4bb..745f3c42f140 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -180,7 +180,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_proto = cl_init->proto; clp->cl_nconnect = cl_init->nconnect; clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1; - clp->cl_net = get_net(cl_init->net); + clp->cl_net = get_net_track(cl_init->net, &clp->cl_ns_tracker, GFP_KERNEL); #if IS_ENABLED(CONFIG_NFS_LOCALIO) seqlock_init(&clp->cl_boot_lock); @@ -250,7 +250,7 @@ void nfs_free_client(struct nfs_client *clp) if (!IS_ERR(clp->cl_rpcclient)) rpc_shutdown_client(clp->cl_rpcclient); - put_net(clp->cl_net); + put_net_track(clp->cl_net, &clp->cl_ns_tracker); put_nfs_version(clp->cl_nfs_mod); kfree(clp->cl_hostname); kfree(clp->cl_acceptor); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index ee03f3cef30c..e02f4c4e9cc4 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -125,6 +125,7 @@ struct nfs_client { */ char cl_ipaddr[48]; struct net *cl_net; + netns_tracker cl_ns_tracker; struct list_head pending_cb_stateids; struct rcu_head rcu; -- 2.51.0 From 6e9a2f8dbe93c8004c2af2c0158888628b7ca034 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Wed, 16 Apr 2025 11:23:38 -0400 Subject: [PATCH 11/16] NFSv4: xattr handlers should check for absent nfs filehandles The nfs inodes for referral anchors that have not yet been followed have their filehandles zeroed out. Attempting to call getxattr() on one of these will cause the nfs client to send a GETATTR to the nfs server with the preceding PUTFH sans filehandle. The server will reply NFS4ERR_NOFILEHANDLE, leading to -EIO being returned to the application. For example: $ strace -e trace=getxattr getfattr -n system.nfs4_acl /mnt/t/ref getxattr("/mnt/t/ref", "system.nfs4_acl", NULL, 0) = -1 EIO (Input/output error) /mnt/t/ref: system.nfs4_acl: Input/output error +++ exited with 1 +++ Have the xattr handlers return -ENODATA instead. Signed-off-by: Scott Mayhew Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b1d2122bd5a7..f837ed7ad90c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6211,6 +6211,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen, struct nfs_server *server = NFS_SERVER(inode); int ret; + if (unlikely(NFS_FH(inode)->size == 0)) + return -ENODATA; if (!nfs4_server_supports_acls(server, type)) return -EOPNOTSUPP; ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); @@ -6285,6 +6287,9 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, { struct nfs4_exception exception = { }; int err; + + if (unlikely(NFS_FH(inode)->size == 0)) + return -ENODATA; do { err = __nfs4_proc_set_acl(inode, buf, buflen, type); trace_nfs4_set_acl(inode, err); -- 2.51.0 From 4c10fa44bc5f700e2ea21de2fbae520ba21f19d9 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Wed, 23 Apr 2025 15:22:50 +0200 Subject: [PATCH 12/16] fs/nfs/read: fix double-unlock bug in nfs_return_empty_folio() Sometimes, when a file was read while it was being truncated by another NFS client, the kernel could deadlock because folio_unlock() was called twice, and the second call would XOR back the `PG_locked` flag. Most of the time (depending on the timing of the truncation), nobody notices the problem because folio_unlock() gets called three times, which flips `PG_locked` back off: 1. vfs_read, nfs_read_folio, ... nfs_read_add_folio, nfs_return_empty_folio 2. vfs_read, nfs_read_folio, ... netfs_read_collection, netfs_unlock_abandoned_read_pages 3. vfs_read, ... nfs_do_read_folio, nfs_read_add_folio, nfs_return_empty_folio The problem is that nfs_read_add_folio() is not supposed to unlock the folio if fscache is enabled, and a nfs_netfs_folio_unlock() check is missing in nfs_return_empty_folio(). Rarely this leads to a warning in netfs_read_collection(): ------------[ cut here ]------------ R=0000031c: folio 10 is not locked WARNING: CPU: 0 PID: 29 at fs/netfs/read_collect.c:133 netfs_read_collection+0x7c0/0xf00 [...] Workqueue: events_unbound netfs_read_collection_worker RIP: 0010:netfs_read_collection+0x7c0/0xf00 [...] Call Trace: netfs_read_collection_worker+0x67/0x80 process_one_work+0x12e/0x2c0 worker_thread+0x295/0x3a0 Most of the time, however, processes just get stuck forever in folio_wait_bit_common(), waiting for `PG_locked` to disappear, which never happens because nobody is really holding the folio lock. Fixes: 000dbe0bec05 ("NFS: Convert buffered read paths to use netfs when fscache is enabled") Cc: stable@vger.kernel.org Signed-off-by: Max Kellermann Reviewed-by: Dave Wysochanski Signed-off-by: Anna Schumaker --- fs/nfs/read.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 81bd1b9aba17..3c1fa320b3f1 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -56,7 +56,8 @@ static int nfs_return_empty_folio(struct folio *folio) { folio_zero_segment(folio, 0, folio_size(folio)); folio_mark_uptodate(folio); - folio_unlock(folio); + if (nfs_netfs_folio_unlock(folio)) + folio_unlock(folio); return 0; } -- 2.51.0 From d2e1d783f2c619cf9d8242b67a0ab0d2915f2920 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 11 Apr 2025 15:47:32 -0400 Subject: [PATCH 13/16] NFS: Add support for fallocate(FALLOC_FL_ZERO_RANGE) This implements a suggestion from Trond that we can mimic FALLOC_FL_ZERO_RANGE by sending a compound that first does a DEALLOCATE to punch a hole in a file, and then an ALLOCATE to fill the hole with zeroes. There might technically be a race here, but once the DEALLOCATE finishes any reads from the region would return zeroes anyway, so I don't expect it to cause problems. Signed-off-by: Anna Schumaker --- fs/nfs/nfs42.h | 1 + fs/nfs/nfs42proc.c | 29 ++++++++++++++++-- fs/nfs/nfs42xdr.c | 64 +++++++++++++++++++++++++++++++++++++++ fs/nfs/nfs4file.c | 10 +++++- fs/nfs/nfs4proc.c | 1 + fs/nfs/nfs4xdr.c | 1 + include/linux/nfs4.h | 1 + include/linux/nfs_fs_sb.h | 1 + 8 files changed, 105 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index 0282d93c8bcc..aafd15a4afce 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -21,6 +21,7 @@ int nfs42_proc_allocate(struct file *, loff_t, loff_t); ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t, struct nl4_server *, nfs4_stateid *, bool); int nfs42_proc_deallocate(struct file *, loff_t, loff_t); +int nfs42_proc_zero_range(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); int nfs42_proc_layoutstats_generic(struct nfs_server *, struct nfs42_layoutstat_data *); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 5cf52ece96ac..01c01f45358b 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -146,7 +146,8 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == -EOPNOTSUPP) - NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; + NFS_SERVER(inode)->caps &= ~(NFS_CAP_ALLOCATE | + NFS_CAP_ZERO_RANGE); inode_unlock(inode); return err; @@ -169,7 +170,31 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (err == 0) truncate_pagecache_range(inode, offset, (offset + len) -1); if (err == -EOPNOTSUPP) - NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; + NFS_SERVER(inode)->caps &= ~(NFS_CAP_DEALLOCATE | + NFS_CAP_ZERO_RANGE); + + inode_unlock(inode); + return err; +} + +int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ZERO_RANGE], + }; + struct inode *inode = file_inode(filep); + int err; + + if (!nfs_server_capable(inode, NFS_CAP_ZERO_RANGE)) + return -EOPNOTSUPP; + + inode_lock(inode); + + err = nfs42_proc_fallocate(&msg, filep, offset, len); + if (err == 0) + truncate_pagecache_range(inode, offset, (offset + len) -1); + if (err == -EOPNOTSUPP) + NFS_SERVER(inode)->caps &= ~NFS_CAP_ZERO_RANGE; inode_unlock(inode); return err; diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index b1b663468249..4cc915d5741d 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -174,6 +174,18 @@ decode_putfh_maxsz + \ decode_deallocate_maxsz + \ decode_getattr_maxsz) +#define NFS4_enc_zero_range_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_deallocate_maxsz + \ + encode_allocate_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_zero_range_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_deallocate_maxsz + \ + decode_allocate_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_read_plus_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -648,6 +660,27 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, encode_nops(&hdr); } +/* + * Encode ZERO_RANGE request + */ +static void nfs4_xdr_enc_zero_range(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_falloc_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->falloc_fh, &hdr); + encode_deallocate(xdr, args, &hdr); + encode_allocate(xdr, args, &hdr); + encode_getfattr(xdr, args->falloc_bitmask, &hdr); + encode_nops(&hdr); +} + /* * Encode READ_PLUS request */ @@ -1510,6 +1543,37 @@ out: return status; } +/* + * Decode ZERO_RANGE request + */ +static int nfs4_xdr_dec_zero_range(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs42_falloc_res *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_deallocate(xdr, res); + if (status) + goto out; + status = decode_allocate(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->falloc_fattr, res->falloc_server); +out: + return status; +} + /* * Decode READ_PLUS request */ diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 1cd9652f3c28..5e9d66f3466c 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -225,8 +225,14 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if ((mode != 0) && (mode != (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE))) + switch (mode) { + case 0: + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: + case FALLOC_FL_ZERO_RANGE: + break; + default: return -EOPNOTSUPP; + } ret = inode_newsize_ok(inode, offset + len); if (ret < 0) @@ -234,6 +240,8 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t if (mode & FALLOC_FL_PUNCH_HOLE) return nfs42_proc_deallocate(filep, offset, len); + else if (mode & FALLOC_FL_ZERO_RANGE) + return nfs42_proc_zero_range(filep, offset ,len); return nfs42_proc_allocate(filep, offset, len); } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f837ed7ad90c..a7c4194ed2a0 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -10822,6 +10822,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_OFFLOAD_CANCEL | NFS_CAP_COPY_NOTIFY | NFS_CAP_DEALLOCATE + | NFS_CAP_ZERO_RANGE | NFS_CAP_SEEK | NFS_CAP_LAYOUTSTATS | NFS_CAP_CLONE diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 55bef5fbfa47..318afde38057 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -7711,6 +7711,7 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC42(LISTXATTRS, enc_listxattrs, dec_listxattrs), PROC42(REMOVEXATTR, enc_removexattr, dec_removexattr), PROC42(READ_PLUS, enc_read_plus, dec_read_plus), + PROC42(ZERO_RANGE, enc_zero_range, dec_zero_range), }; static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)]; diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index d8cad844870a..07635f87a3fd 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -678,6 +678,7 @@ enum { NFSPROC4_CLNT_SEEK, NFSPROC4_CLNT_ALLOCATE, NFSPROC4_CLNT_DEALLOCATE, + NFSPROC4_CLNT_ZERO_RANGE, NFSPROC4_CLNT_LAYOUTSTATS, NFSPROC4_CLNT_CLONE, NFSPROC4_CLNT_COPY, diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index e02f4c4e9cc4..63141320c2a8 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -304,6 +304,7 @@ struct nfs_server { #define NFS_CAP_CASE_PRESERVING (1U << 7) #define NFS_CAP_REBOOT_LAYOUTRETURN (1U << 8) #define NFS_CAP_OFFLOAD_STATUS (1U << 9) +#define NFS_CAP_ZERO_RANGE (1U << 10) #define NFS_CAP_OPEN_XOR (1U << 12) #define NFS_CAP_DELEGTIME (1U << 13) #define NFS_CAP_POSIX_LOCK (1U << 14) -- 2.51.0 From aba41e90aadeca8d4656f90639aa5f91ce564f1c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 25 Apr 2025 15:49:19 +0300 Subject: [PATCH 14/16] NFSv4.2: fix setattr caching of TIME_[MODIFY|ACCESS]_SET when timestamps are delegated nfs_setattr will flush all pending writes before updating a file time attributes. However when the client holds delegated timestamps, it can update its timestamps locally as it is the authority for the file times attributes. The client will later set the file attributes by adding a setattr to the delegreturn compound updating the server time attributes. Fix nfs_setattr to avoid flushing pending writes when the file time attributes are delegated and the mtime/atime are set to a fixed timestamp (ATTR_[MODIFY|ACCESS]_SET. Also, when sending the setattr procedure over the wire, we need to clear the correct attribute bits from the bitmask. I was able to measure a noticable speedup when measuring untar performance. Test: $ time tar xzf ~/dir.tgz Baseline: 1m13.072s Patched: 0m49.038s Which is more than 30% latency improvement. Signed-off-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- fs/nfs/inode.c | 49 +++++++++++++++++++++++++++++++++++++++++++---- fs/nfs/nfs4proc.c | 8 ++++---- 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 119e447758b9..6472b95bfd88 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -633,6 +633,34 @@ nfs_fattr_fixup_delegated(struct inode *inode, struct nfs_fattr *fattr) } } +static void nfs_set_timestamps_to_ts(struct inode *inode, struct iattr *attr) +{ + unsigned int cache_flags = 0; + + if (attr->ia_valid & ATTR_MTIME_SET) { + struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 mtime = inode_get_mtime(inode); + struct timespec64 now; + int updated = 0; + + now = inode_set_ctime_current(inode); + if (!timespec64_equal(&now, &ctime)) + updated |= S_CTIME; + + inode_set_mtime_to_ts(inode, attr->ia_mtime); + if (!timespec64_equal(&now, &mtime)) + updated |= S_MTIME; + + inode_maybe_inc_iversion(inode, updated); + cache_flags |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; + } + if (attr->ia_valid & ATTR_ATIME_SET) { + inode_set_atime_to_ts(inode, attr->ia_atime); + cache_flags |= NFS_INO_INVALID_ATIME; + } + NFS_I(inode)->cache_validity &= ~cache_flags; +} + static void nfs_update_timestamps(struct inode *inode, unsigned int ia_valid) { enum file_time_flags time_flags = 0; @@ -701,14 +729,27 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) { spin_lock(&inode->i_lock); - nfs_update_timestamps(inode, attr->ia_valid); + if (attr->ia_valid & ATTR_MTIME_SET) { + nfs_set_timestamps_to_ts(inode, attr); + attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET| + ATTR_ATIME|ATTR_ATIME_SET); + } else { + nfs_update_timestamps(inode, attr->ia_valid); + attr->ia_valid &= ~(ATTR_MTIME|ATTR_ATIME); + } spin_unlock(&inode->i_lock); - attr->ia_valid &= ~(ATTR_MTIME | ATTR_ATIME); } else if (nfs_have_delegated_atime(inode) && attr->ia_valid & ATTR_ATIME && !(attr->ia_valid & ATTR_MTIME)) { - nfs_update_delegated_atime(inode); - attr->ia_valid &= ~ATTR_ATIME; + if (attr->ia_valid & ATTR_ATIME_SET) { + spin_lock(&inode->i_lock); + nfs_set_timestamps_to_ts(inode, attr); + spin_unlock(&inode->i_lock); + attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET); + } else { + nfs_update_delegated_atime(inode); + attr->ia_valid &= ~ATTR_ATIME; + } } /* Optimization: if the end result is no change, don't RPC */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a7c4194ed2a0..87f9f6fb0214 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -325,14 +325,14 @@ static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src, if (nfs_have_delegated_mtime(inode)) { if (!(cache_validity & NFS_INO_INVALID_ATIME)) - dst[1] &= ~FATTR4_WORD1_TIME_ACCESS; + dst[1] &= ~(FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET); if (!(cache_validity & NFS_INO_INVALID_MTIME)) - dst[1] &= ~FATTR4_WORD1_TIME_MODIFY; + dst[1] &= ~(FATTR4_WORD1_TIME_MODIFY|FATTR4_WORD1_TIME_MODIFY_SET); if (!(cache_validity & NFS_INO_INVALID_CTIME)) - dst[1] &= ~FATTR4_WORD1_TIME_METADATA; + dst[1] &= ~(FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY_SET); } else if (nfs_have_delegated_atime(inode)) { if (!(cache_validity & NFS_INO_INVALID_ATIME)) - dst[1] &= ~FATTR4_WORD1_TIME_ACCESS; + dst[1] &= ~(FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET); } } -- 2.51.0 From 243fea134633ba3d64aceb4c16129c59541ea2c6 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 25 Apr 2025 14:09:21 -0400 Subject: [PATCH 15/16] NFSv4.2: fix listxattr to return selinux security label Currently, when NFS is queried for all the labels present on the file via a command example "getfattr -d -m . /mnt/testfile", it does not return the security label. Yet when asked specifically for the label (getfattr -n security.selinux) it will be returned. Include the security label when all attributes are queried. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 87f9f6fb0214..8feb9abfe152 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -10858,7 +10858,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) { - ssize_t error, error2, error3; + ssize_t error, error2, error3, error4; size_t left = size; error = generic_listxattr(dentry, list, left); @@ -10881,8 +10881,16 @@ static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, left); if (error3 < 0) return error3; + if (list) { + list += error3; + left -= error3; + } + + error4 = security_inode_listsecurity(d_inode(dentry), list, left); + if (error4 < 0) + return error4; - error += error2 + error3; + error += error2 + error3 + error4; if (size && error > size) return -ERANGE; return error; -- 2.51.0 From 4d4832ed13ff505fe0371544b4773e79be2bb964 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Wed, 30 Apr 2025 07:12:29 -0400 Subject: [PATCH 16/16] NFSv4: Don't check for OPEN feature support in v4.1 fattr4_open_arguments is a v4.2 recommended attribute, so we shouldn't be sending it to v4.1 servers. Fixes: cb78f9b7d0c0 ("nfs: fix the fetch of FATTR4_OPEN_ARGUMENTS") Signed-off-by: Scott Mayhew Reviewed-by: Jeff Layton Reviewed-by: Benjamin Coddington Cc: stable@vger.kernel.org # 6.11+ Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8feb9abfe152..d9c7035d5771 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3976,8 +3976,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f FATTR4_WORD0_CASE_INSENSITIVE | FATTR4_WORD0_CASE_PRESERVING; if (minorversion) - bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT | - FATTR4_WORD2_OPEN_ARGUMENTS; + bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT; + if (minorversion > 1) + bitmask[2] |= FATTR4_WORD2_OPEN_ARGUMENTS; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { -- 2.51.0