From 288d7224db0c2a85bda4e2227fad3f6eb89e2874 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 1 Oct 2024 16:33:44 -0400 Subject: [PATCH 01/16] NFS: Implement get_nfs_version() This is a pair for put_nfs_version(), and is used for incrementing the reference count on the nfs version module. I also updated the callers I could find who had this hardcoded up until now. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 10 ++++++++-- fs/nfs/fs_context.c | 4 ++-- fs/nfs/namespace.c | 2 +- fs/nfs/nfs.h | 1 + 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 4c94fe419c40..550ca934c9cf 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -100,12 +100,18 @@ struct nfs_subversion *find_nfs_version(unsigned int version) if (!nfs) return ERR_PTR(-EPROTONOSUPPORT); - if (!try_module_get(nfs->owner)) + if (!get_nfs_version(nfs)) return ERR_PTR(-EAGAIN); return nfs; } +int get_nfs_version(struct nfs_subversion *nfs) +{ + return try_module_get(nfs->owner); +} +EXPORT_SYMBOL_GPL(get_nfs_version); + void put_nfs_version(struct nfs_subversion *nfs) { module_put(nfs->owner); @@ -149,7 +155,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_minorversion = cl_init->minorversion; clp->cl_nfs_mod = cl_init->nfs_mod; - if (!try_module_get(clp->cl_nfs_mod->owner)) + if (!get_nfs_version(clp->cl_nfs_mod)) goto error_dealloc; clp->rpc_ops = clp->cl_nfs_mod->rpc_ops; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index d553daa4c09c..b069385eea17 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -1541,7 +1541,7 @@ static int nfs_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) } nfs_copy_fh(ctx->mntfh, src->mntfh); - __module_get(ctx->nfs_mod->owner); + get_nfs_version(ctx->nfs_mod); ctx->client_address = NULL; ctx->mount_server.hostname = NULL; ctx->nfs_server.export_path = NULL; @@ -1633,7 +1633,7 @@ static int nfs_init_fs_context(struct fs_context *fc) } ctx->nfs_mod = nfss->nfs_client->cl_nfs_mod; - __module_get(ctx->nfs_mod->owner); + get_nfs_version(ctx->nfs_mod); } else { /* defaults */ ctx->timeo = NFS_UNSPEC_TIMEO; diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index e7494cdd957e..2d53574da605 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -182,7 +182,7 @@ struct vfsmount *nfs_d_automount(struct path *path) ctx->version = client->rpc_ops->version; ctx->minorversion = client->cl_minorversion; ctx->nfs_mod = client->cl_nfs_mod; - __module_get(ctx->nfs_mod->owner); + get_nfs_version(ctx->nfs_mod); ret = client->rpc_ops->submount(fc, server); if (ret < 0) { diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h index a30bf8ef79d7..8a5f51be013a 100644 --- a/fs/nfs/nfs.h +++ b/fs/nfs/nfs.h @@ -22,6 +22,7 @@ struct nfs_subversion { }; struct nfs_subversion *find_nfs_version(unsigned int); +int get_nfs_version(struct nfs_subversion *); void put_nfs_version(struct nfs_subversion *); void register_nfs_version(struct nfs_subversion *); void unregister_nfs_version(struct nfs_subversion *); -- 2.51.0 From fb4e525da1c12d1f7aeff94797385937fd89f40b Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 3 Oct 2024 15:35:01 -0400 Subject: [PATCH 02/16] nfs/localio: remove redundant suid/sgid handling nfs_writeback_done() will take care of suid/sgid corner case. Signed-off-by: Mike Snitzer Signed-off-by: Trond Myklebust --- fs/nfs/localio.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 8f0ce82a677e..8d27a55209fc 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -521,12 +521,7 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) } if (status < 0) nfs_reset_boot_verifier(inode); - else if (nfs_should_remove_suid(inode)) { - /* Deal with the suid/sgid bit corner case */ - spin_lock(&inode->i_lock); - nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); - spin_unlock(&inode->i_lock); - } + nfs_local_pgio_done(hdr, status); } -- 2.51.0 From 894f5c5593cdb57841318597a800ad1d3cb45a52 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 3 Oct 2024 15:35:02 -0400 Subject: [PATCH 03/16] nfs/localio: eliminate unnecessary kref in nfs_local_fsync_ctx nfs_local_commit() doesn't need async cleanup of nfs_local_fsync_ctx, so there is no need to use a kref. Signed-off-by: Mike Snitzer Signed-off-by: Trond Myklebust --- fs/nfs/localio.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 8d27a55209fc..153e00e4dd90 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -42,7 +42,6 @@ struct nfs_local_fsync_ctx { struct nfsd_file *localio; struct nfs_commit_data *data; struct work_struct work; - struct kref kref; struct completion *done; }; static void nfs_local_fsync_work(struct work_struct *work); @@ -683,30 +682,17 @@ nfs_local_fsync_ctx_alloc(struct nfs_commit_data *data, ctx->localio = localio; ctx->data = data; INIT_WORK(&ctx->work, nfs_local_fsync_work); - kref_init(&ctx->kref); ctx->done = NULL; } return ctx; } -static void -nfs_local_fsync_ctx_kref_free(struct kref *kref) -{ - kfree(container_of(kref, struct nfs_local_fsync_ctx, kref)); -} - -static void -nfs_local_fsync_ctx_put(struct nfs_local_fsync_ctx *ctx) -{ - kref_put(&ctx->kref, nfs_local_fsync_ctx_kref_free); -} - static void nfs_local_fsync_ctx_free(struct nfs_local_fsync_ctx *ctx) { nfs_local_release_commit_data(ctx->localio, ctx->data, ctx->data->task.tk_ops); - nfs_local_fsync_ctx_put(ctx); + kfree(ctx); } static void @@ -739,7 +725,7 @@ int nfs_local_commit(struct nfsd_file *localio, } nfs_local_init_commit(data, call_ops); - kref_get(&ctx->kref); + if (how & FLUSH_SYNC) { DECLARE_COMPLETION_ONSTACK(done); ctx->done = &done; @@ -747,6 +733,6 @@ int nfs_local_commit(struct nfsd_file *localio, wait_for_completion(&done); } else queue_work(nfsiod_workqueue, &ctx->work); - nfs_local_fsync_ctx_put(ctx); + return 0; } -- 2.51.0 From 0978e5b85fc0867f53c5f4e5b7d2a5536a623e16 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 3 Oct 2024 15:35:03 -0400 Subject: [PATCH 04/16] nfs/localio: remove extra indirect nfs_to call to check {read,write}_iter Push the read_iter and write_iter availability checks down to nfs_do_local_read and nfs_do_local_write respectively. This eliminates a redundant nfs_to->nfsd_file_file() call. Signed-off-by: Mike Snitzer Signed-off-by: Trond Myklebust --- fs/nfs/localio.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 153e00e4dd90..23f6d2a372dd 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -273,7 +273,7 @@ nfs_local_iocb_free(struct nfs_local_kiocb *iocb) static struct nfs_local_kiocb * nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, - struct nfsd_file *localio, gfp_t flags) + struct file *file, gfp_t flags) { struct nfs_local_kiocb *iocb; @@ -286,9 +286,8 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, kfree(iocb); return NULL; } - init_sync_kiocb(&iocb->kiocb, nfs_to->nfsd_file_file(localio)); + init_sync_kiocb(&iocb->kiocb, file); iocb->kiocb.ki_pos = hdr->args.offset; - iocb->localio = localio; iocb->hdr = hdr; iocb->kiocb.ki_flags &= ~IOCB_APPEND; return iocb; @@ -389,13 +388,19 @@ nfs_do_local_read(struct nfs_pgio_header *hdr, const struct rpc_call_ops *call_ops) { struct nfs_local_kiocb *iocb; + struct file *file = nfs_to->nfsd_file_file(localio); + + /* Don't support filesystems without read_iter */ + if (!file->f_op->read_iter) + return -EAGAIN; dprintk("%s: vfs_read count=%u pos=%llu\n", __func__, hdr->args.count, hdr->args.offset); - iocb = nfs_local_iocb_alloc(hdr, localio, GFP_KERNEL); + iocb = nfs_local_iocb_alloc(hdr, file, GFP_KERNEL); if (iocb == NULL) return -ENOMEM; + iocb->localio = localio; nfs_local_pgio_init(hdr, call_ops); hdr->res.eof = false; @@ -558,14 +563,20 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, const struct rpc_call_ops *call_ops) { struct nfs_local_kiocb *iocb; + struct file *file = nfs_to->nfsd_file_file(localio); + + /* Don't support filesystems without write_iter */ + if (!file->f_op->write_iter) + return -EAGAIN; dprintk("%s: vfs_write count=%u pos=%llu %s\n", __func__, hdr->args.count, hdr->args.offset, (hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable"); - iocb = nfs_local_iocb_alloc(hdr, localio, GFP_NOIO); + iocb = nfs_local_iocb_alloc(hdr, file, GFP_NOIO); if (iocb == NULL) return -ENOMEM; + iocb->localio = localio; switch (hdr->args.stable) { default: @@ -591,16 +602,9 @@ int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio, const struct rpc_call_ops *call_ops) { int status = 0; - struct file *filp = nfs_to->nfsd_file_file(localio); if (!hdr->args.count) return 0; - /* Don't support filesystems without read_iter/write_iter */ - if (!filp->f_op->read_iter || !filp->f_op->write_iter) { - nfs_local_disable(clp); - status = -EAGAIN; - goto out; - } switch (hdr->rw_mode) { case FMODE_READ: @@ -614,8 +618,10 @@ int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio, hdr->rw_mode); status = -EINVAL; } -out: + if (status != 0) { + if (status == -EAGAIN) + nfs_local_disable(clp); nfs_to_nfsd_file_put_local(localio); hdr->task.tk_status = status; nfs_local_hdr_release(hdr, call_ops); -- 2.51.0 From 79a66e1465561f7baa3ff3daf79800fc241afeeb Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 3 Oct 2024 15:35:04 -0400 Subject: [PATCH 05/16] nfs/localio: eliminate need for nfs_local_fsync_work forward declaration Move nfs_local_fsync_ctx_alloc() after nfs_local_fsync_work(). Signed-off-by: Mike Snitzer Signed-off-by: Trond Myklebust --- fs/nfs/localio.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 23f6d2a372dd..81bb908ab890 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -44,7 +44,6 @@ struct nfs_local_fsync_ctx { struct work_struct work; struct completion *done; }; -static void nfs_local_fsync_work(struct work_struct *work); static bool localio_enabled __read_mostly = true; module_param(localio_enabled, bool, 0644); @@ -678,21 +677,6 @@ nfs_local_release_commit_data(struct nfsd_file *localio, call_ops->rpc_release(data); } -static struct nfs_local_fsync_ctx * -nfs_local_fsync_ctx_alloc(struct nfs_commit_data *data, - struct nfsd_file *localio, gfp_t flags) -{ - struct nfs_local_fsync_ctx *ctx = kmalloc(sizeof(*ctx), flags); - - if (ctx != NULL) { - ctx->localio = localio; - ctx->data = data; - INIT_WORK(&ctx->work, nfs_local_fsync_work); - ctx->done = NULL; - } - return ctx; -} - static void nfs_local_fsync_ctx_free(struct nfs_local_fsync_ctx *ctx) { @@ -717,6 +701,21 @@ nfs_local_fsync_work(struct work_struct *work) nfs_local_fsync_ctx_free(ctx); } +static struct nfs_local_fsync_ctx * +nfs_local_fsync_ctx_alloc(struct nfs_commit_data *data, + struct nfsd_file *localio, gfp_t flags) +{ + struct nfs_local_fsync_ctx *ctx = kmalloc(sizeof(*ctx), flags); + + if (ctx != NULL) { + ctx->localio = localio; + ctx->data = data; + INIT_WORK(&ctx->work, nfs_local_fsync_work); + ctx->done = NULL; + } + return ctx; +} + int nfs_local_commit(struct nfsd_file *localio, struct nfs_commit_data *data, const struct rpc_call_ops *call_ops, int how) -- 2.51.0 From e8e26a0b09f5783be471b5ffb1e31822b1272c1d Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 15 Oct 2024 22:27:31 +0200 Subject: [PATCH 06/16] nfs: Annotate struct pnfs_commit_array with __counted_by() Add the __counted_by compiler attribute to the flexible array member buckets to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Compile-tested only. Signed-off-by: Thorsten Blum Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 12d8e47bc5a3..559273a0f16d 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1336,7 +1336,7 @@ struct pnfs_commit_array { struct rcu_head rcu; refcount_t refcount; unsigned int nbuckets; - struct pnfs_commit_bucket buckets[]; + struct pnfs_commit_bucket buckets[] __counted_by(nbuckets); }; struct pnfs_ds_commit_info { -- 2.51.0 From 93970b6a143bd9f184ebab37c5862a204fb5690e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 29 Oct 2024 15:21:43 -0400 Subject: [PATCH 07/16] sunrpc: remove newlines from tracepoints Tracepoint strings don't require newlines (and in fact, they are undesirable). Signed-off-by: Jeff Layton Acked-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/sunrpc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 5e8495216689..b13dc275ef4a 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -719,7 +719,7 @@ TRACE_EVENT(rpc_xdr_overflow, ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER - " %sv%d %s requested=%zu p=%p end=%p xdr=[%p,%zu]/%u/[%p,%zu]/%u\n", + " %sv%d %s requested=%zu p=%p end=%p xdr=[%p,%zu]/%u/[%p,%zu]/%u", __entry->task_id, __entry->client_id, __get_str(progname), __entry->version, __get_str(procedure), __entry->requested, __entry->p, __entry->end, @@ -777,7 +777,7 @@ TRACE_EVENT(rpc_xdr_alignment, ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER - " %sv%d %s offset=%zu copied=%u xdr=[%p,%zu]/%u/[%p,%zu]/%u\n", + " %sv%d %s offset=%zu copied=%u xdr=[%p,%zu]/%u/[%p,%zu]/%u", __entry->task_id, __entry->client_id, __get_str(progname), __entry->version, __get_str(procedure), __entry->offset, __entry->copied, -- 2.51.0 From 675d4566e599bab1a225d418bbf7a53100367978 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Mon, 4 Nov 2024 17:11:27 -0500 Subject: [PATCH 08/16] SUNRPC: Fix a hang in TLS sock_close if sk_write_pending We've observed an NFS server shrink the TCP window and then reset the TCP connection as part of a HA failover. When the connection has TLS, often the NFS client will hang indefinitely in this stack: wait_woken+0x70/0x80 wait_on_pending_writer+0xe4/0x110 [tls] tls_sk_proto_close+0x368/0x3a0 [tls] inet_release+0x54/0xb0 __sock_release+0x48/0xc8 sock_close+0x20/0x38 __fput+0xe0/0x2f0 __fput_sync+0x58/0x70 xs_reset_transport+0xe8/0x1f8 [sunrpc] xs_tcp_shutdown+0xa4/0x190 [sunrpc] xprt_autoclose+0x68/0x170 [sunrpc] process_one_work+0x180/0x420 worker_thread+0x258/0x368 kthread+0x104/0x118 ret_from_fork+0x10/0x20 This hang prevents the client from closing the socket and reconnecting to the server. Because xs_nospace() elevates sk_write_pending, and sk_sndtimeo is MAX_SCHEDULE_TIMEOUT, tls_sk_proto_close is never able to complete its wait for pending writes to the socket. For this case where we are resetting the transport anyway, we don't expect the socket to ever have write space, so fix this by simply clearing the sock's sndtimeo under the sock's lock. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 1326fbf45a34..d587c261d999 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1278,6 +1278,7 @@ static void xs_reset_transport(struct sock_xprt *transport) transport->file = NULL; sk->sk_user_data = NULL; + sk->sk_sndtimeo = 0; xs_restore_old_callbacks(transport, sk); xprt_clear_connected(xprt); -- 2.51.0 From c968fd23c68e9929ab6cad4faffc8ea603e98e5d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 8 Nov 2024 11:41:21 -0500 Subject: [PATCH 09/16] NFSv4.0: Fix the wake up of the next waiter in nfs_release_seqid() There is no need to wake up another waiter on the seqid list unless the seqid being removed is at the head of the list, and so is relinquishing control of the sequence counter to the next entry. Reviewed-by: Yang Erkun Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index dafd61186557..9a9f60a2291b 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1083,14 +1083,12 @@ void nfs_release_seqid(struct nfs_seqid *seqid) return; sequence = seqid->sequence; spin_lock(&sequence->lock); - list_del_init(&seqid->list); - if (!list_empty(&sequence->list)) { - struct nfs_seqid *next; - - next = list_first_entry(&sequence->list, - struct nfs_seqid, list); + if (list_is_first(&seqid->list, &sequence->list) && + !list_is_singular(&sequence->list)) { + struct nfs_seqid *next = list_next_entry(seqid, list); rpc_wake_up_queued_task(&sequence->wait, next->task); } + list_del_init(&seqid->list); spin_unlock(&sequence->lock); } -- 2.51.0 From 2fdb05dc0931250574f0cb0ebeb5ed8e20f4a889 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 8 Nov 2024 12:13:31 -0500 Subject: [PATCH 10/16] NFSv4.0: Fix a use-after-free problem in the asynchronous open() Yang Erkun reports that when two threads are opening files at the same time, and are forced to abort before a reply is seen, then the call to nfs_release_seqid() in nfs4_opendata_free() can result in a use-after-free of the pointer to the defunct rpc task of the other thread. The fix is to ensure that if the RPC call is aborted before the call to nfs_wait_on_sequence() is complete, then we must call nfs_release_seqid() in nfs4_open_release() before the rpc_task is freed. Reported-by: Yang Erkun Fixes: 24ac23ab88df ("NFSv4: Convert open() into an asynchronous RPC call") Reviewed-by: Yang Erkun Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9d40319e063d..405f17e6e0b4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2603,12 +2603,14 @@ static void nfs4_open_release(void *calldata) struct nfs4_opendata *data = calldata; struct nfs4_state *state = NULL; + /* In case of error, no cleanup! */ + if (data->rpc_status != 0 || !data->rpc_done) { + nfs_release_seqid(data->o_arg.seqid); + goto out_free; + } /* If this request hasn't been cancelled, do nothing */ if (!data->cancelled) goto out_free; - /* In case of error, no cleanup! */ - if (data->rpc_status != 0 || !data->rpc_done) - goto out_free; /* In case we need an open_confirm, no cleanup! */ if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) goto out_free; -- 2.51.0 From 650703bc4ed3edf841e851c99ab8e7ba9e5262a3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 8 Nov 2024 18:39:44 -0500 Subject: [PATCH 11/16] nfs/localio: must clear res.replen in nfs_local_read_done Otherwise memory corruption can occur due to NFSv3 LOCALIO reads leaving garbage in res.replen: - nfs3_read_done() copies that into server->read_hdrsize; from there nfs3_proc_read_setup() copies it to args.replen in new requests. - nfs3_xdr_enc_read3args() passes that to rpc_prepare_reply_pages() which includes it in hdrsize for xdr_init_pages, so that rq_rcv_buf contains a ridiculous len. - This is copied to rq_private_buf and xs_read_stream_request() eventually passes the kvec to sock_recvmsg() which receives incoming data into entirely the wrong place. This is easily reproduced with NFSv3 LOCALIO that is servicing reads when it is made to pivot back to using normal RPC. This switch back to using normal NFSv3 with RPC can occur for a few reasons but this issue was exposed with a test that stops and then restarts the NFSv3 server while LOCALIO is performing heavy read IO. Fixes: 70ba381e1a43 ("nfs: add LOCALIO support") Reported-by: Mike Snitzer Signed-off-by: NeilBrown Co-developed-by: Mike Snitzer Signed-off-by: Mike Snitzer Signed-off-by: Trond Myklebust --- fs/nfs/localio.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 81bb908ab890..4b8618cf114c 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -351,6 +351,12 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status) nfs_local_pgio_done(hdr, status); + /* + * Must clear replen otherwise NFSv3 data corruption will occur + * if/when switching from LOCALIO back to using normal RPC. + */ + hdr->res.replen = 0; + if (hdr->res.count != hdr->args.count || hdr->args.offset + hdr->res.count >= i_size_read(file_inode(filp))) hdr->res.eof = true; -- 2.51.0 From 8f52caf9d231e77412766b48e5630a647e5ef774 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Nov 2024 21:01:43 -0500 Subject: [PATCH 12/16] Revert "fs: nfs: fix missing refcnt by replacing folio_set_private by folio_attach_private" This reverts commit 03e02b94171b1985dd0aa184296fe94425b855a3. As was pointed out during code review, there is no need to use folio_attach_private()/folio_detach_private() when a refcount to the folio is already carried by the struct nfs_page. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ead2dc55952d..2da00987d9ed 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -772,7 +772,8 @@ static void nfs_inode_add_request(struct nfs_page *req) nfs_lock_request(req); spin_lock(&mapping->i_private_lock); set_bit(PG_MAPPED, &req->wb_flags); - folio_attach_private(folio, req); + folio_set_private(folio); + folio->private = req; spin_unlock(&mapping->i_private_lock); atomic_long_inc(&nfsi->nrequests); /* this a head request for a page group - mark it as having an @@ -796,7 +797,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) spin_lock(&mapping->i_private_lock); if (likely(folio)) { - folio_detach_private(folio); + folio->private = NULL; + folio_clear_private(folio); clear_bit(PG_MAPPED, &req->wb_head->wb_flags); } spin_unlock(&mapping->i_private_lock); -- 2.51.0 From 66f9dac9077c9c063552e465212abeb8f97d28a7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Nov 2024 21:09:21 -0500 Subject: [PATCH 13/16] Revert "nfs: don't reuse partially completed requests in nfs_lock_and_join_requests" This reverts commit b571cfcb9dcac187c6d967987792d37cb0688610. This patch appears to assume that if one request is complete, then the others will complete too before unlocking. That is not a valid assumption, since other requests could hit a non-fatal error or a short write that would cause them not to complete. Reported-by: Igor Raits Link: https://bugzilla.kernel.org/show_bug.cgi?id=219508 Fixes: b571cfcb9dca ("nfs: don't reuse partially completed requests in nfs_lock_and_join_requests") Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 49 +++++++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 2da00987d9ed..50fa539611f5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -144,6 +144,31 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc) kref_put(&ioc->refcount, nfs_io_completion_release); } +static void +nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode) +{ + if (!test_and_set_bit(PG_INODE_REF, &req->wb_flags)) { + kref_get(&req->wb_kref); + atomic_long_inc(&NFS_I(inode)->nrequests); + } +} + +static int +nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode) +{ + int ret; + + if (!test_bit(PG_REMOVE, &req->wb_flags)) + return 0; + ret = nfs_page_group_lock(req); + if (ret) + return ret; + if (test_and_clear_bit(PG_REMOVE, &req->wb_flags)) + nfs_page_set_inode_ref(req, inode); + nfs_page_group_unlock(req); + return 0; +} + /** * nfs_folio_find_head_request - find head request associated with a folio * @folio: pointer to folio @@ -540,7 +565,6 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) struct inode *inode = folio->mapping->host; struct nfs_page *head, *subreq; struct nfs_commit_info cinfo; - bool removed; int ret; /* @@ -565,18 +589,18 @@ retry: goto retry; } - ret = nfs_page_group_lock(head); + ret = nfs_cancel_remove_inode(head, inode); if (ret < 0) goto out_unlock; - removed = test_bit(PG_REMOVE, &head->wb_flags); + ret = nfs_page_group_lock(head); + if (ret < 0) + goto out_unlock; /* lock each request in the page group */ for (subreq = head->wb_this_page; subreq != head; subreq = subreq->wb_this_page) { - if (test_bit(PG_REMOVE, &subreq->wb_flags)) - removed = true; ret = nfs_page_group_lock_subreq(head, subreq); if (ret < 0) goto out_unlock; @@ -584,21 +608,6 @@ retry: nfs_page_group_unlock(head); - /* - * If PG_REMOVE is set on any request, I/O on that request has - * completed, but some requests were still under I/O at the time - * we locked the head request. - * - * In that case the above wait for all requests means that all I/O - * has now finished, and we can restart from a clean slate. Let the - * old requests go away and start from scratch instead. - */ - if (removed) { - nfs_unroll_locks(head, head); - nfs_unlock_and_release_request(head); - goto retry; - } - nfs_init_cinfo_from_inode(&cinfo, inode); nfs_join_page_group(head, &cinfo, inode); return head; -- 2.51.0 From 52cb7f8f177878b4f22397b9c4d2c8f743766be3 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Thu, 14 Nov 2024 12:53:03 +0800 Subject: [PATCH 14/16] nfs: ignore SB_RDONLY when mounting nfs When exporting only one file system with fsid=0 on the server side, the client alternately uses the ro/rw mount options to perform the mount operation, and a new vfsmount is generated each time. It can be reproduced as follows: [root@localhost ~]# mount /dev/sda /mnt2 [root@localhost ~]# echo "/mnt2 *(rw,no_root_squash,fsid=0)" >/etc/exports [root@localhost ~]# systemctl restart nfs-server [root@localhost ~]# mount -t nfs -o ro,vers=4 127.0.0.1:/ /mnt/sdaa [root@localhost ~]# mount -t nfs -o rw,vers=4 127.0.0.1:/ /mnt/sdaa [root@localhost ~]# mount -t nfs -o ro,vers=4 127.0.0.1:/ /mnt/sdaa [root@localhost ~]# mount -t nfs -o rw,vers=4 127.0.0.1:/ /mnt/sdaa [root@localhost ~]# mount | grep nfs4 127.0.0.1:/ on /mnt/sdaa type nfs4 (ro,relatime,vers=4.2,rsize=1048576,... 127.0.0.1:/ on /mnt/sdaa type nfs4 (rw,relatime,vers=4.2,rsize=1048576,... 127.0.0.1:/ on /mnt/sdaa type nfs4 (ro,relatime,vers=4.2,rsize=1048576,... 127.0.0.1:/ on /mnt/sdaa type nfs4 (rw,relatime,vers=4.2,rsize=1048576,... [root@localhost ~]# We expected that after mounting with the ro option, using the rw option to mount again would return EBUSY, but the actual situation was not the case. As shown above, when mounting for the first time, a superblock with the ro flag will be generated, and at the same time, in do_new_mount_fc --> do_add_mount, it detects that the superblock corresponding to the current target directory is inconsistent with the currently generated one (path->mnt->mnt_sb != newmnt->mnt.mnt_sb), and a new vfsmount will be generated. When mounting with the rw option for the second time, since no matching superblock can be found in the fs_supers list, a new superblock with the rw flag will be generated again. The superblock in use (ro) is different from the newly generated superblock (rw), and a new vfsmount will be generated again. When mounting with the ro option for the third time, the superblock (ro) is found in fs_supers, the superblock in use (rw) is different from the found superblock (ro), and a new vfsmount will be generated again. We can switch between ro/rw through remount, and only one superblock needs to be generated, thus avoiding the problem of repeated generation of vfsmount caused by switching superblocks. Furthermore, This can also resolve the issue described in the link. Fixes: 275a5d24bf56 ("NFS: Error when mounting the same filesystem with different options") Link: https://lore.kernel.org/all/20240604112636.236517-3-lilingfeng@huaweicloud.com/ Signed-off-by: Li Lingfeng Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 430733e3eff2..6bcc4b0e00ab 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -12,7 +12,7 @@ #include #include -#define NFS_SB_MASK (SB_RDONLY|SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS) +#define NFS_SB_MASK (SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS) extern const struct export_operations nfs_export_ops; -- 2.51.0 From 4db9ad82a6c823094da27de4825af693a3475d51 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Fri, 15 Nov 2024 17:38:04 +0800 Subject: [PATCH 15/16] sunrpc: clear XPRT_SOCK_UPD_TIMEOUT when reset transport Since transport->sock has been set to NULL during reset transport, XPRT_SOCK_UPD_TIMEOUT also needs to be cleared. Otherwise, the xs_tcp_set_socket_timeouts() may be triggered in xs_tcp_send_request() to dereference the transport->sock that has been set to NULL. Fixes: 7196dbb02ea0 ("SUNRPC: Allow changing of the TCP timeout parameters on the fly") Signed-off-by: Li Lingfeng Signed-off-by: Liu Jian Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index d587c261d999..31bc046e2850 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1198,6 +1198,7 @@ static void xs_sock_reset_state_flags(struct rpc_xprt *xprt) clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state); clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state); clear_bit(XPRT_SOCK_NOSPACE, &transport->sock_state); + clear_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state); } static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr) -- 2.51.0 From d7bdd849ef1b681da03ac05ca0957b2cbe2d24b6 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 15 Nov 2024 08:59:36 -0500 Subject: [PATCH 16/16] SUNRPC: timeout and cancel TLS handshake with -ETIMEDOUT We've noticed a situation where an unstable TCP connection can cause the TLS handshake to timeout waiting for userspace to complete it. When this happens, we don't want to return from xs_tls_handshake_sync() with zero, as this will cause the upper xprt to be set CONNECTED, and subsequent attempts to transmit will be returned with -EPIPE. The sunrpc machine does not recover from this situation and will spin attempting to transmit. The return value of tls_handshake_cancel() can be used to detect a race with completion: * tls_handshake_cancel - cancel a pending handshake * Return values: * %true - Uncompleted handshake request was canceled * %false - Handshake request already completed or not found If true, we do not want the upper xprt to be connected, so return -ETIMEDOUT. If false, its possible the handshake request was lost and that may be the reason for our timeout. Again we do not want the upper xprt to be connected, so return -ETIMEDOUT. Ensure that we alway return an error from xs_tls_handshake_sync() if we call tls_handshake_cancel(). Signed-off-by: Benjamin Coddington Reviewed-by: Chuck Lever Fixes: 75eb6af7acdf ("SUNRPC: Add a TCP-with-TLS RPC transport class") Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 31bc046e2850..fecc8b8fa266 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2616,11 +2616,10 @@ static int xs_tls_handshake_sync(struct rpc_xprt *lower_xprt, struct xprtsec_par rc = wait_for_completion_interruptible_timeout(&lower_transport->handshake_done, XS_TLS_HANDSHAKE_TO); if (rc <= 0) { - if (!tls_handshake_cancel(sk)) { - if (rc == 0) - rc = -ETIMEDOUT; - goto out_put_xprt; - } + tls_handshake_cancel(sk); + if (rc == 0) + rc = -ETIMEDOUT; + goto out_put_xprt; } rc = lower_transport->xprt_err; -- 2.51.0