From 04c8abae1b7b2abeb638a3d5d5950fa2a031c244 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Thu, 29 Aug 2024 11:20:49 -0700 Subject: [PATCH 01/16] dcache: keep dentry_hashtable or d_hash_shift even when not used The runtime constant feature removes all the users of these variables, allowing the compiler to optimize them away. It's quite difficult to extract their values from the kernel text, and the memory saved by removing them is tiny, and it was never the point of this optimization. Since the dentry_hashtable is a core data structure, it's valuable for debugging tools to be able to read it easily. For instance, scripts built on drgn, like the dentrycache script[1], rely on it to be able to perform diagnostics on the contents of the dcache. Annotate it as used, so the compiler doesn't discard it. Link: https://github.com/oracle-samples/drgn-tools/blob/3afc56146f54d09dfd1f6d3c1b7436eda7e638be/drgn_tools/dentry.py#L325-L355 [1] Fixes: e3c92e81711d ("runtime constants: add x86 architecture support") Signed-off-by: Stephen Brennan Signed-off-by: Linus Torvalds --- fs/dcache.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 3d8daaecb6d1..6386b9b625dd 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -96,11 +96,16 @@ EXPORT_SYMBOL(dotdot_name); * * This hash-function tries to avoid losing too many bits of hash * information, yet avoid using a prime hash-size or similar. + * + * Marking the variables "used" ensures that the compiler doesn't + * optimize them away completely on architectures with runtime + * constant infrastructure, this allows debuggers to see their + * values. But updating these values has no effect on those arches. */ -static unsigned int d_hash_shift __ro_after_init; +static unsigned int d_hash_shift __ro_after_init __used; -static struct hlist_bl_head *dentry_hashtable __ro_after_init; +static struct hlist_bl_head *dentry_hashtable __ro_after_init __used; static inline struct hlist_bl_head *d_hash(unsigned long hashlen) { -- 2.51.0 From 1c47c0d6014c832ad8e2ba04fc2c5b7070d999f7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 Aug 2024 09:42:33 -0600 Subject: [PATCH 02/16] io_uring/rsrc: ensure compat iovecs are copied correctly For buffer registration (or updates), a userspace iovec is copied in and updated. If the application is within a compat syscall, then the iovec type is compat_iovec rather than iovec. However, the type used in __io_sqe_buffers_update() and io_sqe_buffers_register() is always struct iovec, and hence the source is incremented by the size of a non-compat iovec in the loop. This misses every other iovec in the source, and will run into garbage half way through the copies and return -EFAULT to the application. Maintain the source address separately and assign to our user vec pointer, so that copies always happen from the right source address. While in there, correct a bad placement of __user which triggered the following sparse warning prior to this fix: io_uring/rsrc.c:981:33: warning: cast removes address space '__user' of expression io_uring/rsrc.c:981:30: warning: incorrect type in assignment (different address spaces) io_uring/rsrc.c:981:30: expected struct iovec const [noderef] __user *uvec io_uring/rsrc.c:981:30: got struct iovec *[noderef] __user Fixes: f4eaf8eda89e ("io_uring/rsrc: Drop io_copy_iov in favor of iovec API") Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index a860516bf448..453867add7ca 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -394,10 +394,11 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned int nr_args) { - struct iovec __user *uvec = u64_to_user_ptr(up->data); u64 __user *tags = u64_to_user_ptr(up->tags); struct iovec fast_iov, *iov; struct page *last_hpage = NULL; + struct iovec __user *uvec; + u64 user_data = up->data; __u32 done; int i, err; @@ -410,7 +411,8 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu; u64 tag = 0; - iov = iovec_from_user(&uvec[done], 1, 1, &fast_iov, ctx->compat); + uvec = u64_to_user_ptr(user_data); + iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); if (IS_ERR(iov)) { err = PTR_ERR(iov); break; @@ -443,6 +445,10 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, ctx->user_bufs[i] = imu; *io_get_tag_slot(ctx->buf_data, i) = tag; + if (ctx->compat) + user_data += sizeof(struct compat_iovec); + else + user_data += sizeof(struct iovec); } return done ? done : err; } @@ -949,7 +955,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, struct page *last_hpage = NULL; struct io_rsrc_data *data; struct iovec fast_iov, *iov = &fast_iov; - const struct iovec __user *uvec = (struct iovec * __user) arg; + const struct iovec __user *uvec; int i, ret; BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); @@ -972,7 +978,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { if (arg) { - iov = iovec_from_user(&uvec[i], 1, 1, &fast_iov, ctx->compat); + uvec = (struct iovec __user *) arg; + iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); if (IS_ERR(iov)) { ret = PTR_ERR(iov); break; @@ -980,6 +987,10 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, ret = io_buffer_validate(iov); if (ret) break; + if (ctx->compat) + arg += sizeof(struct compat_iovec); + else + arg += sizeof(struct iovec); } if (!iov->iov_base && *io_get_tag_slot(data, i)) { -- 2.51.0 From 40927f3d0972bf86357a32a5749be71a551241b6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 29 Aug 2024 09:06:28 +1000 Subject: [PATCH 03/16] nfsd: fix nfsd4_deleg_getattr_conflict in presence of third party lease It is not safe to dereference fl->c.flc_owner without first confirming fl->fl_lmops is the expected manager. nfsd4_deleg_getattr_conflict() tests fl_lmops but largely ignores the result and assumes that flc_owner is an nfs4_delegation anyway. This is wrong. With this patch we restore the "!= &nfsd_lease_mng_ops" case to behave as it did before the change mentioned below. This is the same as the current code, but without any reference to a possible delegation. Fixes: c5967721e106 ("NFSD: handle GETATTR conflict with write delegation") Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 07f2496850c4..a366fb1c1b9b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -8859,7 +8859,15 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, */ if (type == F_RDLCK) break; - goto break_lease; + + nfsd_stats_wdeleg_getattr_inc(nn); + spin_unlock(&ctx->flc_lock); + + status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); + if (status != nfserr_jukebox || + !nfsd_wait_for_delegreturn(rqstp, inode)) + return status; + return 0; } if (type == F_WRLCK) { struct nfs4_delegation *dp = fl->c.flc_owner; @@ -8868,7 +8876,6 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, spin_unlock(&ctx->flc_lock); return 0; } -break_lease: nfsd_stats_wdeleg_getattr_inc(nn); dp = fl->c.flc_owner; refcount_inc(&dp->dl_stid.sc_count); -- 2.51.0 From f274495aea7b15225b3d83837121b22ef96e560c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 30 Aug 2024 10:45:54 -0600 Subject: [PATCH 04/16] io_uring/kbuf: return correct iovec count from classic buffer peek io_provided_buffers_select() returns 0 to indicate success, but it should be returning 1 to indicate that 1 vec was mapped. This causes peeking to fail with classic provided buffers, and while that's not a use case that anyone should use, it should still work correctly. The end result is that no buffer will be selected, and hence a completion with '0' as the result will be posted, without a buffer attached. Fixes: 35c8711c8fc4 ("io_uring/kbuf: add helpers for getting/peeking multiple buffers") Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 1af2bd56af44..bdfa30b38321 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -129,7 +129,7 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len, iov[0].iov_base = buf; iov[0].iov_len = *len; - return 0; + return 1; } static struct io_uring_buf *io_ring_head_to_buf(struct io_uring_buf_ring *br, -- 2.51.0 From 150b572a7c1df30f5d32d87ad96675200cca7b80 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 26 Aug 2024 16:27:39 -0400 Subject: [PATCH 05/16] MAINTAINERS: PCI: Add NXP PCI controller mailing list imx@lists.linux.dev Add imx mailing list imx@lists.linux.dev for PCI controller of NXP chips (Layerscape and iMX). Link: https://lore.kernel.org/r/20240826202740.970015-1-Frank.Li@nxp.com Signed-off-by: Frank Li Signed-off-by: Bjorn Helgaas Acked-by: Richard Zhu --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 3fb27f41515d..1b7a6a8073bb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17412,6 +17412,7 @@ M: Roy Zang L: linuxppc-dev@lists.ozlabs.org L: linux-pci@vger.kernel.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) +L: imx@lists.linux.dev S: Maintained F: drivers/pci/controller/dwc/*layerscape* @@ -17438,6 +17439,7 @@ M: Richard Zhu M: Lucas Stach L: linux-pci@vger.kernel.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) +L: imx@lists.linux.dev S: Maintained F: Documentation/devicetree/bindings/pci/fsl,imx6q-pcie-common.yaml F: Documentation/devicetree/bindings/pci/fsl,imx6q-pcie-ep.yaml -- 2.51.0 From d8b762070c3fde224f8b9ea3cf59bc41a5a3eb57 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 23 Aug 2024 13:55:00 +0200 Subject: [PATCH 06/16] power: sequencing: qcom-wcn: set the wlan-enable GPIO to output Commit a9aaf1ff88a8 ("power: sequencing: request the WLAN enable GPIO as-is") broke WLAN on boards on which the wlan-enable GPIO enabling the wifi module isn't in output mode by default. We need to set direction to output while retaining the value that was already set to keep the ath module on if it's already started. Fixes: a9aaf1ff88a8 ("power: sequencing: request the WLAN enable GPIO as-is") Link: https://lore.kernel.org/r/20240823115500.37280-1-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- drivers/power/sequencing/pwrseq-qcom-wcn.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/power/sequencing/pwrseq-qcom-wcn.c b/drivers/power/sequencing/pwrseq-qcom-wcn.c index d786cbf1b2cd..700879474abf 100644 --- a/drivers/power/sequencing/pwrseq-qcom-wcn.c +++ b/drivers/power/sequencing/pwrseq-qcom-wcn.c @@ -288,6 +288,13 @@ static int pwrseq_qcom_wcn_probe(struct platform_device *pdev) return dev_err_probe(dev, PTR_ERR(ctx->wlan_gpio), "Failed to get the WLAN enable GPIO\n"); + /* + * Set direction to output but keep the current value in order to not + * disable the WLAN module accidentally if it's already powered on. + */ + gpiod_direction_output(ctx->wlan_gpio, + gpiod_get_value_cansleep(ctx->wlan_gpio)); + ctx->clk = devm_clk_get_optional(dev, NULL); if (IS_ERR(ctx->clk)) return dev_err_probe(dev, PTR_ERR(ctx->clk), -- 2.51.0 From e3e6940940910c2287fe962bdf72015efd4fee81 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 31 Aug 2024 17:44:51 -0400 Subject: [PATCH 07/16] bcachefs: Revert lockless buffered IO path We had a report of data corruption on nixos when building installer images. https://github.com/NixOS/nixpkgs/pull/321055#issuecomment-2184131334 It seems that writes are being dropped, but only when issued by QEMU, and possibly only in snapshot mode. It's undetermined if it's write calls are being dropped or dirty folios. Further testing, via minimizing the original patch to just the change that skips the inode lock on non appends/truncates, reveals that it really is just not taking the inode lock that causes the corruption: it has nothing to do with the other logic changes for preserving write atomicity in corner cases. It's also kernel config dependent: it doesn't reproduce with the minimal kernel config that ktest uses, but it does reproduce with nixos's distro config. Bisection the kernel config initially pointer the finger at page migration or compaction, but it appears that was erroneous; we haven't yet determined what kernel config option actually triggers it. Sadly it appears this will have to be reverted since we're getting too close to release and my plate is full, but we'd _really_ like to fully debug it. My suspicion is that this patch is exposing a preexisting bug - the inode lock actually covers very little in IO paths, and we have a different lock (the pagecache add lock) that guards against races with truncate here. Fixes: 7e64c86cdc6c ("bcachefs: Buffered write path now can avoid the inode lock") Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 1 - fs/bcachefs/fs-io-buffered.c | 149 ++++++++++------------------------- 2 files changed, 40 insertions(+), 110 deletions(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index ab5a7adece10..742dcdd3e5d7 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -257,7 +257,6 @@ x(BCH_ERR_nopromote, nopromote_in_flight) \ x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_enomem) \ - x(0, need_inode_lock) \ x(0, invalid_snapshot_node) \ x(0, option_needs_open_fs) diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 184d03851676..ec8c427bf588 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -802,8 +802,7 @@ static noinline void folios_trunc(folios *fs, struct folio **fi) static int __bch2_buffered_write(struct bch_inode_info *inode, struct address_space *mapping, struct iov_iter *iter, - loff_t pos, unsigned len, - bool inode_locked) + loff_t pos, unsigned len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation res; @@ -827,15 +826,6 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, BUG_ON(!fs.nr); - /* - * If we're not using the inode lock, we need to lock all the folios for - * atomiticity of writes vs. other writes: - */ - if (!inode_locked && folio_end_pos(darray_last(fs)) < end) { - ret = -BCH_ERR_need_inode_lock; - goto out; - } - f = darray_first(fs); if (pos != folio_pos(f) && !folio_test_uptodate(f)) { ret = bch2_read_single_folio(f, mapping); @@ -932,10 +922,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, end = pos + copied; spin_lock(&inode->v.i_lock); - if (end > inode->v.i_size) { - BUG_ON(!inode_locked); + if (end > inode->v.i_size) i_size_write(&inode->v, end); - } spin_unlock(&inode->v.i_lock); f_pos = pos; @@ -979,68 +967,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct bch_inode_info *inode = file_bch_inode(file); - loff_t pos; - bool inode_locked = false; - ssize_t written = 0, written2 = 0, ret = 0; - - /* - * We don't take the inode lock unless i_size will be changing. Folio - * locks provide exclusion with other writes, and the pagecache add lock - * provides exclusion with truncate and hole punching. - * - * There is one nasty corner case where atomicity would be broken - * without great care: when copying data from userspace to the page - * cache, we do that with faults disable - a page fault would recurse - * back into the filesystem, taking filesystem locks again, and - * deadlock; so it's done with faults disabled, and we fault in the user - * buffer when we aren't holding locks. - * - * If we do part of the write, but we then race and in the userspace - * buffer have been evicted and are no longer resident, then we have to - * drop our folio locks to re-fault them in, breaking write atomicity. - * - * To fix this, we restart the write from the start, if we weren't - * holding the inode lock. - * - * There is another wrinkle after that; if we restart the write from the - * start, and then get an unrecoverable error, we _cannot_ claim to - * userspace that we did not write data we actually did - so we must - * track (written2) the most we ever wrote. - */ - - if ((iocb->ki_flags & IOCB_APPEND) || - (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) { - inode_lock(&inode->v); - inode_locked = true; - } - - ret = generic_write_checks(iocb, iter); - if (ret <= 0) - goto unlock; - - ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0); - if (ret) { - if (!inode_locked) { - inode_lock(&inode->v); - inode_locked = true; - ret = file_remove_privs_flags(file, 0); - } - if (ret) - goto unlock; - } - - ret = file_update_time(file); - if (ret) - goto unlock; - - pos = iocb->ki_pos; + loff_t pos = iocb->ki_pos; + ssize_t written = 0; + int ret = 0; bch2_pagecache_add_get(inode); - if (!inode_locked && - (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) - goto get_inode_lock; - do { unsigned offset = pos & (PAGE_SIZE - 1); unsigned bytes = iov_iter_count(iter); @@ -1065,17 +997,12 @@ again: } } - if (unlikely(bytes != iov_iter_count(iter) && !inode_locked)) - goto get_inode_lock; - if (unlikely(fatal_signal_pending(current))) { ret = -EINTR; break; } - ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked); - if (ret == -BCH_ERR_need_inode_lock) - goto get_inode_lock; + ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); if (unlikely(ret < 0)) break; @@ -1096,46 +1023,50 @@ again: } pos += ret; written += ret; - written2 = max(written, written2); - - if (ret != bytes && !inode_locked) - goto get_inode_lock; ret = 0; balance_dirty_pages_ratelimited(mapping); - - if (0) { -get_inode_lock: - bch2_pagecache_add_put(inode); - inode_lock(&inode->v); - inode_locked = true; - bch2_pagecache_add_get(inode); - - iov_iter_revert(iter, written); - pos -= written; - written = 0; - ret = 0; - } } while (iov_iter_count(iter)); - bch2_pagecache_add_put(inode); -unlock: - if (inode_locked) - inode_unlock(&inode->v); - iocb->ki_pos += written; + bch2_pagecache_add_put(inode); - ret = max(written, written2) ?: ret; - if (ret > 0) - ret = generic_write_sync(iocb, ret); - return ret; + return written ? written : ret; } -ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter) +ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) { - ssize_t ret = iocb->ki_flags & IOCB_DIRECT - ? bch2_direct_write(iocb, iter) - : bch2_buffered_write(iocb, iter); + struct file *file = iocb->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + ssize_t ret; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = bch2_direct_write(iocb, from); + goto out; + } + + inode_lock(&inode->v); + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto unlock; + + ret = file_remove_privs(file); + if (ret) + goto unlock; + + ret = file_update_time(file); + if (ret) + goto unlock; + + ret = bch2_buffered_write(iocb, from); + if (likely(ret > 0)) + iocb->ki_pos += ret; +unlock: + inode_unlock(&inode->v); + if (ret > 0) + ret = generic_write_sync(iocb, ret); +out: return bch2_err_class(ret); } -- 2.51.0 From 3d3020c461936009dc58702e267ff67b0076cbf2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 22 Aug 2024 11:47:32 -0400 Subject: [PATCH 08/16] bcachefs: Mark more errors as autofix errors that are known to always be safe to fix should be autofix: this should be most errors even at this point, but that will need some thorough review. note that errors are still logged in the superblock, so we'll still know that they happened. Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-errors_format.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index d3a498617303..f0c14702f9e6 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -23,7 +23,7 @@ enum bch_fsck_flags { x(jset_past_bucket_end, 9, 0) \ x(jset_seq_blacklisted, 10, 0) \ x(journal_entries_missing, 11, 0) \ - x(journal_entry_replicas_not_marked, 12, 0) \ + x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \ x(journal_entry_past_jset_end, 13, 0) \ x(journal_entry_replicas_data_mismatch, 14, 0) \ x(journal_entry_bkey_u64s_0, 15, 0) \ @@ -288,10 +288,10 @@ enum bch_fsck_flags { x(invalid_btree_id, 274, 0) \ x(alloc_key_io_time_bad, 275, 0) \ x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ - x(accounting_key_junk_at_end, 277, 0) \ - x(accounting_key_replicas_nr_devs_0, 278, 0) \ - x(accounting_key_replicas_nr_required_bad, 279, 0) \ - x(accounting_key_replicas_devs_unsorted, 280, 0) \ + x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \ + x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \ + x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ + x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, -- 2.51.0 From 431c1646e1f86b949fa3685efc50b660a364c2b6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 1 Sep 2024 19:46:02 +1200 Subject: [PATCH 09/16] Linux 6.11-rc6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 7b60eb103c5d..d57cfc6896b8 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 11 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.51.0 From 8e6e2ffa6569a205f1805cbaeca143b556581da6 Mon Sep 17 00:00:00 2001 From: Youzhong Yang Date: Wed, 10 Jul 2024 10:40:35 -0400 Subject: [PATCH 10/16] nfsd: add list_head nf_gc to struct nfsd_file nfsd_file_put() in one thread can race with another thread doing garbage collection (running nfsd_file_gc() -> list_lru_walk() -> nfsd_file_lru_cb()): * In nfsd_file_put(), nf->nf_ref is 1, so it tries to do nfsd_file_lru_add(). * nfsd_file_lru_add() returns true (with NFSD_FILE_REFERENCED bit set) * garbage collector kicks in, nfsd_file_lru_cb() clears REFERENCED bit and returns LRU_ROTATE. * garbage collector kicks in again, nfsd_file_lru_cb() now decrements nf->nf_ref to 0, runs nfsd_file_unhash(), removes it from the LRU and adds to the dispose list [list_lru_isolate_move(lru, &nf->nf_lru, head)] * nfsd_file_put() detects NFSD_FILE_HASHED bit is cleared, so it tries to remove the 'nf' from the LRU [if (!nfsd_file_lru_remove(nf))]. The 'nf' has been added to the 'dispose' list by nfsd_file_lru_cb(), so nfsd_file_lru_remove(nf) simply treats it as part of the LRU and removes it, which leads to its removal from the 'dispose' list. * At this moment, 'nf' is unhashed with its nf_ref being 0, and not on the LRU. nfsd_file_put() continues its execution [if (refcount_dec_and_test(&nf->nf_ref))], as nf->nf_ref is already 0, nf->nf_ref is set to REFCOUNT_SATURATED, and the 'nf' gets no chance of being freed. nfsd_file_put() can also race with nfsd_file_cond_queue(): * In nfsd_file_put(), nf->nf_ref is 1, so it tries to do nfsd_file_lru_add(). * nfsd_file_lru_add() sets REFERENCED bit and returns true. * Some userland application runs 'exportfs -f' or something like that, which triggers __nfsd_file_cache_purge() -> nfsd_file_cond_queue(). * In nfsd_file_cond_queue(), it runs [if (!nfsd_file_unhash(nf))], unhash is done successfully. * nfsd_file_cond_queue() runs [if (!nfsd_file_get(nf))], now nf->nf_ref goes to 2. * nfsd_file_cond_queue() runs [if (nfsd_file_lru_remove(nf))], it succeeds. * nfsd_file_cond_queue() runs [if (refcount_sub_and_test(decrement, &nf->nf_ref))] (with "decrement" being 2), so the nf->nf_ref goes to 0, the 'nf' is added to the dispose list [list_add(&nf->nf_lru, dispose)] * nfsd_file_put() detects NFSD_FILE_HASHED bit is cleared, so it tries to remove the 'nf' from the LRU [if (!nfsd_file_lru_remove(nf))], although the 'nf' is not in the LRU, but it is linked in the 'dispose' list, nfsd_file_lru_remove() simply treats it as part of the LRU and removes it. This leads to its removal from the 'dispose' list! * Now nf->ref is 0, unhashed. nfsd_file_put() continues its execution and set nf->nf_ref to REFCOUNT_SATURATED. As shown in the above analysis, using nf_lru for both the LRU list and dispose list can cause the leaks. This patch adds a new list_head nf_gc in struct nfsd_file, and uses it for the dispose list. This does not fix the nfsd_file leaking issue completely. Signed-off-by: Youzhong Yang Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 18 ++++++++++-------- fs/nfsd/filecache.h | 1 + 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index f4704f5d4086..01d76f6bc80a 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -216,6 +216,7 @@ nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need, return NULL; INIT_LIST_HEAD(&nf->nf_lru); + INIT_LIST_HEAD(&nf->nf_gc); nf->nf_birthtime = ktime_get(); nf->nf_file = NULL; nf->nf_cred = get_current_cred(); @@ -393,8 +394,8 @@ nfsd_file_dispose_list(struct list_head *dispose) struct nfsd_file *nf; while (!list_empty(dispose)) { - nf = list_first_entry(dispose, struct nfsd_file, nf_lru); - list_del_init(&nf->nf_lru); + nf = list_first_entry(dispose, struct nfsd_file, nf_gc); + list_del_init(&nf->nf_gc); nfsd_file_free(nf); } } @@ -411,12 +412,12 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose) { while(!list_empty(dispose)) { struct nfsd_file *nf = list_first_entry(dispose, - struct nfsd_file, nf_lru); + struct nfsd_file, nf_gc); struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id); struct nfsd_fcache_disposal *l = nn->fcache_disposal; spin_lock(&l->lock); - list_move_tail(&nf->nf_lru, &l->freeme); + list_move_tail(&nf->nf_gc, &l->freeme); spin_unlock(&l->lock); svc_wake_up(nn->nfsd_serv); } @@ -503,7 +504,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, /* Refcount went to zero. Unhash it and queue it to the dispose list */ nfsd_file_unhash(nf); - list_lru_isolate_move(lru, &nf->nf_lru, head); + list_lru_isolate(lru, &nf->nf_lru); + list_add(&nf->nf_gc, head); this_cpu_inc(nfsd_file_evictions); trace_nfsd_file_gc_disposed(nf); return LRU_REMOVED; @@ -578,7 +580,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose) /* If refcount goes to 0, then put on the dispose list */ if (refcount_sub_and_test(decrement, &nf->nf_ref)) { - list_add(&nf->nf_lru, dispose); + list_add(&nf->nf_gc, dispose); trace_nfsd_file_closing(nf); } } @@ -654,8 +656,8 @@ nfsd_file_close_inode_sync(struct inode *inode) nfsd_file_queue_for_close(inode, &dispose); while (!list_empty(&dispose)) { - nf = list_first_entry(&dispose, struct nfsd_file, nf_lru); - list_del_init(&nf->nf_lru); + nf = list_first_entry(&dispose, struct nfsd_file, nf_gc); + list_del_init(&nf->nf_gc); nfsd_file_free(nf); } } diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index c61884def906..3fbec24eea6c 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -44,6 +44,7 @@ struct nfsd_file { struct nfsd_file_mark *nf_mark; struct list_head nf_lru; + struct list_head nf_gc; struct rcu_head nf_rcu; ktime_t nf_birthtime; }; -- 2.51.0 From 81a95c2b1d605743220f28db04b8da13a65c4059 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 11 Jul 2024 15:11:13 -0400 Subject: [PATCH 11/16] nfsd: remove unneeded EEXIST error check in nfsd_do_file_acquire Given that we do the search and insertion while holding the i_lock, I don't think it's possible for us to get EEXIST here. Remove this case. Fixes: c6593366c0bf ("nfsd: don't kill nfsd_files because of lease break error") Signed-off-by: Jeff Layton Tested-by: Youzhong Yang Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 01d76f6bc80a..9d7bb65b0746 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -1037,8 +1037,6 @@ retry: if (likely(ret == 0)) goto open_file; - if (ret == -EEXIST) - goto retry; trace_nfsd_file_insert_err(rqstp, inode, may_flags, ret); status = nfserr_jukebox; goto construction_err; -- 2.51.0 From 8a7926176378460e0d91e02b03f0ff20a8709a60 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Jul 2024 09:05:32 -0400 Subject: [PATCH 12/16] nfsd: fix refcount leak when file is unhashed after being found If we wait_for_construction and find that the file is no longer hashed, and we're going to retry the open, the old nfsd_file reference is currently leaked. Put the reference before retrying. Fixes: c6593366c0bf ("nfsd: don't kill nfsd_files because of lease break error") Signed-off-by: Jeff Layton Tested-by: Youzhong Yang Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 9d7bb65b0746..6af4e6027227 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -1051,6 +1051,7 @@ wait_for_construction: status = nfserr_jukebox; goto construction_err; } + nfsd_file_put(nf); open_retry = false; fh_put(fhp); goto retry; -- 2.51.0 From 700bb4ff912f954345286e065ff145753a1d5bbe Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Jul 2024 09:05:33 -0400 Subject: [PATCH 13/16] nfsd: count nfsd_file allocations We already count the frees (via nfsd_file_releases). Count the allocations as well. Also switch the direct call to nfsd_file_slab_free in nfsd_file_do_acquire to nfsd_file_free, so that the allocs and releases match up. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 6af4e6027227..93e2ffa5eae6 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -56,6 +56,7 @@ static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); +static DEFINE_PER_CPU(unsigned long, nfsd_file_allocations); static DEFINE_PER_CPU(unsigned long, nfsd_file_releases); static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age); static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions); @@ -215,6 +216,7 @@ nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need, if (unlikely(!nf)) return NULL; + this_cpu_inc(nfsd_file_allocations); INIT_LIST_HEAD(&nf->nf_lru); INIT_LIST_HEAD(&nf->nf_gc); nf->nf_birthtime = ktime_get(); @@ -911,6 +913,7 @@ nfsd_file_cache_shutdown(void) for_each_possible_cpu(i) { per_cpu(nfsd_file_cache_hits, i) = 0; per_cpu(nfsd_file_acquisitions, i) = 0; + per_cpu(nfsd_file_allocations, i) = 0; per_cpu(nfsd_file_releases, i) = 0; per_cpu(nfsd_file_total_age, i) = 0; per_cpu(nfsd_file_evictions, i) = 0; @@ -1026,7 +1029,7 @@ retry: if (unlikely(nf)) { spin_unlock(&inode->i_lock); rcu_read_unlock(); - nfsd_file_slab_free(&new->nf_rcu); + nfsd_file_free(new); goto wait_for_construction; } nf = new; @@ -1200,7 +1203,7 @@ nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp, */ int nfsd_file_cache_stats_show(struct seq_file *m, void *v) { - unsigned long releases = 0, evictions = 0; + unsigned long allocations = 0, releases = 0, evictions = 0; unsigned long hits = 0, acquisitions = 0; unsigned int i, count = 0, buckets = 0; unsigned long lru = 0, total_age = 0; @@ -1225,6 +1228,7 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v) for_each_possible_cpu(i) { hits += per_cpu(nfsd_file_cache_hits, i); acquisitions += per_cpu(nfsd_file_acquisitions, i); + allocations += per_cpu(nfsd_file_allocations, i); releases += per_cpu(nfsd_file_releases, i); total_age += per_cpu(nfsd_file_total_age, i); evictions += per_cpu(nfsd_file_evictions, i); @@ -1235,6 +1239,7 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "lru entries: %lu\n", lru); seq_printf(m, "cache hits: %lu\n", hits); seq_printf(m, "acquisitions: %lu\n", acquisitions); + seq_printf(m, "allocations: %lu\n", allocations); seq_printf(m, "releases: %lu\n", releases); seq_printf(m, "evictions: %lu\n", evictions); if (releases) -- 2.51.0 From 4b84551a35e36bbed48850dc870191a13f0841fd Mon Sep 17 00:00:00 2001 From: Youzhong Yang Date: Thu, 11 Jul 2024 11:51:33 -0400 Subject: [PATCH 14/16] nfsd: use system_unbound_wq for nfsd_file_gc_worker() After many rounds of changes in filecache.c, the fix by commit ce7df055(NFSD: Make the file_delayed_close workqueue UNBOUND) is gone, now we are getting syslog messages like these: [ 1618.186688] workqueue: nfsd_file_gc_worker [nfsd] hogged CPU for >13333us 4 times, consider switching to WQ_UNBOUND [ 1638.661616] workqueue: nfsd_file_gc_worker [nfsd] hogged CPU for >13333us 8 times, consider switching to WQ_UNBOUND [ 1665.284542] workqueue: nfsd_file_gc_worker [nfsd] hogged CPU for >13333us 16 times, consider switching to WQ_UNBOUND [ 1759.491342] workqueue: nfsd_file_gc_worker [nfsd] hogged CPU for >13333us 32 times, consider switching to WQ_UNBOUND [ 3013.012308] workqueue: nfsd_file_gc_worker [nfsd] hogged CPU for >13333us 64 times, consider switching to WQ_UNBOUND [ 3154.172827] workqueue: nfsd_file_gc_worker [nfsd] hogged CPU for >13333us 128 times, consider switching to WQ_UNBOUND [ 3422.461924] workqueue: nfsd_file_gc_worker [nfsd] hogged CPU for >13333us 256 times, consider switching to WQ_UNBOUND [ 3963.152054] workqueue: nfsd_file_gc_worker [nfsd] hogged CPU for >13333us 512 times, consider switching to WQ_UNBOUND Consider use system_unbound_wq instead of system_wq for nfsd_file_gc_worker(). Signed-off-by: Youzhong Yang Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 93e2ffa5eae6..9e9d246f993c 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -112,7 +112,7 @@ static void nfsd_file_schedule_laundrette(void) { if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags)) - queue_delayed_work(system_wq, &nfsd_filecache_laundrette, + queue_delayed_work(system_unbound_wq, &nfsd_filecache_laundrette, NFSD_LAUNDRETTE_DELAY); } -- 2.51.0 From cef48236dfe55fa266d505e8a497963a7bc5ef2a Mon Sep 17 00:00:00 2001 From: Chen Hanxiao Date: Thu, 18 Jul 2024 15:06:16 +0800 Subject: [PATCH 15/16] NFS: trace: show TIMEDOUT instead of 0x6e __nfs_revalidate_inode may return ETIMEDOUT. print symbol of ETIMEDOUT in nfs trace: before: cat-5191 [005] 119.331127: nfs_revalidate_inode_exit: error=-110 (0x6e) after: cat-1738 [004] 44.365509: nfs_revalidate_inode_exit: error=-110 (TIMEDOUT) Signed-off-by: Chen Hanxiao Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/trace/misc/nfs.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/trace/misc/nfs.h b/include/trace/misc/nfs.h index 7b221d51133a..c82233e950ac 100644 --- a/include/trace/misc/nfs.h +++ b/include/trace/misc/nfs.h @@ -51,6 +51,7 @@ TRACE_DEFINE_ENUM(NFSERR_JUKEBOX); { NFSERR_IO, "IO" }, \ { NFSERR_NXIO, "NXIO" }, \ { ECHILD, "CHILD" }, \ + { ETIMEDOUT, "TIMEDOUT" }, \ { NFSERR_EAGAIN, "AGAIN" }, \ { NFSERR_ACCES, "ACCES" }, \ { NFSERR_EXIST, "EXIST" }, \ -- 2.51.0 From 8cf9a01edc216b16b5839eb793ac544d2c97ce97 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 11 Sep 2024 15:42:57 -0400 Subject: [PATCH 16/16] fs: Introduce FOP_ASYNC_LOCK Some lock managers (NLM, kNFSD) fastidiously avoid blocking their kernel threads while servicing blocking locks. If a filesystem supports asynchronous lock requests those lock managers can use notifications to quickly inform clients they have acquired a file lock. Historically, only posix_lock_file() was capable of supporting asynchronous locks so the check for support was simply file_operations->lock(), but with recent changes in DLM, both GFS2 and OCFS2 also support asynchronous locks and have started signalling their support with EXPORT_OP_ASYNC_LOCK. We recently noticed that those changes dropped the checks for whether a filesystem simply defaults to posix_lock_file(), so async lock notifications have not been attempted for NLM and NFSv4.1+ for most filesystems. While trying to fix this it has become clear that testing both the export flag combined with testing ->lock() creates quite a layering mess. It seems appropriate to signal support with a fop_flag. Add FOP_ASYNC_LOCK so that filesystems with ->lock() can signal their capability to handle lock requests asynchronously. Add a helper for lock managers to properly test that support. Signed-off-by: Benjamin Coddington Link: https://lore.kernel.org/r/3330d5a324abe2ce9c1dafe89cacdc6db41945d1.1726083391.git.bcodding@redhat.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/filelock.h | 5 +++++ include/linux/fs.h | 2 ++ 2 files changed, 7 insertions(+) diff --git a/include/linux/filelock.h b/include/linux/filelock.h index daee999d05f3..58c1120a8253 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -180,6 +180,11 @@ static inline void locks_wake_up(struct file_lock *fl) wake_up(&fl->c.flc_wait); } +static inline bool locks_can_async_lock(const struct file_operations *fops) +{ + return !fops->lock || fops->fop_flags & FOP_ASYNC_LOCK; +} + /* fs/locks.c */ void locks_free_lock_context(struct inode *inode); void locks_free_lock(struct file_lock *fl); diff --git a/include/linux/fs.h b/include/linux/fs.h index 6ca11e241a24..78221ae589d9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2074,6 +2074,8 @@ struct file_operations { #define FOP_DIO_PARALLEL_WRITE ((__force fop_flags_t)(1 << 3)) /* Contains huge pages */ #define FOP_HUGE_PAGES ((__force fop_flags_t)(1 << 4)) +/* Supports asynchronous lock callbacks */ +#define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 5)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, -- 2.51.0