From 2bb447540e71ee530388750c38e1b2c8ea08b4b7 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Fri, 24 Jan 2025 18:31:02 +0000 Subject: [PATCH 01/16] vfio/nvgrace-gpu: Add GB200 SKU to the devid table NVIDIA is productizing the new Grace Blackwell superchip SKU bearing device ID 0x2941. Add the SKU devid to nvgrace_gpu_vfio_pci_table. CC: Alex Williamson Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20250124183102.3976-5-ankita@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 655a624134cc..e5ac39c4cc6b 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -991,6 +991,8 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) }, /* GH200 SKU */ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) }, + /* GB200 SKU */ + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) }, {} }; -- 2.51.0 From 4a9bfb9b6850fec0685447aed280533cf980de70 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:29:04 +0100 Subject: [PATCH 02/16] fuse: {io-uring} Handle teardown of ring entries On teardown struct file_operations::uring_cmd requests need to be completed by calling io_uring_cmd_done(). Not completing all ring entries would result in busy io-uring tasks giving warning messages in intervals and unreleased struct file. Additionally the fuse connection and with that the ring can only get released when all io-uring commands are completed. Completion is done with ring entries that are a) in waiting state for new fuse requests - io_uring_cmd_done is needed b) already in userspace - io_uring_cmd_done through teardown is not needed, the request can just get released. If fuse server is still active and commits such a ring entry, fuse_uring_cmd() already checks if the connection is active and then complete the io-uring itself with -ENOTCONN. I.e. special handling is not needed. This scheme is basically represented by the ring entry state FRRS_WAIT and FRRS_USERSPACE. Entries in state: - FRRS_INIT: No action needed, do not contribute to ring->queue_refs yet - All other states: Are currently processed by other tasks, async teardown is needed and it has to wait for the two states above. It could be also solved without an async teardown task, but would require additional if conditions in hot code paths. Also in my personal opinion the code looks cleaner with async teardown. Signed-off-by: Bernd Schubert Reviewed-by: Pavel Begunkov # io_uring Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 9 ++ fs/fuse/dev_uring.c | 207 ++++++++++++++++++++++++++++++++++++++++++ fs/fuse/dev_uring_i.h | 51 +++++++++++ 3 files changed, 267 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index aa33eba51c51..1c21e491e891 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -6,6 +6,7 @@ See the file COPYING. */ +#include "dev_uring_i.h" #include "fuse_i.h" #include "fuse_dev_i.h" @@ -2291,6 +2292,12 @@ void fuse_abort_conn(struct fuse_conn *fc) spin_unlock(&fc->lock); fuse_dev_end_requests(&to_end); + + /* + * fc->lock must not be taken to avoid conflicts with io-uring + * locks + */ + fuse_uring_abort(fc); } else { spin_unlock(&fc->lock); } @@ -2302,6 +2309,8 @@ void fuse_wait_aborted(struct fuse_conn *fc) /* matches implicit memory barrier in fuse_drop_waiting() */ smp_mb(); wait_event(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0); + + fuse_uring_wait_stopped_queues(fc); } int fuse_dev_release(struct inode *inode, struct file *file) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 1030c1720990..1161b9aa5e11 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -35,6 +35,37 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, fuse_request_end(req); } +/* Abort all list queued request on the given ring queue */ +static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) +{ + struct fuse_req *req; + LIST_HEAD(req_list); + + spin_lock(&queue->lock); + list_for_each_entry(req, &queue->fuse_req_queue, list) + clear_bit(FR_PENDING, &req->flags); + list_splice_init(&queue->fuse_req_queue, &req_list); + spin_unlock(&queue->lock); + + /* must not hold queue lock to avoid order issues with fi->lock */ + fuse_dev_end_requests(&req_list); +} + +void fuse_uring_abort_end_requests(struct fuse_ring *ring) +{ + int qid; + struct fuse_ring_queue *queue; + + for (qid = 0; qid < ring->nr_queues; qid++) { + queue = READ_ONCE(ring->queues[qid]); + if (!queue) + continue; + + queue->stopped = true; + fuse_uring_abort_end_queue_requests(queue); + } +} + void fuse_uring_destruct(struct fuse_conn *fc) { struct fuse_ring *ring = fc->ring; @@ -94,10 +125,13 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) goto out_err; } + init_waitqueue_head(&ring->stop_waitq); + fc->ring = ring; ring->nr_queues = nr_queues; ring->fc = fc; ring->max_payload_sz = max_payload_size; + atomic_set(&ring->queue_refs, 0); spin_unlock(&fc->lock); return ring; @@ -154,6 +188,175 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, return queue; } +static void fuse_uring_stop_fuse_req_end(struct fuse_req *req) +{ + clear_bit(FR_SENT, &req->flags); + req->out.h.error = -ECONNABORTED; + fuse_request_end(req); +} + +/* + * Release a request/entry on connection tear down + */ +static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) +{ + struct fuse_req *req; + struct io_uring_cmd *cmd; + + struct fuse_ring_queue *queue = ent->queue; + + spin_lock(&queue->lock); + cmd = ent->cmd; + ent->cmd = NULL; + req = ent->fuse_req; + ent->fuse_req = NULL; + if (req) { + /* remove entry from queue->fpq->processing */ + list_del_init(&req->list); + } + spin_unlock(&queue->lock); + + if (cmd) + io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED); + + if (req) + fuse_uring_stop_fuse_req_end(req); + + list_del_init(&ent->list); + kfree(ent); +} + +static void fuse_uring_stop_list_entries(struct list_head *head, + struct fuse_ring_queue *queue, + enum fuse_ring_req_state exp_state) +{ + struct fuse_ring *ring = queue->ring; + struct fuse_ring_ent *ent, *next; + ssize_t queue_refs = SSIZE_MAX; + LIST_HEAD(to_teardown); + + spin_lock(&queue->lock); + list_for_each_entry_safe(ent, next, head, list) { + if (ent->state != exp_state) { + pr_warn("entry teardown qid=%d state=%d expected=%d", + queue->qid, ent->state, exp_state); + continue; + } + + list_move(&ent->list, &to_teardown); + } + spin_unlock(&queue->lock); + + /* no queue lock to avoid lock order issues */ + list_for_each_entry_safe(ent, next, &to_teardown, list) { + fuse_uring_entry_teardown(ent); + queue_refs = atomic_dec_return(&ring->queue_refs); + WARN_ON_ONCE(queue_refs < 0); + } +} + +static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue) +{ + fuse_uring_stop_list_entries(&queue->ent_in_userspace, queue, + FRRS_USERSPACE); + fuse_uring_stop_list_entries(&queue->ent_avail_queue, queue, + FRRS_AVAILABLE); +} + +/* + * Log state debug info + */ +static void fuse_uring_log_ent_state(struct fuse_ring *ring) +{ + int qid; + struct fuse_ring_ent *ent; + + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = ring->queues[qid]; + + if (!queue) + continue; + + spin_lock(&queue->lock); + /* + * Log entries from the intermediate queue, the other queues + * should be empty + */ + list_for_each_entry(ent, &queue->ent_w_req_queue, list) { + pr_info(" ent-req-queue ring=%p qid=%d ent=%p state=%d\n", + ring, qid, ent, ent->state); + } + list_for_each_entry(ent, &queue->ent_commit_queue, list) { + pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n", + ring, qid, ent, ent->state); + } + spin_unlock(&queue->lock); + } + ring->stop_debug_log = 1; +} + +static void fuse_uring_async_stop_queues(struct work_struct *work) +{ + int qid; + struct fuse_ring *ring = + container_of(work, struct fuse_ring, async_teardown_work.work); + + /* XXX code dup */ + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); + + if (!queue) + continue; + + fuse_uring_teardown_entries(queue); + } + + /* + * Some ring entries might be in the middle of IO operations, + * i.e. in process to get handled by file_operations::uring_cmd + * or on the way to userspace - we could handle that with conditions in + * run time code, but easier/cleaner to have an async tear down handler + * If there are still queue references left + */ + if (atomic_read(&ring->queue_refs) > 0) { + if (time_after(jiffies, + ring->teardown_time + FUSE_URING_TEARDOWN_TIMEOUT)) + fuse_uring_log_ent_state(ring); + + schedule_delayed_work(&ring->async_teardown_work, + FUSE_URING_TEARDOWN_INTERVAL); + } else { + wake_up_all(&ring->stop_waitq); + } +} + +/* + * Stop the ring queues + */ +void fuse_uring_stop_queues(struct fuse_ring *ring) +{ + int qid; + + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); + + if (!queue) + continue; + + fuse_uring_teardown_entries(queue); + } + + if (atomic_read(&ring->queue_refs) > 0) { + ring->teardown_time = jiffies; + INIT_DELAYED_WORK(&ring->async_teardown_work, + fuse_uring_async_stop_queues); + schedule_delayed_work(&ring->async_teardown_work, + FUSE_URING_TEARDOWN_INTERVAL); + } else { + wake_up_all(&ring->stop_waitq); + } +} + /* * Checks for errors and stores it into the request */ @@ -525,6 +728,9 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, return err; fpq = &queue->fpq; + if (!READ_ONCE(fc->connected) || READ_ONCE(queue->stopped)) + return err; + spin_lock(&queue->lock); /* Find a request based on the unique ID of the fuse request * This should get revised, as it needs a hash calculation and list @@ -652,6 +858,7 @@ fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, ent->headers = iov[0].iov_base; ent->payload = iov[1].iov_base; + atomic_inc(&ring->queue_refs); return ent; } diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 44bf237f0d5a..a4316e118cbd 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -11,6 +11,9 @@ #ifdef CONFIG_FUSE_IO_URING +#define FUSE_URING_TEARDOWN_TIMEOUT (5 * HZ) +#define FUSE_URING_TEARDOWN_INTERVAL (HZ/20) + enum fuse_ring_req_state { FRRS_INVALID = 0, @@ -80,6 +83,8 @@ struct fuse_ring_queue { struct list_head fuse_req_queue; struct fuse_pqueue fpq; + + bool stopped; }; /** @@ -97,12 +102,51 @@ struct fuse_ring { size_t max_payload_sz; struct fuse_ring_queue **queues; + + /* + * Log ring entry states on stop when entries cannot be released + */ + unsigned int stop_debug_log : 1; + + wait_queue_head_t stop_waitq; + + /* async tear down */ + struct delayed_work async_teardown_work; + + /* log */ + unsigned long teardown_time; + + atomic_t queue_refs; }; bool fuse_uring_enabled(void); void fuse_uring_destruct(struct fuse_conn *fc); +void fuse_uring_stop_queues(struct fuse_ring *ring); +void fuse_uring_abort_end_requests(struct fuse_ring *ring); int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); +static inline void fuse_uring_abort(struct fuse_conn *fc) +{ + struct fuse_ring *ring = fc->ring; + + if (ring == NULL) + return; + + if (atomic_read(&ring->queue_refs) > 0) { + fuse_uring_abort_end_requests(ring); + fuse_uring_stop_queues(ring); + } +} + +static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc) +{ + struct fuse_ring *ring = fc->ring; + + if (ring) + wait_event(ring->stop_waitq, + atomic_read(&ring->queue_refs) == 0); +} + #else /* CONFIG_FUSE_IO_URING */ struct fuse_ring; @@ -120,6 +164,13 @@ static inline bool fuse_uring_enabled(void) return false; } +static inline void fuse_uring_abort(struct fuse_conn *fc) +{ +} + +static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc) +{ +} #endif /* CONFIG_FUSE_IO_URING */ #endif /* _FS_FUSE_DEV_URING_I_H */ -- 2.51.0 From ba74ba571189668697a8d8da906ad6d44762ebc6 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:29:05 +0100 Subject: [PATCH 03/16] fuse: {io-uring} Make fuse_dev_queue_{interrupt,forget} non-static These functions are also needed by fuse-over-io-uring. Signed-off-by: Bernd Schubert Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 5 +++-- fs/fuse/fuse_dev_i.h | 5 +++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1c21e491e891..ecf2f805f456 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -237,7 +237,8 @@ __releases(fiq->lock) spin_unlock(&fiq->lock); } -static void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *forget) +void fuse_dev_queue_forget(struct fuse_iqueue *fiq, + struct fuse_forget_link *forget) { spin_lock(&fiq->lock); if (fiq->connected) { @@ -250,7 +251,7 @@ static void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_li } } -static void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) +void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { spin_lock(&fiq->lock); if (list_empty(&req->intr_entry)) { diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index b64ab84cbc0d..3b2bfe1248d3 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -16,6 +16,8 @@ struct fuse_arg; struct fuse_args; struct fuse_pqueue; struct fuse_req; +struct fuse_iqueue; +struct fuse_forget_link; struct fuse_copy_state { int write; @@ -56,6 +58,9 @@ int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs, int zeroing); int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, unsigned int nbytes); +void fuse_dev_queue_forget(struct fuse_iqueue *fiq, + struct fuse_forget_link *forget); +void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req); #endif -- 2.51.0 From c2c9af9a0b13261c36909036057a116f2edb5e1a Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:29:06 +0100 Subject: [PATCH 04/16] fuse: Allow to queue fg requests through io-uring This prepares queueing and sending foreground requests through io-uring. Signed-off-by: Bernd Schubert Reviewed-by: Pavel Begunkov # io_uring Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi --- fs/fuse/dev_uring.c | 180 ++++++++++++++++++++++++++++++++++++++++++ fs/fuse/dev_uring_i.h | 8 ++ 2 files changed, 188 insertions(+) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 1161b9aa5e11..728000434589 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -24,6 +24,29 @@ bool fuse_uring_enabled(void) return enable_uring; } +struct fuse_uring_pdu { + struct fuse_ring_ent *ent; +}; + +static const struct fuse_iqueue_ops fuse_io_uring_ops; + +static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd, + struct fuse_ring_ent *ring_ent) +{ + struct fuse_uring_pdu *pdu = + io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); + + pdu->ent = ring_ent; +} + +static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) +{ + struct fuse_uring_pdu *pdu = + io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); + + return pdu->ent; +} + static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, int error) { @@ -776,6 +799,31 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, return 0; } +static bool is_ring_ready(struct fuse_ring *ring, int current_qid) +{ + int qid; + struct fuse_ring_queue *queue; + bool ready = true; + + for (qid = 0; qid < ring->nr_queues && ready; qid++) { + if (current_qid == qid) + continue; + + queue = ring->queues[qid]; + if (!queue) { + ready = false; + break; + } + + spin_lock(&queue->lock); + if (list_empty(&queue->ent_avail_queue)) + ready = false; + spin_unlock(&queue->lock); + } + + return ready; +} + /* * fuse_uring_req_fetch command handling */ @@ -784,11 +832,23 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent, unsigned int issue_flags) { struct fuse_ring_queue *queue = ent->queue; + struct fuse_ring *ring = queue->ring; + struct fuse_conn *fc = ring->fc; + struct fuse_iqueue *fiq = &fc->iq; spin_lock(&queue->lock); ent->cmd = cmd; fuse_uring_ent_avail(ent, queue); spin_unlock(&queue->lock); + + if (!ring->ready) { + bool ready = is_ring_ready(ring, queue->qid); + + if (ready) { + WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); + WRITE_ONCE(ring->ready, true); + } + } } /* @@ -972,3 +1032,123 @@ int __maybe_unused fuse_uring_cmd(struct io_uring_cmd *cmd, return -EIOCBQUEUED; } + +static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, + ssize_t ret, unsigned int issue_flags) +{ + struct fuse_ring_queue *queue = ent->queue; + + spin_lock(&queue->lock); + ent->state = FRRS_USERSPACE; + list_move(&ent->list, &queue->ent_in_userspace); + ent->cmd = NULL; + spin_unlock(&queue->lock); + + io_uring_cmd_done(cmd, ret, 0, issue_flags); +} + +/* + * This prepares and sends the ring request in fuse-uring task context. + * User buffers are not mapped yet - the application does not have permission + * to write to it - this has to be executed in ring task context. + */ +static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); + struct fuse_ring_queue *queue = ent->queue; + int err; + + if (!(issue_flags & IO_URING_F_TASK_DEAD)) { + err = fuse_uring_prepare_send(ent, ent->fuse_req); + if (err) { + fuse_uring_next_fuse_req(ent, queue, issue_flags); + return; + } + } else { + err = -ECANCELED; + } + + fuse_uring_send(ent, cmd, err, issue_flags); +} + +static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) +{ + unsigned int qid; + struct fuse_ring_queue *queue; + + qid = task_cpu(current); + + if (WARN_ONCE(qid >= ring->nr_queues, + "Core number (%u) exceeds nr queues (%zu)\n", qid, + ring->nr_queues)) + qid = 0; + + queue = ring->queues[qid]; + WARN_ONCE(!queue, "Missing queue for qid %d\n", qid); + + return queue; +} + +static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent) +{ + struct io_uring_cmd *cmd = ent->cmd; + + uring_cmd_set_ring_ent(cmd, ent); + io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); +} + +/* queue a fuse request and send it if a ring entry is available */ +void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) +{ + struct fuse_conn *fc = req->fm->fc; + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + struct fuse_ring_ent *ent = NULL; + int err; + + err = -EINVAL; + queue = fuse_uring_task_to_queue(ring); + if (!queue) + goto err; + + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique(fiq); + + spin_lock(&queue->lock); + err = -ENOTCONN; + if (unlikely(queue->stopped)) + goto err_unlock; + + ent = list_first_entry_or_null(&queue->ent_avail_queue, + struct fuse_ring_ent, list); + if (ent) + fuse_uring_add_req_to_ring_ent(ent, req); + else + list_add_tail(&req->list, &queue->fuse_req_queue); + spin_unlock(&queue->lock); + + if (ent) + fuse_uring_dispatch_ent(ent); + + return; + +err_unlock: + spin_unlock(&queue->lock); +err: + req->out.h.error = err; + clear_bit(FR_PENDING, &req->flags); + fuse_request_end(req); +} + +static const struct fuse_iqueue_ops fuse_io_uring_ops = { + /* should be send over io-uring as enhancement */ + .send_forget = fuse_dev_queue_forget, + + /* + * could be send over io-uring, but interrupts should be rare, + * no need to make the code complex + */ + .send_interrupt = fuse_dev_queue_interrupt, + .send_req = fuse_uring_queue_fuse_req, +}; diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index a4316e118cbd..0517a6eafc91 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -117,6 +117,8 @@ struct fuse_ring { unsigned long teardown_time; atomic_t queue_refs; + + bool ready; }; bool fuse_uring_enabled(void); @@ -124,6 +126,7 @@ void fuse_uring_destruct(struct fuse_conn *fc); void fuse_uring_stop_queues(struct fuse_ring *ring); void fuse_uring_abort_end_requests(struct fuse_ring *ring); int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); +void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req); static inline void fuse_uring_abort(struct fuse_conn *fc) { @@ -147,6 +150,11 @@ static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc) atomic_read(&ring->queue_refs) == 0); } +static inline bool fuse_uring_ready(struct fuse_conn *fc) +{ + return fc->ring && fc->ring->ready; +} + #else /* CONFIG_FUSE_IO_URING */ struct fuse_ring; -- 2.51.0 From 857b0263f30eebe13ab4b6a65156a0d6c8fc2210 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:29:07 +0100 Subject: [PATCH 05/16] fuse: Allow to queue bg requests through io-uring This prepares queueing and sending background requests through io-uring. Signed-off-by: Bernd Schubert Reviewed-by: Pavel Begunkov # io_uring Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 26 +++++++++++- fs/fuse/dev_uring.c | 99 +++++++++++++++++++++++++++++++++++++++++++ fs/fuse/dev_uring_i.h | 12 ++++++ 3 files changed, 136 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ecf2f805f456..1b593b23f7b8 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -568,7 +568,25 @@ ssize_t __fuse_simple_request(struct mnt_idmap *idmap, return ret; } -static bool fuse_request_queue_background(struct fuse_req *req) +#ifdef CONFIG_FUSE_IO_URING +static bool fuse_request_queue_background_uring(struct fuse_conn *fc, + struct fuse_req *req) +{ + struct fuse_iqueue *fiq = &fc->iq; + + req->in.h.unique = fuse_get_unique(fiq); + req->in.h.len = sizeof(struct fuse_in_header) + + fuse_len_args(req->args->in_numargs, + (struct fuse_arg *) req->args->in_args); + + return fuse_uring_queue_bq_req(req); +} +#endif + +/* + * @return true if queued + */ +static int fuse_request_queue_background(struct fuse_req *req) { struct fuse_mount *fm = req->fm; struct fuse_conn *fc = fm->fc; @@ -580,6 +598,12 @@ static bool fuse_request_queue_background(struct fuse_req *req) atomic_inc(&fc->num_waiting); } __set_bit(FR_ISREPLY, &req->flags); + +#ifdef CONFIG_FUSE_IO_URING + if (fuse_uring_ready(fc)) + return fuse_request_queue_background_uring(fc, req); +#endif + spin_lock(&fc->bg_lock); if (likely(fc->connected)) { fc->num_background++; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 728000434589..27bc103c17c8 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -47,10 +47,53 @@ static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) return pdu->ent; } +static void fuse_uring_flush_bg(struct fuse_ring_queue *queue) +{ + struct fuse_ring *ring = queue->ring; + struct fuse_conn *fc = ring->fc; + + lockdep_assert_held(&queue->lock); + lockdep_assert_held(&fc->bg_lock); + + /* + * Allow one bg request per queue, ignoring global fc limits. + * This prevents a single queue from consuming all resources and + * eliminates the need for remote queue wake-ups when global + * limits are met but this queue has no more waiting requests. + */ + while ((fc->active_background < fc->max_background || + !queue->active_background) && + (!list_empty(&queue->fuse_req_bg_queue))) { + struct fuse_req *req; + + req = list_first_entry(&queue->fuse_req_bg_queue, + struct fuse_req, list); + fc->active_background++; + queue->active_background++; + + list_move_tail(&req->list, &queue->fuse_req_queue); + } +} + static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, int error) { + struct fuse_ring_queue *queue = ent->queue; + struct fuse_ring *ring = queue->ring; + struct fuse_conn *fc = ring->fc; + + lockdep_assert_not_held(&queue->lock); + spin_lock(&queue->lock); ent->fuse_req = NULL; + if (test_bit(FR_BACKGROUND, &req->flags)) { + queue->active_background--; + spin_lock(&fc->bg_lock); + fuse_uring_flush_bg(queue); + spin_unlock(&fc->bg_lock); + } + + spin_unlock(&queue->lock); + if (error) req->out.h.error = error; @@ -78,6 +121,7 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring) { int qid; struct fuse_ring_queue *queue; + struct fuse_conn *fc = ring->fc; for (qid = 0; qid < ring->nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); @@ -85,6 +129,13 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring) continue; queue->stopped = true; + + WARN_ON_ONCE(ring->fc->max_background != UINT_MAX); + spin_lock(&queue->lock); + spin_lock(&fc->bg_lock); + fuse_uring_flush_bg(queue); + spin_unlock(&fc->bg_lock); + spin_unlock(&queue->lock); fuse_uring_abort_end_queue_requests(queue); } } @@ -190,6 +241,7 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, INIT_LIST_HEAD(&queue->ent_w_req_queue); INIT_LIST_HEAD(&queue->ent_in_userspace); INIT_LIST_HEAD(&queue->fuse_req_queue); + INIT_LIST_HEAD(&queue->fuse_req_bg_queue); queue->fpq.processing = pq; fuse_pqueue_init(&queue->fpq); @@ -1141,6 +1193,53 @@ err: fuse_request_end(req); } +bool fuse_uring_queue_bq_req(struct fuse_req *req) +{ + struct fuse_conn *fc = req->fm->fc; + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + struct fuse_ring_ent *ent = NULL; + + queue = fuse_uring_task_to_queue(ring); + if (!queue) + return false; + + spin_lock(&queue->lock); + if (unlikely(queue->stopped)) { + spin_unlock(&queue->lock); + return false; + } + + list_add_tail(&req->list, &queue->fuse_req_bg_queue); + + ent = list_first_entry_or_null(&queue->ent_avail_queue, + struct fuse_ring_ent, list); + spin_lock(&fc->bg_lock); + fc->num_background++; + if (fc->num_background == fc->max_background) + fc->blocked = 1; + fuse_uring_flush_bg(queue); + spin_unlock(&fc->bg_lock); + + /* + * Due to bg_queue flush limits there might be other bg requests + * in the queue that need to be handled first. Or no further req + * might be available. + */ + req = list_first_entry_or_null(&queue->fuse_req_queue, struct fuse_req, + list); + if (ent && req) { + fuse_uring_add_req_to_ring_ent(ent, req); + spin_unlock(&queue->lock); + + fuse_uring_dispatch_ent(ent); + } else { + spin_unlock(&queue->lock); + } + + return true; +} + static const struct fuse_iqueue_ops fuse_io_uring_ops = { /* should be send over io-uring as enhancement */ .send_forget = fuse_dev_queue_forget, diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 0517a6eafc91..0182be61778b 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -82,8 +82,13 @@ struct fuse_ring_queue { /* fuse requests waiting for an entry slot */ struct list_head fuse_req_queue; + /* background fuse requests */ + struct list_head fuse_req_bg_queue; + struct fuse_pqueue fpq; + unsigned int active_background; + bool stopped; }; @@ -127,6 +132,7 @@ void fuse_uring_stop_queues(struct fuse_ring *ring); void fuse_uring_abort_end_requests(struct fuse_ring *ring); int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req); +bool fuse_uring_queue_bq_req(struct fuse_req *req); static inline void fuse_uring_abort(struct fuse_conn *fc) { @@ -179,6 +185,12 @@ static inline void fuse_uring_abort(struct fuse_conn *fc) static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc) { } + +static inline bool fuse_uring_ready(struct fuse_conn *fc) +{ + return false; +} + #endif /* CONFIG_FUSE_IO_URING */ #endif /* _FS_FUSE_DEV_URING_I_H */ -- 2.51.0 From b6236c8407cba5d7a108facb1bcfab24994d3814 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:29:08 +0100 Subject: [PATCH 06/16] fuse: {io-uring} Prevent mount point hang on fuse-server termination When the fuse-server terminates while the fuse-client or kernel still has queued URING_CMDs, these commands retain references to the struct file used by the fuse connection. This prevents fuse_dev_release() from being invoked, resulting in a hung mount point. This patch addresses the issue by making queued URING_CMDs cancelable, allowing fuse_dev_release() to proceed as expected and preventing the mount point from hanging. Signed-off-by: Bernd Schubert Reviewed-by: Pavel Begunkov # io_uring Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi --- fs/fuse/dev_uring.c | 69 +++++++++++++++++++++++++++++++++++++++++-- fs/fuse/dev_uring_i.h | 9 ++++++ 2 files changed, 75 insertions(+), 3 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 27bc103c17c8..fa0451176385 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -150,6 +150,7 @@ void fuse_uring_destruct(struct fuse_conn *fc) for (qid = 0; qid < ring->nr_queues; qid++) { struct fuse_ring_queue *queue = ring->queues[qid]; + struct fuse_ring_ent *ent, *next; if (!queue) continue; @@ -159,6 +160,12 @@ void fuse_uring_destruct(struct fuse_conn *fc) WARN_ON(!list_empty(&queue->ent_commit_queue)); WARN_ON(!list_empty(&queue->ent_in_userspace)); + list_for_each_entry_safe(ent, next, &queue->ent_released, + list) { + list_del_init(&ent->list); + kfree(ent); + } + kfree(queue->fpq.processing); kfree(queue); ring->queues[qid] = NULL; @@ -242,6 +249,7 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, INIT_LIST_HEAD(&queue->ent_in_userspace); INIT_LIST_HEAD(&queue->fuse_req_queue); INIT_LIST_HEAD(&queue->fuse_req_bg_queue); + INIT_LIST_HEAD(&queue->ent_released); queue->fpq.processing = pq; fuse_pqueue_init(&queue->fpq); @@ -289,6 +297,15 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) /* remove entry from queue->fpq->processing */ list_del_init(&req->list); } + + /* + * The entry must not be freed immediately, due to access of direct + * pointer access of entries through IO_URING_F_CANCEL - there is a risk + * of race between daemon termination (which triggers IO_URING_F_CANCEL + * and accesses entries without checking the list state first + */ + list_move(&ent->list, &queue->ent_released); + ent->state = FRRS_RELEASED; spin_unlock(&queue->lock); if (cmd) @@ -296,9 +313,6 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) if (req) fuse_uring_stop_fuse_req_end(req); - - list_del_init(&ent->list); - kfree(ent); } static void fuse_uring_stop_list_entries(struct list_head *head, @@ -318,6 +332,7 @@ static void fuse_uring_stop_list_entries(struct list_head *head, continue; } + ent->state = FRRS_TEARDOWN; list_move(&ent->list, &to_teardown); } spin_unlock(&queue->lock); @@ -432,6 +447,46 @@ void fuse_uring_stop_queues(struct fuse_ring *ring) } } +/* + * Handle IO_URING_F_CANCEL, typically should come on daemon termination. + * + * Releasing the last entry should trigger fuse_dev_release() if + * the daemon was terminated + */ +static void fuse_uring_cancel(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); + struct fuse_ring_queue *queue; + bool need_cmd_done = false; + + /* + * direct access on ent - it must not be destructed as long as + * IO_URING_F_CANCEL might come up + */ + queue = ent->queue; + spin_lock(&queue->lock); + if (ent->state == FRRS_AVAILABLE) { + ent->state = FRRS_USERSPACE; + list_move(&ent->list, &queue->ent_in_userspace); + need_cmd_done = true; + ent->cmd = NULL; + } + spin_unlock(&queue->lock); + + if (need_cmd_done) { + /* no queue lock to avoid lock order issues */ + io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); + } +} + +static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags, + struct fuse_ring_ent *ring_ent) +{ + uring_cmd_set_ring_ent(cmd, ring_ent); + io_uring_cmd_mark_cancelable(cmd, issue_flags); +} + /* * Checks for errors and stores it into the request */ @@ -839,6 +894,7 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, spin_unlock(&queue->lock); /* without the queue lock, as other locks are taken */ + fuse_uring_prepare_cancel(cmd, issue_flags, ent); fuse_uring_commit(ent, req, issue_flags); /* @@ -888,6 +944,8 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent, struct fuse_conn *fc = ring->fc; struct fuse_iqueue *fiq = &fc->iq; + fuse_uring_prepare_cancel(cmd, issue_flags, ent); + spin_lock(&queue->lock); ent->cmd = cmd; fuse_uring_ent_avail(ent, queue); @@ -1038,6 +1096,11 @@ int __maybe_unused fuse_uring_cmd(struct io_uring_cmd *cmd, return -EOPNOTSUPP; } + if ((unlikely(issue_flags & IO_URING_F_CANCEL))) { + fuse_uring_cancel(cmd, issue_flags); + return 0; + } + /* This extra SQE size holds struct fuse_uring_cmd_req */ if (!(issue_flags & IO_URING_F_SQE128)) return -EINVAL; diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 0182be61778b..2102b3d0c1ae 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -28,6 +28,12 @@ enum fuse_ring_req_state { /* The ring entry is in or on the way to user space */ FRRS_USERSPACE, + + /* The ring entry is in teardown */ + FRRS_TEARDOWN, + + /* The ring entry is released, but not freed yet */ + FRRS_RELEASED, }; /** A fuse ring entry, part of the ring queue */ @@ -79,6 +85,9 @@ struct fuse_ring_queue { /* entries in userspace */ struct list_head ent_in_userspace; + /* entries that are released */ + struct list_head ent_released; + /* fuse requests waiting for an entry slot */ struct list_head fuse_req_queue; -- 2.51.0 From 3393ff964e0fa5def66570c54a4612bf9df06b76 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:29:09 +0100 Subject: [PATCH 07/16] fuse: block request allocation until io-uring init is complete Avoid races and block request allocation until io-uring queues are ready. This is a especially important for background requests, as bg request completion might cause lock order inversion of the typical queue->lock and then fc->bg_lock fuse_request_end spin_lock(&fc->bg_lock); flush_bg_queue fuse_send_one fuse_uring_queue_fuse_req spin_lock(&queue->lock); Signed-off-by: Bernd Schubert Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 3 ++- fs/fuse/dev_uring.c | 3 +++ fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 2 ++ 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1b593b23f7b8..f002e8a096f9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -76,7 +76,8 @@ void fuse_set_initialized(struct fuse_conn *fc) static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background) { - return !fc->initialized || (for_background && fc->blocked); + return !fc->initialized || (for_background && fc->blocked) || + (fc->io_uring && !fuse_uring_ready(fc)); } static void fuse_drop_waiting(struct fuse_conn *fc) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index fa0451176385..ea197ccd4c51 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -957,6 +957,7 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent, if (ready) { WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); WRITE_ONCE(ring->ready, true); + wake_up_all(&fc->blocked_waitq); } } } @@ -1130,6 +1131,8 @@ int __maybe_unused fuse_uring_cmd(struct io_uring_cmd *cmd, if (err) { pr_info_once("FUSE_IO_URING_CMD_REGISTER failed err=%d\n", err); + fc->io_uring = 0; + wake_up_all(&fc->blocked_waitq); return err; } break; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ba6901c1bc2d..fee96fe7887b 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -867,6 +867,9 @@ struct fuse_conn { /* Use pages instead of pointer for kernel I/O */ unsigned int use_pages_for_kvec_io:1; + /* Use io_uring for communication */ + unsigned int io_uring; + /** Maximum stack depth for passthrough backing files */ int max_stack_depth; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 328797b9aac9..e9db2cb8c150 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1390,6 +1390,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, else ok = false; } + if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled()) + fc->io_uring = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; -- 2.51.0 From 786412a73e7ee5b00ef3437bbf2f3a250759b2ae Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:29:10 +0100 Subject: [PATCH 08/16] fuse: enable fuse-over-io-uring All required parts are handled now, fuse-io-uring can be enabled. Signed-off-by: Bernd Schubert Reviewed-by: Pavel Begunkov # io_uring Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 3 +++ fs/fuse/dev_uring.c | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f002e8a096f9..5b5f789b37eb 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2493,6 +2493,9 @@ const struct file_operations fuse_dev_operations = { .fasync = fuse_dev_fasync, .unlocked_ioctl = fuse_dev_ioctl, .compat_ioctl = compat_ptr_ioctl, +#ifdef CONFIG_FUSE_IO_URING + .uring_cmd = fuse_uring_cmd, +#endif }; EXPORT_SYMBOL_GPL(fuse_dev_operations); diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index ea197ccd4c51..3bdc75518e5b 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1084,8 +1084,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, * Entry function from io_uring to handle the given passthrough command * (op code IORING_OP_URING_CMD) */ -int __maybe_unused fuse_uring_cmd(struct io_uring_cmd *cmd, - unsigned int issue_flags) +int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct fuse_dev *fud; struct fuse_conn *fc; -- 2.51.0 From 2d4fde59fd502a65c1698b61ad4d0f10a9ab665a Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Thu, 23 Jan 2025 17:55:32 +0100 Subject: [PATCH 09/16] fuse: prevent disabling io-uring on active connections The enable_uring module parameter allows administrators to enable/disable io-uring support for FUSE at runtime. However, disabling io-uring while connections already have it enabled can lead to an inconsistent state. Fix this by keeping io-uring enabled on connections that were already using it, even if the module parameter is later disabled. This ensures active FUSE mounts continue to function correctly. Signed-off-by: Bernd Schubert Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi --- fs/fuse/dev_uring.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 3bdc75518e5b..ebd2931b4f2a 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1091,11 +1091,6 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) u32 cmd_op = cmd->cmd_op; int err; - if (!enable_uring) { - pr_info_ratelimited("fuse-io-uring is disabled\n"); - return -EOPNOTSUPP; - } - if ((unlikely(issue_flags & IO_URING_F_CANCEL))) { fuse_uring_cancel(cmd, issue_flags); return 0; @@ -1112,6 +1107,12 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) } fc = fud->fc; + /* Once a connection has io-uring enabled on it, it can't be disabled */ + if (!enable_uring && !fc->io_uring) { + pr_info_ratelimited("fuse-io-uring is disabled\n"); + return -EOPNOTSUPP; + } + if (fc->aborted) return -ECONNABORTED; if (!fc->connected) -- 2.51.0 From 1751f872cc97f992ed5c4c72c55588db1f0021e1 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 28 Jan 2025 13:48:37 +0100 Subject: [PATCH 10/16] treewide: const qualify ctl_tables where applicable Add the const qualifier to all the ctl_tables in the tree except for watchdog_hardlockup_sysctl, memory_allocation_profiling_sysctls, loadpin_sysctl_table and the ones calling register_net_sysctl (./net, drivers/inifiniband dirs). These are special cases as they use a registration function with a non-const qualified ctl_table argument or modify the arrays before passing them on to the registration function. Constifying ctl_table structs will prevent the modification of proc_handler function pointers as the arrays would reside in .rodata. This is made possible after commit 78eb4ea25cd5 ("sysctl: treewide: constify the ctl_table argument of proc_handlers") constified all the proc_handlers. Created this by running an spatch followed by a sed command: Spatch: virtual patch @ depends on !(file in "net") disable optional_qualifier @ identifier table_name != { watchdog_hardlockup_sysctl, iwcm_ctl_table, ucma_ctl_table, memory_allocation_profiling_sysctls, loadpin_sysctl_table }; @@ + const struct ctl_table table_name [] = { ... }; sed: sed --in-place \ -e "s/struct ctl_table .table = &uts_kern/const struct ctl_table *table = \&uts_kern/" \ kernel/utsname_sysctl.c Reviewed-by: Song Liu Acked-by: Steven Rostedt (Google) # for kernel/trace/ Reviewed-by: Martin K. Petersen # SCSI Reviewed-by: Darrick J. Wong # xfs Acked-by: Jani Nikula Acked-by: Corey Minyard Acked-by: Wei Liu Acked-by: Thomas Gleixner Reviewed-by: Bill O'Donnell Acked-by: Baoquan He Acked-by: Ashutosh Dixit Acked-by: Anna Schumaker Signed-off-by: Joel Granados --- arch/arm/kernel/isa.c | 2 +- arch/arm64/kernel/fpsimd.c | 4 ++-- arch/arm64/kernel/process.c | 2 +- arch/powerpc/kernel/idle.c | 2 +- arch/powerpc/platforms/pseries/mobility.c | 2 +- arch/riscv/kernel/process.c | 2 +- arch/riscv/kernel/vector.c | 2 +- arch/s390/appldata/appldata_base.c | 2 +- arch/s390/kernel/debug.c | 2 +- arch/s390/kernel/hiperdispatch.c | 2 +- arch/s390/kernel/topology.c | 2 +- arch/s390/mm/cmm.c | 2 +- arch/s390/mm/pgalloc.c | 2 +- arch/x86/entry/vdso/vdso32-setup.c | 2 +- arch/x86/kernel/cpu/bus_lock.c | 2 +- crypto/fips.c | 2 +- drivers/base/firmware_loader/fallback_table.c | 2 +- drivers/cdrom/cdrom.c | 2 +- drivers/char/hpet.c | 2 +- drivers/char/ipmi/ipmi_poweroff.c | 2 +- drivers/char/random.c | 2 +- drivers/gpu/drm/i915/i915_perf.c | 2 +- drivers/gpu/drm/xe/xe_observation.c | 2 +- drivers/hv/hv_common.c | 2 +- drivers/md/md.c | 2 +- drivers/misc/sgi-xp/xpc_main.c | 4 ++-- drivers/perf/arm_pmuv3.c | 2 +- drivers/perf/riscv_pmu_sbi.c | 2 +- drivers/scsi/scsi_sysctl.c | 2 +- drivers/scsi/sg.c | 2 +- drivers/tty/tty_io.c | 2 +- drivers/xen/balloon.c | 2 +- fs/aio.c | 2 +- fs/cachefiles/error_inject.c | 2 +- fs/coda/sysctl.c | 2 +- fs/coredump.c | 2 +- fs/dcache.c | 2 +- fs/devpts/inode.c | 2 +- fs/eventpoll.c | 2 +- fs/exec.c | 2 +- fs/file_table.c | 2 +- fs/fuse/sysctl.c | 2 +- fs/inode.c | 2 +- fs/lockd/svc.c | 2 +- fs/locks.c | 2 +- fs/namei.c | 2 +- fs/namespace.c | 2 +- fs/nfs/nfs4sysctl.c | 2 +- fs/nfs/sysctl.c | 2 +- fs/notify/dnotify/dnotify.c | 2 +- fs/notify/fanotify/fanotify_user.c | 2 +- fs/notify/inotify/inotify_user.c | 2 +- fs/ocfs2/stackglue.c | 2 +- fs/pipe.c | 2 +- fs/quota/dquot.c | 2 +- fs/sysctls.c | 2 +- fs/userfaultfd.c | 2 +- fs/verity/init.c | 2 +- fs/xfs/xfs_sysctl.c | 2 +- init/do_mounts_initrd.c | 2 +- io_uring/io_uring.c | 2 +- ipc/ipc_sysctl.c | 2 +- ipc/mq_sysctl.c | 2 +- kernel/acct.c | 2 +- kernel/bpf/syscall.c | 2 +- kernel/delayacct.c | 2 +- kernel/exit.c | 2 +- kernel/hung_task.c | 2 +- kernel/kexec_core.c | 2 +- kernel/kprobes.c | 2 +- kernel/latencytop.c | 2 +- kernel/locking/lockdep.c | 2 +- kernel/panic.c | 2 +- kernel/pid.c | 2 +- kernel/pid_namespace.c | 2 +- kernel/pid_sysctl.h | 2 +- kernel/printk/sysctl.c | 2 +- kernel/reboot.c | 2 +- kernel/sched/autogroup.c | 2 +- kernel/sched/core.c | 2 +- kernel/sched/deadline.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/rt.c | 2 +- kernel/sched/topology.c | 2 +- kernel/seccomp.c | 2 +- kernel/signal.c | 2 +- kernel/stackleak.c | 2 +- kernel/sysctl-test.c | 6 +++--- kernel/sysctl.c | 4 ++-- kernel/time/timer.c | 2 +- kernel/trace/ftrace.c | 2 +- kernel/trace/trace_events_user.c | 2 +- kernel/umh.c | 2 +- kernel/utsname_sysctl.c | 4 ++-- kernel/watchdog.c | 2 +- lib/test_sysctl.c | 6 +++--- mm/compaction.c | 2 +- mm/hugetlb.c | 2 +- mm/hugetlb_vmemmap.c | 2 +- mm/memory-failure.c | 2 +- mm/oom_kill.c | 2 +- mm/page-writeback.c | 2 +- mm/page_alloc.c | 2 +- security/apparmor/lsm.c | 2 +- security/keys/sysctl.c | 2 +- security/yama/yama_lsm.c | 2 +- 106 files changed, 114 insertions(+), 114 deletions(-) diff --git a/arch/arm/kernel/isa.c b/arch/arm/kernel/isa.c index 905b1b191546..db8be609fab2 100644 --- a/arch/arm/kernel/isa.c +++ b/arch/arm/kernel/isa.c @@ -16,7 +16,7 @@ static unsigned int isa_membase, isa_portbase, isa_portshift; -static struct ctl_table ctl_isa_vars[] = { +static const struct ctl_table ctl_isa_vars[] = { { .procname = "membase", .data = &isa_membase, diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 8c4c1a2186cc..2b601d88762d 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -562,7 +562,7 @@ static int vec_proc_do_default_vl(const struct ctl_table *table, int write, return 0; } -static struct ctl_table sve_default_vl_table[] = { +static const struct ctl_table sve_default_vl_table[] = { { .procname = "sve_default_vector_length", .mode = 0644, @@ -585,7 +585,7 @@ static int __init sve_sysctl_init(void) { return 0; } #endif /* ! (CONFIG_ARM64_SVE && CONFIG_SYSCTL) */ #if defined(CONFIG_ARM64_SME) && defined(CONFIG_SYSCTL) -static struct ctl_table sme_default_vl_table[] = { +static const struct ctl_table sme_default_vl_table[] = { { .procname = "sme_default_vector_length", .mode = 0644, diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 2968a33bb3bc..42faebb7b712 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -859,7 +859,7 @@ long get_tagged_addr_ctrl(struct task_struct *task) * disable it for tasks that already opted in to the relaxed ABI. */ -static struct ctl_table tagged_addr_sysctl_table[] = { +static const struct ctl_table tagged_addr_sysctl_table[] = { { .procname = "tagged_addr_disabled", .mode = 0644, diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 30b56c67fa61..e527cd3ef128 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -97,7 +97,7 @@ void power4_idle(void) /* * Register the sysctl to set/clear powersave_nap. */ -static struct ctl_table powersave_nap_ctl_table[] = { +static const struct ctl_table powersave_nap_ctl_table[] = { { .procname = "powersave-nap", .data = &powersave_nap, diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index 1798f0f14d58..62bd8e2d5d4c 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -53,7 +53,7 @@ struct update_props_workarea { static unsigned int nmi_wd_lpm_factor = 200; #ifdef CONFIG_SYSCTL -static struct ctl_table nmi_wd_lpm_factor_ctl_table[] = { +static const struct ctl_table nmi_wd_lpm_factor_ctl_table[] = { { .procname = "nmi_wd_lpm_factor", .data = &nmi_wd_lpm_factor, diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 58b6482c2bf6..7891294abf49 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -364,7 +364,7 @@ static bool try_to_set_pmm(unsigned long value) * disable it for tasks that already opted in to the relaxed ABI. */ -static struct ctl_table tagged_addr_sysctl_table[] = { +static const struct ctl_table tagged_addr_sysctl_table[] = { { .procname = "tagged_addr_disabled", .mode = 0644, diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c index 821818886fab..d022b028ac3f 100644 --- a/arch/riscv/kernel/vector.c +++ b/arch/riscv/kernel/vector.c @@ -287,7 +287,7 @@ long riscv_v_vstate_ctrl_set_current(unsigned long arg) #ifdef CONFIG_SYSCTL -static struct ctl_table riscv_v_default_vstate_table[] = { +static const struct ctl_table riscv_v_default_vstate_table[] = { { .procname = "riscv_v_default_allow", .data = &riscv_v_implicit_uacc, diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c index 91a30e017d65..dd7ba7587dd5 100644 --- a/arch/s390/appldata/appldata_base.c +++ b/arch/s390/appldata/appldata_base.c @@ -52,7 +52,7 @@ static int appldata_interval_handler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); static struct ctl_table_header *appldata_sysctl_header; -static struct ctl_table appldata_table[] = { +static const struct ctl_table appldata_table[] = { { .procname = "timer", .mode = S_IRUGO | S_IWUSR, diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index ba6b7329a10e..ce038e9205f7 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -1122,7 +1122,7 @@ static int s390dbf_procactive(const struct ctl_table *table, int write, return 0; } -static struct ctl_table s390dbf_table[] = { +static const struct ctl_table s390dbf_table[] = { { .procname = "debug_stoppable", .data = &debug_stoppable, diff --git a/arch/s390/kernel/hiperdispatch.c b/arch/s390/kernel/hiperdispatch.c index 2a99a216ab62..7857a7e8e56c 100644 --- a/arch/s390/kernel/hiperdispatch.c +++ b/arch/s390/kernel/hiperdispatch.c @@ -292,7 +292,7 @@ static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write, return 0; } -static struct ctl_table hiperdispatch_ctl_table[] = { +static const struct ctl_table hiperdispatch_ctl_table[] = { { .procname = "hiperdispatch", .mode = 0644, diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 3808f942a433..211cc8382e4a 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -667,7 +667,7 @@ static int polarization_ctl_handler(const struct ctl_table *ctl, int write, return set_polarization(polarization); } -static struct ctl_table topology_ctl_table[] = { +static const struct ctl_table topology_ctl_table[] = { { .procname = "topology", .mode = 0644, diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index 7bf0f691827b..39f44b6256e0 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -332,7 +332,7 @@ static int cmm_timeout_handler(const struct ctl_table *ctl, int write, return 0; } -static struct ctl_table cmm_table[] = { +static const struct ctl_table cmm_table[] = { { .procname = "cmm_pages", .mode = 0644, diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index d33f55b7ee98..cd2fef79ad2c 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -21,7 +21,7 @@ int page_table_allocate_pgste = 0; EXPORT_SYMBOL(page_table_allocate_pgste); -static struct ctl_table page_table_sysctl[] = { +static const struct ctl_table page_table_sysctl[] = { { .procname = "allocate_pgste", .data = &page_table_allocate_pgste, diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index 76e4e74f35b5..f6d2d8aba643 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -57,7 +57,7 @@ __setup_param("vdso=", vdso_setup, vdso32_setup, 0); /* Register vsyscall32 into the ABI table */ #include -static struct ctl_table abi_table2[] = { +static const struct ctl_table abi_table2[] = { { .procname = "vsyscall32", .data = &vdso32_enabled, diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c index 704e9241b964..6cba85c79d42 100644 --- a/arch/x86/kernel/cpu/bus_lock.c +++ b/arch/x86/kernel/cpu/bus_lock.c @@ -49,7 +49,7 @@ static unsigned int sysctl_sld_mitigate = 1; static DEFINE_SEMAPHORE(buslock_sem, 1); #ifdef CONFIG_PROC_SYSCTL -static struct ctl_table sld_sysctls[] = { +static const struct ctl_table sld_sysctls[] = { { .procname = "split_lock_mitigate", .data = &sysctl_sld_mitigate, diff --git a/crypto/fips.c b/crypto/fips.c index a58e7750f532..2fa3a9ee61a1 100644 --- a/crypto/fips.c +++ b/crypto/fips.c @@ -41,7 +41,7 @@ __setup("fips=", fips_enable); static char fips_name[] = FIPS_MODULE_NAME; static char fips_version[] = FIPS_MODULE_VERSION; -static struct ctl_table crypto_sysctl_table[] = { +static const struct ctl_table crypto_sysctl_table[] = { { .procname = "fips_enabled", .data = &fips_enabled, diff --git a/drivers/base/firmware_loader/fallback_table.c b/drivers/base/firmware_loader/fallback_table.c index ddb70e29eb42..c8afc501a8a4 100644 --- a/drivers/base/firmware_loader/fallback_table.c +++ b/drivers/base/firmware_loader/fallback_table.c @@ -25,7 +25,7 @@ struct firmware_fallback_config fw_fallback_config = { EXPORT_SYMBOL_NS_GPL(fw_fallback_config, "FIRMWARE_LOADER_PRIVATE"); #ifdef CONFIG_SYSCTL -static struct ctl_table firmware_config_table[] = { +static const struct ctl_table firmware_config_table[] = { { .procname = "force_sysfs_fallback", .data = &fw_fallback_config.force_sysfs_fallback, diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 51745ed1bbab..b163e043c687 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -3612,7 +3612,7 @@ static int cdrom_sysctl_handler(const struct ctl_table *ctl, int write, } /* Place files in /proc/sys/dev/cdrom */ -static struct ctl_table cdrom_table[] = { +static const struct ctl_table cdrom_table[] = { { .procname = "info", .data = &cdrom_sysctl_settings.info, diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index 48fe96ab4649..e110857824fc 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -724,7 +724,7 @@ static int hpet_is_known(struct hpet_data *hdp) return 0; } -static struct ctl_table hpet_table[] = { +static const struct ctl_table hpet_table[] = { { .procname = "max-user-freq", .data = &hpet_max_freq, diff --git a/drivers/char/ipmi/ipmi_poweroff.c b/drivers/char/ipmi/ipmi_poweroff.c index 05f17e3e6207..e63c316d8aaa 100644 --- a/drivers/char/ipmi/ipmi_poweroff.c +++ b/drivers/char/ipmi/ipmi_poweroff.c @@ -650,7 +650,7 @@ static struct ipmi_smi_watcher smi_watcher = { #ifdef CONFIG_PROC_FS #include -static struct ctl_table ipmi_table[] = { +static const struct ctl_table ipmi_table[] = { { .procname = "poweroff_powercycle", .data = &poweroff_powercycle, .maxlen = sizeof(poweroff_powercycle), diff --git a/drivers/char/random.c b/drivers/char/random.c index 23ee76bbb4aa..2581186fa61b 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1665,7 +1665,7 @@ static int proc_do_rointvec(const struct ctl_table *table, int write, void *buf, return write ? 0 : proc_dointvec(table, 0, buf, lenp, ppos); } -static struct ctl_table random_table[] = { +static const struct ctl_table random_table[] = { { .procname = "poolsize", .data = &sysctl_poolsize, diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 2406cda75b7b..5384d1bb4923 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -4802,7 +4802,7 @@ err_unlock: return ret; } -static struct ctl_table oa_table[] = { +static const struct ctl_table oa_table[] = { { .procname = "perf_stream_paranoid", .data = &i915_perf_stream_paranoid, diff --git a/drivers/gpu/drm/xe/xe_observation.c b/drivers/gpu/drm/xe/xe_observation.c index 8ec1b84cbb9e..57cf01efc07f 100644 --- a/drivers/gpu/drm/xe/xe_observation.c +++ b/drivers/gpu/drm/xe/xe_observation.c @@ -56,7 +56,7 @@ int xe_observation_ioctl(struct drm_device *dev, void *data, struct drm_file *fi } } -static struct ctl_table observation_ctl_table[] = { +static const struct ctl_table observation_ctl_table[] = { { .procname = "observation_paranoid", .data = &xe_observation_paranoid, diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index af5d1dc451f6..f2e6f55d6ca6 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -141,7 +141,7 @@ static int sysctl_record_panic_msg = 1; * sysctl option to allow the user to control whether kmsg data should be * reported to Hyper-V on panic. */ -static struct ctl_table hv_ctl_table[] = { +static const struct ctl_table hv_ctl_table[] = { { .procname = "hyperv_record_panic_msg", .data = &sysctl_record_panic_msg, diff --git a/drivers/md/md.c b/drivers/md/md.c index 866015b681af..22f7bd3b94d5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -294,7 +294,7 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) static struct ctl_table_header *raid_table_header; -static struct ctl_table raid_table[] = { +static const struct ctl_table raid_table[] = { { .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c index 61b66e318488..7a3c34306de9 100644 --- a/drivers/misc/sgi-xp/xpc_main.c +++ b/drivers/misc/sgi-xp/xpc_main.c @@ -93,7 +93,7 @@ int xpc_disengage_timelimit = XPC_DISENGAGE_DEFAULT_TIMELIMIT; static int xpc_disengage_min_timelimit; /* = 0 */ static int xpc_disengage_max_timelimit = 120; -static struct ctl_table xpc_sys_xpc_hb[] = { +static const struct ctl_table xpc_sys_xpc_hb[] = { { .procname = "hb_interval", .data = &xpc_hb_interval, @@ -111,7 +111,7 @@ static struct ctl_table xpc_sys_xpc_hb[] = { .extra1 = &xpc_hb_check_min_interval, .extra2 = &xpc_hb_check_max_interval}, }; -static struct ctl_table xpc_sys_xpc[] = { +static const struct ctl_table xpc_sys_xpc[] = { { .procname = "disengage_timelimit", .data = &xpc_disengage_timelimit, diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index b5cc11abc962..0e360feb3432 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -1279,7 +1279,7 @@ static int armv8pmu_proc_user_access_handler(const struct ctl_table *table, int return 0; } -static struct ctl_table armv8_pmu_sysctl_table[] = { +static const struct ctl_table armv8_pmu_sysctl_table[] = { { .procname = "perf_user_access", .data = &sysctl_perf_user_access, diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index 194c153e5d71..698de8ddf895 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -1317,7 +1317,7 @@ static int riscv_pmu_proc_user_access_handler(const struct ctl_table *table, return 0; } -static struct ctl_table sbi_pmu_sysctl_table[] = { +static const struct ctl_table sbi_pmu_sysctl_table[] = { { .procname = "perf_user_access", .data = &sysctl_perf_user_access, diff --git a/drivers/scsi/scsi_sysctl.c b/drivers/scsi/scsi_sysctl.c index 093774d77534..be4aef0f4f99 100644 --- a/drivers/scsi/scsi_sysctl.c +++ b/drivers/scsi/scsi_sysctl.c @@ -12,7 +12,7 @@ #include "scsi_priv.h" -static struct ctl_table scsi_table[] = { +static const struct ctl_table scsi_table[] = { { .procname = "logging_level", .data = &scsi_logging_level, .maxlen = sizeof(scsi_logging_level), diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 94127868bedf..effb7e768165 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1639,7 +1639,7 @@ MODULE_PARM_DESC(allow_dio, "allow direct I/O (default: 0 (disallow))"); #ifdef CONFIG_SYSCTL #include -static struct ctl_table sg_sysctls[] = { +static const struct ctl_table sg_sysctls[] = { { .procname = "sg-big-buff", .data = &sg_big_buff, diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 50c0c23ae678..449dbd216460 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -3617,7 +3617,7 @@ void console_sysfs_notify(void) sysfs_notify(&consdev->kobj, NULL, "active"); } -static struct ctl_table tty_table[] = { +static const struct ctl_table tty_table[] = { { .procname = "legacy_tiocsti", .data = &tty_legacy_tiocsti, diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 528395133b4f..163f7f1d70f1 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -84,7 +84,7 @@ module_param(balloon_boot_timeout, uint, 0444); #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG static int xen_hotplug_unpopulated; -static struct ctl_table balloon_table[] = { +static const struct ctl_table balloon_table[] = { { .procname = "hotplug_unpopulated", .data = &xen_hotplug_unpopulated, diff --git a/fs/aio.c b/fs/aio.c index 50671640b588..7b976b564cfc 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -224,7 +224,7 @@ static unsigned long aio_nr; /* current system wide number of aio requests */ static unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ /*----end sysctl variables---*/ #ifdef CONFIG_SYSCTL -static struct ctl_table aio_sysctls[] = { +static const struct ctl_table aio_sysctls[] = { { .procname = "aio-nr", .data = &aio_nr, diff --git a/fs/cachefiles/error_inject.c b/fs/cachefiles/error_inject.c index 1715d5ca2b2d..e341ade47dd8 100644 --- a/fs/cachefiles/error_inject.c +++ b/fs/cachefiles/error_inject.c @@ -11,7 +11,7 @@ unsigned int cachefiles_error_injection_state; static struct ctl_table_header *cachefiles_sysctl; -static struct ctl_table cachefiles_sysctls[] = { +static const struct ctl_table cachefiles_sysctls[] = { { .procname = "error_injection", .data = &cachefiles_error_injection_state, diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c index 9f2d5743e2c8..0df46f09b6cc 100644 --- a/fs/coda/sysctl.c +++ b/fs/coda/sysctl.c @@ -14,7 +14,7 @@ static struct ctl_table_header *fs_table_header; -static struct ctl_table coda_table[] = { +static const struct ctl_table coda_table[] = { { .procname = "timeout", .data = &coda_timeout, diff --git a/fs/coredump.c b/fs/coredump.c index d48edb37bc35..591700e1b2ce 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -995,7 +995,7 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write, static const unsigned int core_file_note_size_min = CORE_FILE_NOTE_SIZE_DEFAULT; static const unsigned int core_file_note_size_max = CORE_FILE_NOTE_SIZE_MAX; -static struct ctl_table coredump_sysctls[] = { +static const struct ctl_table coredump_sysctls[] = { { .procname = "core_uses_pid", .data = &core_uses_pid, diff --git a/fs/dcache.c b/fs/dcache.c index 1a01d7a6a7a9..1cd929f17eec 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -192,7 +192,7 @@ static int proc_nr_dentry(const struct ctl_table *table, int write, void *buffer return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } -static struct ctl_table fs_dcache_sysctls[] = { +static const struct ctl_table fs_dcache_sysctls[] = { { .procname = "dentry-state", .data = &dentry_stat, diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index b20e565b9c5e..1096ff8562fa 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -45,7 +45,7 @@ static int pty_limit_min; static int pty_limit_max = INT_MAX; static atomic_t pty_count = ATOMIC_INIT(0); -static struct ctl_table pty_table[] = { +static const struct ctl_table pty_table[] = { { .procname = "max", .maxlen = sizeof(int), diff --git a/fs/eventpoll.c b/fs/eventpoll.c index f9898e60dd8b..7c0980db77b3 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -318,7 +318,7 @@ static void unlist_file(struct epitems_head *head) static long long_zero; static long long_max = LONG_MAX; -static struct ctl_table epoll_table[] = { +static const struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, diff --git a/fs/exec.c b/fs/exec.c index a49839174472..506cd411f4ac 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2159,7 +2159,7 @@ static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int writ return error; } -static struct ctl_table fs_exec_sysctls[] = { +static const struct ctl_table fs_exec_sysctls[] = { { .procname = "suid_dumpable", .data = &suid_dumpable, diff --git a/fs/file_table.c b/fs/file_table.c index a32171d2b83f..7f7c378c6e31 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -106,7 +106,7 @@ static int proc_nr_files(const struct ctl_table *table, int write, void *buffer, return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } -static struct ctl_table fs_stat_sysctls[] = { +static const struct ctl_table fs_stat_sysctls[] = { { .procname = "file-nr", .data = &files_stat, diff --git a/fs/fuse/sysctl.c b/fs/fuse/sysctl.c index b272bb333005..63fb1e5bee30 100644 --- a/fs/fuse/sysctl.c +++ b/fs/fuse/sysctl.c @@ -13,7 +13,7 @@ static struct ctl_table_header *fuse_table_header; /* Bound by fuse_init_out max_pages, which is a u16 */ static unsigned int sysctl_fuse_max_pages_limit = 65535; -static struct ctl_table fuse_sysctl_table[] = { +static const struct ctl_table fuse_sysctl_table[] = { { .procname = "max_pages_limit", .data = &fuse_max_pages_limit, diff --git a/fs/inode.c b/fs/inode.c index 6b4c77268fc0..5587aabdaa5e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -184,7 +184,7 @@ static int proc_nr_inodes(const struct ctl_table *table, int write, void *buffer return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } -static struct ctl_table inodes_sysctls[] = { +static const struct ctl_table inodes_sysctls[] = { { .procname = "inode-nr", .data = &inodes_stat, diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 7ded57ec3a60..2c8eedc6c2cc 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -412,7 +412,7 @@ EXPORT_SYMBOL_GPL(lockd_down); * Sysctl parameters (same as module parameters, different interface). */ -static struct ctl_table nlm_sysctls[] = { +static const struct ctl_table nlm_sysctls[] = { { .procname = "nlm_grace_period", .data = &nlm_grace_period, diff --git a/fs/locks.c b/fs/locks.c index 25afc8d9c9d1..1619cddfa7a4 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -97,7 +97,7 @@ static int leases_enable = 1; static int lease_break_time = 45; #ifdef CONFIG_SYSCTL -static struct ctl_table locks_sysctls[] = { +static const struct ctl_table locks_sysctls[] = { { .procname = "leases-enable", .data = &leases_enable, diff --git a/fs/namei.c b/fs/namei.c index e56c29a22d26..8c82afddd2ad 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1099,7 +1099,7 @@ static int sysctl_protected_fifos __read_mostly; static int sysctl_protected_regular __read_mostly; #ifdef CONFIG_SYSCTL -static struct ctl_table namei_sysctls[] = { +static const struct ctl_table namei_sysctls[] = { { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, diff --git a/fs/namespace.c b/fs/namespace.c index 4013fbac354a..a3ed3f2980cb 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5985,7 +5985,7 @@ const struct proc_ns_operations mntns_operations = { }; #ifdef CONFIG_SYSCTL -static struct ctl_table fs_namespace_sysctls[] = { +static const struct ctl_table fs_namespace_sysctls[] = { { .procname = "mount-max", .data = &sysctl_mount_max, diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index 886a7c4c60b3..d1a92d8f8ba4 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -17,7 +17,7 @@ static const int nfs_set_port_min; static const int nfs_set_port_max = 65535; static struct ctl_table_header *nfs4_callback_sysctl_table; -static struct ctl_table nfs4_cb_sysctls[] = { +static const struct ctl_table nfs4_cb_sysctls[] = { { .procname = "nfs_callback_tcpport", .data = &nfs_callback_set_tcpport, diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index e645be1a3381..f579df0e8d67 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -14,7 +14,7 @@ static struct ctl_table_header *nfs_callback_sysctl_table; -static struct ctl_table nfs_cb_sysctls[] = { +static const struct ctl_table nfs_cb_sysctls[] = { { .procname = "nfs_mountpoint_timeout", .data = &nfs_mountpoint_expiry_timeout, diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 6004dfdfdf0f..c4cdaf5fa7ed 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -20,7 +20,7 @@ static int dir_notify_enable __read_mostly = 1; #ifdef CONFIG_SYSCTL -static struct ctl_table dnotify_sysctls[] = { +static const struct ctl_table dnotify_sysctls[] = { { .procname = "dir-notify-enable", .data = &dir_notify_enable, diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 6ff94e312232..ba3e2d09eb44 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -58,7 +58,7 @@ static int fanotify_max_queued_events __read_mostly; static long ft_zero = 0; static long ft_int_max = INT_MAX; -static struct ctl_table fanotify_table[] = { +static const struct ctl_table fanotify_table[] = { { .procname = "max_user_groups", .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS], diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index e0c48956608a..b372fb2c56bd 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -58,7 +58,7 @@ struct kmem_cache *inotify_inode_mark_cachep __ro_after_init; static long it_zero = 0; static long it_int_max = INT_MAX; -static struct ctl_table inotify_table[] = { +static const struct ctl_table inotify_table[] = { { .procname = "max_user_instances", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES], diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 20aa37b67cfb..ddd761cf44c8 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -650,7 +650,7 @@ error: * and easier to preserve the name. */ -static struct ctl_table ocfs2_nm_table[] = { +static const struct ctl_table ocfs2_nm_table[] = { { .procname = "hb_ctl_path", .data = ocfs2_hb_ctl_path, diff --git a/fs/pipe.c b/fs/pipe.c index 82fede0f2111..94b59045ab44 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1478,7 +1478,7 @@ static int proc_dopipe_max_size(const struct ctl_table *table, int write, do_proc_dopipe_max_size_conv, NULL); } -static struct ctl_table fs_pipe_sysctls[] = { +static const struct ctl_table fs_pipe_sysctls[] = { { .procname = "pipe-max-size", .data = &pipe_max_size, diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index f9578918cfb2..825c5c2e0962 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2926,7 +2926,7 @@ static int do_proc_dqstats(const struct ctl_table *table, int write, return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } -static struct ctl_table fs_dqstats_table[] = { +static const struct ctl_table fs_dqstats_table[] = { { .procname = "lookups", .data = &dqstats.stat[DQST_LOOKUPS], diff --git a/fs/sysctls.c b/fs/sysctls.c index 8dbde9a802fa..ad429dffeb4b 100644 --- a/fs/sysctls.c +++ b/fs/sysctls.c @@ -7,7 +7,7 @@ #include #include -static struct ctl_table fs_shared_sysctls[] = { +static const struct ctl_table fs_shared_sysctls[] = { { .procname = "overflowuid", .data = &fs_overflowuid, diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 7c0bd0b55f88..97c4d71115d8 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -36,7 +36,7 @@ static int sysctl_unprivileged_userfaultfd __read_mostly; #ifdef CONFIG_SYSCTL -static struct ctl_table vm_userfaultfd_table[] = { +static const struct ctl_table vm_userfaultfd_table[] = { { .procname = "unprivileged_userfaultfd", .data = &sysctl_unprivileged_userfaultfd, diff --git a/fs/verity/init.c b/fs/verity/init.c index f440f0e61e3e..6e8d33b50240 100644 --- a/fs/verity/init.c +++ b/fs/verity/init.c @@ -10,7 +10,7 @@ #include #ifdef CONFIG_SYSCTL -static struct ctl_table fsverity_sysctl_table[] = { +static const struct ctl_table fsverity_sysctl_table[] = { #ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES { .procname = "require_signatures", diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index c84df23b494d..751dc74a3067 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -66,7 +66,7 @@ xfs_deprecated_dointvec_minmax( return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); } -static struct ctl_table xfs_table[] = { +static const struct ctl_table xfs_table[] = { { .procname = "irix_sgid_inherit", .data = &xfs_params.sgid_inherit.val, diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index f86ef92a6c46..f6867bad0d78 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -21,7 +21,7 @@ phys_addr_t phys_initrd_start __initdata; unsigned long phys_initrd_size __initdata; #ifdef CONFIG_SYSCTL -static struct ctl_table kern_do_mounts_initrd_table[] = { +static const struct ctl_table kern_do_mounts_initrd_table[] = { { .procname = "real-root-dev", .data = &real_root_dev, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 7bfbc7c22367..5a0f8a5041d6 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -157,7 +157,7 @@ static int __read_mostly sysctl_io_uring_disabled; static int __read_mostly sysctl_io_uring_group = -1; #ifdef CONFIG_SYSCTL -static struct ctl_table kernel_io_uring_disabled_table[] = { +static const struct ctl_table kernel_io_uring_disabled_table[] = { { .procname = "io_uring_disabled", .data = &sysctl_io_uring_disabled, diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 54318e0b4557..15b17e86e198 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -73,7 +73,7 @@ int ipc_mni = IPCMNI; int ipc_mni_shift = IPCMNI_SHIFT; int ipc_min_cycle = RADIX_TREE_MAP_SIZE; -static struct ctl_table ipc_sysctls[] = { +static const struct ctl_table ipc_sysctls[] = { { .procname = "shmmax", .data = &init_ipc_ns.shm_ctlmax, diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c index b70dc2ff22d8..0dd12e1c9f53 100644 --- a/ipc/mq_sysctl.c +++ b/ipc/mq_sysctl.c @@ -20,7 +20,7 @@ static int msg_max_limit_max = HARD_MSGMAX; static int msg_maxsize_limit_min = MIN_MSGSIZEMAX; static int msg_maxsize_limit_max = HARD_MSGSIZEMAX; -static struct ctl_table mq_sysctls[] = { +static const struct ctl_table mq_sysctls[] = { { .procname = "queues_max", .data = &init_ipc_ns.mq_queues_max, diff --git a/kernel/acct.c b/kernel/acct.c index 179848ad33e9..31222e8cd534 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -76,7 +76,7 @@ static int acct_parm[3] = {4, 2, 30}; #define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */ #ifdef CONFIG_SYSCTL -static struct ctl_table kern_acct_table[] = { +static const struct ctl_table kern_acct_table[] = { { .procname = "acct", .data = &acct_parm, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0daf098e3207..c420edbfb7c8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -6128,7 +6128,7 @@ static int bpf_unpriv_handler(const struct ctl_table *table, int write, return ret; } -static struct ctl_table bpf_syscall_table[] = { +static const struct ctl_table bpf_syscall_table[] = { { .procname = "unprivileged_bpf_disabled", .data = &sysctl_unprivileged_bpf_disabled, diff --git a/kernel/delayacct.c b/kernel/delayacct.c index b238eb8c6573..eb63a021ac04 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -64,7 +64,7 @@ static int sysctl_delayacct(const struct ctl_table *table, int write, void *buff return err; } -static struct ctl_table kern_delayacct_table[] = { +static const struct ctl_table kern_delayacct_table[] = { { .procname = "task_delayacct", .data = NULL, diff --git a/kernel/exit.c b/kernel/exit.c index 1dcddfe537ee..3485e5fc499e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -85,7 +85,7 @@ static unsigned int oops_limit = 10000; #ifdef CONFIG_SYSCTL -static struct ctl_table kern_exit_table[] = { +static const struct ctl_table kern_exit_table[] = { { .procname = "oops_limit", .data = &oops_limit, diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 953169893a95..04efa7a6e69b 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -274,7 +274,7 @@ static int proc_dohung_task_timeout_secs(const struct ctl_table *table, int writ * and hung_task_check_interval_secs */ static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ); -static struct ctl_table hung_task_sysctls[] = { +static const struct ctl_table hung_task_sysctls[] = { #ifdef CONFIG_SMP { .procname = "hung_task_all_cpu_backtrace", diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index b424a5c6ae87..c0bdc1686154 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -925,7 +925,7 @@ static int kexec_limit_handler(const struct ctl_table *table, int write, return proc_dointvec(&tmp, write, buffer, lenp, ppos); } -static struct ctl_table kexec_core_sysctls[] = { +static const struct ctl_table kexec_core_sysctls[] = { { .procname = "kexec_load_disabled", .data = &kexec_load_disabled, diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 030569210670..88aeac84e4c0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -946,7 +946,7 @@ static int proc_kprobes_optimization_handler(const struct ctl_table *table, return ret; } -static struct ctl_table kprobe_sysctls[] = { +static const struct ctl_table kprobe_sysctls[] = { { .procname = "kprobes-optimization", .data = &sysctl_kprobes_optimization, diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 77ee3ea8a573..d4281d1e13a6 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -77,7 +77,7 @@ static int sysctl_latencytop(const struct ctl_table *table, int write, void *buf return err; } -static struct ctl_table latencytop_sysctl[] = { +static const struct ctl_table latencytop_sysctl[] = { { .procname = "latencytop", .data = &latencytop_enabled, diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 29acd238dad7..4470680f0226 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -79,7 +79,7 @@ module_param(lock_stat, int, 0644); #endif #ifdef CONFIG_SYSCTL -static struct ctl_table kern_lockdep_table[] = { +static const struct ctl_table kern_lockdep_table[] = { #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", diff --git a/kernel/panic.c b/kernel/panic.c index fbc59b3b64d0..d8635d5cecb2 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -84,7 +84,7 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); #ifdef CONFIG_SYSCTL -static struct ctl_table kern_panic_table[] = { +static const struct ctl_table kern_panic_table[] = { #ifdef CONFIG_SMP { .procname = "oops_all_cpu_backtrace", diff --git a/kernel/pid.c b/kernel/pid.c index 3a10a7b6fcf8..924084713be8 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -695,7 +695,7 @@ static struct ctl_table_root pid_table_root = { .set_ownership = pid_table_root_set_ownership, }; -static struct ctl_table pid_table[] = { +static const struct ctl_table pid_table[] = { { .procname = "pid_max", .data = &init_pid_ns.pid_max, diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index f1ffa032fc32..8f6cfec87555 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -303,7 +303,7 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write, return ret; } -static struct ctl_table pid_ns_ctl_table[] = { +static const struct ctl_table pid_ns_ctl_table[] = { { .procname = "ns_last_pid", .maxlen = sizeof(int), diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h index 18ecaef6be41..5d8f981de7c5 100644 --- a/kernel/pid_sysctl.h +++ b/kernel/pid_sysctl.h @@ -31,7 +31,7 @@ static int pid_mfd_noexec_dointvec_minmax(const struct ctl_table *table, return err; } -static struct ctl_table pid_ns_ctl_table_vm[] = { +static const struct ctl_table pid_ns_ctl_table_vm[] = { { .procname = "memfd_noexec", .data = &init_pid_ns.memfd_noexec_scope, diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c index f5072dc85f7a..da77f3f5c1fe 100644 --- a/kernel/printk/sysctl.c +++ b/kernel/printk/sysctl.c @@ -20,7 +20,7 @@ static int proc_dointvec_minmax_sysadmin(const struct ctl_table *table, int writ return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } -static struct ctl_table printk_sysctls[] = { +static const struct ctl_table printk_sysctls[] = { { .procname = "printk", .data = &console_loglevel, diff --git a/kernel/reboot.c b/kernel/reboot.c index a701000bab34..b5a8569e5d81 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -1287,7 +1287,7 @@ static struct attribute *reboot_attrs[] = { }; #ifdef CONFIG_SYSCTL -static struct ctl_table kern_reboot_table[] = { +static const struct ctl_table kern_reboot_table[] = { { .procname = "poweroff_cmd", .data = &poweroff_cmd, diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index db68a964e34e..83d46b9b8ec8 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -9,7 +9,7 @@ static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; #ifdef CONFIG_SYSCTL -static struct ctl_table sched_autogroup_sysctls[] = { +static const struct ctl_table sched_autogroup_sysctls[] = { { .procname = "sched_autogroup_enabled", .data = &sysctl_sched_autogroup_enabled, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9142a0394d46..165c90ba64ea 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4654,7 +4654,7 @@ static int sysctl_schedstats(const struct ctl_table *table, int write, void *buf #endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_SYSCTL -static struct ctl_table sched_core_sysctls[] = { +static const struct ctl_table sched_core_sysctls[] = { #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats", diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 62192ac79c30..38e4537790af 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -26,7 +26,7 @@ static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ static unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ #ifdef CONFIG_SYSCTL -static struct ctl_table sched_dl_sysctls[] = { +static const struct ctl_table sched_dl_sysctls[] = { { .procname = "sched_deadline_period_max_us", .data = &sysctl_sched_dl_period_max, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1e78caa21436..ce2e94ccad0c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -133,7 +133,7 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; #endif #ifdef CONFIG_SYSCTL -static struct ctl_table sched_fair_sysctls[] = { +static const struct ctl_table sched_fair_sysctls[] = { #ifdef CONFIG_CFS_BANDWIDTH { .procname = "sched_cfs_bandwidth_slice_us", diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index bd66a46b06ac..4b8e33c615b1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -26,7 +26,7 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff size_t *lenp, loff_t *ppos); static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -static struct ctl_table sched_rt_sysctls[] = { +static const struct ctl_table sched_rt_sysctls[] = { { .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index da33ec9e94ab..c49aea8c1025 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -312,7 +312,7 @@ static int sched_energy_aware_handler(const struct ctl_table *table, int write, return ret; } -static struct ctl_table sched_energy_aware_sysctls[] = { +static const struct ctl_table sched_energy_aware_sysctls[] = { { .procname = "sched_energy_aware", .data = &sysctl_sched_energy_aware, diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 385d48293a5f..f59381c4a2ff 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -2450,7 +2450,7 @@ static int seccomp_actions_logged_handler(const struct ctl_table *ro_table, int return ret; } -static struct ctl_table seccomp_sysctl_table[] = { +static const struct ctl_table seccomp_sysctl_table[] = { { .procname = "actions_avail", .data = (void *) &seccomp_actions_avail, diff --git a/kernel/signal.c b/kernel/signal.c index a2afd54303f0..875e97f6205a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -4950,7 +4950,7 @@ static inline void siginfo_buildtime_checks(void) } #if defined(CONFIG_SYSCTL) -static struct ctl_table signal_debug_table[] = { +static const struct ctl_table signal_debug_table[] = { #ifdef CONFIG_SYSCTL_EXCEPTION_TRACE { .procname = "exception-trace", diff --git a/kernel/stackleak.c b/kernel/stackleak.c index 0f4804f28c61..bb65321761b4 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -45,7 +45,7 @@ static int stack_erasing_sysctl(const struct ctl_table *table, int write, str_enabled_disabled(state)); return ret; } -static struct ctl_table stackleak_sysctls[] = { +static const struct ctl_table stackleak_sysctls[] = { { .procname = "stack_erasing", .data = NULL, diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c index 3ac98bb7fb82..eb2842bd0557 100644 --- a/kernel/sysctl-test.c +++ b/kernel/sysctl-test.c @@ -374,7 +374,7 @@ static void sysctl_test_register_sysctl_sz_invalid_extra_value( struct kunit *test) { unsigned char data = 0; - struct ctl_table table_foo[] = { + const struct ctl_table table_foo[] = { { .procname = "foo", .data = &data, @@ -386,7 +386,7 @@ static void sysctl_test_register_sysctl_sz_invalid_extra_value( }, }; - struct ctl_table table_bar[] = { + const struct ctl_table table_bar[] = { { .procname = "bar", .data = &data, @@ -398,7 +398,7 @@ static void sysctl_test_register_sysctl_sz_invalid_extra_value( }, }; - struct ctl_table table_qux[] = { + const struct ctl_table table_qux[] = { { .procname = "qux", .data = &data, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7ae7a4136855..cb57da499ebb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1609,7 +1609,7 @@ int proc_do_static_key(const struct ctl_table *table, int write, return ret; } -static struct ctl_table kern_table[] = { +static const struct ctl_table kern_table[] = { { .procname = "panic", .data = &panic_timeout, @@ -2021,7 +2021,7 @@ static struct ctl_table kern_table[] = { #endif }; -static struct ctl_table vm_table[] = { +static const struct ctl_table vm_table[] = { { .procname = "overcommit_memory", .data = &sysctl_overcommit_memory, diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 40706cb36920..c8f776dc6ee0 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -301,7 +301,7 @@ static int timer_migration_handler(const struct ctl_table *table, int write, return ret; } -static struct ctl_table timer_sysctl[] = { +static const struct ctl_table timer_sysctl[] = { { .procname = "timer_migration", .data = &sysctl_timer_migration, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f79eb9386c7f..728ecda6e8d4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -8780,7 +8780,7 @@ ftrace_enable_sysctl(const struct ctl_table *table, int write, return 0; } -static struct ctl_table ftrace_sysctls[] = { +static const struct ctl_table ftrace_sysctls[] = { { .procname = "ftrace_enabled", .data = &ftrace_enabled, diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 17bcad8f79de..97325fbd6283 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -2899,7 +2899,7 @@ static int set_max_user_events_sysctl(const struct ctl_table *table, int write, return ret; } -static struct ctl_table user_event_sysctls[] = { +static const struct ctl_table user_event_sysctls[] = { { .procname = "user_events_max", .data = &max_user_events, diff --git a/kernel/umh.c b/kernel/umh.c index be9234270777..b4da45a3a7cf 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -544,7 +544,7 @@ static int proc_cap_handler(const struct ctl_table *table, int write, return 0; } -static struct ctl_table usermodehelper_table[] = { +static const struct ctl_table usermodehelper_table[] = { { .procname = "bset", .data = &usermodehelper_bset, diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 7282f61a8650..bfbaaecb1dd4 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -75,7 +75,7 @@ static DEFINE_CTL_TABLE_POLL(hostname_poll); static DEFINE_CTL_TABLE_POLL(domainname_poll); // Note: update 'enum uts_proc' to match any changes to this table -static struct ctl_table uts_kern_table[] = { +static const struct ctl_table uts_kern_table[] = { { .procname = "arch", .data = init_uts_ns.name.machine, @@ -129,7 +129,7 @@ static struct ctl_table uts_kern_table[] = { */ void uts_proc_notify(enum uts_proc proc) { - struct ctl_table *table = &uts_kern_table[proc]; + const struct ctl_table *table = &uts_kern_table[proc]; proc_sys_poll_notify(table->poll); } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 177abb7d0d4e..b2da7de39d06 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -1094,7 +1094,7 @@ static int proc_watchdog_cpumask(const struct ctl_table *table, int write, static const int sixty = 60; -static struct ctl_table watchdog_sysctls[] = { +static const struct ctl_table watchdog_sysctls[] = { { .procname = "watchdog", .data = &watchdog_user_enabled, diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c index b6696fa1d426..4249e0cc8aaf 100644 --- a/lib/test_sysctl.c +++ b/lib/test_sysctl.c @@ -71,7 +71,7 @@ static struct test_sysctl_data test_data = { }; /* These are all under /proc/sys/debug/test_sysctl/ */ -static struct ctl_table test_table[] = { +static const struct ctl_table test_table[] = { { .procname = "int_0001", .data = &test_data.int_0001, @@ -177,7 +177,7 @@ static int test_sysctl_setup_node_tests(void) } /* Used to test that unregister actually removes the directory */ -static struct ctl_table test_table_unregister[] = { +static const struct ctl_table test_table_unregister[] = { { .procname = "unregister_error", .data = &test_data.int_0001, @@ -220,7 +220,7 @@ static int test_sysctl_run_register_mount_point(void) return 0; } -static struct ctl_table test_table_empty[] = { }; +static const struct ctl_table test_table_empty[] = { }; static int test_sysctl_run_register_empty(void) { diff --git a/mm/compaction.c b/mm/compaction.c index 73e80b2fb22e..bcc0df0066dc 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -3272,7 +3272,7 @@ static int proc_dointvec_minmax_warn_RT_change(const struct ctl_table *table, return ret; } -static struct ctl_table vm_compaction[] = { +static const struct ctl_table vm_compaction[] = { { .procname = "compact_memory", .data = &sysctl_compact_memory, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 87761b042ed0..3b25b69aa94f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4867,7 +4867,7 @@ out: return ret; } -static struct ctl_table hugetlb_table[] = { +static const struct ctl_table hugetlb_table[] = { { .procname = "nr_hugepages", .data = NULL, diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 57b7f591eee8..7735972add01 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -693,7 +693,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l free_vmemmap_page_list(&vmemmap_pages); } -static struct ctl_table hugetlb_vmemmap_sysctls[] = { +static const struct ctl_table hugetlb_vmemmap_sysctls[] = { { .procname = "hugetlb_optimize_vmemmap", .data = &vmemmap_optimize_enabled, diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a7b8ccd29b6f..995a15eb67e2 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -124,7 +124,7 @@ const struct attribute_group memory_failure_attr_group = { .attrs = memory_failure_attr, }; -static struct ctl_table memory_failure_table[] = { +static const struct ctl_table memory_failure_table[] = { { .procname = "memory_failure_early_kill", .data = &sysctl_memory_failure_early_kill, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 044ebab2c941..1cf121ad7085 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -705,7 +705,7 @@ static void queue_oom_reaper(struct task_struct *tsk) } #ifdef CONFIG_SYSCTL -static struct ctl_table vm_oom_kill_table[] = { +static const struct ctl_table vm_oom_kill_table[] = { { .procname = "panic_on_oom", .data = &sysctl_panic_on_oom, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 4f5970723cf2..eb55ece39c56 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2298,7 +2298,7 @@ static int page_writeback_cpu_online(unsigned int cpu) /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; -static struct ctl_table vm_page_writeback_sysctls[] = { +static const struct ctl_table vm_page_writeback_sysctls[] = { { .procname = "dirty_background_ratio", .data = &dirty_background_ratio, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e469c7ef9a4..579789600a3c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6166,7 +6166,7 @@ out: return ret; } -static struct ctl_table page_alloc_sysctl_table[] = { +static const struct ctl_table page_alloc_sysctl_table[] = { { .procname = "min_free_kbytes", .data = &min_free_kbytes, diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 1edc12862a7d..9b6c2f157f83 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -2038,7 +2038,7 @@ static int apparmor_dointvec(const struct ctl_table *table, int write, return proc_dointvec(table, write, buffer, lenp, ppos); } -static struct ctl_table apparmor_sysctl_table[] = { +static const struct ctl_table apparmor_sysctl_table[] = { #ifdef CONFIG_USER_NS { .procname = "unprivileged_userns_apparmor_policy", diff --git a/security/keys/sysctl.c b/security/keys/sysctl.c index 91f000eef3ad..cde08c478f32 100644 --- a/security/keys/sysctl.c +++ b/security/keys/sysctl.c @@ -9,7 +9,7 @@ #include #include "internal.h" -static struct ctl_table key_sysctls[] = { +static const struct ctl_table key_sysctls[] = { { .procname = "maxkeys", .data = &key_quota_maxkeys, diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 1a2d02fee09b..1971710620c1 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -452,7 +452,7 @@ static int yama_dointvec_minmax(const struct ctl_table *table, int write, static int max_scope = YAMA_SCOPE_NO_ATTACH; -static struct ctl_table yama_sysctl_table[] = { +static const struct ctl_table yama_sysctl_table[] = { { .procname = "ptrace_scope", .data = &ptrace_scope, -- 2.51.0 From f6ab7384d554ba80ff4793259d75535874b366f5 Mon Sep 17 00:00:00 2001 From: Luo Yifan Date: Tue, 28 Jan 2025 23:27:01 +0900 Subject: [PATCH 11/16] tools/bootconfig: Fix the wrong format specifier Use '%u' instead of '%d' for unsigned int. Link: https://lore.kernel.org/all/20241105011048.201629-1-luoyifan@cmss.chinamobile.com/ Fixes: 973780011106 ("tools/bootconfig: Suppress non-error messages") Signed-off-by: Luo Yifan Signed-off-by: Masami Hiramatsu (Google) --- tools/bootconfig/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index 156b62a163c5..8a48cc2536f5 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -226,7 +226,7 @@ static int load_xbc_from_initrd(int fd, char **buf) /* Wrong Checksum */ rcsum = xbc_calc_checksum(*buf, size); if (csum != rcsum) { - pr_err("checksum error: %d != %d\n", csum, rcsum); + pr_err("checksum error: %u != %u\n", csum, rcsum); return -EINVAL; } @@ -395,7 +395,7 @@ static int apply_xbc(const char *path, const char *xbc_path) xbc_get_info(&ret, NULL); printf("\tNumber of nodes: %d\n", ret); printf("\tSize: %u bytes\n", (unsigned int)size); - printf("\tChecksum: %d\n", (unsigned int)csum); + printf("\tChecksum: %u\n", (unsigned int)csum); /* TODO: Check the options by schema */ xbc_exit(); -- 2.51.0 From 1105ab42a84bc11c62597005f78ccad2434fbd66 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 27 Jan 2025 12:43:37 +0100 Subject: [PATCH 12/16] x86/sev: Disable jump tables in SEV startup code When retpolines and IBT are both disabled, the compiler is free to use jump tables to optimize switch instructions. However, these are emitted by Clang as absolute references into .rodata: jmp *-0x7dfffe90(,%r9,8) R_X86_64_32S .rodata+0x170 Given that this code will execute before that address in .rodata has even been mapped, it is guaranteed to crash a SEV-SNP guest in a way that is difficult to diagnose. So disable jump tables when building this code. It would be better if we could attach this annotation to the __head macro but this appears to be impossible. Reported-by: Linus Torvalds Tested-by: Linus Torvalds Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250127114334.1045857-6-ardb+git@google.com --- arch/x86/coco/sev/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/coco/sev/Makefile b/arch/x86/coco/sev/Makefile index 08de37559307..dcb06dc8b5ae 100644 --- a/arch/x86/coco/sev/Makefile +++ b/arch/x86/coco/sev/Makefile @@ -2,6 +2,10 @@ obj-y += core.o +# jump tables are emitted using absolute references in non-PIC code +# so they cannot be used in the early SEV startup code +CFLAGS_core.o += -fno-jump-tables + ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_core.o = -pg endif -- 2.51.0 From 27560b371ab82c1894d048aef0d113acb093f67f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 29 Jan 2025 07:37:57 +0100 Subject: [PATCH 13/16] fs: pack struct kstat better Move the change_cookie and subvol up to avoid two 4 byte holes. Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- include/linux/stat.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/stat.h b/include/linux/stat.h index 9d8382e23a9c..be7496a6a0dd 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -50,11 +50,11 @@ struct kstat { struct timespec64 btime; /* File creation time */ u64 blocks; u64 mnt_id; + u64 change_cookie; + u64 subvol; u32 dio_mem_align; u32 dio_offset_align; u32 dio_read_offset_align; - u64 change_cookie; - u64 subvol; u32 atomic_write_unit_min; u32 atomic_write_unit_max; u32 atomic_write_segments_max; -- 2.51.0 From b0430f39de089920e3aab3f4a9c35c35110bdbea Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 23 Jan 2025 13:29:03 -0800 Subject: [PATCH 14/16] lib/crc: simplify the kconfig options for CRC implementations Make the following simplifications to the kconfig options for choosing CRC implementations for CRC32 and CRC_T10DIF: 1. Make the option to disable the arch-optimized code be visible only when CONFIG_EXPERT=y. 2. Make a single option control the inclusion of the arch-optimized code for all enabled CRC variants. 3. Make CRC32_SARWATE (a.k.a. slice-by-1 or byte-by-byte) be the only generic CRC32 implementation. The result is there is now just one option, CRC_OPTIMIZATIONS, which is default y and can be disabled only when CONFIG_EXPERT=y. Rationale: 1. Enabling the arch-optimized code is nearly always the right choice. However, people trying to build the tiniest kernel possible would find some use in disabling it. Anything we add to CRC32 is de facto unconditional, given that CRC32 gets selected by something in nearly all kernels. And unfortunately enabling the arch CRC code does not eliminate the need to build the generic CRC code into the kernel too, due to CPU feature dependencies. The size of the arch CRC code will also increase slightly over time as more CRC variants get added and more implementations targeting different instruction set extensions get added. Thus, it seems worthwhile to still provide an option to disable it, but it should be considered an expert-level tweak. 2. Considering the use case described in (1), there doesn't seem to be sufficient value in making the arch-optimized CRC code be independently configurable for different CRC variants. Note also that multiple variants were already grouped together, e.g. CONFIG_CRC32 actually enables three different variants of CRC32. 3. The bit-by-bit implementation is uselessly slow, whereas slice-by-n for n=4 and n=8 use tables that are inconveniently large: 4096 bytes and 8192 bytes respectively, compared to 1024 bytes for n=1. Higher n gives higher instruction-level parallelism, so higher n easily wins on traditional microbenchmarks on most CPUs. However, the larger tables, which are accessed randomly, can be harmful in real-world situations where the dcache may be cold or useful data may need be evicted from the dcache. Meanwhile, today most architectures have much faster CRC32 implementations using dedicated CRC32 instructions or carryless multiplication instructions anyway, which make the generic code obsolete in most cases especially on long messages. Another reason for going with n=1 is that this is already what is used by all the other CRC variants in the kernel. CRC32 was unique in having support for larger tables. But as per the above this can be considered an outdated optimization. The standardization on slice-by-1 a.k.a. CRC32_SARWATE makes much of the code in lib/crc32.c unused. A later patch will clean that up. Link: https://lore.kernel.org/r/20250123212904.118683-2-ebiggers@kernel.org Reviewed-by: Ard Biesheuvel Reviewed-by: Martin K. Petersen Signed-off-by: Eric Biggers --- lib/Kconfig | 116 +++++++--------------------------------------------- 1 file changed, 14 insertions(+), 102 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index a78d22c6507f..e08b26e8e03f 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -164,34 +164,9 @@ config CRC_T10DIF config ARCH_HAS_CRC_T10DIF bool -choice - prompt "CRC-T10DIF implementation" - depends on CRC_T10DIF - default CRC_T10DIF_IMPL_ARCH if ARCH_HAS_CRC_T10DIF - default CRC_T10DIF_IMPL_GENERIC if !ARCH_HAS_CRC_T10DIF - help - This option allows you to override the default choice of CRC-T10DIF - implementation. - -config CRC_T10DIF_IMPL_ARCH - bool "Architecture-optimized" if ARCH_HAS_CRC_T10DIF - help - Use the optimized implementation of CRC-T10DIF for the selected - architecture. It is recommended to keep this enabled, as it can - greatly improve CRC-T10DIF performance. - -config CRC_T10DIF_IMPL_GENERIC - bool "Generic implementation" - help - Use the generic table-based implementation of CRC-T10DIF. Selecting - this will reduce code size slightly but can greatly reduce CRC-T10DIF - performance. - -endchoice - config CRC_T10DIF_ARCH tristate - default CRC_T10DIF if CRC_T10DIF_IMPL_ARCH + default CRC_T10DIF if ARCH_HAS_CRC_T10DIF && CRC_OPTIMIZATIONS config CRC64_ROCKSOFT tristate "CRC calculation for the Rocksoft model CRC64" @@ -214,6 +189,7 @@ config CRC32 tristate "CRC32/CRC32c functions" default y select BITREVERSE + select CRC32_SARWATE help This option is provided for the case where no in-kernel-tree modules require CRC32/CRC32c functions, but a module built outside @@ -223,87 +199,12 @@ config CRC32 config ARCH_HAS_CRC32 bool -choice - prompt "CRC32 implementation" - depends on CRC32 - default CRC32_IMPL_ARCH_PLUS_SLICEBY8 if ARCH_HAS_CRC32 - default CRC32_IMPL_SLICEBY8 if !ARCH_HAS_CRC32 - help - This option allows you to override the default choice of CRC32 - implementation. Choose the default unless you know that you need one - of the others. - -config CRC32_IMPL_ARCH_PLUS_SLICEBY8 - bool "Arch-optimized, with fallback to slice-by-8" if ARCH_HAS_CRC32 - help - Use architecture-optimized implementation of CRC32. Fall back to - slice-by-8 in cases where the arch-optimized implementation cannot be - used, e.g. if the CPU lacks support for the needed instructions. - - This is the default when an arch-optimized implementation exists. - -config CRC32_IMPL_ARCH_PLUS_SLICEBY1 - bool "Arch-optimized, with fallback to slice-by-1" if ARCH_HAS_CRC32 - help - Use architecture-optimized implementation of CRC32, but fall back to - slice-by-1 instead of slice-by-8 in order to reduce the binary size. - -config CRC32_IMPL_SLICEBY8 - bool "Slice by 8 bytes" - help - Calculate checksum 8 bytes at a time with a clever slicing algorithm. - This is much slower than the architecture-optimized implementation of - CRC32 (if the selected arch has one), but it is portable and is the - fastest implementation when no arch-optimized implementation is - available. It uses an 8KiB lookup table. Most modern processors have - enough cache to hold this table without thrashing the cache. - -config CRC32_IMPL_SLICEBY4 - bool "Slice by 4 bytes" - help - Calculate checksum 4 bytes at a time with a clever slicing algorithm. - This is a bit slower than slice by 8, but has a smaller 4KiB lookup - table. - - Only choose this option if you know what you are doing. - -config CRC32_IMPL_SLICEBY1 - bool "Slice by 1 byte (Sarwate's algorithm)" - help - Calculate checksum a byte at a time using Sarwate's algorithm. This - is not particularly fast, but has a small 1KiB lookup table. - - Only choose this option if you know what you are doing. - -config CRC32_IMPL_BIT - bool "Classic Algorithm (one bit at a time)" - help - Calculate checksum one bit at a time. This is VERY slow, but has - no lookup table. This is provided as a debugging option. - - Only choose this option if you are debugging crc32. - -endchoice - config CRC32_ARCH tristate - default CRC32 if CRC32_IMPL_ARCH_PLUS_SLICEBY8 || CRC32_IMPL_ARCH_PLUS_SLICEBY1 - -config CRC32_SLICEBY8 - bool - default y if CRC32_IMPL_SLICEBY8 || CRC32_IMPL_ARCH_PLUS_SLICEBY8 - -config CRC32_SLICEBY4 - bool - default y if CRC32_IMPL_SLICEBY4 + default CRC32 if ARCH_HAS_CRC32 && CRC_OPTIMIZATIONS config CRC32_SARWATE bool - default y if CRC32_IMPL_SLICEBY1 || CRC32_IMPL_ARCH_PLUS_SLICEBY1 - -config CRC32_BIT - bool - default y if CRC32_IMPL_BIT config CRC64 tristate "CRC64 functions" @@ -343,6 +244,17 @@ config CRC8 when they need to do cyclic redundancy check according CRC8 algorithm. Module will be called crc8. +config CRC_OPTIMIZATIONS + bool "Enable optimized CRC implementations" if EXPERT + default y + help + Disabling this option reduces code size slightly by disabling the + architecture-optimized implementations of any CRC variants that are + enabled. CRC checksumming performance may get much slower. + + Keep this enabled unless you're really trying to minimize the size of + the kernel. + config XXHASH tristate -- 2.51.0 From 5e3c1c48fac3793c173567df735890d4e29cbb64 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 23 Jan 2025 13:29:04 -0800 Subject: [PATCH 15/16] lib/crc32: remove other generic implementations Now that we've standardized on the byte-by-byte implementation of CRC32 as the only generic implementation (see previous commit for the rationale), remove the code for the other implementations. Tested with crc_kunit. Link: https://lore.kernel.org/r/20250123212904.118683-3-ebiggers@kernel.org Reviewed-by: Ard Biesheuvel Reviewed-by: Martin K. Petersen Signed-off-by: Eric Biggers --- lib/Kconfig | 4 - lib/crc32.c | 225 ++----------------------------------------- lib/crc32defs.h | 59 ------------ lib/gen_crc32table.c | 113 ++++++---------------- 4 files changed, 40 insertions(+), 361 deletions(-) delete mode 100644 lib/crc32defs.h diff --git a/lib/Kconfig b/lib/Kconfig index e08b26e8e03f..dccb61b7d698 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -189,7 +189,6 @@ config CRC32 tristate "CRC32/CRC32c functions" default y select BITREVERSE - select CRC32_SARWATE help This option is provided for the case where no in-kernel-tree modules require CRC32/CRC32c functions, but a module built outside @@ -203,9 +202,6 @@ config CRC32_ARCH tristate default CRC32 if ARCH_HAS_CRC32 && CRC_OPTIMIZATIONS -config CRC32_SARWATE - bool - config CRC64 tristate "CRC64 functions" help diff --git a/lib/crc32.c b/lib/crc32.c index 47151624332e..ede6131f66fc 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -30,20 +30,6 @@ #include #include #include -#include -#include "crc32defs.h" - -#if CRC_LE_BITS > 8 -# define tole(x) ((__force u32) cpu_to_le32(x)) -#else -# define tole(x) (x) -#endif - -#if CRC_BE_BITS > 8 -# define tobe(x) ((__force u32) cpu_to_be32(x)) -#else -# define tobe(x) (x) -#endif #include "crc32table.h" @@ -51,157 +37,20 @@ MODULE_AUTHOR("Matt Domsch "); MODULE_DESCRIPTION("Various CRC32 calculations"); MODULE_LICENSE("GPL"); -#if CRC_LE_BITS > 8 || CRC_BE_BITS > 8 - -/* implements slicing-by-4 or slicing-by-8 algorithm */ -static inline u32 __pure -crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) -{ -# ifdef __LITTLE_ENDIAN -# define DO_CRC(x) crc = t0[(crc ^ (x)) & 255] ^ (crc >> 8) -# define DO_CRC4 (t3[(q) & 255] ^ t2[(q >> 8) & 255] ^ \ - t1[(q >> 16) & 255] ^ t0[(q >> 24) & 255]) -# define DO_CRC8 (t7[(q) & 255] ^ t6[(q >> 8) & 255] ^ \ - t5[(q >> 16) & 255] ^ t4[(q >> 24) & 255]) -# else -# define DO_CRC(x) crc = t0[((crc >> 24) ^ (x)) & 255] ^ (crc << 8) -# define DO_CRC4 (t0[(q) & 255] ^ t1[(q >> 8) & 255] ^ \ - t2[(q >> 16) & 255] ^ t3[(q >> 24) & 255]) -# define DO_CRC8 (t4[(q) & 255] ^ t5[(q >> 8) & 255] ^ \ - t6[(q >> 16) & 255] ^ t7[(q >> 24) & 255]) -# endif - const u32 *b; - size_t rem_len; -# ifdef CONFIG_X86 - size_t i; -# endif - const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3]; -# if CRC_LE_BITS != 32 - const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7]; -# endif - u32 q; - - /* Align it */ - if (unlikely((long)buf & 3 && len)) { - do { - DO_CRC(*buf++); - } while ((--len) && ((long)buf)&3); - } - -# if CRC_LE_BITS == 32 - rem_len = len & 3; - len = len >> 2; -# else - rem_len = len & 7; - len = len >> 3; -# endif - - b = (const u32 *)buf; -# ifdef CONFIG_X86 - --b; - for (i = 0; i < len; i++) { -# else - for (--b; len; --len) { -# endif - q = crc ^ *++b; /* use pre increment for speed */ -# if CRC_LE_BITS == 32 - crc = DO_CRC4; -# else - crc = DO_CRC8; - q = *++b; - crc ^= DO_CRC4; -# endif - } - len = rem_len; - /* And the last few bytes */ - if (len) { - u8 *p = (u8 *)(b + 1) - 1; -# ifdef CONFIG_X86 - for (i = 0; i < len; i++) - DO_CRC(*++p); /* use pre increment for speed */ -# else - do { - DO_CRC(*++p); /* use pre increment for speed */ - } while (--len); -# endif - } - return crc; -#undef DO_CRC -#undef DO_CRC4 -#undef DO_CRC8 -} -#endif - - -/** - * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II - * CRC32/CRC32C - * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for other - * uses, or the previous crc32/crc32c value if computing incrementally. - * @p: pointer to buffer over which CRC32/CRC32C is run - * @len: length of buffer @p - * @tab: little-endian Ethernet table - * @polynomial: CRC32/CRC32c LE polynomial - */ -static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p, - size_t len, const u32 (*tab)[256], - u32 polynomial) +u32 __pure crc32_le_base(u32 crc, const u8 *p, size_t len) { -#if CRC_LE_BITS == 1 - int i; - while (len--) { - crc ^= *p++; - for (i = 0; i < 8; i++) - crc = (crc >> 1) ^ ((crc & 1) ? polynomial : 0); - } -# elif CRC_LE_BITS == 2 - while (len--) { - crc ^= *p++; - crc = (crc >> 2) ^ tab[0][crc & 3]; - crc = (crc >> 2) ^ tab[0][crc & 3]; - crc = (crc >> 2) ^ tab[0][crc & 3]; - crc = (crc >> 2) ^ tab[0][crc & 3]; - } -# elif CRC_LE_BITS == 4 - while (len--) { - crc ^= *p++; - crc = (crc >> 4) ^ tab[0][crc & 15]; - crc = (crc >> 4) ^ tab[0][crc & 15]; - } -# elif CRC_LE_BITS == 8 - /* aka Sarwate algorithm */ - while (len--) { - crc ^= *p++; - crc = (crc >> 8) ^ tab[0][crc & 255]; - } -# else - crc = (__force u32) __cpu_to_le32(crc); - crc = crc32_body(crc, p, len, tab); - crc = __le32_to_cpu((__force __le32)crc); -#endif + while (len--) + crc = (crc >> 8) ^ crc32table_le[(crc & 255) ^ *p++]; return crc; } +EXPORT_SYMBOL(crc32_le_base); -#if CRC_LE_BITS == 1 -u32 __pure crc32_le_base(u32 crc, const u8 *p, size_t len) -{ - return crc32_le_generic(crc, p, len, NULL, CRC32_POLY_LE); -} -u32 __pure crc32c_le_base(u32 crc, const u8 *p, size_t len) -{ - return crc32_le_generic(crc, p, len, NULL, CRC32C_POLY_LE); -} -#else -u32 __pure crc32_le_base(u32 crc, const u8 *p, size_t len) -{ - return crc32_le_generic(crc, p, len, crc32table_le, CRC32_POLY_LE); -} u32 __pure crc32c_le_base(u32 crc, const u8 *p, size_t len) { - return crc32_le_generic(crc, p, len, crc32ctable_le, CRC32C_POLY_LE); + while (len--) + crc = (crc >> 8) ^ crc32ctable_le[(crc & 255) ^ *p++]; + return crc; } -#endif -EXPORT_SYMBOL(crc32_le_base); EXPORT_SYMBOL(crc32c_le_base); /* @@ -277,64 +126,10 @@ u32 __attribute_const__ __crc32c_le_shift(u32 crc, size_t len) EXPORT_SYMBOL(crc32_le_shift); EXPORT_SYMBOL(__crc32c_le_shift); -/** - * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 - * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for - * other uses, or the previous crc32 value if computing incrementally. - * @p: pointer to buffer over which CRC32 is run - * @len: length of buffer @p - * @tab: big-endian Ethernet table - * @polynomial: CRC32 BE polynomial - */ -static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p, - size_t len, const u32 (*tab)[256], - u32 polynomial) -{ -#if CRC_BE_BITS == 1 - int i; - while (len--) { - crc ^= *p++ << 24; - for (i = 0; i < 8; i++) - crc = - (crc << 1) ^ ((crc & 0x80000000) ? polynomial : - 0); - } -# elif CRC_BE_BITS == 2 - while (len--) { - crc ^= *p++ << 24; - crc = (crc << 2) ^ tab[0][crc >> 30]; - crc = (crc << 2) ^ tab[0][crc >> 30]; - crc = (crc << 2) ^ tab[0][crc >> 30]; - crc = (crc << 2) ^ tab[0][crc >> 30]; - } -# elif CRC_BE_BITS == 4 - while (len--) { - crc ^= *p++ << 24; - crc = (crc << 4) ^ tab[0][crc >> 28]; - crc = (crc << 4) ^ tab[0][crc >> 28]; - } -# elif CRC_BE_BITS == 8 - while (len--) { - crc ^= *p++ << 24; - crc = (crc << 8) ^ tab[0][crc >> 24]; - } -# else - crc = (__force u32) __cpu_to_be32(crc); - crc = crc32_body(crc, p, len, tab); - crc = __be32_to_cpu((__force __be32)crc); -# endif - return crc; -} - -#if CRC_BE_BITS == 1 -u32 __pure crc32_be_base(u32 crc, const u8 *p, size_t len) -{ - return crc32_be_generic(crc, p, len, NULL, CRC32_POLY_BE); -} -#else u32 __pure crc32_be_base(u32 crc, const u8 *p, size_t len) { - return crc32_be_generic(crc, p, len, crc32table_be, CRC32_POLY_BE); + while (len--) + crc = (crc << 8) ^ crc32table_be[(crc >> 24) ^ *p++]; + return crc; } -#endif EXPORT_SYMBOL(crc32_be_base); diff --git a/lib/crc32defs.h b/lib/crc32defs.h deleted file mode 100644 index 0c8fb5923e7e..000000000000 --- a/lib/crc32defs.h +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -/* Try to choose an implementation variant via Kconfig */ -#ifdef CONFIG_CRC32_SLICEBY8 -# define CRC_LE_BITS 64 -# define CRC_BE_BITS 64 -#endif -#ifdef CONFIG_CRC32_SLICEBY4 -# define CRC_LE_BITS 32 -# define CRC_BE_BITS 32 -#endif -#ifdef CONFIG_CRC32_SARWATE -# define CRC_LE_BITS 8 -# define CRC_BE_BITS 8 -#endif -#ifdef CONFIG_CRC32_BIT -# define CRC_LE_BITS 1 -# define CRC_BE_BITS 1 -#endif - -/* - * How many bits at a time to use. Valid values are 1, 2, 4, 8, 32 and 64. - * For less performance-sensitive, use 4 or 8 to save table size. - * For larger systems choose same as CPU architecture as default. - * This works well on X86_64, SPARC64 systems. This may require some - * elaboration after experiments with other architectures. - */ -#ifndef CRC_LE_BITS -# ifdef CONFIG_64BIT -# define CRC_LE_BITS 64 -# else -# define CRC_LE_BITS 32 -# endif -#endif -#ifndef CRC_BE_BITS -# ifdef CONFIG_64BIT -# define CRC_BE_BITS 64 -# else -# define CRC_BE_BITS 32 -# endif -#endif - -/* - * Little-endian CRC computation. Used with serial bit streams sent - * lsbit-first. Be sure to use cpu_to_le32() to append the computed CRC. - */ -#if CRC_LE_BITS > 64 || CRC_LE_BITS < 1 || CRC_LE_BITS == 16 || \ - CRC_LE_BITS & CRC_LE_BITS-1 -# error "CRC_LE_BITS must be one of {1, 2, 4, 8, 32, 64}" -#endif - -/* - * Big-endian CRC computation. Used with serial bit streams sent - * msbit-first. Be sure to use cpu_to_be32() to append the computed CRC. - */ -#if CRC_BE_BITS > 64 || CRC_BE_BITS < 1 || CRC_BE_BITS == 16 || \ - CRC_BE_BITS & CRC_BE_BITS-1 -# error "CRC_BE_BITS must be one of {1, 2, 4, 8, 32, 64}" -#endif diff --git a/lib/gen_crc32table.c b/lib/gen_crc32table.c index f755b997b967..6d03425b849e 100644 --- a/lib/gen_crc32table.c +++ b/lib/gen_crc32table.c @@ -2,30 +2,11 @@ #include #include "../include/linux/crc32poly.h" #include "../include/generated/autoconf.h" -#include "crc32defs.h" #include -#define ENTRIES_PER_LINE 4 - -#if CRC_LE_BITS > 8 -# define LE_TABLE_ROWS (CRC_LE_BITS/8) -# define LE_TABLE_SIZE 256 -#else -# define LE_TABLE_ROWS 1 -# define LE_TABLE_SIZE (1 << CRC_LE_BITS) -#endif - -#if CRC_BE_BITS > 8 -# define BE_TABLE_ROWS (CRC_BE_BITS/8) -# define BE_TABLE_SIZE 256 -#else -# define BE_TABLE_ROWS 1 -# define BE_TABLE_SIZE (1 << CRC_BE_BITS) -#endif - -static uint32_t crc32table_le[LE_TABLE_ROWS][256]; -static uint32_t crc32table_be[BE_TABLE_ROWS][256]; -static uint32_t crc32ctable_le[LE_TABLE_ROWS][256]; +static uint32_t crc32table_le[256]; +static uint32_t crc32table_be[256]; +static uint32_t crc32ctable_le[256]; /** * crc32init_le() - allocate and initialize LE table data @@ -34,25 +15,17 @@ static uint32_t crc32ctable_le[LE_TABLE_ROWS][256]; * fact that crctable[i^j] = crctable[i] ^ crctable[j]. * */ -static void crc32init_le_generic(const uint32_t polynomial, - uint32_t (*tab)[256]) +static void crc32init_le_generic(const uint32_t polynomial, uint32_t tab[256]) { unsigned i, j; uint32_t crc = 1; - tab[0][0] = 0; + tab[0] = 0; - for (i = LE_TABLE_SIZE >> 1; i; i >>= 1) { + for (i = 128; i; i >>= 1) { crc = (crc >> 1) ^ ((crc & 1) ? polynomial : 0); - for (j = 0; j < LE_TABLE_SIZE; j += 2 * i) - tab[0][i + j] = crc ^ tab[0][j]; - } - for (i = 0; i < LE_TABLE_SIZE; i++) { - crc = tab[0][i]; - for (j = 1; j < LE_TABLE_ROWS; j++) { - crc = tab[0][crc & 0xff] ^ (crc >> 8); - tab[j][i] = crc; - } + for (j = 0; j < 256; j += 2 * i) + tab[i + j] = crc ^ tab[j]; } } @@ -74,34 +47,22 @@ static void crc32init_be(void) unsigned i, j; uint32_t crc = 0x80000000; - crc32table_be[0][0] = 0; + crc32table_be[0] = 0; - for (i = 1; i < BE_TABLE_SIZE; i <<= 1) { + for (i = 1; i < 256; i <<= 1) { crc = (crc << 1) ^ ((crc & 0x80000000) ? CRC32_POLY_BE : 0); for (j = 0; j < i; j++) - crc32table_be[0][i + j] = crc ^ crc32table_be[0][j]; - } - for (i = 0; i < BE_TABLE_SIZE; i++) { - crc = crc32table_be[0][i]; - for (j = 1; j < BE_TABLE_ROWS; j++) { - crc = crc32table_be[0][(crc >> 24) & 0xff] ^ (crc << 8); - crc32table_be[j][i] = crc; - } + crc32table_be[i + j] = crc ^ crc32table_be[j]; } } -static void output_table(uint32_t (*table)[256], int rows, int len, char *trans) +static void output_table(const uint32_t table[256]) { - int i, j; - - for (j = 0 ; j < rows; j++) { - printf("{"); - for (i = 0; i < len - 1; i++) { - if (i % ENTRIES_PER_LINE == 0) - printf("\n"); - printf("%s(0x%8.8xL), ", trans, table[j][i]); - } - printf("%s(0x%8.8xL)},\n", trans, table[j][len - 1]); + int i; + + for (i = 0; i < 256; i += 4) { + printf("\t0x%08x, 0x%08x, 0x%08x, 0x%08x,\n", + table[i], table[i + 1], table[i + 2], table[i + 3]); } } @@ -109,34 +70,20 @@ int main(int argc, char** argv) { printf("/* this file is generated - do not edit */\n\n"); - if (CRC_LE_BITS > 1) { - crc32init_le(); - printf("static const u32 ____cacheline_aligned " - "crc32table_le[%d][%d] = {", - LE_TABLE_ROWS, LE_TABLE_SIZE); - output_table(crc32table_le, LE_TABLE_ROWS, - LE_TABLE_SIZE, "tole"); - printf("};\n"); - } + crc32init_le(); + printf("static const u32 ____cacheline_aligned crc32table_le[256] = {\n"); + output_table(crc32table_le); + printf("};\n"); - if (CRC_BE_BITS > 1) { - crc32init_be(); - printf("static const u32 ____cacheline_aligned " - "crc32table_be[%d][%d] = {", - BE_TABLE_ROWS, BE_TABLE_SIZE); - output_table(crc32table_be, LE_TABLE_ROWS, - BE_TABLE_SIZE, "tobe"); - printf("};\n"); - } - if (CRC_LE_BITS > 1) { - crc32cinit_le(); - printf("static const u32 ____cacheline_aligned " - "crc32ctable_le[%d][%d] = {", - LE_TABLE_ROWS, LE_TABLE_SIZE); - output_table(crc32ctable_le, LE_TABLE_ROWS, - LE_TABLE_SIZE, "tole"); - printf("};\n"); - } + crc32init_be(); + printf("static const u32 ____cacheline_aligned crc32table_be[256] = {\n"); + output_table(crc32table_be); + printf("};\n"); + + crc32cinit_le(); + printf("static const u32 ____cacheline_aligned crc32ctable_le[256] = {\n"); + output_table(crc32ctable_le); + printf("};\n"); return 0; } -- 2.51.0 From 35fcac7a7c25cc04f730b9570c737f31295fa92d Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 29 Jan 2025 20:06:52 +0800 Subject: [PATCH 16/16] audit: Initialize lsmctx to avoid memory allocation error When audit is enabled in a kernel build, and there are no LSMs active that support LSM labeling, it is possible that local variable lsmctx in the AUDIT_SIGNAL_INFO handler in audit_receive_msg() could be used before it is properly initialize. Then kmalloc() will try to allocate a large amount of memory with the uninitialized length. This patch corrects this problem by initializing the lsmctx to a safe value when it is declared, which avoid errors like: WARNING: CPU: 2 PID: 443 at mm/page_alloc.c:4727 __alloc_pages_noprof ... ra: 9000000003059644 ___kmalloc_large_node+0x84/0x1e0 ERA: 900000000304d588 __alloc_pages_noprof+0x4c8/0x1040 CRMD: 000000b0 (PLV0 -IE -DA +PG DACF=CC DACM=CC -WE) PRMD: 00000004 (PPLV0 +PIE -PWE) EUEN: 00000007 (+FPE +SXE +ASXE -BTE) ECFG: 00071c1d (LIE=0,2-4,10-12 VS=7) ESTAT: 000c0000 [BRK] (IS= ECode=12 EsubCode=0) PRID: 0014c010 (Loongson-64bit, Loongson-3A5000) CPU: 2 UID: 0 PID: 443 Comm: auditd Not tainted 6.13.0-rc1+ #1899 ... Call Trace: [<9000000002def6a8>] show_stack+0x30/0x148 [<9000000002debf58>] dump_stack_lvl+0x68/0xa0 [<9000000002e0fe18>] __warn+0x80/0x108 [<900000000407486c>] report_bug+0x154/0x268 [<90000000040ad468>] do_bp+0x2a8/0x320 [<9000000002dedda0>] handle_bp+0x120/0x1c0 [<900000000304d588>] __alloc_pages_noprof+0x4c8/0x1040 [<9000000003059640>] ___kmalloc_large_node+0x80/0x1e0 [<9000000003061504>] __kmalloc_noprof+0x2c4/0x380 [<9000000002f0f7ac>] audit_receive_msg+0x764/0x1530 [<9000000002f1065c>] audit_receive+0xe4/0x1c0 [<9000000003e5abe8>] netlink_unicast+0x340/0x450 [<9000000003e5ae9c>] netlink_sendmsg+0x1a4/0x4a0 [<9000000003d9ffd0>] __sock_sendmsg+0x48/0x58 [<9000000003da32f0>] __sys_sendto+0x100/0x170 [<9000000003da3374>] sys_sendto+0x14/0x28 [<90000000040ad574>] do_syscall+0x94/0x138 [<9000000002ded318>] handle_syscall+0xb8/0x158 Fixes: 6fba89813ccf333d ("lsm: ensure the correct LSM context releaser") Signed-off-by: Huacai Chen [PM: resolved excessive line length in the backtrace] Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/audit.c b/kernel/audit.c index 13d0144efaa3..5f5bf85bcc90 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1221,7 +1221,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; struct audit_sig_info *sig_data; - struct lsm_context lsmctx; + struct lsm_context lsmctx = { NULL, 0, 0 }; err = audit_netlink_ok(skb, msg_type); if (err) -- 2.51.0