From 0af2f6be1b4281385b618cb86ad946eded089ac8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 6 Apr 2025 13:11:33 -0700 Subject: [PATCH 01/16] Linux 6.15-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index e55726a71d95..38689a0c3605 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 -PATCHLEVEL = 14 +PATCHLEVEL = 15 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.51.0 From 76d2d75ddc034e0ee7d14f9023cb6ebd6c59278d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 3 Apr 2025 16:09:01 +0200 Subject: [PATCH 02/16] selftests/pidfd: adapt to recent changes Adapt to changes in commit 9133607de37a ("exit: fix the usage of delay_group_leader->exit_code in do_notify_parent() and pidfs_exit()"). Even if the thread-group leader exited early and succesfully it's exit status will only be reported once the whole thread-group has exited and it will share the exit code of the thread-group. So if the thread-group was SIGKILLed the thread-group leader will also be reported as having been SIGKILLed. Link: https://lore.kernel.org/r/20250403-work-pidfd-fixes-v1-1-a123b6ed6716@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- tools/testing/selftests/pidfd/pidfd_info_test.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/pidfd/pidfd_info_test.c b/tools/testing/selftests/pidfd/pidfd_info_test.c index 1758a1b0457b..accfd6bdc539 100644 --- a/tools/testing/selftests/pidfd/pidfd_info_test.c +++ b/tools/testing/selftests/pidfd/pidfd_info_test.c @@ -362,9 +362,9 @@ TEST_F(pidfd_info, thread_group) ASSERT_EQ(ioctl(pidfd_leader, PIDFD_GET_INFO, &info), 0); ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); - /* The thread-group leader exited successfully. Only the specific thread was SIGKILLed. */ - ASSERT_TRUE(WIFEXITED(info.exit_code)); - ASSERT_EQ(WEXITSTATUS(info.exit_code), 0); + /* Even though the thread-group exited successfully it will still report the group exit code. */ + ASSERT_TRUE(WIFSIGNALED(info.exit_code)); + ASSERT_EQ(WTERMSIG(info.exit_code), SIGKILL); /* * Retrieve exit information for the thread-group leader via the @@ -375,9 +375,9 @@ TEST_F(pidfd_info, thread_group) ASSERT_FALSE(!!(info2.mask & PIDFD_INFO_CREDS)); ASSERT_TRUE(!!(info2.mask & PIDFD_INFO_EXIT)); - /* The thread-group leader exited successfully. Only the specific thread was SIGKILLed. */ - ASSERT_TRUE(WIFEXITED(info2.exit_code)); - ASSERT_EQ(WEXITSTATUS(info2.exit_code), 0); + /* Even though the thread-group exited successfully it will still report the group exit code. */ + ASSERT_TRUE(WIFSIGNALED(info2.exit_code)); + ASSERT_EQ(WTERMSIG(info2.exit_code), SIGKILL); /* Retrieve exit information for the thread. */ info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; -- 2.51.0 From 1b090949c9989a35c74aa2cd7fee6670b79019cd Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 3 Apr 2025 16:09:02 +0200 Subject: [PATCH 03/16] pidfd: remove unneeded NULL check from pidfd_prepare() None of the caller actually pass a NULL pid in there. Link: https://lore.kernel.org/r/20250403-work-pidfd-fixes-v1-2-a123b6ed6716@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index c4b26cd8998b..182ec2e9087d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2110,7 +2110,7 @@ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { bool thread = flags & PIDFD_THREAD; - if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) + if (!pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) return -EINVAL; return __pidfd_prepare(pid, flags, ret); -- 2.51.0 From 8cf4b738f6d84fdd8d7ff1e8d0e2298ded3e4153 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 3 Apr 2025 16:09:03 +0200 Subject: [PATCH 04/16] pidfd: improve uapi when task isn't found We currently report EINVAL whenever a struct pid has no tasked attached anymore thereby conflating two concepts: (1) The task has already been reaped. (2) The caller requested a pidfd for a thread-group leader but the pid actually references a struct pid that isn't used as a thread-group leader. This is causing issues for non-threaded workloads as in [1]. This patch tries to allow userspace to distinguish between (1) and (2). This is racy of course but that shouldn't matter. Link: https://github.com/systemd/systemd/pull/36982 [1] Link: https://lore.kernel.org/r/20250403-work-pidfd-fixes-v1-3-a123b6ed6716@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- kernel/fork.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 182ec2e9087d..4a2080b968c8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2108,10 +2108,27 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re */ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { - bool thread = flags & PIDFD_THREAD; + int err = 0; - if (!pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) - return -EINVAL; + if (!(flags & PIDFD_THREAD)) { + /* + * If this is struct pid isn't used as a thread-group + * leader pid but the caller requested to create a + * thread-group leader pidfd then report ENOENT to the + * caller as a hint. + */ + if (!pid_has_task(pid, PIDTYPE_TGID)) + err = -ENOENT; + } + + /* + * If this wasn't a thread-group leader struct pid or the task + * got reaped in the meantime report -ESRCH to userspace. + */ + if (!pid_has_task(pid, PIDTYPE_PID)) + err = -ESRCH; + if (err) + return err; return __pidfd_prepare(pid, flags, ret); } -- 2.51.0 From 4fc3f73c16dae0211f31a963eedfb921f8366f57 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 3 Apr 2025 16:09:04 +0200 Subject: [PATCH 05/16] selftest/pidfd: add test for thread-group leader pidfd open for thread Verify that we report ENOENT when userspace tries to create a thread-group leader pidfd for a thread pidfd that isn't a thread-group leader. Link: https://lore.kernel.org/r/20250403-work-pidfd-fixes-v1-4-a123b6ed6716@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- tools/testing/selftests/pidfd/pidfd_info_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/pidfd/pidfd_info_test.c b/tools/testing/selftests/pidfd/pidfd_info_test.c index accfd6bdc539..a0eb6e81eaa2 100644 --- a/tools/testing/selftests/pidfd/pidfd_info_test.c +++ b/tools/testing/selftests/pidfd/pidfd_info_test.c @@ -299,6 +299,7 @@ TEST_F(pidfd_info, thread_group) /* Opening a thread as a thread-group leader must fail. */ pidfd_thread = sys_pidfd_open(pid_thread, 0); ASSERT_LT(pidfd_thread, 0); + ASSERT_EQ(errno, ENOENT); /* Opening a thread as a PIDFD_THREAD must succeed. */ pidfd_thread = sys_pidfd_open(pid_thread, PIDFD_THREAD); -- 2.51.0 From 35c9701ea717dc548f1fab5bfa286be98c1bade8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 11 Apr 2025 15:22:44 +0200 Subject: [PATCH 06/16] exit: move wake_up_all() pidfd waiters into __unhash_process() Move the pidfd notification out of __change_pid() and into __unhash_process(). The only valid call to __change_pid() with a NULL argument and PIDTYPE_PID is from __unhash_process(). This is a lot more obvious than calling it from __change_pid(). Link: https://lore.kernel.org/20250411-work-pidfs-enoent-v2-1-60b2d3bb545f@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- kernel/exit.c | 5 +++++ kernel/pid.c | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 1b51dc099f1e..abcd93ce4e18 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -133,8 +133,13 @@ struct release_task_post { static void __unhash_process(struct release_task_post *post, struct task_struct *p, bool group_dead) { + struct pid *pid = task_pid(p); + nr_threads--; + detach_pid(post->pids, p, PIDTYPE_PID); + wake_up_all(&pid->wait_pidfd); + if (group_dead) { detach_pid(post->pids, p, PIDTYPE_TGID); detach_pid(post->pids, p, PIDTYPE_PGID); diff --git a/kernel/pid.c b/kernel/pid.c index 4ac2ce46817f..26f1e136f017 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -359,11 +359,6 @@ static void __change_pid(struct pid **pids, struct task_struct *task, hlist_del_rcu(&task->pid_links[type]); *pid_ptr = new; - if (type == PIDTYPE_PID) { - WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID)); - wake_up_all(&pid->wait_pidfd); - } - for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (pid_has_task(pid, tmp)) return; -- 2.51.0 From 17f1b08acf50c0bfb02e21623e53e7e575612b67 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 11 Apr 2025 15:22:45 +0200 Subject: [PATCH 07/16] pidfs: ensure consistent ENOENT/ESRCH reporting In a prior patch series we tried to cleanly differentiate between: (1) The task has already been reaped. (2) The caller requested a pidfd for a thread-group leader but the pid actually references a struct pid that isn't used as a thread-group leader. as this was causing issues for non-threaded workloads. But there's cases where the current simple logic is wrong. Specifically, if the pid was a leader pid and the check races with __unhash_process(). Stabilize this by using the pidfd waitqueue lock. Link: https://lore.kernel.org/20250411-work-pidfs-enoent-v2-2-60b2d3bb545f@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- kernel/fork.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 4a2080b968c8..f7403e1fb0d4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2108,28 +2108,26 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re */ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { - int err = 0; - - if (!(flags & PIDFD_THREAD)) { + /* + * While holding the pidfd waitqueue lock removing the task + * linkage for the thread-group leader pid (PIDTYPE_TGID) isn't + * possible. Thus, if there's still task linkage for PIDTYPE_PID + * not having thread-group leader linkage for the pid means it + * wasn't a thread-group leader in the first place. + */ + scoped_guard(spinlock_irq, &pid->wait_pidfd.lock) { + /* Task has already been reaped. */ + if (!pid_has_task(pid, PIDTYPE_PID)) + return -ESRCH; /* - * If this is struct pid isn't used as a thread-group - * leader pid but the caller requested to create a - * thread-group leader pidfd then report ENOENT to the - * caller as a hint. + * If this struct pid isn't used as a thread-group + * leader but the caller requested to create a + * thread-group leader pidfd then report ENOENT. */ - if (!pid_has_task(pid, PIDTYPE_TGID)) - err = -ENOENT; + if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID)) + return -ENOENT; } - /* - * If this wasn't a thread-group leader struct pid or the task - * got reaped in the meantime report -ESRCH to userspace. - */ - if (!pid_has_task(pid, PIDTYPE_PID)) - err = -ESRCH; - if (err) - return err; - return __pidfd_prepare(pid, flags, ret); } -- 2.51.0 From 0a36bad01731e71568bdd365764d38b6bd576ab0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 11 Apr 2025 14:18:57 +0200 Subject: [PATCH 08/16] release_task: kill the no longer needed get/put_pid(thread_pid) After the commit 7903f907a2260 ("pid: perform free_pid() calls outside of tasklist_lock") __unhash_process() -> detach_pid() no longer calls free_pid(), proc_flush_pid() can just use p->thread_pid without the now pointless get_pid() + put_pid(). Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/20250411121857.GA10550@redhat.com Reviewed-by: Mateusz Guzik Signed-off-by: Christian Brauner --- kernel/exit.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index abcd93ce4e18..c33ecde016de 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -258,7 +258,8 @@ repeat: pidfs_exit(p); cgroup_release(p); - thread_pid = get_pid(p->thread_pid); + /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */ + thread_pid = task_pid(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); @@ -287,8 +288,8 @@ repeat: } write_unlock_irq(&tasklist_lock); + /* @thread_pid can't go away until free_pids() below */ proc_flush_pid(thread_pid); - put_pid(thread_pid); add_device_randomness(&p->se.sum_exec_runtime, sizeof(p->se.sum_exec_runtime)); free_pids(post.pids); -- 2.51.0 From b590c928cca7bdc7fd580d52e42bfdc3ac5eeacb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 16 Apr 2025 22:05:40 +0200 Subject: [PATCH 09/16] net, pidfd: report EINVAL for ESRCH dbus-broker relies -EINVAL being returned to indicate ESRCH in [1]. This causes issues for some workloads as reported in [2]. Paper over it until this is fixed in userspace. Link: https://lore.kernel.org/20250416-gegriffen-tiefbau-70cfecb80ac8@brauner Link: https://github.com/bus1/dbus-broker/blob/5d34d91b138fc802a016aa68c093eb81ea31139c/src/util/sockopt.c#L241 [1] Link: https://lore.kernel.org/20250415223454.GA1852104@ax162 [2] Tested-by: Nathan Chancellor Signed-off-by: Christian Brauner --- net/core/sock.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/core/sock.c b/net/core/sock.c index f67a3c5b0988..b969d2210656 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1893,8 +1893,16 @@ int sk_getsockopt(struct sock *sk, int level, int optname, pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); put_pid(peer_pid); - if (pidfd < 0) + if (pidfd < 0) { + /* + * dbus-broker relies on -EINVAL being returned + * to indicate ESRCH. Paper over it until this + * is fixed in userspace. + */ + if (pidfd == -ESRCH) + pidfd = -EINVAL; return pidfd; + } if (copy_to_sockptr(optval, &pidfd, len) || copy_to_sockptr(optlen, &len, sizeof(int))) { -- 2.51.0 From 477058411c45f225ddfbb4769e35a9a5a95cb826 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 25 Apr 2025 10:11:30 +0200 Subject: [PATCH 10/16] pidfs: register pid in pidfs Add simple helpers that allow a struct pid to be pinned via a pidfs dentry/inode. If no pidfs dentry exists a new one will be allocated for it. A reference is taken by pidfs on @pid. The reference must be released via pidfs_put_pid(). This will allow AF_UNIX sockets to allocate a dentry for the peer credentials pid at the time they are recorded where we know the task is still alive. When the task gets reaped its exit status is guaranteed to be recorded and a pidfd can be handed out for the reaped task. Link: https://lore.kernel.org/20250425-work-pidfs-net-v2-1-450a19461e75@kernel.org Reviewed-by: Oleg Nesterov Reviewed-by: David Rheinsberg Signed-off-by: Christian Brauner --- fs/pidfs.c | 59 +++++++++++++++++++++++++++++++++++++++++++ include/linux/pidfs.h | 3 +++ 2 files changed, 62 insertions(+) diff --git a/fs/pidfs.c b/fs/pidfs.c index d64a4cbeb0da..ec5f9ad09bb8 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -896,6 +896,65 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) return pidfd_file; } +/** + * pidfs_register_pid - register a struct pid in pidfs + * @pid: pid to pin + * + * Register a struct pid in pidfs. Needs to be paired with + * pidfs_put_pid() to not risk leaking the pidfs dentry and inode. + * + * Return: On success zero, on error a negative error code is returned. + */ +int pidfs_register_pid(struct pid *pid) +{ + struct path path __free(path_put) = {}; + int ret; + + might_sleep(); + + if (!pid) + return 0; + + ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); + if (unlikely(ret)) + return ret; + /* Keep the dentry and only put the reference to the mount. */ + path.dentry = NULL; + return 0; +} + +/** + * pidfs_get_pid - pin a struct pid through pidfs + * @pid: pid to pin + * + * Similar to pidfs_register_pid() but only valid if the caller knows + * there's a reference to the @pid through a dentry already that can't + * go away. + */ +void pidfs_get_pid(struct pid *pid) +{ + if (!pid) + return; + WARN_ON_ONCE(!stashed_dentry_get(&pid->stashed)); +} + +/** + * pidfs_put_pid - drop a pidfs reference + * @pid: pid to drop + * + * Drop a reference to @pid via pidfs. This is only safe if the + * reference has been taken via pidfs_get_pid(). + */ +void pidfs_put_pid(struct pid *pid) +{ + might_sleep(); + + if (!pid) + return; + VFS_WARN_ON_ONCE(!pid->stashed); + dput(pid->stashed); +} + static void pidfs_inode_init_once(void *data) { struct pidfs_inode *pi = data; diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 05e6f8f4a026..2676890c4d0d 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -8,5 +8,8 @@ void pidfs_add_pid(struct pid *pid); void pidfs_remove_pid(struct pid *pid); void pidfs_exit(struct task_struct *tsk); extern const struct dentry_operations pidfs_dentry_operations; +int pidfs_register_pid(struct pid *pid); +void pidfs_get_pid(struct pid *pid); +void pidfs_put_pid(struct pid *pid); #endif /* _LINUX_PID_FS_H */ -- 2.51.0 From fd0a109a0f6b7524543d17520da92a44a9f5343c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 25 Apr 2025 10:11:31 +0200 Subject: [PATCH 11/16] net, pidfs: prepare for handing out pidfds for reaped sk->sk_peer_pid SO_PEERPIDFD currently doesn't support handing out pidfds if the sk->sk_peer_pid thread-group leader has already been reaped. In this case it currently returns EINVAL. Userspace still wants to get a pidfd for a reaped process to have a stable handle it can pass on. This is especially useful now that it is possible to retrieve exit information through a pidfd via the PIDFD_GET_INFO ioctl()'s PIDFD_INFO_EXIT flag. Another summary has been provided by David in [1]: > A pidfd can outlive the task it refers to, and thus user-space must > already be prepared that the task underlying a pidfd is gone at the time > they get their hands on the pidfd. For instance, resolving the pidfd to > a PID via the fdinfo must be prepared to read `-1`. > > Despite user-space knowing that a pidfd might be stale, several kernel > APIs currently add another layer that checks for this. In particular, > SO_PEERPIDFD returns `EINVAL` if the peer-task was already reaped, > but returns a stale pidfd if the task is reaped immediately after the > respective alive-check. > > This has the unfortunate effect that user-space now has two ways to > check for the exact same scenario: A syscall might return > EINVAL/ESRCH/... *or* the pidfd might be stale, even though there is no > particular reason to distinguish both cases. This also propagates > through user-space APIs, which pass on pidfds. They must be prepared to > pass on `-1` *or* the pidfd, because there is no guaranteed way to get a > stale pidfd from the kernel. > Userspace must already deal with a pidfd referring to a reaped task as > the task may exit and get reaped at any time will there are still many > pidfds referring to it. In order to allow handing out reaped pidfd SO_PEERPIDFD needs to ensure that PIDFD_INFO_EXIT information is available whenever a pidfd for a reaped task is created by PIDFD_INFO_EXIT. The uapi promises that reaped pidfds are only handed out if it is guaranteed that the caller sees the exit information: TEST_F(pidfd_info, success_reaped) { struct pidfd_info info = { .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT, }; /* * Process has already been reaped and PIDFD_INFO_EXIT been set. * Verify that we can retrieve the exit status of the process. */ ASSERT_EQ(ioctl(self->child_pidfd4, PIDFD_GET_INFO, &info), 0); ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); ASSERT_TRUE(WIFEXITED(info.exit_code)); ASSERT_EQ(WEXITSTATUS(info.exit_code), 0); } To hand out pidfds for reaped processes we thus allocate a pidfs entry for the relevant sk->sk_peer_pid at the time the sk->sk_peer_pid is stashed and drop it when the socket is destroyed. This guarantees that exit information will always be recorded for the sk->sk_peer_pid task and we can hand out pidfds for reaped processes. Link: https://lore.kernel.org/lkml/20230807085203.819772-1-david@readahead.eu [1] Link: https://lore.kernel.org/20250425-work-pidfs-net-v2-2-450a19461e75@kernel.org Reviewed-by: David Rheinsberg Reviewed-by: Kuniyuki Iwashima Signed-off-by: Christian Brauner --- net/unix/af_unix.c | 85 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 74 insertions(+), 11 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f78a2492826f..472f8aa9ea15 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -100,6 +100,7 @@ #include #include #include +#include #include #include #include @@ -643,6 +644,9 @@ static void unix_sock_destructor(struct sock *sk) return; } + if (sk->sk_peer_pid) + pidfs_put_pid(sk->sk_peer_pid); + if (u->addr) unix_release_addr(u->addr); @@ -734,13 +738,48 @@ static void unix_release_sock(struct sock *sk, int embrion) unix_gc(); /* Garbage collect fds */ } -static void init_peercred(struct sock *sk) +struct unix_peercred { + struct pid *peer_pid; + const struct cred *peer_cred; +}; + +static inline int prepare_peercred(struct unix_peercred *peercred) { - sk->sk_peer_pid = get_pid(task_tgid(current)); - sk->sk_peer_cred = get_current_cred(); + struct pid *pid; + int err; + + pid = task_tgid(current); + err = pidfs_register_pid(pid); + if (likely(!err)) { + peercred->peer_pid = get_pid(pid); + peercred->peer_cred = get_current_cred(); + } + return err; } -static void update_peercred(struct sock *sk) +static void drop_peercred(struct unix_peercred *peercred) +{ + const struct cred *cred = NULL; + struct pid *pid = NULL; + + might_sleep(); + + swap(peercred->peer_pid, pid); + swap(peercred->peer_cred, cred); + + pidfs_put_pid(pid); + put_pid(pid); + put_cred(cred); +} + +static inline void init_peercred(struct sock *sk, + const struct unix_peercred *peercred) +{ + sk->sk_peer_pid = peercred->peer_pid; + sk->sk_peer_cred = peercred->peer_cred; +} + +static void update_peercred(struct sock *sk, struct unix_peercred *peercred) { const struct cred *old_cred; struct pid *old_pid; @@ -748,11 +787,11 @@ static void update_peercred(struct sock *sk) spin_lock(&sk->sk_peer_lock); old_pid = sk->sk_peer_pid; old_cred = sk->sk_peer_cred; - init_peercred(sk); + init_peercred(sk, peercred); spin_unlock(&sk->sk_peer_lock); - put_pid(old_pid); - put_cred(old_cred); + peercred->peer_pid = old_pid; + peercred->peer_cred = old_cred; } static void copy_peercred(struct sock *sk, struct sock *peersk) @@ -761,6 +800,7 @@ static void copy_peercred(struct sock *sk, struct sock *peersk) spin_lock(&sk->sk_peer_lock); sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); + pidfs_get_pid(sk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); } @@ -770,6 +810,7 @@ static int unix_listen(struct socket *sock, int backlog) int err; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); + struct unix_peercred peercred = {}; err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) @@ -777,6 +818,9 @@ static int unix_listen(struct socket *sock, int backlog) err = -EINVAL; if (!READ_ONCE(u->addr)) goto out; /* No listens on an unbound socket */ + err = prepare_peercred(&peercred); + if (err) + goto out; unix_state_lock(sk); if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) goto out_unlock; @@ -786,11 +830,12 @@ static int unix_listen(struct socket *sock, int backlog) WRITE_ONCE(sk->sk_state, TCP_LISTEN); /* set credentials so connect can copy them */ - update_peercred(sk); + update_peercred(sk, &peercred); err = 0; out_unlock: unix_state_unlock(sk); + drop_peercred(&peercred); out: return err; } @@ -1525,6 +1570,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; struct unix_sock *u = unix_sk(sk), *newu, *otheru; + struct unix_peercred peercred = {}; struct net *net = sock_net(sk); struct sk_buff *skb = NULL; unsigned char state; @@ -1561,6 +1607,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto out; } + err = prepare_peercred(&peercred); + if (err) + goto out; + /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); if (!skb) { @@ -1636,7 +1686,7 @@ restart: unix_peer(newsk) = sk; newsk->sk_state = TCP_ESTABLISHED; newsk->sk_type = sk->sk_type; - init_peercred(newsk); + init_peercred(newsk, &peercred); newu = unix_sk(newsk); newu->listener = other; RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); @@ -1695,20 +1745,33 @@ out_free_skb: out_free_sk: unix_release_sock(newsk, 0); out: + drop_peercred(&peercred); return err; } static int unix_socketpair(struct socket *socka, struct socket *sockb) { + struct unix_peercred ska_peercred = {}, skb_peercred = {}; struct sock *ska = socka->sk, *skb = sockb->sk; + int err; + + err = prepare_peercred(&ska_peercred); + if (err) + return err; + + err = prepare_peercred(&skb_peercred); + if (err) { + drop_peercred(&ska_peercred); + return err; + } /* Join our sockets back to back */ sock_hold(ska); sock_hold(skb); unix_peer(ska) = skb; unix_peer(skb) = ska; - init_peercred(ska); - init_peercred(skb); + init_peercred(ska, &ska_peercred); + init_peercred(skb, &skb_peercred); ska->sk_state = TCP_ESTABLISHED; skb->sk_state = TCP_ESTABLISHED; -- 2.51.0 From a71f402acd71a942e59c16270ad61dee06de6e24 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 25 Apr 2025 10:11:32 +0200 Subject: [PATCH 12/16] pidfs: get rid of __pidfd_prepare() Fold it into pidfd_prepare() and rename PIDFD_CLONE to PIDFD_STALE to indicate that the passed pid might not have task linkage and no explicit check for that should be performed. Link: https://lore.kernel.org/20250425-work-pidfs-net-v2-3-450a19461e75@kernel.org Reviewed-by: Oleg Nesterov Reviewed-by: David Rheinsberg Signed-off-by: Christian Brauner --- fs/pidfs.c | 22 +++++----- include/linux/pid.h | 2 +- include/uapi/linux/pidfd.h | 2 +- kernel/fork.c | 83 +++++++++++++------------------------- 4 files changed, 44 insertions(+), 65 deletions(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index ec5f9ad09bb8..9d993ecadad7 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -768,7 +768,7 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path, { enum pid_type type; - if (flags & PIDFD_CLONE) + if (flags & PIDFD_STALE) return true; /* @@ -777,10 +777,14 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path, * pidfd has been allocated perform another check that the pid * is still alive. If it is exit information is available even * if the task gets reaped before the pidfd is returned to - * userspace. The only exception is PIDFD_CLONE where no task - * linkage has been established for @pid yet and the kernel is - * in the middle of process creation so there's nothing for - * pidfs to miss. + * userspace. The only exception are indicated by PIDFD_STALE: + * + * (1) The kernel is in the middle of task creation and thus no + * task linkage has been established yet. + * (2) The caller knows @pid has been registered in pidfs at a + * time when the task was still alive. + * + * In both cases exit information will have been reported. */ if (flags & PIDFD_THREAD) type = PIDTYPE_PID; @@ -874,11 +878,11 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) int ret; /* - * Ensure that PIDFD_CLONE can be passed as a flag without + * Ensure that PIDFD_STALE can be passed as a flag without * overloading other uapi pidfd flags. */ - BUILD_BUG_ON(PIDFD_CLONE == PIDFD_THREAD); - BUILD_BUG_ON(PIDFD_CLONE == PIDFD_NONBLOCK); + BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD); + BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK); ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); if (ret < 0) @@ -887,7 +891,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) if (!pidfs_pid_valid(pid, &path, flags)) return ERR_PTR(-ESRCH); - flags &= ~PIDFD_CLONE; + flags &= ~PIDFD_STALE; pidfd_file = dentry_open(&path, flags, current_cred()); /* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */ if (!IS_ERR(pidfd_file)) diff --git a/include/linux/pid.h b/include/linux/pid.h index 311ecebd7d56..453ae6d8a68d 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -77,7 +77,7 @@ struct file; struct pid *pidfd_pid(const struct file *file); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags); -int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret); +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file); void do_notify_pidfd(struct task_struct *task); static inline struct pid *get_pid(struct pid *pid) diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index 2970ef44655a..8c1511edd0e9 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -12,7 +12,7 @@ #define PIDFD_THREAD O_EXCL #ifdef __KERNEL__ #include -#define PIDFD_CLONE CLONE_PIDFD +#define PIDFD_STALE CLONE_PIDFD #endif /* Flags for pidfd_send_signal(). */ diff --git a/kernel/fork.c b/kernel/fork.c index f7403e1fb0d4..1d95f4dae327 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2035,55 +2035,11 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } -/** - * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd - * @pid: the struct pid for which to create a pidfd - * @flags: flags of the new @pidfd - * @ret: Where to return the file for the pidfd. - * - * Allocate a new file that stashes @pid and reserve a new pidfd number in the - * caller's file descriptor table. The pidfd is reserved but not installed yet. - * - * The helper doesn't perform checks on @pid which makes it useful for pidfds - * created via CLONE_PIDFD where @pid has no task attached when the pidfd and - * pidfd file are prepared. - * - * If this function returns successfully the caller is responsible to either - * call fd_install() passing the returned pidfd and pidfd file as arguments in - * order to install the pidfd into its file descriptor table or they must use - * put_unused_fd() and fput() on the returned pidfd and pidfd file - * respectively. - * - * This function is useful when a pidfd must already be reserved but there - * might still be points of failure afterwards and the caller wants to ensure - * that no pidfd is leaked into its file descriptor table. - * - * Return: On success, a reserved pidfd is returned from the function and a new - * pidfd file is returned in the last argument to the function. On - * error, a negative error code is returned from the function and the - * last argument remains unchanged. - */ -static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) -{ - struct file *pidfd_file; - - CLASS(get_unused_fd, pidfd)(O_CLOEXEC); - if (pidfd < 0) - return pidfd; - - pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR); - if (IS_ERR(pidfd_file)) - return PTR_ERR(pidfd_file); - - *ret = pidfd_file; - return take_fd(pidfd); -} - /** * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd * @flags: flags of the new @pidfd - * @ret: Where to return the pidfd. + * @ret_file: return the new pidfs file * * Allocate a new file that stashes @pid and reserve a new pidfd number in the * caller's file descriptor table. The pidfd is reserved but not installed yet. @@ -2106,16 +2062,26 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re * error, a negative error code is returned from the function and the * last argument remains unchanged. */ -int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file) { + struct file *pidfs_file; + /* - * While holding the pidfd waitqueue lock removing the task - * linkage for the thread-group leader pid (PIDTYPE_TGID) isn't - * possible. Thus, if there's still task linkage for PIDTYPE_PID - * not having thread-group leader linkage for the pid means it - * wasn't a thread-group leader in the first place. + * PIDFD_STALE is only allowed to be passed if the caller knows + * that @pid is already registered in pidfs and thus + * PIDFD_INFO_EXIT information is guaranteed to be available. */ - scoped_guard(spinlock_irq, &pid->wait_pidfd.lock) { + if (!(flags & PIDFD_STALE)) { + /* + * While holding the pidfd waitqueue lock removing the + * task linkage for the thread-group leader pid + * (PIDTYPE_TGID) isn't possible. Thus, if there's still + * task linkage for PIDTYPE_PID not having thread-group + * leader linkage for the pid means it wasn't a + * thread-group leader in the first place. + */ + guard(spinlock_irq)(&pid->wait_pidfd.lock); + /* Task has already been reaped. */ if (!pid_has_task(pid, PIDTYPE_PID)) return -ESRCH; @@ -2128,7 +2094,16 @@ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) return -ENOENT; } - return __pidfd_prepare(pid, flags, ret); + CLASS(get_unused_fd, pidfd)(O_CLOEXEC); + if (pidfd < 0) + return pidfd; + + pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR); + if (IS_ERR(pidfs_file)) + return PTR_ERR(pidfs_file); + + *ret_file = pidfs_file; + return take_fd(pidfd); } static void __delayed_free_task(struct rcu_head *rhp) @@ -2477,7 +2452,7 @@ __latent_entropy struct task_struct *copy_process( * Note that no task has been attached to @pid yet indicate * that via CLONE_PIDFD. */ - retval = __pidfd_prepare(pid, flags | PIDFD_CLONE, &pidfile); + retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; -- 2.51.0 From 20b70e58961b855e61f2c6a7a19a38a64ad1bed6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 25 Apr 2025 10:11:33 +0200 Subject: [PATCH 13/16] net, pidfs: enable handing out pidfds for reaped sk->sk_peer_pid Now that all preconditions are met, allow handing out pidfs for reaped sk->sk_peer_pids. Thanks to Alexander Mikhalitsyn for pointing out that we need to limit this to AF_UNIX sockets for now. Link: https://lore.kernel.org/20250425-work-pidfs-net-v2-4-450a19461e75@kernel.org Reviewed-by: David Rheinsberg Reviewed-by: Kuniyuki Iwashima Signed-off-by: Christian Brauner --- net/core/sock.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index b969d2210656..180f441f4ab6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -148,6 +148,8 @@ #include +#include + #include "dev.h" static DEFINE_MUTEX(proto_list_mutex); @@ -1879,6 +1881,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, { struct pid *peer_pid; struct file *pidfd_file = NULL; + unsigned int flags = 0; int pidfd; if (len > sizeof(pidfd)) @@ -1891,18 +1894,17 @@ int sk_getsockopt(struct sock *sk, int level, int optname, if (!peer_pid) return -ENODATA; - pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); + /* The use of PIDFD_STALE requires stashing of struct pid + * on pidfs with pidfs_register_pid() and only AF_UNIX + * were prepared for this. + */ + if (sk->sk_family == AF_UNIX) + flags = PIDFD_STALE; + + pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file); put_pid(peer_pid); - if (pidfd < 0) { - /* - * dbus-broker relies on -EINVAL being returned - * to indicate ESRCH. Paper over it until this - * is fixed in userspace. - */ - if (pidfd == -ESRCH) - pidfd = -EINVAL; + if (pidfd < 0) return pidfd; - } if (copy_to_sockptr(optval, &pidfd, len) || copy_to_sockptr(optlen, &len, sizeof(int))) { -- 2.51.0 From e194d2067c958827810a7a7282dff8773633ad8c Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Fri, 11 Apr 2025 17:09:41 +0200 Subject: [PATCH 14/16] selftests: coredump: Properly initialize pointer The buffer pointer "line" is not initialized. This pointer is passed to getline(). It can still work if the stack is zero-initialized, because getline() can work with a NULL pointer as buffer. But this is obviously broken. This bug shows up while running the test on a riscv64 machine. Fix it by properly initializing the pointer. Fixes: 15858da53542 ("selftests: coredump: Add stackdump test") Signed-off-by: Nam Cao Link: https://lore.kernel.org/4fb9b6fb3e0040481bacc258c44b4aab5c4df35d.1744383419.git.namcao@linutronix.de Signed-off-by: Christian Brauner --- tools/testing/selftests/coredump/stackdump_test.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c index 137b2364a082..c23cf95c3f6d 100644 --- a/tools/testing/selftests/coredump/stackdump_test.c +++ b/tools/testing/selftests/coredump/stackdump_test.c @@ -138,10 +138,12 @@ TEST_F(coredump, stackdump) ASSERT_NE(file, NULL); /* Step 4: Make sure all stack pointer values are non-zero */ + line = NULL; for (i = 0; -1 != getline(&line, &line_length, file); ++i) { stack = strtoull(line, NULL, 10); ASSERT_NE(stack, 0); } + free(line); ASSERT_EQ(i, 1 + NUM_THREAD_SPAWN); -- 2.51.0 From 6f5bf9f37f06668ead446e2997f2b431db8963ce Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Fri, 11 Apr 2025 17:09:42 +0200 Subject: [PATCH 15/16] selftests: coredump: Fix test failure for slow machines The test waits for coredump to finish by busy-waiting for the stack_values file to be created. The maximum wait time is 10 seconds. This doesn't work for slow machine (qemu-system-riscv64), because coredump takes longer. Fix it by waiting for the crashing child process to finish first. Fixes: 15858da53542 ("selftests: coredump: Add stackdump test") Signed-off-by: Nam Cao Link: https://lore.kernel.org/ee657f3fc8e19657cf7aaa366552d6347728f371.1744383419.git.namcao@linutronix.de Signed-off-by: Christian Brauner --- tools/testing/selftests/coredump/stackdump_test.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c index c23cf95c3f6d..9da10fb5e597 100644 --- a/tools/testing/selftests/coredump/stackdump_test.c +++ b/tools/testing/selftests/coredump/stackdump_test.c @@ -96,7 +96,7 @@ TEST_F(coredump, stackdump) char *test_dir, *line; size_t line_length; char buf[PATH_MAX]; - int ret, i; + int ret, i, status; FILE *file; pid_t pid; @@ -129,6 +129,10 @@ TEST_F(coredump, stackdump) /* * Step 3: Wait for the stackdump script to write the stack pointers to the stackdump file */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_TRUE(WCOREDUMP(status)); + for (i = 0; i < 10; ++i) { file = fopen(STACKDUMP_FILE, "r"); if (file) -- 2.51.0 From c6e888d02d51d1e9f9abf1587e6a5618375cac9f Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Fri, 11 Apr 2025 17:09:43 +0200 Subject: [PATCH 16/16] selftests: coredump: Raise timeout to 2 minutes The test's runtime (nearly 20s) is dangerously close to the limit (30s) on qemu-system-riscv64: $ time ./stackdump_test > /dev/null real 0m19.210s user 0m0.077s sys 0m0.359s There could be machines slower than qemu-system-riscv64. Therefore raise the test timeout to 2 minutes to be safe. Fixes: 15858da53542 ("selftests: coredump: Add stackdump test") Signed-off-by: Nam Cao Link: https://lore.kernel.org/dd636084d55e7828782728d087fa2298dcab1c8b.1744383419.git.namcao@linutronix.de Signed-off-by: Christian Brauner --- tools/testing/selftests/coredump/stackdump_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c index 9da10fb5e597..fe3c728cd6be 100644 --- a/tools/testing/selftests/coredump/stackdump_test.c +++ b/tools/testing/selftests/coredump/stackdump_test.c @@ -89,7 +89,7 @@ fail: fprintf(stderr, "Failed to cleanup stackdump test: %s\n", reason); } -TEST_F(coredump, stackdump) +TEST_F_TIMEOUT(coredump, stackdump, 120) { struct sigaction action = {}; unsigned long long stack; -- 2.51.0