The waitfd() syscall, introduced principally for DTrace's use, has turned out to
be somewhat problematic because of differences between waitid() and waitpid().
In particular, you can't use waitfd() usefully on ptrace()d multithreaded
processes because waitid() explicitly rejects __WNOTHREAD, so your waitfd will
be written to even when a corresponding waitpid() will return nothing. Also, of
course, waitid() isn't much use for communicating with ptrace()d children
because it throws away most of the information needed.
So rejig waitfd() to use wait4 (the guts of waitpid()) instead. This has both
positive and negative consequences.
On the positive side, the significant changes to kernel/exit.c which were
imposed to cater for the waitfd()'s info parameter can all be discarded: we are
much closer to mainline. (The diff between mainline kernel/exit.c and this
patch is 23 lines, down from 83 before it.)
The downside is that half the parameters to the waitfd() syscall are useless
now, because they are modelled on waitid(), which is no longer used (the which
and flags arguments), but we can't remove them without breaking the syscall's
one caller. The content read from the waitfd has also changed -- rather than
being an array of 'struct siginfo' it is an array of ints, waitpid() exit
statuses. This is not a problem in practice because all released versions of
dtrace only poll this descriptor: they never read from it.
Cross-compatibility, therefore:
- older dtrace, older kernel: works.
- older dtrace, newer kernel: works, because the syscall's parameters are
unchanged, waitfd() was being called with P_PID and zero flags, giving
identical behaviour; this dtrace doesn't work properly with multithreaded
tracees as it is, so we can ignore that case (it won't work worse)
- newer dtrace, older kernel: an attempted call with __WNOTHREAD will fail, so
it will retry without __WNOTHREAD, and succeed (this means that we could
introduce a new syscall, new_waitfd(), without those compatibility
arguments, if you prefer); for multithreaded tracees, the older kernel will
tell dtrace that lots of threads have changed state even when they haven't,
incurring substantial CPU-wasting spinning
- newer dtrace, newer kernel: works as intended.
Orabug:
21245371
Signed-off-by: Nick Alcock <nick.alcock@oracle.com>
Reviewed-by: Todd Vierling <todd.vierling@oracle.com>
#include <linux/anon_inodes.h>
#include <linux/syscalls.h>
-long do_waitid(int which, pid_t upid,
- struct siginfo __user *infop, int options,
- struct rusage __user *ru);
+long do_wait4(pid_t upid, int __user *stat_addr,
+ int options, struct rusage __user *ru);
struct waitfd_ctx {
int options;
- int which;
pid_t upid;
};
poll_wait_fixed(file, ¤t->signal->wait_chldexit, wait,
POLLIN);
- value = do_waitid(ctx->which, ctx->upid, NULL,
- ctx->options | WNOHANG | WNOWAIT, NULL);
+ value = do_wait4(ctx->upid, NULL, ctx->options | WNOHANG | WNOWAIT,
+ NULL);
if (value > 0 || value == -ECHILD)
return POLLIN | POLLRDNORM;
}
/*
- * Returns a multiple of the size of a struct siginfo, or a negative
- * error code. The "count" parameter must be at least sizeof(struct siginfo)
+ * Returns a multiple of the size of a stat_addr, or a negative error code. The
+ * "count" parameter must be at least sizeof(int).
*/
static ssize_t waitfd_read(struct file *file, char __user *buf, size_t count,
loff_t *ppos)
{
struct waitfd_ctx *ctx = file->private_data;
- struct siginfo __user *info_addr = (struct siginfo *)buf;
+ int __user *stat_addr = (int *)buf;
int flags = ctx->options;
ssize_t ret, total = 0;
- count /= sizeof(struct siginfo);
+ count /= sizeof(int);
if (!count)
return -EINVAL;
flags |= WNOHANG;
do {
- ret = do_waitid(ctx->which, ctx->upid, info_addr, flags, NULL);
+ ret = do_wait4(ctx->upid, stat_addr, flags, NULL);
if (ret == 0)
ret = -EAGAIN;
if (ret == -ECHILD)
if (ret <= 0)
break;
- info_addr++;
- total += sizeof(struct siginfo);
+ stat_addr++;
+ total += sizeof(int);
} while (--count);
return total ? total : ret;
.llseek = noop_llseek,
};
-SYSCALL_DEFINE4(waitfd, int, which, pid_t, upid, int, options, int, flags)
+SYSCALL_DEFINE4(waitfd, int __maybe_unused, which, pid_t, upid, int, options,
+ int __maybe_unused, flags)
{
int ufd;
struct waitfd_ctx *ctx;
/*
- * Options validation from do_waitid()
+ * Options validation from do_wait4(), minus WNOWAIT, which is only used
+ * by our polling implementation. If WEXITED or WSTOPPED are provided,
+ * silently remove them (for backward compatibility with older callers).
*/
- if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
- return -EINVAL;
- if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
+ options &= ~(WEXITED | WSTOPPED);
+ if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
+ __WNOTHREAD|__WCLONE|__WALL))
return -EINVAL;
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
ctx->options = options;
ctx->upid = upid;
- ctx->which = which;
ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx,
O_RDWR | flags | ((options & WNOHANG) ?
put_task_struct(p);
infop = wo->wo_info;
if (infop) {
- int put_user_retval;
-
- put_user_retval = put_user(SIGCHLD, &infop->si_signo);
- put_user_retval |= put_user(0, &infop->si_errno);
- put_user_retval |= put_user((short)why, &infop->si_code);
- put_user_retval |= put_user(pid, &infop->si_pid);
- put_user_retval |= put_user(uid, &infop->si_uid);
- put_user_retval |= put_user(status, &infop->si_status);
-
- if (put_user_retval != 0)
- retval = put_user_retval;
+ if (!retval)
+ retval = put_user(SIGCHLD, &infop->si_signo);
+ if (!retval)
+ retval = put_user(0, &infop->si_errno);
+ if (!retval)
+ retval = put_user((short)why, &infop->si_code);
+ if (!retval)
+ retval = put_user(pid, &infop->si_pid);
+ if (!retval)
+ retval = put_user(uid, &infop->si_uid);
+ if (!retval)
+ retval = put_user(status, &infop->si_status);
}
if (!retval)
retval = pid;
return retval;
}
-long do_waitid(int which, pid_t upid,
- struct siginfo __user *infop, int options,
- struct rusage __user *ru)
+SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
+ infop, int, options, struct rusage __user *, ru)
{
-
struct wait_opts wo;
struct pid *pid = NULL;
enum pid_type type;
wo.wo_stat = NULL;
wo.wo_rusage = ru;
ret = do_wait(&wo);
- put_pid(pid);
-
- return ret;
-}
-
-SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
- infop, int, options, struct rusage __user *, ru)
-{
- long ret;
-
- if (infop && !access_ok (VERIFY_WRITE, infop, sizeof(siginfo_t)))
- return -EFAULT;
-
- ret = do_waitid(which, upid, infop, options, ru);
if (ret > 0) {
ret = 0;
} else if (infop) {
- int put_user_ret;
-
/*
* For a WNOHANG return, clear out all the fields
* we would set so the user can easily tell the
* difference.
*/
- put_user_ret = __put_user(0, &infop->si_signo);
- put_user_ret |= __put_user(0, &infop->si_errno);
- put_user_ret |= __put_user(0, &infop->si_code);
- put_user_ret |= __put_user(0, &infop->si_pid);
- put_user_ret |= __put_user(0, &infop->si_uid);
- put_user_ret |= __put_user(0, &infop->si_status);
-
- if (put_user_ret != 0)
- ret = put_user_ret;
+ if (!ret)
+ ret = put_user(0, &infop->si_signo);
+ if (!ret)
+ ret = put_user(0, &infop->si_errno);
+ if (!ret)
+ ret = put_user(0, &infop->si_code);
+ if (!ret)
+ ret = put_user(0, &infop->si_pid);
+ if (!ret)
+ ret = put_user(0, &infop->si_uid);
+ if (!ret)
+ ret = put_user(0, &infop->si_status);
}
+ put_pid(pid);
return ret;
}
-SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
- int, options, struct rusage __user *, ru)
+long do_wait4(pid_t upid, int __user *stat_addr,
+ int options, struct rusage __user *ru)
{
struct wait_opts wo;
struct pid *pid = NULL;
enum pid_type type;
long ret;
- if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
+ /*
+ * As for wait4(), except that waitfd() additionally needs WNOWAIT.
+ */
+ if (options & ~(WNOHANG|WNOWAIT|WUNTRACED|WCONTINUED|
__WNOTHREAD|__WCLONE|__WALL))
return -EINVAL;
return ret;
}
+SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
+ int, options, struct rusage __user *, ru)
+{
+ long ret;
+
+ if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
+ __WNOTHREAD|__WCLONE|__WALL))
+ return -EINVAL;
+
+ ret = do_wait4(upid, stat_addr, options, ru);
+
+ return ret;
+}
+
#ifdef __ARCH_WANT_SYS_WAITPID
/*
#include <sys/ptrace.h>
#include <sys/types.h>
#include <sys/wait.h>
+#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
{
pid_t die_pid, ptrace_pid;
int die_fd, ptrace_fd;
- siginfo_t info;
+ int status;
struct pollfd pfd[2];
int procs_left = 2;
/*
* Fork off two children, one of which waits for a ptrace().
- * Both just sleep after that.
+ * Both just sleep after that. Make sure we can use __WNOTHREAD,
+ * __WALL, and WUNTRACED without getting an -EINVAL.
*/
die_pid = fork();
sleeper();
}
- die_fd = waitfd(P_PID, die_pid, WEXITED | WSTOPPED, 0);
- ptrace_fd = waitfd(P_PID, ptrace_pid, WEXITED | WSTOPPED, 0);
+ die_fd = waitfd(P_PID, die_pid, 0, 0);
+ ptrace_fd = waitfd(P_PID, ptrace_pid, __WNOTHREAD | __WALL | WUNTRACED, 0);
if (die_fd < 0 || ptrace_fd < 0) {
perror("Cannot waitfd()");
perror ("poll() failed");
if (pfd[0].revents != 0) {
- if ((bytes = read(die_fd, &info, sizeof (siginfo_t))) < sizeof (siginfo_t)) {
+ if ((bytes = read(die_fd, &status, sizeof (int))) < sizeof (int)) {
fprintf(stderr, "Only read %zi bytes\n", bytes);
exit(1);
}
- printf("die_fd returned code %i, status %i via waitfd read: revents are %x\n", info.si_code, info.si_status, pfd[0].revents);
+ printf("die_fd returned %i via waitfd read: revents are %x\n", status, pfd[0].revents);
pfd[0].fd *= -1;
procs_left--;
}
if (pfd[1].revents != 0) {
- memset(&info, 0, sizeof (siginfo_t));
- waitid(P_PID, ptrace_pid, &info, WEXITED | WSTOPPED | WNOHANG);
- if (info.si_pid != ptrace_pid) {
- fprintf(stderr, "waitfd said PID %i was ready, but waitid() says it isn't: %i\n",
- ptrace_pid, info.si_pid);
+ pid_t check_pid;
+ status = 0;
+ check_pid = waitpid(ptrace_pid, &status, __WNOTHREAD | __WALL | WUNTRACED | WNOHANG);
+ if (check_pid < 0) {
+ fprintf(stderr, "waitpid() failed: %s\n", strerror(errno));
exit(1);
}
- printf("ptrace_fd returned code %i, status %i via waitid; revents are %x\n", info.si_code, info.si_status, pfd[1].revents);
+ if (check_pid != ptrace_pid) {
+ fprintf(stderr, "waitfd() said PID %i was ready, but waitpid() says it isn't: %i\n",
+ ptrace_pid, check_pid);
+ exit(1);
+ }
+ printf("ptrace_fd returned status %i via waitpid; revents are %x\n", status, pfd[1].revents);
pfd[1].fd *= -1;
procs_left--;
}