]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
wait: add waitfd(), and a testcase for it
authorNick Alcock <nick.alcock@oracle.com>
Tue, 4 Jun 2013 15:52:46 +0000 (16:52 +0100)
committerNick Alcock <nick.alcock@oracle.com>
Mon, 29 Jun 2015 21:41:45 +0000 (22:41 +0100)
This syscall, of prototype

    int waitfd(int which, pid_t upid, int options, int flags);

yields a pollable file descriptor which yields a 'struct siginfo_t' whenever
waitid() or waitpid() would return (when child processes die or ptrace()d
tracees undergo an appropriate state change).

The which, upid and options arguments are as to waitid(); the flags argument is
fd flags as to open() or fcntl(F_SETFL), to which O_RDWR is automatically added.
WNOHANG in the options is automatically translated into O_NONBLOCK in the flags,
and vice versa.

No compat wrappers are in place for this syscall: 32-bit calls with a 64-bit
kernel will return a 64-bit version of 'struct siginfo'.

Current bugs:
  - select/poll/epoll is not waking up the process yet, even when it should.

Signed-off-by: Nick Alcock <nick.alcock@oracle.com>
arch/x86/syscalls/syscall_32.tbl
arch/x86/syscalls/syscall_64.tbl
fs/Makefile
fs/waitfd.c [new file with mode: 0644]
init/Kconfig
kernel/dtrace/Kconfig
kernel/exit.c
kernel/sys_ni.c
tools/testing/selftests/waitfd/Makefile [new file with mode: 0644]
tools/testing/selftests/waitfd/waitfd.c [new file with mode: 0644]

index ef8187f9d28d96651d3d52e39d4441c60a408332..da49ccb036bdb1eabd3e6f6326b59c8fb96b4805 100644 (file)
 356    i386    memfd_create            sys_memfd_create
 357    i386    bpf                     sys_bpf
 358    i386    execveat                sys_execveat                    stub32_execveat
+# This one is a temporary number, designed for no clashes.
+# Nothing but DTrace should use it.
+473    i386    waitfd                  sys_waitfd
index 9ef32d5f1b19e67ed10b69c67be5f53806c19ffa..afa9c1378c3d8eff2c6509fe612167f11d12011a 100644 (file)
 320    common  kexec_file_load         sys_kexec_file_load
 321    common  bpf                     sys_bpf
 322    64      execveat                stub_execveat
+# This one is a temporary number, designed for no clashes.
+# Nothing but DTrace should use it.
+473    common  waitfd                  sys_waitfd
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
index cb92fd4c31729cc5cf799a3278c57ad3c1af6314..5401bd793ae9187316e700404920171652aa85a9 100644 (file)
@@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES)     += anon_inodes.o
 obj-$(CONFIG_SIGNALFD)         += signalfd.o
 obj-$(CONFIG_TIMERFD)          += timerfd.o
 obj-$(CONFIG_EVENTFD)          += eventfd.o
+obj-$(CONFIG_WAITFD)           += waitfd.o
 obj-$(CONFIG_AIO)               += aio.o
 obj-$(CONFIG_FS_DAX)           += dax.o
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
diff --git a/fs/waitfd.c b/fs/waitfd.c
new file mode 100644 (file)
index 0000000..a833ce5
--- /dev/null
@@ -0,0 +1,131 @@
+/*
+ *  fs/waitfd.c
+ *
+ *  Copyright (C) 2008  Red Hat, Casey Dahlin <cdahlin@redhat.com>
+ *
+ *  Largely derived from fs/signalfd.c
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/list.h>
+#include <linux/anon_inodes.h>
+#include <linux/syscalls.h>
+
+long do_waitid(int which, pid_t upid,
+              struct siginfo __user *infop, int options,
+              struct rusage __user *ru);
+
+struct waitfd_ctx {
+       int     options;
+       int     which;
+       pid_t   upid;
+};
+
+static int waitfd_release(struct inode *inode, struct file *file)
+{
+       kfree(file->private_data);
+       return 0;
+}
+
+static unsigned int waitfd_poll(struct file *file, poll_table *wait)
+{
+       struct waitfd_ctx *ctx = file->private_data;
+       long value;
+
+       printk(KERN_INFO "DEBUG: %i: about to sleep on waitqueue at %p\n", current->pid, &current->signal->wait_chldexit);
+       poll_wait(file, &current->signal->wait_chldexit, wait);
+       printk(KERN_INFO "DEBUG: waitfd poll woken up and checking pid %i, options are %i\n", ctx->upid, ctx->options);
+
+       value = do_waitid(ctx->which, ctx->upid, NULL,
+                          ctx->options | WNOHANG | WNOWAIT, NULL);
+       if (value > 0 || value == -ECHILD)
+               return POLLIN | POLLRDNORM;
+
+       printk(KERN_INFO "DEBUG: waitfd poll returning zilch\n");
+
+       return 0;
+}
+
+/*
+ * Returns a multiple of the size of a struct siginfo, or a negative
+ * error code. The "count" parameter must be at least sizeof(struct siginfo)
+ */
+static ssize_t waitfd_read(struct file *file, char __user *buf, size_t count,
+                            loff_t *ppos)
+{
+       struct waitfd_ctx *ctx = file->private_data;
+       struct siginfo __user *info_addr = (struct siginfo *)buf;
+       int flags = ctx->options;
+       ssize_t ret, total = 0;
+
+       count /= sizeof(struct siginfo);
+       if (!count)
+               return -EINVAL;
+
+       if (file->f_flags & O_NONBLOCK)
+               flags |= WNOHANG;
+
+       do {
+               ret = do_waitid(ctx->which, ctx->upid, info_addr, flags, NULL);
+               if (ret == 0)
+                       ret = -EAGAIN;
+               if (ret == -ECHILD)
+                       ret = 0;
+               if (ret <= 0)
+                       break;
+
+               info_addr++;
+               total += sizeof(struct siginfo);
+       } while (--count);
+
+       return total ? total : ret;
+}
+
+static const struct file_operations waitfd_fops = {
+       .release        = waitfd_release,
+       .poll           = waitfd_poll,
+       .read           = waitfd_read,
+       .llseek         = noop_llseek,
+};
+SYSCALL_DEFINE4(waitfd, int, which, pid_t, upid, int, options, int, flags)
+{
+       int ufd;
+       struct waitfd_ctx *ctx;
+
+       /*
+        * Options validation from do_waitid()
+        */
+       if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
+               return -EINVAL;
+       if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
+               return -EINVAL;
+
+       ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+       if (!ctx)
+               return -ENOMEM;
+
+       ctx->options = options;
+       ctx->upid = upid;
+       ctx->which = which;
+
+       ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx,
+                              O_RDWR | flags | ((options & WNOHANG) ?
+                                                O_NONBLOCK | 0 : 0));
+       /*
+        * Use the fd's nonblocking state from now on, since that can change.
+        */
+       ctx->options &= ~WNOHANG;
+
+       if (ufd < 0)
+               kfree(ctx);
+
+       return ufd;
+}
index 3fcaedfb970de98b05755ba27f798f8a780cab77..75242761fe6d24c5b80aae5a9c6131ab14a5d144 100644 (file)
@@ -1505,6 +1505,20 @@ config EPOLL
          Disabling this option will cause the kernel to be built without
          support for epoll family of system calls.
 
+config WAITFD
+       bool "Enable waitfd() system call" if EXPERT
+       select ANON_INODES
+       default n
+       help
+         Enable the waitfd() system call that allows receiving child state
+         changes from a file descriptor.
+
+          Note: this system call is not upstream: its syscall number is not
+          finalized, and the call itself should only be used by DTrace.
+
+         If unsure, say N.
+
+
 config SIGNALFD
        bool "Enable signalfd() system call" if EXPERT
        select ANON_INODES
index 8411d527b46a44cef6ad0bd8cb9e45d248cfe509..a9c352f15cdd63443f93438b6deb6b1605948c5d 100644 (file)
@@ -8,6 +8,7 @@ menuconfig DTRACE
        default y
        depends on X86_64 && !DEBUG_LOCK_ALLOC
        select KALLSYMS
+       select WAITFD
        select CTF if (!DT_DISABLE_CTF)
        select STRIP_ASM_SYMS if (!DT_DISABLE_CTF)
        select DEBUG_INFO if (!DT_DISABLE_CTF)
index 7ff3c4aed9c6fc86487d3185ee751377fcedbe45..9ef11c30c41ef6c4a4987810b1ac8a5a5d806465 100644 (file)
@@ -949,10 +949,8 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
        put_task_struct(p);
        infop = wo->wo_info;
        if (infop) {
-               if (!retval)
-                       retval = put_user(SIGCHLD, &infop->si_signo);
-               if (!retval)
-                       retval = put_user(0, &infop->si_errno);
+               retval = put_user(SIGCHLD, &infop->si_signo);
+               retval |= put_user(0, &infop->si_errno);
                if (!retval)
                        retval = put_user((short)why, &infop->si_code);
                if (!retval)
@@ -1522,9 +1520,11 @@ end:
        return retval;
 }
 
-SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
-               infop, int, options, struct rusage __user *, ru)
+long do_waitid(int which, pid_t upid,
+              struct siginfo __user *infop, int options,
+              struct rusage __user *ru)
 {
+
        struct wait_opts wo;
        struct pid *pid = NULL;
        enum pid_type type;
@@ -1563,6 +1563,20 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
        wo.wo_stat      = NULL;
        wo.wo_rusage    = ru;
        ret = do_wait(&wo);
+       put_pid(pid);
+
+       return ret;
+}
+
+SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
+               infop, int, options, struct rusage __user *, ru)
+{
+       long ret;
+
+       if (infop && !access_ok (VERIFY_WRITE, infop, sizeof(siginfo_t)))
+               return -EFAULT;
+
+       ret = do_waitid(which, upid, infop, options, ru);
 
        if (ret > 0) {
                ret = 0;
@@ -1572,21 +1586,14 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
                 * we would set so the user can easily tell the
                 * difference.
                 */
-               if (!ret)
-                       ret = put_user(0, &infop->si_signo);
-               if (!ret)
-                       ret = put_user(0, &infop->si_errno);
-               if (!ret)
-                       ret = put_user(0, &infop->si_code);
-               if (!ret)
-                       ret = put_user(0, &infop->si_pid);
-               if (!ret)
-                       ret = put_user(0, &infop->si_uid);
-               if (!ret)
-                       ret = put_user(0, &infop->si_status);
+               ret = __put_user(0, &infop->si_signo);
+               ret |= __put_user(0, &infop->si_errno);
+               ret |= __put_user(0, &infop->si_code);
+               ret |= __put_user(0, &infop->si_pid);
+               ret |= __put_user(0, &infop->si_uid);
+               ret |= __put_user(0, &infop->si_status);
        }
 
-       put_pid(pid);
        return ret;
 }
 
index 7995ef5868d8f3f2c5ec299209c3d3160457ba7a..1ce33ea49b90ae0d4b8cdc2127e07af2e17468af 100644 (file)
@@ -206,6 +206,7 @@ cond_syscall(sys_ioprio_set);
 cond_syscall(sys_ioprio_get);
 
 /* New file descriptors */
+cond_syscall(sys_waitfd);
 cond_syscall(sys_signalfd);
 cond_syscall(sys_signalfd4);
 cond_syscall(compat_sys_signalfd);
diff --git a/tools/testing/selftests/waitfd/Makefile b/tools/testing/selftests/waitfd/Makefile
new file mode 100644 (file)
index 0000000..f85c80b
--- /dev/null
@@ -0,0 +1,28 @@
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/)
+ifeq ($(ARCH),i386)
+        ARCH := X86
+       CFLAGS := -DCONFIG_X86_32 -D__i386__
+endif
+ifeq ($(ARCH),x86_64)
+       ARCH := X86
+       CFLAGS := -DCONFIG_X86_64 -D__x86_64__
+endif
+
+CFLAGS += -I../../../../arch/x86/include/generated/
+CFLAGS += -I../../../../include/
+CFLAGS += -I../../../../usr/include/
+CFLAGS += -I../../../../arch/x86/include/
+
+all:
+ifeq ($(ARCH),X86)
+       gcc $(CFLAGS) waitfd.c -o waitfd
+else
+       echo "Not an x86 target, can't build waitfd selftest"
+endif
+
+run_tests: all
+       @./waitfd || echo "waitfd: [FAIL]"
+
+clean:
+       rm -fr ./waitfd
diff --git a/tools/testing/selftests/waitfd/waitfd.c b/tools/testing/selftests/waitfd/waitfd.c
new file mode 100644 (file)
index 0000000..e3a37fd
--- /dev/null
@@ -0,0 +1,109 @@
+/* waitfd testcase. */
+
+#define _GNU_SOURCE 1
+#include <linux/unistd.h>
+#include <sys/syscall.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <string.h>
+#include <poll.h>
+
+#if 0
+#ifndef SYS_waitfd
+#define SYS_waitfd 473
+#endif
+#endif
+
+int waitfd(int which, pid_t upid, int options, int flags)
+{
+        return syscall(__NR_waitfd, which, upid, options, flags);
+}
+
+void sleeper(void)
+{
+        sleep(10);
+        exit(0);
+}
+
+int main (void)
+{
+        pid_t die_pid, ptrace_pid;
+        int die_fd, ptrace_fd;
+        siginfo_t info;
+       struct pollfd pfd[2];
+        int procs_left = 2;
+
+       memset(pfd, 0, sizeof(pfd));
+
+        /*
+         * Fork off two children, one of which waits for a ptrace().
+         * Both just sleep after that.
+         */
+
+        die_pid = fork();
+
+        if (die_pid == 0)
+                sleeper();
+
+        ptrace_pid = fork();
+        if (ptrace_pid == 0) {
+                ptrace(PTRACE_TRACEME, 0, 0, 0);
+                sleeper();
+        }
+
+        die_fd = waitfd(P_PID, die_pid, WEXITED | WSTOPPED, 0);
+        ptrace_fd = waitfd(P_PID, ptrace_pid, WEXITED | WSTOPPED, 0);
+
+        if (die_fd < 0 || ptrace_fd < 0) {
+                perror("Cannot waitfd()");
+                exit(1);
+        }
+
+        pfd[0].fd = die_fd;
+        pfd[0].events = POLLIN;
+        pfd[1].fd = ptrace_fd;
+        pfd[1].events = POLLIN;
+
+        /*
+         * Hit the ptrace PID with a signal
+         */
+        kill(ptrace_pid, SIGABRT);
+
+        while (procs_left > 0) {
+                ssize_t bytes;
+
+                if (poll(pfd, 2, -1) < 0)
+                        perror ("poll() failed");
+
+                if (pfd[0].revents != 0) {
+                        if ((bytes = read(die_fd, &info, sizeof (siginfo_t))) < sizeof (siginfo_t)) {
+                                fprintf(stderr, "Only read %zi bytes\n", bytes);
+                                exit(1);
+                        }
+
+                        printf("die_fd returned code %i, status %i via waitfd read: revents are %x\n", info.si_code, info.si_status, pfd[0].revents);
+                        pfd[0].fd *= -1;
+                        procs_left--;
+                }
+
+                if (pfd[1].revents != 0) {
+                        memset(&info, 0, sizeof (siginfo_t));
+                        waitid(P_PID, ptrace_pid, &info, WEXITED | WSTOPPED | WNOHANG);
+                        if (info.si_pid != ptrace_pid) {
+                                fprintf(stderr, "waitfd said PID %i was ready, but waitid() says it isn't: %i\n",
+                                    ptrace_pid, info.si_pid);
+                                exit(1);
+                        }
+                        printf("ptrace_fd returned code %i, status %i via waitid; revents are %x\n", info.si_code, info.si_status, pfd[1].revents);
+                        pfd[1].fd *= -1;
+                        procs_left--;
+                }
+        }
+
+        return 0;
+}