From: Nick Alcock Date: Tue, 4 Jun 2013 15:52:46 +0000 (+0100) Subject: wait: add waitfd(), and a testcase for it X-Git-Tag: v4.1.12-92~313^2~77 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=95293bb2e56cba1a49fcfa02554e9669661fd60a;p=users%2Fjedix%2Flinux-maple.git wait: add waitfd(), and a testcase for it This syscall, of prototype int waitfd(int which, pid_t upid, int options, int flags); yields a pollable file descriptor which yields a 'struct siginfo_t' whenever waitid() or waitpid() would return (when child processes die or ptrace()d tracees undergo an appropriate state change). The which, upid and options arguments are as to waitid(); the flags argument is fd flags as to open() or fcntl(F_SETFL), to which O_RDWR is automatically added. WNOHANG in the options is automatically translated into O_NONBLOCK in the flags, and vice versa. No compat wrappers are in place for this syscall: 32-bit calls with a 64-bit kernel will return a 64-bit version of 'struct siginfo'. Current bugs: - select/poll/epoll is not waking up the process yet, even when it should. Signed-off-by: Nick Alcock --- diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index ef8187f9d28d9..da49ccb036bdb 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -365,3 +365,6 @@ 356 i386 memfd_create sys_memfd_create 357 i386 bpf sys_bpf 358 i386 execveat sys_execveat stub32_execveat +# This one is a temporary number, designed for no clashes. +# Nothing but DTrace should use it. +473 i386 waitfd sys_waitfd diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 9ef32d5f1b19e..afa9c1378c3d8 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -329,6 +329,9 @@ 320 common kexec_file_load sys_kexec_file_load 321 common bpf sys_bpf 322 64 execveat stub_execveat +# This one is a temporary number, designed for no clashes. +# Nothing but DTrace should use it. +473 common waitfd sys_waitfd # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/fs/Makefile b/fs/Makefile index cb92fd4c31729..5401bd793ae91 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o +obj-$(CONFIG_WAITFD) += waitfd.o obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FILE_LOCKING) += locks.o diff --git a/fs/waitfd.c b/fs/waitfd.c new file mode 100644 index 0000000000000..a833ce5a7b242 --- /dev/null +++ b/fs/waitfd.c @@ -0,0 +1,131 @@ +/* + * fs/waitfd.c + * + * Copyright (C) 2008 Red Hat, Casey Dahlin + * + * Largely derived from fs/signalfd.c + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +long do_waitid(int which, pid_t upid, + struct siginfo __user *infop, int options, + struct rusage __user *ru); + +struct waitfd_ctx { + int options; + int which; + pid_t upid; +}; + +static int waitfd_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static unsigned int waitfd_poll(struct file *file, poll_table *wait) +{ + struct waitfd_ctx *ctx = file->private_data; + long value; + + printk(KERN_INFO "DEBUG: %i: about to sleep on waitqueue at %p\n", current->pid, ¤t->signal->wait_chldexit); + poll_wait(file, ¤t->signal->wait_chldexit, wait); + printk(KERN_INFO "DEBUG: waitfd poll woken up and checking pid %i, options are %i\n", ctx->upid, ctx->options); + + value = do_waitid(ctx->which, ctx->upid, NULL, + ctx->options | WNOHANG | WNOWAIT, NULL); + if (value > 0 || value == -ECHILD) + return POLLIN | POLLRDNORM; + + printk(KERN_INFO "DEBUG: waitfd poll returning zilch\n"); + + return 0; +} + +/* + * Returns a multiple of the size of a struct siginfo, or a negative + * error code. The "count" parameter must be at least sizeof(struct siginfo) + */ +static ssize_t waitfd_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct waitfd_ctx *ctx = file->private_data; + struct siginfo __user *info_addr = (struct siginfo *)buf; + int flags = ctx->options; + ssize_t ret, total = 0; + + count /= sizeof(struct siginfo); + if (!count) + return -EINVAL; + + if (file->f_flags & O_NONBLOCK) + flags |= WNOHANG; + + do { + ret = do_waitid(ctx->which, ctx->upid, info_addr, flags, NULL); + if (ret == 0) + ret = -EAGAIN; + if (ret == -ECHILD) + ret = 0; + if (ret <= 0) + break; + + info_addr++; + total += sizeof(struct siginfo); + } while (--count); + + return total ? total : ret; +} + +static const struct file_operations waitfd_fops = { + .release = waitfd_release, + .poll = waitfd_poll, + .read = waitfd_read, + .llseek = noop_llseek, +}; + +SYSCALL_DEFINE4(waitfd, int, which, pid_t, upid, int, options, int, flags) +{ + int ufd; + struct waitfd_ctx *ctx; + + /* + * Options validation from do_waitid() + */ + if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) + return -EINVAL; + if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) + return -EINVAL; + + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->options = options; + ctx->upid = upid; + ctx->which = which; + + ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx, + O_RDWR | flags | ((options & WNOHANG) ? + O_NONBLOCK | 0 : 0)); + /* + * Use the fd's nonblocking state from now on, since that can change. + */ + ctx->options &= ~WNOHANG; + + if (ufd < 0) + kfree(ctx); + + return ufd; +} diff --git a/init/Kconfig b/init/Kconfig index 3fcaedfb970de..75242761fe6d2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1505,6 +1505,20 @@ config EPOLL Disabling this option will cause the kernel to be built without support for epoll family of system calls. +config WAITFD + bool "Enable waitfd() system call" if EXPERT + select ANON_INODES + default n + help + Enable the waitfd() system call that allows receiving child state + changes from a file descriptor. + + Note: this system call is not upstream: its syscall number is not + finalized, and the call itself should only be used by DTrace. + + If unsure, say N. + + config SIGNALFD bool "Enable signalfd() system call" if EXPERT select ANON_INODES diff --git a/kernel/dtrace/Kconfig b/kernel/dtrace/Kconfig index 8411d527b46a4..a9c352f15cdd6 100644 --- a/kernel/dtrace/Kconfig +++ b/kernel/dtrace/Kconfig @@ -8,6 +8,7 @@ menuconfig DTRACE default y depends on X86_64 && !DEBUG_LOCK_ALLOC select KALLSYMS + select WAITFD select CTF if (!DT_DISABLE_CTF) select STRIP_ASM_SYMS if (!DT_DISABLE_CTF) select DEBUG_INFO if (!DT_DISABLE_CTF) diff --git a/kernel/exit.c b/kernel/exit.c index 7ff3c4aed9c6f..9ef11c30c41ef 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -949,10 +949,8 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, put_task_struct(p); infop = wo->wo_info; if (infop) { - if (!retval) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); + retval = put_user(SIGCHLD, &infop->si_signo); + retval |= put_user(0, &infop->si_errno); if (!retval) retval = put_user((short)why, &infop->si_code); if (!retval) @@ -1522,9 +1520,11 @@ end: return retval; } -SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, - infop, int, options, struct rusage __user *, ru) +long do_waitid(int which, pid_t upid, + struct siginfo __user *infop, int options, + struct rusage __user *ru) { + struct wait_opts wo; struct pid *pid = NULL; enum pid_type type; @@ -1563,6 +1563,20 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, wo.wo_stat = NULL; wo.wo_rusage = ru; ret = do_wait(&wo); + put_pid(pid); + + return ret; +} + +SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, + infop, int, options, struct rusage __user *, ru) +{ + long ret; + + if (infop && !access_ok (VERIFY_WRITE, infop, sizeof(siginfo_t))) + return -EFAULT; + + ret = do_waitid(which, upid, infop, options, ru); if (ret > 0) { ret = 0; @@ -1572,21 +1586,14 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, * we would set so the user can easily tell the * difference. */ - if (!ret) - ret = put_user(0, &infop->si_signo); - if (!ret) - ret = put_user(0, &infop->si_errno); - if (!ret) - ret = put_user(0, &infop->si_code); - if (!ret) - ret = put_user(0, &infop->si_pid); - if (!ret) - ret = put_user(0, &infop->si_uid); - if (!ret) - ret = put_user(0, &infop->si_status); + ret = __put_user(0, &infop->si_signo); + ret |= __put_user(0, &infop->si_errno); + ret |= __put_user(0, &infop->si_code); + ret |= __put_user(0, &infop->si_pid); + ret |= __put_user(0, &infop->si_uid); + ret |= __put_user(0, &infop->si_status); } - put_pid(pid); return ret; } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 7995ef5868d8f..1ce33ea49b90a 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -206,6 +206,7 @@ cond_syscall(sys_ioprio_set); cond_syscall(sys_ioprio_get); /* New file descriptors */ +cond_syscall(sys_waitfd); cond_syscall(sys_signalfd); cond_syscall(sys_signalfd4); cond_syscall(compat_sys_signalfd); diff --git a/tools/testing/selftests/waitfd/Makefile b/tools/testing/selftests/waitfd/Makefile new file mode 100644 index 0000000000000..f85c80b54f05e --- /dev/null +++ b/tools/testing/selftests/waitfd/Makefile @@ -0,0 +1,28 @@ +uname_M := $(shell uname -m 2>/dev/null || echo not) +ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/) +ifeq ($(ARCH),i386) + ARCH := X86 + CFLAGS := -DCONFIG_X86_32 -D__i386__ +endif +ifeq ($(ARCH),x86_64) + ARCH := X86 + CFLAGS := -DCONFIG_X86_64 -D__x86_64__ +endif + +CFLAGS += -I../../../../arch/x86/include/generated/ +CFLAGS += -I../../../../include/ +CFLAGS += -I../../../../usr/include/ +CFLAGS += -I../../../../arch/x86/include/ + +all: +ifeq ($(ARCH),X86) + gcc $(CFLAGS) waitfd.c -o waitfd +else + echo "Not an x86 target, can't build waitfd selftest" +endif + +run_tests: all + @./waitfd || echo "waitfd: [FAIL]" + +clean: + rm -fr ./waitfd diff --git a/tools/testing/selftests/waitfd/waitfd.c b/tools/testing/selftests/waitfd/waitfd.c new file mode 100644 index 0000000000000..e3a37fd7fcbd6 --- /dev/null +++ b/tools/testing/selftests/waitfd/waitfd.c @@ -0,0 +1,109 @@ +/* waitfd testcase. */ + +#define _GNU_SOURCE 1 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#ifndef SYS_waitfd +#define SYS_waitfd 473 +#endif +#endif + +int waitfd(int which, pid_t upid, int options, int flags) +{ + return syscall(__NR_waitfd, which, upid, options, flags); +} + +void sleeper(void) +{ + sleep(10); + exit(0); +} + +int main (void) +{ + pid_t die_pid, ptrace_pid; + int die_fd, ptrace_fd; + siginfo_t info; + struct pollfd pfd[2]; + int procs_left = 2; + + memset(pfd, 0, sizeof(pfd)); + + /* + * Fork off two children, one of which waits for a ptrace(). + * Both just sleep after that. + */ + + die_pid = fork(); + + if (die_pid == 0) + sleeper(); + + ptrace_pid = fork(); + if (ptrace_pid == 0) { + ptrace(PTRACE_TRACEME, 0, 0, 0); + sleeper(); + } + + die_fd = waitfd(P_PID, die_pid, WEXITED | WSTOPPED, 0); + ptrace_fd = waitfd(P_PID, ptrace_pid, WEXITED | WSTOPPED, 0); + + if (die_fd < 0 || ptrace_fd < 0) { + perror("Cannot waitfd()"); + exit(1); + } + + pfd[0].fd = die_fd; + pfd[0].events = POLLIN; + pfd[1].fd = ptrace_fd; + pfd[1].events = POLLIN; + + /* + * Hit the ptrace PID with a signal + */ + kill(ptrace_pid, SIGABRT); + + while (procs_left > 0) { + ssize_t bytes; + + if (poll(pfd, 2, -1) < 0) + perror ("poll() failed"); + + if (pfd[0].revents != 0) { + if ((bytes = read(die_fd, &info, sizeof (siginfo_t))) < sizeof (siginfo_t)) { + fprintf(stderr, "Only read %zi bytes\n", bytes); + exit(1); + } + + printf("die_fd returned code %i, status %i via waitfd read: revents are %x\n", info.si_code, info.si_status, pfd[0].revents); + pfd[0].fd *= -1; + procs_left--; + } + + if (pfd[1].revents != 0) { + memset(&info, 0, sizeof (siginfo_t)); + waitid(P_PID, ptrace_pid, &info, WEXITED | WSTOPPED | WNOHANG); + if (info.si_pid != ptrace_pid) { + fprintf(stderr, "waitfd said PID %i was ready, but waitid() says it isn't: %i\n", + ptrace_pid, info.si_pid); + exit(1); + } + printf("ptrace_fd returned code %i, status %i via waitid; revents are %x\n", info.si_code, info.si_status, pfd[1].revents); + pfd[1].fd *= -1; + procs_left--; + } + } + + return 0; +}