356 i386 memfd_create sys_memfd_create
357 i386 bpf sys_bpf
358 i386 execveat sys_execveat stub32_execveat
+# This one is a temporary number, designed for no clashes.
+# Nothing but DTrace should use it.
+473 i386 waitfd sys_waitfd
320 common kexec_file_load sys_kexec_file_load
321 common bpf sys_bpf
322 64 execveat stub_execveat
+# This one is a temporary number, designed for no clashes.
+# Nothing but DTrace should use it.
+473 common waitfd sys_waitfd
#
# x32-specific system call numbers start at 512 to avoid cache impact
obj-$(CONFIG_SIGNALFD) += signalfd.o
obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
+obj-$(CONFIG_WAITFD) += waitfd.o
obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FILE_LOCKING) += locks.o
--- /dev/null
+/*
+ * fs/waitfd.c
+ *
+ * Copyright (C) 2008 Red Hat, Casey Dahlin <cdahlin@redhat.com>
+ *
+ * Largely derived from fs/signalfd.c
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/list.h>
+#include <linux/anon_inodes.h>
+#include <linux/syscalls.h>
+
+long do_waitid(int which, pid_t upid,
+ struct siginfo __user *infop, int options,
+ struct rusage __user *ru);
+
+struct waitfd_ctx {
+ int options;
+ int which;
+ pid_t upid;
+};
+
+static int waitfd_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static unsigned int waitfd_poll(struct file *file, poll_table *wait)
+{
+ struct waitfd_ctx *ctx = file->private_data;
+ long value;
+
+ printk(KERN_INFO "DEBUG: %i: about to sleep on waitqueue at %p\n", current->pid, ¤t->signal->wait_chldexit);
+ poll_wait(file, ¤t->signal->wait_chldexit, wait);
+ printk(KERN_INFO "DEBUG: waitfd poll woken up and checking pid %i, options are %i\n", ctx->upid, ctx->options);
+
+ value = do_waitid(ctx->which, ctx->upid, NULL,
+ ctx->options | WNOHANG | WNOWAIT, NULL);
+ if (value > 0 || value == -ECHILD)
+ return POLLIN | POLLRDNORM;
+
+ printk(KERN_INFO "DEBUG: waitfd poll returning zilch\n");
+
+ return 0;
+}
+
+/*
+ * Returns a multiple of the size of a struct siginfo, or a negative
+ * error code. The "count" parameter must be at least sizeof(struct siginfo)
+ */
+static ssize_t waitfd_read(struct file *file, char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct waitfd_ctx *ctx = file->private_data;
+ struct siginfo __user *info_addr = (struct siginfo *)buf;
+ int flags = ctx->options;
+ ssize_t ret, total = 0;
+
+ count /= sizeof(struct siginfo);
+ if (!count)
+ return -EINVAL;
+
+ if (file->f_flags & O_NONBLOCK)
+ flags |= WNOHANG;
+
+ do {
+ ret = do_waitid(ctx->which, ctx->upid, info_addr, flags, NULL);
+ if (ret == 0)
+ ret = -EAGAIN;
+ if (ret == -ECHILD)
+ ret = 0;
+ if (ret <= 0)
+ break;
+
+ info_addr++;
+ total += sizeof(struct siginfo);
+ } while (--count);
+
+ return total ? total : ret;
+}
+
+static const struct file_operations waitfd_fops = {
+ .release = waitfd_release,
+ .poll = waitfd_poll,
+ .read = waitfd_read,
+ .llseek = noop_llseek,
+};
+
+SYSCALL_DEFINE4(waitfd, int, which, pid_t, upid, int, options, int, flags)
+{
+ int ufd;
+ struct waitfd_ctx *ctx;
+
+ /*
+ * Options validation from do_waitid()
+ */
+ if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
+ return -EINVAL;
+ if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
+ return -EINVAL;
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->options = options;
+ ctx->upid = upid;
+ ctx->which = which;
+
+ ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx,
+ O_RDWR | flags | ((options & WNOHANG) ?
+ O_NONBLOCK | 0 : 0));
+ /*
+ * Use the fd's nonblocking state from now on, since that can change.
+ */
+ ctx->options &= ~WNOHANG;
+
+ if (ufd < 0)
+ kfree(ctx);
+
+ return ufd;
+}
Disabling this option will cause the kernel to be built without
support for epoll family of system calls.
+config WAITFD
+ bool "Enable waitfd() system call" if EXPERT
+ select ANON_INODES
+ default n
+ help
+ Enable the waitfd() system call that allows receiving child state
+ changes from a file descriptor.
+
+ Note: this system call is not upstream: its syscall number is not
+ finalized, and the call itself should only be used by DTrace.
+
+ If unsure, say N.
+
+
config SIGNALFD
bool "Enable signalfd() system call" if EXPERT
select ANON_INODES
default y
depends on X86_64 && !DEBUG_LOCK_ALLOC
select KALLSYMS
+ select WAITFD
select CTF if (!DT_DISABLE_CTF)
select STRIP_ASM_SYMS if (!DT_DISABLE_CTF)
select DEBUG_INFO if (!DT_DISABLE_CTF)
put_task_struct(p);
infop = wo->wo_info;
if (infop) {
- if (!retval)
- retval = put_user(SIGCHLD, &infop->si_signo);
- if (!retval)
- retval = put_user(0, &infop->si_errno);
+ retval = put_user(SIGCHLD, &infop->si_signo);
+ retval |= put_user(0, &infop->si_errno);
if (!retval)
retval = put_user((short)why, &infop->si_code);
if (!retval)
return retval;
}
-SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
- infop, int, options, struct rusage __user *, ru)
+long do_waitid(int which, pid_t upid,
+ struct siginfo __user *infop, int options,
+ struct rusage __user *ru)
{
+
struct wait_opts wo;
struct pid *pid = NULL;
enum pid_type type;
wo.wo_stat = NULL;
wo.wo_rusage = ru;
ret = do_wait(&wo);
+ put_pid(pid);
+
+ return ret;
+}
+
+SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
+ infop, int, options, struct rusage __user *, ru)
+{
+ long ret;
+
+ if (infop && !access_ok (VERIFY_WRITE, infop, sizeof(siginfo_t)))
+ return -EFAULT;
+
+ ret = do_waitid(which, upid, infop, options, ru);
if (ret > 0) {
ret = 0;
* we would set so the user can easily tell the
* difference.
*/
- if (!ret)
- ret = put_user(0, &infop->si_signo);
- if (!ret)
- ret = put_user(0, &infop->si_errno);
- if (!ret)
- ret = put_user(0, &infop->si_code);
- if (!ret)
- ret = put_user(0, &infop->si_pid);
- if (!ret)
- ret = put_user(0, &infop->si_uid);
- if (!ret)
- ret = put_user(0, &infop->si_status);
+ ret = __put_user(0, &infop->si_signo);
+ ret |= __put_user(0, &infop->si_errno);
+ ret |= __put_user(0, &infop->si_code);
+ ret |= __put_user(0, &infop->si_pid);
+ ret |= __put_user(0, &infop->si_uid);
+ ret |= __put_user(0, &infop->si_status);
}
- put_pid(pid);
return ret;
}
cond_syscall(sys_ioprio_get);
/* New file descriptors */
+cond_syscall(sys_waitfd);
cond_syscall(sys_signalfd);
cond_syscall(sys_signalfd4);
cond_syscall(compat_sys_signalfd);
--- /dev/null
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/)
+ifeq ($(ARCH),i386)
+ ARCH := X86
+ CFLAGS := -DCONFIG_X86_32 -D__i386__
+endif
+ifeq ($(ARCH),x86_64)
+ ARCH := X86
+ CFLAGS := -DCONFIG_X86_64 -D__x86_64__
+endif
+
+CFLAGS += -I../../../../arch/x86/include/generated/
+CFLAGS += -I../../../../include/
+CFLAGS += -I../../../../usr/include/
+CFLAGS += -I../../../../arch/x86/include/
+
+all:
+ifeq ($(ARCH),X86)
+ gcc $(CFLAGS) waitfd.c -o waitfd
+else
+ echo "Not an x86 target, can't build waitfd selftest"
+endif
+
+run_tests: all
+ @./waitfd || echo "waitfd: [FAIL]"
+
+clean:
+ rm -fr ./waitfd
--- /dev/null
+/* waitfd testcase. */
+
+#define _GNU_SOURCE 1
+#include <linux/unistd.h>
+#include <sys/syscall.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <string.h>
+#include <poll.h>
+
+#if 0
+#ifndef SYS_waitfd
+#define SYS_waitfd 473
+#endif
+#endif
+
+int waitfd(int which, pid_t upid, int options, int flags)
+{
+ return syscall(__NR_waitfd, which, upid, options, flags);
+}
+
+void sleeper(void)
+{
+ sleep(10);
+ exit(0);
+}
+
+int main (void)
+{
+ pid_t die_pid, ptrace_pid;
+ int die_fd, ptrace_fd;
+ siginfo_t info;
+ struct pollfd pfd[2];
+ int procs_left = 2;
+
+ memset(pfd, 0, sizeof(pfd));
+
+ /*
+ * Fork off two children, one of which waits for a ptrace().
+ * Both just sleep after that.
+ */
+
+ die_pid = fork();
+
+ if (die_pid == 0)
+ sleeper();
+
+ ptrace_pid = fork();
+ if (ptrace_pid == 0) {
+ ptrace(PTRACE_TRACEME, 0, 0, 0);
+ sleeper();
+ }
+
+ die_fd = waitfd(P_PID, die_pid, WEXITED | WSTOPPED, 0);
+ ptrace_fd = waitfd(P_PID, ptrace_pid, WEXITED | WSTOPPED, 0);
+
+ if (die_fd < 0 || ptrace_fd < 0) {
+ perror("Cannot waitfd()");
+ exit(1);
+ }
+
+ pfd[0].fd = die_fd;
+ pfd[0].events = POLLIN;
+ pfd[1].fd = ptrace_fd;
+ pfd[1].events = POLLIN;
+
+ /*
+ * Hit the ptrace PID with a signal
+ */
+ kill(ptrace_pid, SIGABRT);
+
+ while (procs_left > 0) {
+ ssize_t bytes;
+
+ if (poll(pfd, 2, -1) < 0)
+ perror ("poll() failed");
+
+ if (pfd[0].revents != 0) {
+ if ((bytes = read(die_fd, &info, sizeof (siginfo_t))) < sizeof (siginfo_t)) {
+ fprintf(stderr, "Only read %zi bytes\n", bytes);
+ exit(1);
+ }
+
+ printf("die_fd returned code %i, status %i via waitfd read: revents are %x\n", info.si_code, info.si_status, pfd[0].revents);
+ pfd[0].fd *= -1;
+ procs_left--;
+ }
+
+ if (pfd[1].revents != 0) {
+ memset(&info, 0, sizeof (siginfo_t));
+ waitid(P_PID, ptrace_pid, &info, WEXITED | WSTOPPED | WNOHANG);
+ if (info.si_pid != ptrace_pid) {
+ fprintf(stderr, "waitfd said PID %i was ready, but waitid() says it isn't: %i\n",
+ ptrace_pid, info.si_pid);
+ exit(1);
+ }
+ printf("ptrace_fd returned code %i, status %i via waitid; revents are %x\n", info.si_code, info.si_status, pfd[1].revents);
+ pfd[1].fd *= -1;
+ procs_left--;
+ }
+ }
+
+ return 0;
+}