userfaultfd: non-cooperative: Add fork() event

author Pavel Emelyanov <xemul@parallels.com>

Wed, 22 Feb 2017 23:42:27 +0000 (15:42 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 23 Feb 2017 00:41:28 +0000 (16:41 -0800)
author Pavel Emelyanov <xemul@parallels.com>
Wed, 22 Feb 2017 23:42:27 +0000 (15:42 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 23 Feb 2017 00:41:28 +0000 (16:41 -0800)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c

index 87d31921b66cdd3072a4f1beb74e05ce49ef3caf..6046e0b552b24d4d1808dfa1c52470ea9fdc70ee 100644 (file)
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -64,6 +64,12 @@ struct userfaultfd_ctx {
         struct mm_struct *mm;
  };
  
+struct userfaultfd_fork_ctx {
+       struct userfaultfd_ctx *orig;
+       struct userfaultfd_ctx *new;
+       struct list_head list;
+};
+
  struct userfaultfd_wait_queue {
         struct uffd_msg msg;
         wait_queue_t wq;
@@ -465,9 +471,8 @@ out:
         return ret;
  }
  
-static int __maybe_unused userfaultfd_event_wait_completion(
-               struct userfaultfd_ctx *ctx,
-               struct userfaultfd_wait_queue *ewq)
+static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
+                                            struct userfaultfd_wait_queue *ewq)
  {
         int ret = 0;
  
@@ -518,6 +523,79 @@ static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
         __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
  }
  
+int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
+{
+       struct userfaultfd_ctx *ctx = NULL, *octx;
+       struct userfaultfd_fork_ctx *fctx;
+
+       octx = vma->vm_userfaultfd_ctx.ctx;
+       if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+               vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+               return 0;
+       }
+
+       list_for_each_entry(fctx, fcs, list)
+               if (fctx->orig == octx) {
+                       ctx = fctx->new;
+                       break;
+               }
+
+       if (!ctx) {
+               fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
+               if (!fctx)
+                       return -ENOMEM;
+
+               ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
+               if (!ctx) {
+                       kfree(fctx);
+                       return -ENOMEM;
+               }
+
+               atomic_set(&ctx->refcount, 1);
+               ctx->flags = octx->flags;
+               ctx->state = UFFD_STATE_RUNNING;
+               ctx->features = octx->features;
+               ctx->released = false;
+               ctx->mm = vma->vm_mm;
+               atomic_inc(&ctx->mm->mm_users);
+
+               userfaultfd_ctx_get(octx);
+               fctx->orig = octx;
+               fctx->new = ctx;
+               list_add_tail(&fctx->list, fcs);
+       }
+
+       vma->vm_userfaultfd_ctx.ctx = ctx;
+       return 0;
+}
+
+static int dup_fctx(struct userfaultfd_fork_ctx *fctx)
+{
+       struct userfaultfd_ctx *ctx = fctx->orig;
+       struct userfaultfd_wait_queue ewq;
+
+       msg_init(&ewq.msg);
+
+       ewq.msg.event = UFFD_EVENT_FORK;
+       ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
+
+       return userfaultfd_event_wait_completion(ctx, &ewq);
+}
+
+void dup_userfaultfd_complete(struct list_head *fcs)
+{
+       int ret = 0;
+       struct userfaultfd_fork_ctx *fctx, *n;
+
+       list_for_each_entry_safe(fctx, n, fcs, list) {
+               if (!ret)
+                       ret = dup_fctx(fctx);
+               list_del(&fctx->list);
+               kfree(fctx);
+       }
+}
+
  static int userfaultfd_release(struct inode *inode, struct file *file)
  {
         struct userfaultfd_ctx *ctx = file->private_data;
@@ -653,12 +731,49 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
         }
  }
  
+static const struct file_operations userfaultfd_fops;
+
+static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
+                                 struct userfaultfd_ctx *new,
+                                 struct uffd_msg *msg)
+{
+       int fd;
+       struct file *file;
+       unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS;
+
+       fd = get_unused_fd_flags(flags);
+       if (fd < 0)
+               return fd;
+
+       file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new,
+                                 O_RDWR | flags);
+       if (IS_ERR(file)) {
+               put_unused_fd(fd);
+               return PTR_ERR(file);
+       }
+
+       fd_install(fd, file);
+       msg->arg.reserved.reserved1 = 0;
+       msg->arg.fork.ufd = fd;
+
+       return 0;
+}
+
  static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
                                     struct uffd_msg *msg)
  {
         ssize_t ret;
         DECLARE_WAITQUEUE(wait, current);
         struct userfaultfd_wait_queue *uwq;
+       /*
+        * Handling fork event requires sleeping operations, so
+        * we drop the event_wqh lock, then do these ops, then
+        * lock it back and wake up the waiter. While the lock is
+        * dropped the ewq may go away so we keep track of it
+        * carefully.
+        */
+       LIST_HEAD(fork_event);
+       struct userfaultfd_ctx *fork_nctx = NULL;
  
         /* always take the fd_wqh lock before the fault_pending_wqh lock */
         spin_lock(&ctx->fd_wqh.lock);
@@ -716,6 +831,16 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
                 if (uwq) {
                         *msg = uwq->msg;
  
+                       if (uwq->msg.event == UFFD_EVENT_FORK) {
+                               fork_nctx = (struct userfaultfd_ctx *)
+                                       (unsigned long)
+                                       uwq->msg.arg.reserved.reserved1;
+                               list_move(&uwq->wq.task_list, &fork_event);
+                               spin_unlock(&ctx->event_wqh.lock);
+                               ret = 0;
+                               break;
+                       }
+
                         userfaultfd_event_complete(ctx, uwq);
                         spin_unlock(&ctx->event_wqh.lock);
                         ret = 0;
@@ -739,6 +864,23 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
         __set_current_state(TASK_RUNNING);
         spin_unlock(&ctx->fd_wqh.lock);
  
+       if (!ret && msg->event == UFFD_EVENT_FORK) {
+               ret = resolve_userfault_fork(ctx, fork_nctx, msg);
+
+               if (!ret) {
+                       spin_lock(&ctx->event_wqh.lock);
+                       if (!list_empty(&fork_event)) {
+                               uwq = list_first_entry(&fork_event,
+                                                      typeof(*uwq),
+                                                      wq.task_list);
+                               list_del(&uwq->wq.task_list);
+                               __add_wait_queue(&ctx->event_wqh, &uwq->wq);
+                               userfaultfd_event_complete(ctx, uwq);
+                       }
+                       spin_unlock(&ctx->event_wqh.lock);
+               }
+       }
+
         return ret;
  }
  
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h

index 11b92b047a1ee186e1e34af304498746296172d6..79002bca1f43650ab4fddd1f25a67ebeed7680ab 100644 (file)
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -52,6 +52,9 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
         return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
  }
  
+extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
+extern void dup_userfaultfd_complete(struct list_head *);
+
  #else /* CONFIG_USERFAULTFD */
  
  /* mm helpers */
@@ -76,6 +79,16 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
         return false;
  }
  
+static inline int dup_userfaultfd(struct vm_area_struct *vma,
+                                 struct list_head *l)
+{
+       return 0;
+}
+
+static inline void dup_userfaultfd_complete(struct list_head *l)
+{
+}
+
  #endif /* CONFIG_USERFAULTFD */
  
  #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h

index 94046b8aa6ad0c730be33f052cd2ce6241290c1b..c8953c84fdcc522b43a2315c8528ff08a6323830 100644 (file)
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -18,12 +18,7 @@
   * means the userland is reading).
   */
  #define UFFD_API ((__u64)0xAA)
-/*
- * After implementing the respective features it will become:
- * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
- *                           UFFD_FEATURE_EVENT_FORK)
- */
-#define UFFD_API_FEATURES (0)
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK)
  #define UFFD_API_IOCTLS                                \
         ((__u64)1 << _UFFDIO_REGISTER |         \
          (__u64)1 << _UFFDIO_UNREGISTER |       \
@@ -77,6 +72,10 @@ struct uffd_msg {
                         __u64   address;
                 } pagefault;
  
+               struct {
+                       __u32   ufd;
+               } fork;
+
                 struct {
                         /* unused reserved fields */
                         __u64   reserved1;
@@ -90,9 +89,7 @@ struct uffd_msg {
   * Start at 0x12 and not at 0 to be more strict against bugs.
   */
  #define UFFD_EVENT_PAGEFAULT   0x12
-#if 0 /* not available yet */
  #define UFFD_EVENT_FORK                0x13
-#endif
  
  /* flags for UFFD_EVENT_PAGEFAULT */
  #define UFFD_PAGEFAULT_FLAG_WRITE      (1<<0)  /* If this was a write fault */
@@ -111,10 +108,8 @@ struct uffdio_api {
          * are to be considered implicitly always enabled in all kernels as
          * long as the uffdio_api.api requested matches UFFD_API.
          */
-#if 0 /* not available yet */
  #define UFFD_FEATURE_PAGEFAULT_FLAG_WP         (1<<0)
  #define UFFD_FEATURE_EVENT_FORK                        (1<<1)
-#endif
         __u64 features;
  
         __u64 ioctls;
diff --git a/kernel/fork.c b/kernel/fork.c

index ff82e24573b6d07e7ccaf37d1d765311d48a1398..d12fcc4db8a3c2e16ba718381a5871559859eaf7 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -55,6 +55,7 @@
  #include <linux/rmap.h>
  #include <linux/ksm.h>
  #include <linux/acct.h>
+#include <linux/userfaultfd_k.h>
  #include <linux/tsacct_kern.h>
  #include <linux/cn_proc.h>
  #include <linux/freezer.h>
@@ -561,6 +562,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
         struct rb_node **rb_link, *rb_parent;
         int retval;
         unsigned long charge;
+       LIST_HEAD(uf);
  
         uprobe_start_dup_mmap();
         if (down_write_killable(&oldmm->mmap_sem)) {
@@ -617,12 +619,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
                 if (retval)
                         goto fail_nomem_policy;
                 tmp->vm_mm = mm;
+               retval = dup_userfaultfd(tmp, &uf);
+               if (retval)
+                       goto fail_nomem_anon_vma_fork;
                 if (anon_vma_fork(tmp, mpnt))
                         goto fail_nomem_anon_vma_fork;
-               tmp->vm_flags &=
-                       ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
+               tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
                 tmp->vm_next = tmp->vm_prev = NULL;
-               tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
                 file = tmp->vm_file;
                 if (file) {
                         struct inode *inode = file_inode(file);
@@ -678,6 +681,7 @@ out:
         up_write(&mm->mmap_sem);
         flush_tlb_mm(oldmm);
         up_write(&oldmm->mmap_sem);
+       dup_userfaultfd_complete(&uf);
  fail_uprobe_end:
         uprobe_end_dup_mmap();
         return retval;
author	Pavel Emelyanov <xemul@parallels.com>
	Wed, 22 Feb 2017 23:42:27 +0000 (15:42 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 23 Feb 2017 00:41:28 +0000 (16:41 -0800)
fs/userfaultfd.c		patch \| blob \| history
include/linux/userfaultfd_k.h		patch \| blob \| history
include/uapi/linux/userfaultfd.h		patch \| blob \| history
kernel/fork.c		patch \| blob \| history