userfaultfd: wake pending userfaults

author Andrea Arcangeli <aarcange@redhat.com>

Fri, 4 Sep 2015 22:46:41 +0000 (15:46 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 4 Sep 2015 23:54:41 +0000 (16:54 -0700)
author Andrea Arcangeli <aarcange@redhat.com>
Fri, 4 Sep 2015 22:46:41 +0000 (15:46 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 4 Sep 2015 23:54:41 +0000 (16:54 -0700)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c

index 1f2ddaaf3c03bd02ad43a4d20dfac1e0fe48ed04..0877222dfa475757bdd046c8b05b9a95fcc14b7b 100644 (file)
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -52,6 +52,10 @@ struct userfaultfd_ctx {
  struct userfaultfd_wait_queue {
         struct uffd_msg msg;
         wait_queue_t wq;
+       /*
+        * Only relevant when queued in fault_wqh and only used by the
+        * read operation to avoid reading the same userfault twice.
+        */
         bool pending;
         struct userfaultfd_ctx *ctx;
  };
@@ -71,9 +75,6 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
  
         uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
         ret = 0;
-       /* don't wake the pending ones to avoid reads to block */
-       if (uwq->pending && !ACCESS_ONCE(uwq->ctx->released))
-               goto out;
         /* len == 0 means wake all */
         start = range->start;
         len = range->len;
@@ -196,12 +197,14 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
         struct mm_struct *mm = vma->vm_mm;
         struct userfaultfd_ctx *ctx;
         struct userfaultfd_wait_queue uwq;
+       int ret;
  
         BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
  
+       ret = VM_FAULT_SIGBUS;
         ctx = vma->vm_userfaultfd_ctx.ctx;
         if (!ctx)
-               return VM_FAULT_SIGBUS;
+               goto out;
  
         BUG_ON(ctx->mm != mm);
  
@@ -214,7 +217,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
          * caller of handle_userfault to release the mmap_sem.
          */
         if (unlikely(ACCESS_ONCE(ctx->released)))
-               return VM_FAULT_SIGBUS;
+               goto out;
  
         /*
          * Check that we can return VM_FAULT_RETRY.
@@ -240,15 +243,16 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
                         dump_stack();
                 }
  #endif
-               return VM_FAULT_SIGBUS;
+               goto out;
         }
  
         /*
          * Handle nowait, not much to do other than tell it to retry
          * and wait.
          */
+       ret = VM_FAULT_RETRY;
         if (flags & FAULT_FLAG_RETRY_NOWAIT)
-               return VM_FAULT_RETRY;
+               goto out;
  
         /* take the reference before dropping the mmap_sem */
         userfaultfd_ctx_get(ctx);
@@ -268,21 +272,23 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
          * through poll/read().
          */
         __add_wait_queue(&ctx->fault_wqh, &uwq.wq);
-       for (;;) {
-               set_current_state(TASK_KILLABLE);
-               if (!uwq.pending || ACCESS_ONCE(ctx->released) ||
-                   fatal_signal_pending(current))
-                       break;
-               spin_unlock(&ctx->fault_wqh.lock);
+       set_current_state(TASK_KILLABLE);
+       spin_unlock(&ctx->fault_wqh.lock);
  
+       if (likely(!ACCESS_ONCE(ctx->released) &&
+                  !fatal_signal_pending(current))) {
                 wake_up_poll(&ctx->fd_wqh, POLLIN);
                 schedule();
+               ret |= VM_FAULT_MAJOR;
+       }
  
+       __set_current_state(TASK_RUNNING);
+       /* see finish_wait() comment for why list_empty_careful() */
+       if (!list_empty_careful(&uwq.wq.task_list)) {
                 spin_lock(&ctx->fault_wqh.lock);
+               list_del_init(&uwq.wq.task_list);
+               spin_unlock(&ctx->fault_wqh.lock);
         }
-       __remove_wait_queue(&ctx->fault_wqh, &uwq.wq);
-       __set_current_state(TASK_RUNNING);
-       spin_unlock(&ctx->fault_wqh.lock);
  
         /*
          * ctx may go away after this if the userfault pseudo fd is
@@ -290,7 +296,8 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
          */
         userfaultfd_ctx_put(ctx);
  
-       return VM_FAULT_RETRY;
+out:
+       return ret;
  }
  
  static int userfaultfd_release(struct inode *inode, struct file *file)
@@ -404,6 +411,12 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
         case UFFD_STATE_WAIT_API:
                 return POLLERR;
         case UFFD_STATE_RUNNING:
+               /*
+                * poll() never guarantees that read won't block.
+                * userfaults can be waken before they're read().
+                */
+               if (unlikely(!(file->f_flags & O_NONBLOCK)))
+                       return POLLERR;
                 spin_lock(&ctx->fault_wqh.lock);
                 ret = find_userfault(ctx, NULL);
                 spin_unlock(&ctx->fault_wqh.lock);
@@ -834,11 +847,19 @@ out:
  }
  
  /*
- * This is mostly needed to re-wakeup those userfaults that were still
- * pending when userland wake them up the first time. We don't wake
- * the pending one to avoid blocking reads to block, or non blocking
- * read to return -EAGAIN, if used with POLLIN, to avoid userland
- * doubts on why POLLIN wasn't reliable.
+ * userfaultfd_wake is needed in case an userfault is in flight by the
+ * time a UFFDIO_COPY (or other ioctl variants) completes. The page
+ * may be well get mapped and the page fault if repeated wouldn't lead
+ * to a userfault anymore, but before scheduling in TASK_KILLABLE mode
+ * handle_userfault() doesn't recheck the pagetables and it doesn't
+ * serialize against UFFDO_COPY (or other ioctl variants). Ultimately
+ * the knowledge of which pages are mapped is left to userland who is
+ * responsible for handling the race between read() userfaults and
+ * background UFFDIO_COPY (or other ioctl variants), if done by
+ * separate concurrent threads.
+ *
+ * userfaultfd_wake may be used in combination with the
+ * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
   */
  static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
                             unsigned long arg)
author	Andrea Arcangeli <aarcange@redhat.com>
	Fri, 4 Sep 2015 22:46:41 +0000 (15:46 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 4 Sep 2015 23:54:41 +0000 (16:54 -0700)