]> www.infradead.org Git - users/willy/linux.git/commitdiff
mshare: Basic page table sharing support
authorMatthew Wilcox (Oracle) <willy@infradead.org>
Tue, 11 Aug 2020 20:14:43 +0000 (16:14 -0400)
committerMatthew Wilcox (Oracle) <willy@infradead.org>
Tue, 11 Aug 2020 20:14:43 +0000 (16:14 -0400)
There are many bugs with this; in particular the kernel will hit
a VM_BUG_ON_PAGE if a page table is shared as its refcount will be
decremented to 0.  Also we don't currently reparent VMAs to the
newly created MM.  And the refcount on the MM isn't maintained.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
include/linux/mm.h
include/trace/events/mmflags.h
mm/internal.h
mm/memory.c
mm/mshare.c

index dc7b87310c1035c3e1cc02305e09d31fae7e2fb0..582a6040736cfce7ac914e05d019df016c921470 100644 (file)
@@ -289,12 +289,18 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_NOHUGEPAGE  0x40000000      /* MADV_NOHUGEPAGE marked this vma */
 #define VM_MERGEABLE   0x80000000      /* KSM may merge identical pages */
 
+#ifdef CONFIG_64BIT
+#define VM_SHARED_PT   (1UL << 32)
+#else
+#define VM_SHARED_PT   0
+#endif
+
 #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
-#define VM_HIGH_ARCH_BIT_0     32      /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_1     33      /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_2     34      /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_3     35      /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_4     36      /* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_0     33      /* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_1     34      /* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_2     35      /* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_3     36      /* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_4     37      /* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
 #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
 #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
index 5fb7520343863648160da1564621e83422985d97..9cc30b79f9bdc7bd7233b44ab2bd16d47d892b0d 100644 (file)
@@ -162,7 +162,8 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY,  "softdirty"     )               \
        {VM_MIXEDMAP,                   "mixedmap"      },              \
        {VM_HUGEPAGE,                   "hugepage"      },              \
        {VM_NOHUGEPAGE,                 "nohugepage"    },              \
-       {VM_MERGEABLE,                  "mergeable"     }               \
+       {VM_MERGEABLE,                  "mergeable"     },              \
+       {VM_SHARED_PT,                  "sharedpt"      }
 
 #define show_vma_flags(flags)                                          \
        (flags) ? __print_flags(flags, "|",                             \
index 9886db20d94fe810eb484e9f1b4550d45aa8b6e7..85611d81183f0cc0e047f0ba4a2f78638f7f6da7 100644 (file)
@@ -613,4 +613,11 @@ static inline bool is_migrate_highatomic_page(struct page *page)
 
 void setup_zone_pageset(struct zone *zone);
 extern struct page *alloc_new_node_page(struct page *page, unsigned long node);
+
+extern vm_fault_t find_shared_vma(struct vm_area_struct **,
+               unsigned long *addrp);
+static inline bool vma_is_shared(const struct vm_area_struct *vma)
+{
+       return vma->vm_flags & VM_SHARED_PT;
+}
 #endif /* __MM_INTERNAL_H */
index 3ecad55103adb89bc6ff213e6ce617745a966990..e6c5af86bb0d8cb6ced03ed0227a52ac7e4cbea8 100644 (file)
@@ -4367,6 +4367,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                unsigned int flags)
 {
        vm_fault_t ret;
+       bool shared = false;
 
        __set_current_state(TASK_RUNNING);
 
@@ -4376,6 +4377,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
        /* do counter updates before entering really critical section. */
        check_sync_rss_stat(current);
 
+       if (unlikely(vma_is_shared(vma))) {
+               ret = find_shared_vma(&vma, &address);
+               if (ret)
+                       return ret;
+               if (!vma)
+                       return VM_FAULT_SIGSEGV;
+               shared = true;
+       }
+
        if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
                                            flags & FAULT_FLAG_INSTRUCTION,
                                            flags & FAULT_FLAG_REMOTE))
@@ -4393,6 +4403,9 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
        else
                ret = __handle_mm_fault(vma, address, flags);
 
+       if (shared)
+               mmap_read_unlock(vma->vm_mm);
+
        if (flags & FAULT_FLAG_USER) {
                mem_cgroup_exit_user_fault();
                /*
index 75eb0796584f5a94e5333fe5186f564894b974e2..f79e6519c94a9b85663dad478b7284b5d06253c9 100644 (file)
@@ -2,6 +2,33 @@
 #include <linux/fs.h>
 #include <linux/sched/mm.h>
 #include <linux/syscalls.h>
+#include "internal.h"
+
+/* Returns holding the guest mm's lock for read.  Caller must release. */
+vm_fault_t find_shared_vma(struct vm_area_struct **vmap, unsigned long *addrp)
+{
+       struct vm_area_struct *vma, *host = *vmap;
+       struct mm_struct *mm = host->vm_private_data;
+       unsigned long guest_addr = *addrp - host->vm_start;
+       pgd_t pgd = *pgd_offset(mm, guest_addr);
+       pgd_t *host_pgd = pgd_offset(current->mm, *addrp);
+
+       if (!pgd_same(*host_pgd, pgd)) {
+               set_pgd(host_pgd, pgd);
+               return VM_FAULT_NOPAGE;
+       }
+
+       mmap_read_lock(mm);
+       vma = find_vma(mm, guest_addr);
+
+       /* XXX: expand stack? */
+       if (vma && vma->vm_start > guest_addr)
+               vma = NULL;
+
+       *addrp = guest_addr;
+       *vmap = vma;
+       return 0;
+}
 
 static ssize_t mshare_read(struct kiocb *iocb, struct iov_iter *iov)
 {
@@ -17,6 +44,18 @@ static ssize_t mshare_read(struct kiocb *iocb, struct iov_iter *iov)
        return ret;
 }
 
+static int mshare_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       struct mm_struct *mm = file->private_data;
+
+       if ((vma->vm_start | vma->vm_end) & (PGDIR_SIZE - 1))
+               return -EINVAL;
+
+       vma->vm_flags |= VM_SHARED_PT;
+       vma->vm_private_data = mm;
+       return 0;
+}
+
 static int mshare_release(struct inode *inode, struct file *file)
 {
        struct mm_struct *mm = file->private_data;
@@ -28,6 +67,7 @@ static int mshare_release(struct inode *inode, struct file *file)
 
 static const struct file_operations mshare_fops = {
        .read_iter = mshare_read,
+       .mmap = mshare_mmap,
        .release = mshare_release,
 };
 
@@ -35,7 +75,9 @@ SYSCALL_DEFINE3(mshare, unsigned long, addr, unsigned long, len,
                unsigned long, flags)
 {
        struct mm_struct *mm;
+       struct vm_area_struct *vma;
        int fd;
+       int i = 0;
 
        if ((addr | len) & (PGDIR_SIZE - 1))
                return -EINVAL;
@@ -50,7 +92,30 @@ SYSCALL_DEFINE3(mshare, unsigned long, addr, unsigned long, len,
        if (!mm->task_size)
                mm->task_size--;
 
-       fd = anon_inode_getfd("mshare", &mshare_fops, mm, O_RDWR);
+       mmap_write_lock(current->mm);
+
+       vma = find_vma(current->mm, addr + len);
+       if (vma && vma->vm_start < addr + len)
+               goto unlock;
+       vma = find_vma(current->mm, addr);
+       if (vma && vma->vm_start < addr)
+               goto unlock;
+
+       while (addr < mm->task_size) {
+               mm->pgd[i++] = *pgd_offset(current->mm, addr);
+               addr += PGDIR_SIZE;
+       }
+       mmap_write_unlock(current->mm);
 
+       fd = anon_inode_getfd("mshare", &mshare_fops, mm, O_RDWR);
+       if (fd < 0)
+               goto nofd;
+out:
        return fd;
+unlock:
+       mmap_write_unlock(current->mm);
+       fd = -EINVAL;
+nofd:
+       mmput(mm);
+       goto out;
 }