mm: introduce memfd_secret system call to create "secret" memory areas
authorMike Rapoport <rppt@linux.ibm.com>
Thu, 31 Dec 2020 22:05:03 +0000 (22:05 +0000)
committerJohannes Weiner <hannes@cmpxchg.org>
Thu, 31 Dec 2020 22:05:03 +0000 (22:05 +0000)
Introduce "memfd_secret" system call with the ability to create memory
areas visible only in the context of the owning process and not mapped not
only to other processes but in the kernel page tables as well.

The user will create a file descriptor using the memfd_secret() system
call. The memory areas created by mmap() calls from this file descriptor
will be unmapped from the kernel direct map and they will be only mapped in
the page table of the owning mm.

The secret memory remains accessible in the process context using uaccess
primitives, but it is not accessible using direct/linear map addresses.

Functions in the follow_page()/get_user_page() family will refuse to return
a page that belongs to the secret memory area.

A page that was a part of the secret memory area is cleared when it is
freed.

The following example demonstrates creation of a secret mapping (error
handling is omitted):

fd = memfd_secret(0);
ftruncate(fd, MAP_SIZE);
ptr = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);

Link: https://lkml.kernel.org/r/20201203062949.5484-6-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Hagen Paul Pfeifer <hagen@jauu.net>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Bottomley <jejb@linux.ibm.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Palmer Dabbelt <palmerdabbelt@google.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tycho Andersen <tycho@tycho.ws>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
arch/x86/Kconfig
include/linux/secretmem.h [new file with mode: 0644]
include/uapi/linux/magic.h
kernel/sys_ni.c
mm/Kconfig
mm/Makefile
mm/gup.c
mm/secretmem.c [new file with mode: 0644]

index 6ea13f4ad07e90ef0722aac3a40c32c6df59c896..42d84d86f1f4642606735aa7ce626ac394d799c9 100644 (file)
@@ -42,7 +42,7 @@ config FORCE_DYNAMIC_FTRACE
         in order to test the non static function tracing in the
         generic code, as other architectures still use it. But we
         only need to keep it around for x86_64. No need to keep it
-        for x86_32. For x86_32, force DYNAMIC_FTRACE. 
+        for x86_32. For x86_32, force DYNAMIC_FTRACE.
 #
 # Arch settings
 #
diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
new file mode 100644 (file)
index 0000000..70e7db9
--- /dev/null
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _LINUX_SECRETMEM_H
+#define _LINUX_SECRETMEM_H
+
+#ifdef CONFIG_SECRETMEM
+
+bool vma_is_secretmem(struct vm_area_struct *vma);
+bool page_is_secretmem(struct page *page);
+
+#else
+
+static inline bool vma_is_secretmem(struct vm_area_struct *vma)
+{
+       return false;
+}
+
+static inline bool page_is_secretmem(struct page *page)
+{
+       return false;
+}
+
+#endif /* CONFIG_SECRETMEM */
+
+#endif /* _LINUX_SECRETMEM_H */
index f3956fc11de68cf20e535449cb7200ce20207c8b..35687dcb1a42952ab6ecebee1f1b620f21dd510e 100644 (file)
@@ -97,5 +97,6 @@
 #define DEVMEM_MAGIC           0x454d444d      /* "DMEM" */
 #define Z3FOLD_MAGIC           0x33
 #define PPC_CMM_MAGIC          0xc7571590
+#define SECRETMEM_MAGIC                0x5345434d      /* "SECM" */
 
 #endif /* __LINUX_MAGIC_H__ */
index 769ad6225ab14234265d47b5f9248cfb35432fa7..869aa6b5bf3454ecdb1912ad7773a86291669297 100644 (file)
@@ -355,6 +355,8 @@ COND_SYSCALL(pkey_mprotect);
 COND_SYSCALL(pkey_alloc);
 COND_SYSCALL(pkey_free);
 
+/* memfd_secret */
+COND_SYSCALL(memfd_secret);
 
 /*
  * Architecture specific weak syscall entries.
index f730605b8dcf4971f37a01fbd311dd6d82d44422..7204ee9041850b5898ebd2339191f7f78a35d793 100644 (file)
@@ -875,4 +875,7 @@ config MAPPING_DIRTY_HELPERS
 config KMAP_LOCAL
        bool
 
+config SECRETMEM
+       def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
+
 endmenu
index a1af02ba8f3f10a5b131af574652e0d4e3cb05c4..6b581f8337e832700344b18ce4e3c8275f7bbac9 100644 (file)
@@ -121,3 +121,4 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o
 obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
 obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
 obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
+obj-$(CONFIG_SECRETMEM) += secretmem.o
index e4c224cd9661f9c4d1d8372a37a19684cca3e3d0..3e086b073624e8248af19aef5902e56f624f4432 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -10,6 +10,7 @@
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/secretmem.h>
 
 #include <linux/sched/signal.h>
 #include <linux/rwsem.h>
@@ -759,6 +760,9 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
        struct follow_page_context ctx = { NULL };
        struct page *page;
 
+       if (vma_is_secretmem(vma))
+               return NULL;
+
        page = follow_page_mask(vma, address, foll_flags, &ctx);
        if (ctx.pgmap)
                put_dev_pagemap(ctx.pgmap);
@@ -892,6 +896,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
        if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
                return -EOPNOTSUPP;
 
+       if (vma_is_secretmem(vma))
+               return -EFAULT;
+
        if (write) {
                if (!(vm_flags & VM_WRITE)) {
                        if (!(gup_flags & FOLL_FORCE))
@@ -2031,6 +2038,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
                page = pte_page(pte);
 
+               if (page_is_secretmem(page))
+                       goto pte_unmap;
+
                head = try_grab_compound_head(page, 1, flags);
                if (!head)
                        goto pte_unmap;
diff --git a/mm/secretmem.c b/mm/secretmem.c
new file mode 100644 (file)
index 0000000..781aaac
--- /dev/null
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corporation, 2020
+ *
+ * Author: Mike Rapoport <rppt@linux.ibm.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/memfd.h>
+#include <linux/bitops.h>
+#include <linux/printk.h>
+#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/pseudo_fs.h>
+#include <linux/secretmem.h>
+#include <linux/set_memory.h>
+#include <linux/sched/signal.h>
+
+#include <uapi/linux/magic.h>
+
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) "secretmem: " fmt
+
+/*
+ * Define mode and flag masks to allow validation of the system call
+ * parameters.
+ */
+#define SECRETMEM_MODE_MASK    (0x0)
+#define SECRETMEM_FLAGS_MASK   SECRETMEM_MODE_MASK
+
+struct secretmem_ctx {
+       unsigned int mode;
+};
+
+static struct page *secretmem_alloc_page(gfp_t gfp)
+{
+       /*
+        * FIXME: use a cache of large pages to reduce the direct map
+        * fragmentation
+        */
+       return alloc_page(gfp);
+}
+
+static vm_fault_t secretmem_fault(struct vm_fault *vmf)
+{
+       struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+       struct inode *inode = file_inode(vmf->vma->vm_file);
+       pgoff_t offset = vmf->pgoff;
+       vm_fault_t ret = 0;
+       unsigned long addr;
+       struct page *page;
+       int err;
+
+       if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
+               return vmf_error(-EINVAL);
+
+       page = find_get_page(mapping, offset);
+       if (!page) {
+
+               page = secretmem_alloc_page(vmf->gfp_mask);
+               if (!page)
+                       return vmf_error(-ENOMEM);
+
+               err = add_to_page_cache(page, mapping, offset, vmf->gfp_mask);
+               if (unlikely(err))
+                       goto err_put_page;
+
+               err = set_direct_map_invalid_noflush(page, 1);
+               if (err)
+                       goto err_del_page_cache;
+
+               addr = (unsigned long)page_address(page);
+               flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+
+               __SetPageUptodate(page);
+
+               ret = VM_FAULT_LOCKED;
+       }
+
+       vmf->page = page;
+       return ret;
+
+err_del_page_cache:
+       delete_from_page_cache(page);
+err_put_page:
+       put_page(page);
+       return vmf_error(err);
+}
+
+static const struct vm_operations_struct secretmem_vm_ops = {
+       .fault = secretmem_fault,
+};
+
+static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       unsigned long len = vma->vm_end - vma->vm_start;
+
+       if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
+               return -EINVAL;
+
+       if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
+               return -EAGAIN;
+
+       vma->vm_ops = &secretmem_vm_ops;
+       vma->vm_flags |= VM_LOCKED;
+
+       return 0;
+}
+
+bool vma_is_secretmem(struct vm_area_struct *vma)
+{
+       return vma->vm_ops == &secretmem_vm_ops;
+}
+
+static const struct file_operations secretmem_fops = {
+       .mmap           = secretmem_mmap,
+};
+
+static bool secretmem_isolate_page(struct page *page, isolate_mode_t mode)
+{
+       return false;
+}
+
+static int secretmem_migratepage(struct address_space *mapping,
+                                struct page *newpage, struct page *page,
+                                enum migrate_mode mode)
+{
+       return -EBUSY;
+}
+
+static void secretmem_freepage(struct page *page)
+{
+       set_direct_map_default_noflush(page, 1);
+       clear_highpage(page);
+}
+
+static const struct address_space_operations secretmem_aops = {
+       .freepage       = secretmem_freepage,
+       .migratepage    = secretmem_migratepage,
+       .isolate_page   = secretmem_isolate_page,
+};
+
+bool page_is_secretmem(struct page *page)
+{
+       struct address_space *mapping = page_mapping(page);
+
+       if (!mapping)
+               return false;
+
+       return mapping->a_ops == &secretmem_aops;
+}
+
+static struct vfsmount *secretmem_mnt;
+
+static struct file *secretmem_file_create(unsigned long flags)
+{
+       struct file *file = ERR_PTR(-ENOMEM);
+       struct secretmem_ctx *ctx;
+       struct inode *inode;
+
+       inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
+       if (IS_ERR(inode))
+               return ERR_CAST(inode);
+
+       ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+       if (!ctx)
+               goto err_free_inode;
+
+       file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
+                                O_RDWR, &secretmem_fops);
+       if (IS_ERR(file))
+               goto err_free_ctx;
+
+       mapping_set_unevictable(inode->i_mapping);
+
+       inode->i_mapping->private_data = ctx;
+       inode->i_mapping->a_ops = &secretmem_aops;
+
+       /* pretend we are a normal file with zero size */
+       inode->i_mode |= S_IFREG;
+       inode->i_size = 0;
+
+       file->private_data = ctx;
+
+       ctx->mode = flags & SECRETMEM_MODE_MASK;
+
+       return file;
+
+err_free_ctx:
+       kfree(ctx);
+err_free_inode:
+       iput(inode);
+       return file;
+}
+
+SYSCALL_DEFINE1(memfd_secret, unsigned long, flags)
+{
+       struct file *file;
+       int fd, err;
+
+       /* make sure local flags do not confict with global fcntl.h */
+       BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC);
+
+       if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC))
+               return -EINVAL;
+
+       fd = get_unused_fd_flags(flags & O_CLOEXEC);
+       if (fd < 0)
+               return fd;
+
+       file = secretmem_file_create(flags);
+       if (IS_ERR(file)) {
+               err = PTR_ERR(file);
+               goto err_put_fd;
+       }
+
+       file->f_flags |= O_LARGEFILE;
+
+       fd_install(fd, file);
+       return fd;
+
+err_put_fd:
+       put_unused_fd(fd);
+       return err;
+}
+
+static void secretmem_evict_inode(struct inode *inode)
+{
+       struct secretmem_ctx *ctx = inode->i_private;
+
+       truncate_inode_pages_final(&inode->i_data);
+       clear_inode(inode);
+       kfree(ctx);
+}
+
+static const struct super_operations secretmem_super_ops = {
+       .evict_inode = secretmem_evict_inode,
+};
+
+static int secretmem_init_fs_context(struct fs_context *fc)
+{
+       struct pseudo_fs_context *ctx = init_pseudo(fc, SECRETMEM_MAGIC);
+
+       if (!ctx)
+               return -ENOMEM;
+       ctx->ops = &secretmem_super_ops;
+
+       return 0;
+}
+
+static struct file_system_type secretmem_fs = {
+       .name           = "secretmem",
+       .init_fs_context = secretmem_init_fs_context,
+       .kill_sb        = kill_anon_super,
+};
+
+static int secretmem_init(void)
+{
+       int ret = 0;
+
+       secretmem_mnt = kern_mount(&secretmem_fs);
+       if (IS_ERR(secretmem_mnt))
+               ret = PTR_ERR(secretmem_mnt);
+
+       return ret;
+}
+fs_initcall(secretmem_init);