]> www.infradead.org Git - users/willy/linux.git/commitdiff
RDMA/core: Create mmap database and cookie helper functions
authorMichal Kalderon <michal.kalderon@marvell.com>
Wed, 30 Oct 2019 09:44:11 +0000 (11:44 +0200)
committerJason Gunthorpe <jgg@mellanox.com>
Wed, 6 Nov 2019 17:08:00 +0000 (13:08 -0400)
Create some common API's for adding entries to a xa_mmap. Searching for
an entry and freeing one.

The general approach is copied from the EFA driver and improved to be more
general and do more to help the drivers. Integration with the core allows
a reference counted scheme with a free function so that the driver can
know when its mmaps are all gone.

This significant new functionality will be helpful for drivers to have the
correct lifetime model for mmap objects.

Link: https://lore.kernel.org/r/20191030094417.16866-3-michal.kalderon@marvell.com
Signed-off-by: Ariel Elior <ariel.elior@marvell.com>
Signed-off-by: Michal Kalderon <michal.kalderon@marvell.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
drivers/infiniband/core/device.c
drivers/infiniband/core/ib_core_uverbs.c
drivers/infiniband/core/rdma_core.c
drivers/infiniband/core/uverbs_cmd.c
include/rdma/ib_verbs.h

index f8d383ceae054fa7cfc17de312bc7335c5f53c7f..e785bebaf16e21801cbd79adaf70e57544419b7a 100644 (file)
@@ -2642,6 +2642,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
        SET_DEVICE_OP(dev_ops, map_mr_sg_pi);
        SET_DEVICE_OP(dev_ops, map_phys_fmr);
        SET_DEVICE_OP(dev_ops, mmap);
+       SET_DEVICE_OP(dev_ops, mmap_free);
        SET_DEVICE_OP(dev_ops, modify_ah);
        SET_DEVICE_OP(dev_ops, modify_cq);
        SET_DEVICE_OP(dev_ops, modify_device);
index b74d2a2fb342c3350ce85d417fbec4fa0cbea320..aacd84a45de624d3f7cb8aaa6b92025687b50f4d 100644 (file)
@@ -71,3 +71,239 @@ int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
        return 0;
 }
 EXPORT_SYMBOL(rdma_user_mmap_io);
+
+/**
+ * rdma_user_mmap_entry_get_pgoff() - Get an entry from the mmap_xa
+ *
+ * @ucontext: associated user context
+ * @pgoff: The mmap offset >> PAGE_SHIFT
+ *
+ * This function is called when a user tries to mmap with an offset (returned
+ * by rdma_user_mmap_get_offset()) it initially received from the driver. The
+ * rdma_user_mmap_entry was created by the function
+ * rdma_user_mmap_entry_insert().  This function increases the refcnt of the
+ * entry so that it won't be deleted from the xarray in the meantime.
+ *
+ * Return an reference to an entry if exists or NULL if there is no
+ * match. rdma_user_mmap_entry_put() must be called to put the reference.
+ */
+struct rdma_user_mmap_entry *
+rdma_user_mmap_entry_get_pgoff(struct ib_ucontext *ucontext,
+                              unsigned long pgoff)
+{
+       struct rdma_user_mmap_entry *entry;
+
+       if (pgoff > U32_MAX)
+               return NULL;
+
+       xa_lock(&ucontext->mmap_xa);
+
+       entry = xa_load(&ucontext->mmap_xa, pgoff);
+
+       /*
+        * If refcount is zero, entry is already being deleted, driver_removed
+        * indicates that the no further mmaps are possible and we waiting for
+        * the active VMAs to be closed.
+        */
+       if (!entry || entry->start_pgoff != pgoff || entry->driver_removed ||
+           !kref_get_unless_zero(&entry->ref))
+               goto err;
+
+       xa_unlock(&ucontext->mmap_xa);
+
+       ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] returned\n",
+                 pgoff, entry->npages);
+
+       return entry;
+
+err:
+       xa_unlock(&ucontext->mmap_xa);
+       return NULL;
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_get_pgoff);
+
+/**
+ * rdma_user_mmap_entry_get() - Get an entry from the mmap_xa
+ *
+ * @ucontext: associated user context
+ * @vma: the vma being mmap'd into
+ *
+ * This function is like rdma_user_mmap_entry_get_pgoff() except that it also
+ * checks that the VMA is correct.
+ */
+struct rdma_user_mmap_entry *
+rdma_user_mmap_entry_get(struct ib_ucontext *ucontext,
+                        struct vm_area_struct *vma)
+{
+       struct rdma_user_mmap_entry *entry;
+
+       if (!(vma->vm_flags & VM_SHARED))
+               return NULL;
+       entry = rdma_user_mmap_entry_get_pgoff(ucontext, vma->vm_pgoff);
+       if (!entry)
+               return NULL;
+       if (entry->npages * PAGE_SIZE != vma->vm_end - vma->vm_start) {
+               rdma_user_mmap_entry_put(entry);
+               return NULL;
+       }
+       return entry;
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_get);
+
+static void rdma_user_mmap_entry_free(struct kref *kref)
+{
+       struct rdma_user_mmap_entry *entry =
+               container_of(kref, struct rdma_user_mmap_entry, ref);
+       struct ib_ucontext *ucontext = entry->ucontext;
+       unsigned long i;
+
+       /*
+        * Erase all entries occupied by this single entry, this is deferred
+        * until all VMA are closed so that the mmap offsets remain unique.
+        */
+       xa_lock(&ucontext->mmap_xa);
+       for (i = 0; i < entry->npages; i++)
+               __xa_erase(&ucontext->mmap_xa, entry->start_pgoff + i);
+       xa_unlock(&ucontext->mmap_xa);
+
+       ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] removed\n",
+                 entry->start_pgoff, entry->npages);
+
+       if (ucontext->device->ops.mmap_free)
+               ucontext->device->ops.mmap_free(entry);
+}
+
+/**
+ * rdma_user_mmap_entry_put() - Drop reference to the mmap entry
+ *
+ * @entry: an entry in the mmap_xa
+ *
+ * This function is called when the mapping is closed if it was
+ * an io mapping or when the driver is done with the entry for
+ * some other reason.
+ * Should be called after rdma_user_mmap_entry_get was called
+ * and entry is no longer needed. This function will erase the
+ * entry and free it if its refcnt reaches zero.
+ */
+void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry)
+{
+       kref_put(&entry->ref, rdma_user_mmap_entry_free);
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_put);
+
+/**
+ * rdma_user_mmap_entry_remove() - Drop reference to entry and
+ *                                mark it as unmmapable
+ *
+ * @entry: the entry to insert into the mmap_xa
+ *
+ * Drivers can call this to prevent userspace from creating more mappings for
+ * entry, however existing mmaps continue to exist and ops->mmap_free() will
+ * not be called until all user mmaps are destroyed.
+ */
+void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry)
+{
+       if (!entry)
+               return;
+
+       entry->driver_removed = true;
+       kref_put(&entry->ref, rdma_user_mmap_entry_free);
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_remove);
+
+/**
+ * rdma_user_mmap_entry_insert() - Insert an entry to the mmap_xa
+ *
+ * @ucontext: associated user context.
+ * @entry: the entry to insert into the mmap_xa
+ * @length: length of the address that will be mmapped
+ *
+ * This function should be called by drivers that use the rdma_user_mmap
+ * interface for implementing their mmap syscall A database of mmap offsets is
+ * handled in the core and helper functions are provided to insert entries
+ * into the database and extract entries when the user calls mmap with the
+ * given offset.  The function allocates a unique page offset that should be
+ * provided to user, the user will use the offset to retrieve information such
+ * as address to be mapped and how.
+ *
+ * Return: 0 on success and -ENOMEM on failure
+ */
+int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext,
+                               struct rdma_user_mmap_entry *entry,
+                               size_t length)
+{
+       struct ib_uverbs_file *ufile = ucontext->ufile;
+       XA_STATE(xas, &ucontext->mmap_xa, 0);
+       u32 xa_first, xa_last, npages;
+       int err;
+       u32 i;
+
+       if (!entry)
+               return -EINVAL;
+
+       kref_init(&entry->ref);
+       entry->ucontext = ucontext;
+
+       /*
+        * We want the whole allocation to be done without interruption from a
+        * different thread. The allocation requires finding a free range and
+        * storing. During the xa_insert the lock could be released, possibly
+        * allowing another thread to choose the same range.
+        */
+       mutex_lock(&ufile->umap_lock);
+
+       xa_lock(&ucontext->mmap_xa);
+
+       /* We want to find an empty range */
+       npages = (u32)DIV_ROUND_UP(length, PAGE_SIZE);
+       entry->npages = npages;
+       while (true) {
+               /* First find an empty index */
+               xas_find_marked(&xas, U32_MAX, XA_FREE_MARK);
+               if (xas.xa_node == XAS_RESTART)
+                       goto err_unlock;
+
+               xa_first = xas.xa_index;
+
+               /* Is there enough room to have the range? */
+               if (check_add_overflow(xa_first, npages, &xa_last))
+                       goto err_unlock;
+
+               /*
+                * Now look for the next present entry. If an entry doesn't
+                * exist, we found an empty range and can proceed.
+                */
+               xas_next_entry(&xas, xa_last - 1);
+               if (xas.xa_node == XAS_BOUNDS || xas.xa_index >= xa_last)
+                       break;
+       }
+
+       for (i = xa_first; i < xa_last; i++) {
+               err = __xa_insert(&ucontext->mmap_xa, i, entry, GFP_KERNEL);
+               if (err)
+                       goto err_undo;
+       }
+
+       /*
+        * Internally the kernel uses a page offset, in libc this is a byte
+        * offset. Drivers should not return pgoff to userspace.
+        */
+       entry->start_pgoff = xa_first;
+       xa_unlock(&ucontext->mmap_xa);
+       mutex_unlock(&ufile->umap_lock);
+
+       ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#x] inserted\n",
+                 entry->start_pgoff, npages);
+
+       return 0;
+
+err_undo:
+       for (; i > xa_first; i--)
+               __xa_erase(&ucontext->mmap_xa, i - 1);
+
+err_unlock:
+       xa_unlock(&ucontext->mmap_xa);
+       mutex_unlock(&ufile->umap_lock);
+       return -ENOMEM;
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_insert);
index ccf4d069c25c995a1078b31c3892c9585a3143c2..6c72773faf2911406191b67c76caef4da54a04d7 100644 (file)
@@ -817,6 +817,7 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile,
        rdma_restrack_del(&ucontext->res);
 
        ib_dev->ops.dealloc_ucontext(ucontext);
+       WARN_ON(!xa_empty(&ucontext->mmap_xa));
        kfree(ucontext);
 
        ufile->ucontext = NULL;
index 14a80fd9f464733ada16c52ab492abc95e92210d..06ed32c8662facea14196c44349b2a0f0c757d05 100644 (file)
@@ -252,6 +252,8 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
        ucontext->closing = false;
        ucontext->cleanup_retryable = false;
 
+       xa_init_flags(&ucontext->mmap_xa, XA_FLAGS_ALLOC);
+
        ret = get_unused_fd_flags(O_CLOEXEC);
        if (ret < 0)
                goto err_free;
index 0626b62ed107d519603ca239866607458e19af29..8865ec28180aa3133fc7e8586944f567fe712c57 100644 (file)
@@ -1473,6 +1473,7 @@ struct ib_ucontext {
         * Implementation details of the RDMA core, don't use in drivers:
         */
        struct rdma_restrack_entry res;
+       struct xarray mmap_xa;
 };
 
 struct ib_uobject {
@@ -2258,6 +2259,21 @@ struct iw_cm_conn_param;
 
 #define DECLARE_RDMA_OBJ_SIZE(ib_struct) size_t size_##ib_struct
 
+struct rdma_user_mmap_entry {
+       struct kref ref;
+       struct ib_ucontext *ucontext;
+       unsigned long start_pgoff;
+       size_t npages;
+       bool driver_removed;
+};
+
+/* Return the offset (in bytes) the user should pass to libc's mmap() */
+static inline u64
+rdma_user_mmap_get_offset(const struct rdma_user_mmap_entry *entry)
+{
+       return (u64)entry->start_pgoff << PAGE_SHIFT;
+}
+
 /**
  * struct ib_device_ops - InfiniBand device operations
  * This structure defines all the InfiniBand device operations, providers will
@@ -2370,6 +2386,13 @@ struct ib_device_ops {
                              struct ib_udata *udata);
        void (*dealloc_ucontext)(struct ib_ucontext *context);
        int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma);
+       /**
+        * This will be called once refcount of an entry in mmap_xa reaches
+        * zero. The type of the memory that was mapped may differ between
+        * entries and is opaque to the rdma_user_mmap interface.
+        * Therefore needs to be implemented by the driver in mmap_free.
+        */
+       void (*mmap_free)(struct rdma_user_mmap_entry *entry);
        void (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
        int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata);
        void (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata);
@@ -2815,6 +2838,18 @@ static inline int rdma_user_mmap_io(struct ib_ucontext *ucontext,
        return -EINVAL;
 }
 #endif
+int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext,
+                               struct rdma_user_mmap_entry *entry,
+                               size_t length);
+struct rdma_user_mmap_entry *
+rdma_user_mmap_entry_get_pgoff(struct ib_ucontext *ucontext,
+                              unsigned long pgoff);
+struct rdma_user_mmap_entry *
+rdma_user_mmap_entry_get(struct ib_ucontext *ucontext,
+                        struct vm_area_struct *vma);
+void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry);
+
+void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry);
 
 static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len)
 {