]> www.infradead.org Git - users/hch/block.git/commitdiff
RDMA/umem: Store ODP access mask information in PFN
authorLeon Romanovsky <leonro@nvidia.com>
Tue, 28 Nov 2023 13:56:53 +0000 (15:56 +0200)
committerLeon Romanovsky <leon@kernel.org>
Thu, 3 Oct 2024 16:05:52 +0000 (19:05 +0300)
As a preparation to remove of dma_list, store access mask in PFN pointer
and not in dma_addr_t.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
drivers/infiniband/core/umem_odp.c
drivers/infiniband/hw/mlx5/mlx5_ib.h
drivers/infiniband/hw/mlx5/odp.c
include/rdma/ib_umem_odp.h

index 01cbf7f55b3ae4158cd0266178d84e8713f7701b..72885eca4181e0a12411f5d5625e26d81128501a 100644 (file)
@@ -307,22 +307,11 @@ EXPORT_SYMBOL(ib_umem_odp_release);
 static int ib_umem_odp_map_dma_single_page(
                struct ib_umem_odp *umem_odp,
                unsigned int dma_index,
-               struct page *page,
-               u64 access_mask)
+               struct page *page)
 {
        struct ib_device *dev = umem_odp->umem.ibdev;
        dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
 
-       if (*dma_addr) {
-               /*
-                * If the page is already dma mapped it means it went through
-                * a non-invalidating trasition, like read-only to writable.
-                * Resync the flags.
-                */
-               *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask;
-               return 0;
-       }
-
        *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
                                    DMA_BIDIRECTIONAL);
        if (ib_dma_mapping_error(dev, *dma_addr)) {
@@ -330,7 +319,6 @@ static int ib_umem_odp_map_dma_single_page(
                return -EFAULT;
        }
        umem_odp->npages++;
-       *dma_addr |= access_mask;
        return 0;
 }
 
@@ -366,9 +354,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
        struct hmm_range range = {};
        unsigned long timeout;
 
-       if (access_mask == 0)
-               return -EINVAL;
-
        if (user_virt < ib_umem_start(umem_odp) ||
            user_virt + bcnt > ib_umem_end(umem_odp))
                return -EFAULT;
@@ -394,7 +379,7 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
        if (fault) {
                range.default_flags = HMM_PFN_REQ_FAULT;
 
-               if (access_mask & ODP_WRITE_ALLOWED_BIT)
+               if (access_mask & HMM_PFN_WRITE)
                        range.default_flags |= HMM_PFN_REQ_WRITE;
        }
 
@@ -426,22 +411,17 @@ retry:
        for (pfn_index = 0; pfn_index < num_pfns;
                pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {
 
-               if (fault) {
-                       /*
-                        * Since we asked for hmm_range_fault() to populate
-                        * pages it shouldn't return an error entry on success.
-                        */
-                       WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
-                       WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
-               } else {
-                       if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) {
-                               WARN_ON(umem_odp->dma_list[dma_index]);
-                               continue;
-                       }
-                       access_mask = ODP_READ_ALLOWED_BIT;
-                       if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE)
-                               access_mask |= ODP_WRITE_ALLOWED_BIT;
-               }
+               /*
+                * Since we asked for hmm_range_fault() to populate
+                * pages it shouldn't return an error entry on success.
+                */
+               WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
+               WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
+               if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID))
+                       continue;
+
+               if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED)
+                       continue;
 
                hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
                /* If a hugepage was detected and ODP wasn't set for, the umem
@@ -456,13 +436,13 @@ retry:
                }
 
                ret = ib_umem_odp_map_dma_single_page(
-                               umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]),
-                               access_mask);
+                               umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]));
                if (ret < 0) {
                        ibdev_dbg(umem_odp->umem.ibdev,
                                  "ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
                        break;
                }
+               range.hmm_pfns[pfn_index] |= HMM_PFN_DMA_MAPPED;
        }
        /* upon success lock should stay on hold for the callee */
        if (!ret)
@@ -482,7 +462,6 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
 void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
                                 u64 bound)
 {
-       dma_addr_t dma_addr;
        dma_addr_t dma;
        int idx;
        u64 addr;
@@ -493,34 +472,33 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
        virt = max_t(u64, virt, ib_umem_start(umem_odp));
        bound = min_t(u64, bound, ib_umem_end(umem_odp));
        for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
+               unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
+               struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
+
                idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
                dma = umem_odp->dma_list[idx];
 
-               /* The access flags guaranteed a valid DMA address in case was NULL */
-               if (dma) {
-                       unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
-                       struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
-
-                       dma_addr = dma & ODP_DMA_ADDR_MASK;
-                       ib_dma_unmap_page(dev, dma_addr,
-                                         BIT(umem_odp->page_shift),
-                                         DMA_BIDIRECTIONAL);
-                       if (dma & ODP_WRITE_ALLOWED_BIT) {
-                               struct page *head_page = compound_head(page);
-                               /*
-                                * set_page_dirty prefers being called with
-                                * the page lock. However, MMU notifiers are
-                                * called sometimes with and sometimes without
-                                * the lock. We rely on the umem_mutex instead
-                                * to prevent other mmu notifiers from
-                                * continuing and allowing the page mapping to
-                                * be removed.
-                                */
-                               set_page_dirty(head_page);
-                       }
-                       umem_odp->dma_list[idx] = 0;
-                       umem_odp->npages--;
+               if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_VALID))
+                       continue;
+               if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_DMA_MAPPED))
+                       continue;
+
+               ib_dma_unmap_page(dev, dma, BIT(umem_odp->page_shift),
+                                 DMA_BIDIRECTIONAL);
+               if (umem_odp->pfn_list[pfn_idx] & HMM_PFN_WRITE) {
+                       struct page *head_page = compound_head(page);
+                       /*
+                        * set_page_dirty prefers being called with
+                        * the page lock. However, MMU notifiers are
+                        * called sometimes with and sometimes without
+                        * the lock. We rely on the umem_mutex instead
+                        * to prevent other mmu notifiers from
+                        * continuing and allowing the page mapping to
+                        * be removed.
+                        */
+                       set_page_dirty(head_page);
                }
+               umem_odp->npages--;
        }
 }
 EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
index 23fd72f7f63df9fab1ba63cef098113d62648180..3e4aaa6319dbb1874d1f04eb44bee61e8a3eac59 100644 (file)
@@ -336,6 +336,7 @@ struct mlx5_ib_flow_db {
 #define MLX5_IB_UPD_XLT_PD           BIT(4)
 #define MLX5_IB_UPD_XLT_ACCESS       BIT(5)
 #define MLX5_IB_UPD_XLT_INDIRECT      BIT(6)
+#define MLX5_IB_UPD_XLT_DOWNGRADE     BIT(7)
 
 /* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
  *
index 4b37446758fd4efe5c776eef211038ee1abc4780..96eda5db45456b0453b65ca8f1cd44fa3e75ef55 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/kernel.h>
 #include <linux/dma-buf.h>
 #include <linux/dma-resv.h>
+#include <linux/hmm.h>
 
 #include "mlx5_ib.h"
 #include "cmd.h"
@@ -158,22 +159,12 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
        }
 }
 
-static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
-{
-       u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
-
-       if (umem_dma & ODP_READ_ALLOWED_BIT)
-               mtt_entry |= MLX5_IB_MTT_READ;
-       if (umem_dma & ODP_WRITE_ALLOWED_BIT)
-               mtt_entry |= MLX5_IB_MTT_WRITE;
-
-       return mtt_entry;
-}
-
 static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
                         struct mlx5_ib_mr *mr, int flags)
 {
        struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+       bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE;
+       unsigned long pfn;
        dma_addr_t pa;
        size_t i;
 
@@ -181,8 +172,17 @@ static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
                return;
 
        for (i = 0; i < nentries; i++) {
+               pfn = odp->pfn_list[idx + i];
+               if (!(pfn & HMM_PFN_VALID))
+                       /* Initial ODP init */
+                       continue;
+
                pa = odp->dma_list[idx + i];
-               pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
+               pa |= MLX5_IB_MTT_READ;
+               if ((pfn & HMM_PFN_WRITE) && !downgrade)
+                       pa |= MLX5_IB_MTT_WRITE;
+
+               pas[i] = cpu_to_be64(pa);
        }
 }
 
@@ -286,8 +286,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
                 * estimate the cost of another UMR vs. the cost of bigger
                 * UMR.
                 */
-               if (umem_odp->dma_list[idx] &
-                   (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
+               if (umem_odp->pfn_list[idx] & HMM_PFN_VALID) {
                        if (!in_block) {
                                blk_start_idx = idx;
                                in_block = 1;
@@ -668,7 +667,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
 {
        int page_shift, ret, np;
        bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
-       u64 access_mask;
+       u64 access_mask = 0;
        u64 start_idx;
        bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT);
        u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC;
@@ -676,12 +675,14 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
        if (flags & MLX5_PF_FLAGS_ENABLE)
                xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
 
+       if (flags & MLX5_PF_FLAGS_DOWNGRADE)
+               xlt_flags |= MLX5_IB_UPD_XLT_DOWNGRADE;
+
        page_shift = odp->page_shift;
        start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
-       access_mask = ODP_READ_ALLOWED_BIT;
 
        if (odp->umem.writable && !downgrade)
-               access_mask |= ODP_WRITE_ALLOWED_BIT;
+               access_mask |= HMM_PFN_WRITE;
 
        np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault);
        if (np < 0)
index c0c1215925eb74b652b59620d3cf871ba2e4518a..f99911b478c4ca9a0487cb041e7e7168a7103fbd 100644 (file)
@@ -8,6 +8,7 @@
 
 #include <rdma/ib_umem.h>
 #include <rdma/ib_verbs.h>
+#include <linux/hmm.h>
 
 struct ib_umem_odp {
        struct ib_umem umem;
@@ -68,19 +69,6 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
               umem_odp->page_shift;
 }
 
-/*
- * The lower 2 bits of the DMA address signal the R/W permissions for
- * the entry. To upgrade the permissions, provide the appropriate
- * bitmask to the map_dma_pages function.
- *
- * Be aware that upgrading a mapped address might result in change of
- * the DMA address for the page.
- */
-#define ODP_READ_ALLOWED_BIT  (1<<0ULL)
-#define ODP_WRITE_ALLOWED_BIT (1<<1ULL)
-
-#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT))
-
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 
 struct ib_umem_odp *