mlx4_ib: Use optimal numbers of MTT entries.

author Yishai Hadas <yishaih@mellanox.com>

Sun, 15 Apr 2012 14:57:24 +0000 (17:57 +0300)

committer Mukesh Kacker <mukesh.kacker@oracle.com>

Tue, 7 Jul 2015 21:38:08 +0000 (14:38 -0700)
author Yishai Hadas <yishaih@mellanox.com>
Sun, 15 Apr 2012 14:57:24 +0000 (17:57 +0300)
committer Mukesh Kacker <mukesh.kacker@oracle.com>
Tue, 7 Jul 2015 21:38:08 +0000 (14:38 -0700)
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c

index e0d271782d0a0012577a100e4a59c27f17f9366d..3519d4547b20ca5edd9ce757fcfbef0809baeafd 100644 (file)
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -86,50 +86,302 @@ err_free:
         return ERR_PTR(err);
  }
  
+static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev,
+                                               struct mlx4_mtt *mtt,
+                                               u64 mtt_size,
+                                               u64 mtt_shift,
+                                               u64 len,
+                                               u64 cur_start_addr,
+                                               u64 *pages,
+                                               int *start_index,
+                                               int *npages)
+{
+       int k;
+       int err = 0;
+       u64 mtt_entries;
+       u64 cur_end_addr = cur_start_addr + len;
+       u64 cur_end_addr_aligned = 0;
+
+       len += (cur_start_addr & (mtt_size-1ULL));
+       cur_end_addr_aligned = round_up(cur_end_addr, mtt_size);
+       len += (cur_end_addr_aligned - cur_end_addr);
+       if (len & (mtt_size-1ULL)) {
+               WARN(1 ,
+               "write_block: len %llx is not aligned to mtt_size %llx\n",
+                       len, mtt_size);
+               return -EINVAL;
+       }
+
+
+       mtt_entries = (len >> mtt_shift);
+
+       /* Align the MTT start address to
+               the mtt_size.
+               Required to handle cases when the MR
+               starts in the middle of an MTT record.
+               Was not required in old code since
+               the physical addresses provided by
+               the dma subsystem were page aligned,
+               which was also the MTT size.
+       */
+       cur_start_addr = round_down(cur_start_addr, mtt_size);
+       /* A new block is started ...*/
+       for (k = 0; k < mtt_entries; ++k) {
+               pages[*npages] = cur_start_addr + (mtt_size * k);
+               (*npages)++;
+               /*
+                * Be friendly to mlx4_write_mtt() and
+                * pass it chunks of appropriate size.
+                */
+               if (*npages == PAGE_SIZE / sizeof(u64)) {
+                       err = mlx4_write_mtt(dev->dev,
+                                       mtt, *start_index,
+                                       *npages, pages);
+                       if (err)
+                               return err;
+
+                       (*start_index) += *npages;
+                       *npages = 0;
+               }
+       }
+
+       return 0;
+}
+
  int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
                            struct ib_umem *umem)
  {
         u64 *pages;
-       int i, k, entry;
-       int n;
-       int len;
+       int entry;
+       u64 len = 0;
         int err = 0;
+       u64 mtt_size;
+       u64 cur_start_addr = 0;
+       u64 mtt_shift;
+       int start_index = 0;
+       int npages = 0;
         struct scatterlist *sg;
  
         pages = (u64 *) __get_free_page(GFP_KERNEL);
         if (!pages)
                 return -ENOMEM;
  
-       i = n = 0;
+       mtt_shift = mtt->page_shift;
+       mtt_size = 1ULL << mtt_shift;
  
         for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-               len = sg_dma_len(sg) >> mtt->page_shift;
-               for (k = 0; k < len; ++k) {
-                       pages[i++] = sg_dma_address(sg) +
-                               umem->page_size * k;
-                       /*
-                        * Be friendly to mlx4_write_mtt() and
-                        * pass it chunks of appropriate size.
-                        */
-                       if (i == PAGE_SIZE / sizeof (u64)) {
-                               err = mlx4_write_mtt(dev->dev, mtt, n,
-                                                    i, pages);
-                               if (err)
-                                       goto out;
-                               n += i;
-                               i = 0;
+                       if (cur_start_addr + len ==
+                           sg_dma_address(sg)) {
+                               /* still the same block */
+                               len += sg_dma_len(sg);
+                               continue;
                         }
-               }
+                       /* A new block is started ...*/
+                       /* If len is malaligned, write an extra mtt entry to
+                           cover the misaligned area (round up the division)
+                       */
+                       err = mlx4_ib_umem_write_mtt_block(dev,
+                                               mtt, mtt_size, mtt_shift,
+                                               len, cur_start_addr,
+                                               pages,
+                                               &start_index,
+                                               &npages);
+                       if (err)
+                               goto out;
+
+                       cur_start_addr =
+                               sg_dma_address(sg);
+                       len = sg_dma_len(sg);
+       }
+
+       /* Handle the last block */
+       if (len > 0) {
+               /*  If len is malaligned, write an extra mtt entry to cover
+                    the misaligned area (round up the division)
+               */
+               err = mlx4_ib_umem_write_mtt_block(dev,
+                                               mtt, mtt_size, mtt_shift,
+                                               len, cur_start_addr,
+                                               pages,
+                                               &start_index,
+                                               &npages);
+                       if (err)
+                               goto out;
         }
  
-       if (i)
-               err = mlx4_write_mtt(dev->dev, mtt, n, i, pages);
+
+       if (npages)
+               err = mlx4_write_mtt(dev->dev, mtt, start_index, npages, pages);
  
  out:
         free_page((unsigned long) pages);
         return err;
  }
  
+static inline u64 alignment_of(u64 ptr)
+{
+       return ilog2(ptr & (~(ptr-1)));
+}
+
+static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start,
+                                               u64 current_block_end,
+                                               u64 block_shift)
+{
+       /* Check whether the alignment of the new block
+            is aligned as well as the previous block.
+            Block address must start with zeros till size of entity_size.
+       */
+       if ((next_block_start & ((1ULL << block_shift) - 1ULL)) != 0)
+               /* It is not as well aligned as the
+               previous block-reduce the mtt size
+               accordingly.
+               Here we take the last right bit
+               which is 1.
+               */
+               block_shift = alignment_of(next_block_start);
+
+       /*  Check whether the alignment of the
+            end of previous block - is it aligned
+            as well as the start of the block
+       */
+       if (((current_block_end) & ((1ULL << block_shift) - 1ULL)) != 0)
+               /* It is not as well aligned as
+               the start of the block - reduce the
+               mtt size accordingly.
+               */
+               block_shift = alignment_of(current_block_end);
+
+       return block_shift;
+}
+
+/* Calculate optimal mtt size based on contiguous pages.
+* Function will return also the number of pages that are not aligned to the
+   calculated mtt_size to be added to total number
+    of pages. For that we should check the first chunk length & last chunk
+    length and if not aligned to mtt_size we should increment
+    the non_aligned_pages number.
+    All chunks in the middle already handled as part of mtt shift calculation
+    for both their start & end addresses.
+*/
+static int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem,
+                                               u64 start_va,
+                                               int *num_of_mtts)
+{
+       u64 block_shift = MLX4_MAX_MTT_SHIFT;
+       u64 current_block_len = 0;
+       u64 current_block_start = 0;
+       u64 misalignment_bits;
+       u64 first_block_start = 0;
+       u64 last_block_end = 0;
+       u64 total_len = 0;
+       u64 last_block_aligned_end = 0;
+       u64 min_shift = ilog2(umem->page_size);
+       struct scatterlist *sg;
+       int i;
+       u64 next_block_start;
+       u64 current_block_end;
+
+       for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
+               /* Initialization - save the first chunk start as
+                   the current_block_start - block means contiguous pages.
+               */
+               if (current_block_len == 0 && current_block_start == 0) {
+                       first_block_start = current_block_start =
+                               sg_dma_address(sg);
+                       /* Find the bits that are different between
+                           the physical address and the virtual
+                           address for the start of the MR.
+                       */
+                       /* umem_get aligned the start_va to a page
+                          boundry. Therefore, we need to align the
+                          start va to the same boundry */
+                       /* misalignment_bits is needed to handle the
+                          case of a single memory region. In this
+                          case, the rest of the logic will not reduce
+                          the block size.  If we use a block size
+                          which is bigger than the alignment of the
+                          misalignment bits, we might use the virtual
+                          page number instead of the physical page
+                          number, resulting in access to the wrong
+                          data. */
+                       misalignment_bits =
+                       (start_va & (~(((u64)(umem->page_size))-1ULL)))
+                                               ^ current_block_start;
+                       block_shift = min(alignment_of(misalignment_bits)
+                               , block_shift);
+               }
+
+               /* Go over the scatter entries and check
+                    if they continue the previous scatter entry.
+               */
+               next_block_start =
+                       sg_dma_address(sg);
+               current_block_end = current_block_start
+                       + current_block_len;
+               /* If we have a split (non-contig.) between two block*/
+               if (current_block_end != next_block_start) {
+                       block_shift = mlx4_ib_umem_calc_block_mtt(
+                                       next_block_start,
+                                       current_block_end,
+                                       block_shift);
+
+                       /* If we reached the minimum shift for 4k
+                            page we stop the loop.
+                       */
+                       if (block_shift <= min_shift)
+                               goto end;
+
+                       /* If not saved yet we are in first block -
+                            we save the length of first block to
+                            calculate the non_aligned_pages number at
+                       *    the end.
+                       */
+                       total_len += current_block_len;
+
+                       /* Start a new block */
+                       current_block_start = next_block_start;
+                       current_block_len =
+                               sg_dma_len(sg);
+                       continue;
+               }
+               /* The scatter entry is another part of
+                    the current block, increase the block size
+               * An entry in the scatter can be larger than
+               4k (page) as of dma mapping
+               which merge some blocks together.
+               */
+               current_block_len +=
+                       sg_dma_len(sg);
+       }
+
+       /* Account for the last block in the total len */
+       total_len += current_block_len;
+       /* Add to the first block the misalignment that it suffers from.*/
+       total_len += (first_block_start & ((1ULL<<block_shift)-1ULL));
+       last_block_end = current_block_start+current_block_len;
+       last_block_aligned_end = round_up(last_block_end, 1<<block_shift);
+       total_len += (last_block_aligned_end - last_block_end);
+
+       WARN((total_len & ((1ULL<<block_shift)-1ULL)),
+               " misaligned total length detected (%llu, %llu)!",
+               total_len, block_shift);
+
+       *num_of_mtts = total_len >> block_shift;
+end:
+       if (block_shift < min_shift) {
+               /* If shift is less than the min we set a WARN and
+                    return the min shift.
+               */
+               WARN(1,
+               "mlx4_ib_umem_calc_optimal_mtt_size - unexpected shift %lld\n",
+               block_shift);
+
+               block_shift = min_shift;
+       }
+       return block_shift;
+
+}
+
  struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                                   u64 virt_addr, int access_flags,
                                   struct ib_udata *udata)
@@ -154,7 +406,8 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
         }
  
         n = ib_umem_page_count(mr->umem);
-       shift = ilog2(mr->umem->page_size);
+       shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start,
+               &n);
  
         err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
                             convert_access(access_flags), n, shift, &mr->mmr);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h

index 83e80ab9450048d121b739bd23b85ccb14a39b36..1ed388f598c5d7c5b5190ba8fe62d0e4bfcae55c 100644 (file)
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -383,6 +383,10 @@ enum {
         MLX4_MTT_FLAG_PRESENT           = 1
  };
  
+enum {
+       MLX4_MAX_MTT_SHIFT              = 31
+};
+
  enum mlx4_qp_region {
         MLX4_QP_REGION_FW = 0,
         MLX4_QP_REGION_RSS_RAW_ETH,
author	Yishai Hadas <yishaih@mellanox.com>
	Sun, 15 Apr 2012 14:57:24 +0000 (17:57 +0300)
committer	Mukesh Kacker <mukesh.kacker@oracle.com>
	Tue, 7 Jul 2015 21:38:08 +0000 (14:38 -0700)
drivers/infiniband/hw/mlx4/mr.c		patch \| blob \| history
include/linux/mlx4/device.h		patch \| blob \| history