]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
nvme-pci: add support for sgl metadata
authorKeith Busch <kbusch@kernel.org>
Fri, 15 Nov 2024 21:41:21 +0000 (13:41 -0800)
committerKeith Busch <kbusch@kernel.org>
Mon, 18 Nov 2024 17:17:25 +0000 (09:17 -0800)
Supporting this mode allows creating and merging multi-segment metadata
requests that wouldn't be possible otherwise. It also allows directly
using user space requests that straddle physically discontiguous pages.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
drivers/nvme/host/nvme.h
drivers/nvme/host/pci.c
include/linux/nvme.h

index 900719c4c70c15af480d13bb08bf7119ae5f0e69..5ef284a376cc7d806f97892d496449b035ce9687 100644 (file)
@@ -1126,6 +1126,13 @@ static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl)
        return ctrl->sgls & ((1 << 0) | (1 << 1));
 }
 
+static inline bool nvme_ctrl_meta_sgl_supported(struct nvme_ctrl *ctrl)
+{
+       if (ctrl->ops->flags & NVME_F_FABRICS)
+               return true;
+       return ctrl->sgls & NVME_CTRL_SGLS_MSDS;
+}
+
 #ifdef CONFIG_NVME_HOST_AUTH
 int __init nvme_init_auth(void);
 void __exit nvme_exit_auth(void);
index 5f2e3ad2cc521be206540ad2df0e66565a1fab9b..c6c3ae3a7c434d9f6a97920b3dbea01c49615926 100644 (file)
@@ -43,6 +43,7 @@
  */
 #define NVME_MAX_KB_SZ 8192
 #define NVME_MAX_SEGS  128
+#define NVME_MAX_META_SEGS 15
 #define NVME_MAX_NR_ALLOCATIONS        5
 
 static int use_threaded_interrupts;
@@ -144,6 +145,7 @@ struct nvme_dev {
        struct sg_table *hmb_sgt;
 
        mempool_t *iod_mempool;
+       mempool_t *iod_meta_mempool;
 
        /* shadow doorbell buffer support: */
        __le32 *dbbuf_dbs;
@@ -239,6 +241,8 @@ struct nvme_iod {
        dma_addr_t first_dma;
        dma_addr_t meta_dma;
        struct sg_table sgt;
+       struct sg_table meta_sgt;
+       union nvme_descriptor meta_list;
        union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
 };
 
@@ -506,6 +510,14 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
        spin_unlock(&nvmeq->sq_lock);
 }
 
+static inline bool nvme_pci_metadata_use_sgls(struct nvme_dev *dev,
+                                             struct request *req)
+{
+       if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl))
+               return false;
+       return req->nr_integrity_segments > 1;
+}
+
 static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
                                     int nseg)
 {
@@ -518,6 +530,8 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
                return false;
        if (!nvmeq->qid)
                return false;
+       if (nvme_pci_metadata_use_sgls(dev, req))
+               return true;
        if (!sgl_threshold || avg_seg_size < sgl_threshold)
                return false;
        return true;
@@ -780,7 +794,8 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
                struct bio_vec bv = req_bvec(req);
 
                if (!is_pci_p2pdma_page(bv.bv_page)) {
-                       if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
+                       if (!nvme_pci_metadata_use_sgls(dev, req) &&
+                           (bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
                             bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
                                return nvme_setup_prp_simple(dev, req,
                                                             &cmnd->rw, &bv);
@@ -824,11 +839,69 @@ out_free_sg:
        return ret;
 }
 
-static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
-               struct nvme_command *cmnd)
+static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
+                                            struct request *req)
+{
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       struct nvme_rw_command *cmnd = &iod->cmd.rw;
+       struct nvme_sgl_desc *sg_list;
+       struct scatterlist *sgl, *sg;
+       unsigned int entries;
+       dma_addr_t sgl_dma;
+       int rc, i;
+
+       iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC);
+       if (!iod->meta_sgt.sgl)
+               return BLK_STS_RESOURCE;
+
+       sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments);
+       iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req,
+                                                          iod->meta_sgt.sgl);
+       if (!iod->meta_sgt.orig_nents)
+               goto out_free_sg;
+
+       rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req),
+                            DMA_ATTR_NO_WARN);
+       if (rc)
+               goto out_free_sg;
+
+       sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma);
+       if (!sg_list)
+               goto out_unmap_sg;
+
+       entries = iod->meta_sgt.nents;
+       iod->meta_list.sg_list = sg_list;
+       iod->meta_dma = sgl_dma;
+
+       cmnd->flags = NVME_CMD_SGL_METASEG;
+       cmnd->metadata = cpu_to_le64(sgl_dma);
+
+       sgl = iod->meta_sgt.sgl;
+       if (entries == 1) {
+               nvme_pci_sgl_set_data(sg_list, sgl);
+               return BLK_STS_OK;
+       }
+
+       sgl_dma += sizeof(*sg_list);
+       nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries);
+       for_each_sg(sgl, sg, entries, i)
+               nvme_pci_sgl_set_data(&sg_list[i + 1], sg);
+
+       return BLK_STS_OK;
+
+out_unmap_sg:
+       dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
+out_free_sg:
+       mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
+       return BLK_STS_RESOURCE;
+}
+
+static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev,
+                                            struct request *req)
 {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
        struct bio_vec bv = rq_integrity_vec(req);
+       struct nvme_command *cmnd = &iod->cmd;
 
        iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
        if (dma_mapping_error(dev->dev, iod->meta_dma))
@@ -837,6 +910,13 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
        return BLK_STS_OK;
 }
 
+static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req)
+{
+       if (nvme_pci_metadata_use_sgls(dev, req))
+               return nvme_pci_setup_meta_sgls(dev, req);
+       return nvme_pci_setup_meta_mptr(dev, req);
+}
+
 static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
 {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -845,6 +925,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
        iod->aborted = false;
        iod->nr_allocations = -1;
        iod->sgt.nents = 0;
+       iod->meta_sgt.nents = 0;
 
        ret = nvme_setup_cmd(req->q->queuedata, req);
        if (ret)
@@ -857,7 +938,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
        }
 
        if (blk_integrity_rq(req)) {
-               ret = nvme_map_metadata(dev, req, &iod->cmd);
+               ret = nvme_map_metadata(dev, req);
                if (ret)
                        goto out_unmap_data;
        }
@@ -955,17 +1036,31 @@ static void nvme_queue_rqs(struct rq_list *rqlist)
        *rqlist = requeue_list;
 }
 
+static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
+                                               struct request *req)
+{
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+       if (!iod->meta_sgt.nents) {
+               dma_unmap_page(dev->dev, iod->meta_dma,
+                              rq_integrity_vec(req).bv_len,
+                              rq_dma_dir(req));
+               return;
+       }
+
+       dma_pool_free(dev->prp_small_pool, iod->meta_list.sg_list,
+                     iod->meta_dma);
+       dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
+       mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
+}
+
 static __always_inline void nvme_pci_unmap_rq(struct request *req)
 {
        struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
        struct nvme_dev *dev = nvmeq->dev;
 
-       if (blk_integrity_rq(req)) {
-               struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-
-               dma_unmap_page(dev->dev, iod->meta_dma,
-                              rq_integrity_vec(req).bv_len, rq_dma_dir(req));
-       }
+       if (blk_integrity_rq(req))
+               nvme_unmap_metadata(dev, req);
 
        if (blk_rq_nr_phys_segments(req))
                nvme_unmap_data(dev, req);
@@ -2761,6 +2856,7 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
 
 static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
 {
+       size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
        size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS;
 
        dev->iod_mempool = mempool_create_node(1,
@@ -2769,7 +2865,18 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
                        dev_to_node(dev->dev));
        if (!dev->iod_mempool)
                return -ENOMEM;
+
+       dev->iod_meta_mempool = mempool_create_node(1,
+                       mempool_kmalloc, mempool_kfree,
+                       (void *)meta_size, GFP_KERNEL,
+                       dev_to_node(dev->dev));
+       if (!dev->iod_meta_mempool)
+               goto free;
+
        return 0;
+free:
+       mempool_destroy(dev->iod_mempool);
+       return -ENOMEM;
 }
 
 static void nvme_free_tagset(struct nvme_dev *dev)
@@ -2834,6 +2941,11 @@ static void nvme_reset_work(struct work_struct *work)
        if (result)
                goto out;
 
+       if (nvme_ctrl_meta_sgl_supported(&dev->ctrl))
+               dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS;
+       else
+               dev->ctrl.max_integrity_segments = 1;
+
        nvme_dbbuf_dma_alloc(dev);
 
        result = nvme_setup_host_mem(dev);
@@ -3101,11 +3213,6 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
        dev->ctrl.max_hw_sectors = min_t(u32,
                NVME_MAX_KB_SZ << 1, dma_opt_mapping_size(&pdev->dev) >> 9);
        dev->ctrl.max_segments = NVME_MAX_SEGS;
-
-       /*
-        * There is no support for SGLs for metadata (yet), so we are limited to
-        * a single integrity segment for the separate metadata pointer.
-        */
        dev->ctrl.max_integrity_segments = 1;
        return dev;
 
@@ -3168,6 +3275,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        if (result)
                goto out_disable;
 
+       if (nvme_ctrl_meta_sgl_supported(&dev->ctrl))
+               dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS;
+       else
+               dev->ctrl.max_integrity_segments = 1;
+
        nvme_dbbuf_dma_alloc(dev);
 
        result = nvme_setup_host_mem(dev);
@@ -3210,6 +3322,7 @@ out_disable:
        nvme_free_queues(dev, 0);
 out_release_iod_mempool:
        mempool_destroy(dev->iod_mempool);
+       mempool_destroy(dev->iod_meta_mempool);
 out_release_prp_pools:
        nvme_release_prp_pools(dev);
 out_dev_unmap:
@@ -3275,6 +3388,7 @@ static void nvme_remove(struct pci_dev *pdev)
        nvme_dbbuf_dma_free(dev);
        nvme_free_queues(dev, 0);
        mempool_destroy(dev->iod_mempool);
+       mempool_destroy(dev->iod_meta_mempool);
        nvme_release_prp_pools(dev);
        nvme_dev_unmap(dev);
        nvme_uninit_ctrl(&dev->ctrl);
index 0a6e22038ce361e678326d16677045c2b9b6fcc5..5873ce859cc8b54d3e13a3d557dc2157f7daab11 100644 (file)
@@ -389,6 +389,7 @@ enum {
        NVME_CTRL_CTRATT_PREDICTABLE_LAT        = 1 << 5,
        NVME_CTRL_CTRATT_NAMESPACE_GRANULARITY  = 1 << 7,
        NVME_CTRL_CTRATT_UUID_LIST              = 1 << 9,
+       NVME_CTRL_SGLS_MSDS                     = 1 << 19,
 };
 
 struct nvme_lbaf {