]> www.infradead.org Git - users/hch/misc.git/commitdiff
nvme-pci: convert to blk_rq_dma_map nvme-dma-map
authorChristoph Hellwig <hch@lst.de>
Sat, 10 May 2025 05:24:59 +0000 (07:24 +0200)
committerChristoph Hellwig <hch@lst.de>
Sun, 11 May 2025 05:57:59 +0000 (07:57 +0200)
Use the blk_rq_dma_map API to DMA map requests instead of
scatterlists.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Jens Axboe <axboe@kernel.dk>
[ Leon: squashed optimization patch from Kanchan ]
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
[ Leon: rewrote original patch due to rebases and addition of metadata support ]
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
drivers/nvme/host/pci.c

index 42fa3ddf81c2bbfa27a13adfa2d6c9646c39fe9a..63f4e20d4faf53fe48e506460bf99897893270e9 100644 (file)
@@ -7,7 +7,7 @@
 #include <linux/acpi.h>
 #include <linux/async.h>
 #include <linux/blkdev.h>
-#include <linux/blk-mq.h>
+#include <linux/blk-mq-dma.h>
 #include <linux/blk-integrity.h>
 #include <linux/dmi.h>
 #include <linux/init.h>
@@ -26,7 +26,6 @@
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/io-64-nonatomic-hi-lo.h>
 #include <linux/sed-opal.h>
-#include <linux/pci-p2pdma.h>
 
 #include "trace.h"
 #include "nvme.h"
@@ -145,9 +144,6 @@ struct nvme_dev {
        bool hmb;
        struct sg_table *hmb_sgt;
 
-       mempool_t *iod_mempool;
-       mempool_t *iod_meta_mempool;
-
        /* shadow doorbell buffer support: */
        __le32 *dbbuf_dbs;
        dma_addr_t dbbuf_dbs_dma_addr;
@@ -228,6 +224,12 @@ enum nvme_iod_flags {
 
        /* uses the small descriptor pool */
        IOD_SMALL_POOL          = 1U << 1,
+
+       /* single segment dma mapping */
+       IOD_SINGLE_SEGMENT      = 1U << 2,
+
+       /* single meta SGL entry without indirection  */
+       IOD_SINGLE_META_SGL     = 1U << 3,
 };
 
 /*
@@ -237,14 +239,15 @@ struct nvme_iod {
        struct nvme_request req;
        struct nvme_command cmd;
        u8 flags;
+
        u8 nr_descriptors;
-       unsigned int dma_len;   /* length of single DMA segment mapping */
-       dma_addr_t first_dma;
-       dma_addr_t meta_dma;
-       struct sg_table sgt;
-       struct sg_table meta_sgt;
-       void *meta_descriptor;
+       unsigned int total_len;
+       struct dma_iova_state dma_state;
        void *descriptors[NVME_MAX_NR_DESCRIPTORS];
+
+       unsigned int total_meta_len;
+       struct dma_iova_state dma_meta_state;
+       void *meta_descriptor;
 };
 
 static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
@@ -511,32 +514,44 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
        spin_unlock(&nvmeq->sq_lock);
 }
 
-static inline bool nvme_pci_metadata_use_sgls(struct nvme_dev *dev,
-                                             struct request *req)
+enum nvme_use_sgl {
+       SGL_UNSUPPORTED,
+       SGL_SUPPORTED,
+       SGL_FORCED,
+};
+
+static inline bool nvme_pci_metadata_use_sgls(struct request *req)
 {
-       if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl))
-               return false;
        return req->nr_integrity_segments > 1 ||
                nvme_req(req)->flags & NVME_REQ_USERCMD;
 }
 
-static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
-                                    int nseg)
+static inline enum nvme_use_sgl nvme_pci_use_sgls(struct nvme_dev *dev,
+               struct request *req)
 {
        struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
-       unsigned int avg_seg_size;
 
-       avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
+       if (nvmeq->qid && nvme_ctrl_sgl_supported(&dev->ctrl)) {
+               if (nvme_req(req)->flags & NVME_REQ_USERCMD)
+                       return SGL_FORCED;
+               if (nvme_pci_metadata_use_sgls(req))
+                       return SGL_FORCED;
+               return SGL_SUPPORTED;
+       }
 
-       if (!nvme_ctrl_sgl_supported(&dev->ctrl))
-               return false;
-       if (!nvmeq->qid)
-               return false;
-       if (nvme_pci_metadata_use_sgls(dev, req))
-               return true;
-       if (!sgl_threshold || avg_seg_size < sgl_threshold)
-               return nvme_req(req)->flags & NVME_REQ_USERCMD;
-       return true;
+       return SGL_UNSUPPORTED;
+}
+
+static unsigned int nvme_pci_avg_seg_size(struct request *req)
+{
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       unsigned int nseg;
+
+       if (blk_rq_dma_map_coalesce(&iod->dma_state))
+               nseg = 1;
+       else
+               nseg = blk_rq_nr_phys_segments(req);
+       return DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
 }
 
 static inline struct dma_pool *nvme_dma_pool(struct nvme_dev *dev,
@@ -547,11 +562,24 @@ static inline struct dma_pool *nvme_dma_pool(struct nvme_dev *dev,
        return dev->prp_page_pool;
 }
 
+static inline bool nvme_pci_cmd_use_sgl(struct nvme_command *cmd)
+{
+       return cmd->common.flags &
+               (NVME_CMD_SGL_METABUF | NVME_CMD_SGL_METASEG);
+}
+
+static inline dma_addr_t nvme_pci_first_desc_dma_addr(struct nvme_command *cmd)
+{
+       if (nvme_pci_cmd_use_sgl(cmd))
+               return le64_to_cpu(cmd->common.dptr.sgl.addr);
+       return le64_to_cpu(cmd->common.dptr.prp2);
+}
+
 static void nvme_free_descriptors(struct nvme_dev *dev, struct request *req)
 {
        const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-       dma_addr_t dma_addr = iod->first_dma;
+       dma_addr_t dma_addr = nvme_pci_first_desc_dma_addr(&iod->cmd);
        int i;
 
        if (iod->nr_descriptors == 1) {
@@ -569,67 +597,182 @@ static void nvme_free_descriptors(struct nvme_dev *dev, struct request *req)
        }
 }
 
+static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
+{
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       enum dma_data_direction dir = rq_dma_dir(req);
+       int length = iod->total_len;
+       dma_addr_t dma_addr;
+       int i, desc;
+       __le64 *prp_list;
+       u32 dma_len;
+
+       dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp1);
+       dma_len = min_t(u32, length,
+               NVME_CTRL_PAGE_SIZE - (dma_addr & (NVME_CTRL_PAGE_SIZE - 1)));
+       length -= dma_len;
+       if (!length) {
+               dma_unmap_page(dev->dev, dma_addr, dma_len, dir);
+               return;
+       }
+
+       if (length <= NVME_CTRL_PAGE_SIZE) {
+               dma_unmap_page(dev->dev, dma_addr, dma_len, dir);
+               dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp2);
+               dma_unmap_page(dev->dev, dma_addr, length, dir);
+               return;
+       }
+
+       i = 0;
+       desc = 0;
+       prp_list = iod->descriptors[desc];
+       do {
+               dma_unmap_page(dev->dev, dma_addr, dma_len, dir);
+               if (i == NVME_CTRL_PAGE_SIZE >> 3) {
+                       prp_list = iod->descriptors[++desc];
+                       i = 0;
+               }
+
+               dma_addr = le64_to_cpu(prp_list[i++]);
+               dma_len = min(length, NVME_CTRL_PAGE_SIZE);
+               length -= dma_len;
+       } while (length);
+}
+
+
+static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
+{
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       dma_addr_t sqe_dma_addr = le64_to_cpu(iod->cmd.common.dptr.sgl.addr);
+       unsigned int sqe_dma_len = le32_to_cpu(iod->cmd.common.dptr.sgl.length);
+       struct nvme_sgl_desc *sg_list = iod->descriptors[0];
+       enum dma_data_direction dir = rq_dma_dir(req);
+
+       if (iod->nr_descriptors) {
+               unsigned int nr_entries = sqe_dma_len / sizeof(*sg_list), i;
+
+               for (i = 0; i < nr_entries; i++)
+                       dma_unmap_page(dev->dev, le64_to_cpu(sg_list[i].addr),
+                               le32_to_cpu(sg_list[i].length), dir);
+       } else {
+               dma_unmap_page(dev->dev, sqe_dma_addr, sqe_dma_len, dir);
+       }
+}
+
 static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 
-       if (iod->dma_len) {
-               dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
-                              rq_dma_dir(req));
+       if (iod->flags & IOD_SINGLE_SEGMENT) {
+               static_assert(offsetof(union nvme_data_ptr, prp1) ==
+                               offsetof(union nvme_data_ptr, sgl.addr));
+               dma_unmap_page(dev->dev, le64_to_cpu(iod->cmd.common.dptr.prp1),
+                               iod->total_len, rq_dma_dir(req));
                return;
        }
 
-       WARN_ON_ONCE(!iod->sgt.nents);
+       if (!blk_rq_dma_unmap(req, dev->dev, &iod->dma_state, iod->total_len)) {
+               if (nvme_pci_cmd_use_sgl(&iod->cmd))
+                       nvme_free_sgls(dev, req);
+               else
+                       nvme_free_prps(dev, req);
+       }
 
-       dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
-       nvme_free_descriptors(dev, req);
-       mempool_free(iod->sgt.sgl, dev->iod_mempool);
+       if (iod->nr_descriptors)
+               nvme_free_descriptors(dev, req);
 }
 
-static void nvme_print_sgl(struct scatterlist *sgl, int nents)
+static blk_status_t nvme_pci_setup_data_simple(struct nvme_dev *dev,
+               struct request *req, enum nvme_use_sgl use_sgl)
 {
-       int i;
-       struct scatterlist *sg;
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       struct nvme_rw_command *cmd = &iod->cmd.rw;
+       struct bio_vec bv = req_bvec(req);
+       unsigned int prp1_offset = bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
+       bool prp_possible = prp1_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2;
+       dma_addr_t dma_addr;
+
+       /*
+        * The simple mapping in dma_map_bvec does not support P2P mappings, so
+        * always defer to the fully features iterator based version for them.
+        */
+       if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA))
+               return BLK_STS_AGAIN;
+
+       /*
+        * While we can map all single-segment mappings with a single SGL, PRPs
+        * limited in the aligned length that can be supported with just the two
+        * PRP entires in the SQE.
+        */
+       if (!use_sgl && !prp_possible)
+               return BLK_STS_AGAIN;
 
-       for_each_sg(sgl, sg, nents, i) {
-               dma_addr_t phys = sg_phys(sg);
-               pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
-                       "dma_address:%pad dma_length:%d\n",
-                       i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
-                       sg_dma_len(sg));
+       dma_addr = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
+       if (dma_mapping_error(dev->dev, dma_addr))
+               return BLK_STS_RESOURCE;
+       iod->total_len = bv.bv_len;
+       iod->flags |= IOD_SINGLE_SEGMENT;
+
+       if (use_sgl == SGL_FORCED || !prp_possible) {
+               cmd->flags = NVME_CMD_SGL_METABUF;
+               cmd->dptr.sgl.addr = cpu_to_le64(dma_addr);
+               cmd->dptr.sgl.length = cpu_to_le32(bv.bv_len);
+               cmd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
+       } else {
+               unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - prp1_offset;
+       
+               cmd->dptr.prp1 = cpu_to_le64(dma_addr);
+               if (bv.bv_len > first_prp_len)
+                       cmd->dptr.prp2 = cpu_to_le64(dma_addr + first_prp_len);
+               else
+                       cmd->dptr.prp2 = 0;
        }
+
+       return BLK_STS_OK;
 }
 
-static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
-               struct request *req, struct nvme_rw_command *cmnd)
+static blk_status_t nvme_pci_setup_data_prp(struct nvme_dev *dev,
+               struct request *req, struct blk_dma_iter *iter)
 {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-       int length = blk_rq_payload_bytes(req);
-       struct scatterlist *sg = iod->sgt.sgl;
-       int dma_len = sg_dma_len(sg);
-       u64 dma_addr = sg_dma_address(sg);
-       int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
+       struct nvme_rw_command *cmnd = &iod->cmd.rw;
+       unsigned int length = blk_rq_payload_bytes(req);
+       dma_addr_t prp1_dma, prp2_dma = 0;
+       unsigned int prp_len, i;
        __le64 *prp_list;
-       dma_addr_t prp_dma;
-       int i;
 
-       length -= (NVME_CTRL_PAGE_SIZE - offset);
-       if (length <= 0) {
-               iod->first_dma = 0;
+       /*
+        * PRP1 always points to the start of the DMA transfers.
+        *
+        * This is the only PRP (except for the list entries) that could be
+        * non-aligned.
+        */
+       prp1_dma = iter->addr;
+       prp_len = min(length, NVME_CTRL_PAGE_SIZE -
+                       (iter->addr & (NVME_CTRL_PAGE_SIZE - 1)));
+       iod->total_len += prp_len;
+       iter->addr += prp_len;
+       iter->len -= prp_len;
+       length -= prp_len;
+       if (!length)
                goto done;
-       }
 
-       dma_len -= (NVME_CTRL_PAGE_SIZE - offset);
-       if (dma_len) {
-               dma_addr += (NVME_CTRL_PAGE_SIZE - offset);
-       } else {
-               sg = sg_next(sg);
-               dma_addr = sg_dma_address(sg);
-               dma_len = sg_dma_len(sg);
+       if (!iter->len) {
+               if (!blk_rq_dma_map_iter_next(req, dev->dev, &iod->dma_state,
+                               iter)) {
+                       if (WARN_ON_ONCE(!iter->status))
+                               goto bad_sgl;
+                       goto done;
+               }
        }
 
+       /*
+        * PRP2 is usually a list, but can point to data if all data to be
+        * transferred fits into PRP1 + PRP2:
+        */
        if (length <= NVME_CTRL_PAGE_SIZE) {
-               iod->first_dma = dma_addr;
+               prp2_dma = iter->addr;
+               iod->total_len += length;
                goto done;
        }
 
@@ -638,58 +781,83 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
                iod->flags |= IOD_SMALL_POOL;
 
        prp_list = dma_pool_alloc(nvme_dma_pool(dev, iod), GFP_ATOMIC,
-                       &prp_dma);
-       if (!prp_list)
-               return BLK_STS_RESOURCE;
+                       &prp2_dma);
+       if (!prp_list) {
+               iter->status = BLK_STS_RESOURCE;
+               goto done;
+       }
        iod->descriptors[iod->nr_descriptors++] = prp_list;
-       iod->first_dma = prp_dma;
+
        i = 0;
        for (;;) {
+               prp_list[i++] = cpu_to_le64(iter->addr);
+               prp_len = min(length, NVME_CTRL_PAGE_SIZE);
+               if (WARN_ON_ONCE(iter->len < prp_len))
+                       goto bad_sgl;
+
+               iod->total_len += prp_len;
+               iter->addr += prp_len;
+               iter->len -= prp_len;
+               length -= prp_len;
+               if (!length)
+                       break;
+
+               if (iter->len == 0) {
+                       if (!blk_rq_dma_map_iter_next(req, dev->dev,
+                                       &iod->dma_state, iter)) {
+                               if (WARN_ON_ONCE(!iter->status))
+                                       goto bad_sgl;
+                               goto done;
+                       }
+               }
+
+               /*
+                * If we've filled the entire descriptor, allocate a new that is
+                * pointed to be the last entry in the previous PRP list.  To
+                * accommodate for that move the last actual entry to the new
+                * descriptor.
+                */
                if (i == NVME_CTRL_PAGE_SIZE >> 3) {
                        __le64 *old_prp_list = prp_list;
+                       dma_addr_t prp_list_dma;
 
                        prp_list = dma_pool_alloc(dev->prp_page_pool,
-                                       GFP_ATOMIC, &prp_dma);
-                       if (!prp_list)
-                               goto free_prps;
+                                       GFP_ATOMIC, &prp_list_dma);
+                       if (!prp_list) {
+                               iter->status = BLK_STS_RESOURCE;
+                               goto done;
+                       }
                        iod->descriptors[iod->nr_descriptors++] = prp_list;
+
                        prp_list[0] = old_prp_list[i - 1];
-                       old_prp_list[i - 1] = cpu_to_le64(prp_dma);
+                       old_prp_list[i - 1] = cpu_to_le64(prp_list_dma);
                        i = 1;
                }
-               prp_list[i++] = cpu_to_le64(dma_addr);
-               dma_len -= NVME_CTRL_PAGE_SIZE;
-               dma_addr += NVME_CTRL_PAGE_SIZE;
-               length -= NVME_CTRL_PAGE_SIZE;
-               if (length <= 0)
-                       break;
-               if (dma_len > 0)
-                       continue;
-               if (unlikely(dma_len < 0))
-                       goto bad_sgl;
-               sg = sg_next(sg);
-               dma_addr = sg_dma_address(sg);
-               dma_len = sg_dma_len(sg);
        }
+
 done:
-       cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sgt.sgl));
-       cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
-       return BLK_STS_OK;
-free_prps:
-       nvme_free_descriptors(dev, req);
-       return BLK_STS_RESOURCE;
+       /*
+        * nvme_unmap_data uses the DPT field in the SQE to tear down the
+        * mapping, so initialize it even for failures.
+        */
+       cmnd->dptr.prp1 = cpu_to_le64(prp1_dma);
+       cmnd->dptr.prp2 = cpu_to_le64(prp2_dma);
+       if (unlikely(iter->status))
+               nvme_unmap_data(dev, req);
+       return iter->status;
+
 bad_sgl:
-       WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents),
-                       "Invalid SGL for payload:%d nents:%d\n",
-                       blk_rq_payload_bytes(req), iod->sgt.nents);
+       dev_err_once(dev->dev,
+               "Incorrectly formed request for payload:%d nents:%d\n",
+               blk_rq_payload_bytes(req), blk_rq_nr_phys_segments(req));
        return BLK_STS_IOERR;
 }
 
 static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
-               struct scatterlist *sg)
+               struct blk_dma_iter *iter)
 {
-       sge->addr = cpu_to_le64(sg_dma_address(sg));
-       sge->length = cpu_to_le32(sg_dma_len(sg));
+       sge->addr = cpu_to_le64(iter->addr);
+       sge->length = cpu_to_le32(iter->len);
        sge->type = NVME_SGL_FMT_DATA_DESC << 4;
 }
 
@@ -701,21 +869,22 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
        sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
 }
 
-static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
-               struct request *req, struct nvme_rw_command *cmd)
+static blk_status_t nvme_pci_setup_data_sgl(struct nvme_dev *dev,
+               struct request *req, struct blk_dma_iter *iter)
 {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       struct nvme_rw_command *cmd = &iod->cmd.rw;
+       unsigned int entries = blk_rq_nr_phys_segments(req);
        struct nvme_sgl_desc *sg_list;
-       struct scatterlist *sg = iod->sgt.sgl;
-       unsigned int entries = iod->sgt.nents;
        dma_addr_t sgl_dma;
-       int i = 0;
+       unsigned int mapped = 0;
 
-       /* setting the transfer type as SGL */
+       /* set the transfer type as SGL */
        cmd->flags = NVME_CMD_SGL_METABUF;
 
-       if (entries == 1) {
-               nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
+       if (entries == 1 || blk_rq_dma_map_coalesce(&iod->dma_state)) {
+               nvme_pci_sgl_set_data(&cmd->dptr.sgl, iter);
+               iod->total_len += iter->len;
                return BLK_STS_OK;
        }
 
@@ -726,168 +895,145 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
        if (!sg_list)
                return BLK_STS_RESOURCE;
        iod->descriptors[iod->nr_descriptors++] = sg_list;
-       iod->first_dma = sgl_dma;
 
-       nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
        do {
-               nvme_pci_sgl_set_data(&sg_list[i++], sg);
-               sg = sg_next(sg);
-       } while (--entries > 0);
+               if (WARN_ON_ONCE(mapped == entries)) {
+                       iter->status = BLK_STS_IOERR;
+                       break;
+               }
+               nvme_pci_sgl_set_data(&sg_list[mapped++], iter);
+               iod->total_len += iter->len;
+       } while (blk_rq_dma_map_iter_next(req, dev->dev, &iod->dma_state,
+                               iter));
 
-       return BLK_STS_OK;
+       nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, mapped);
+       if (unlikely(iter->status))
+               nvme_free_sgls(dev, req);
+       return iter->status;
 }
 
-static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
-               struct request *req, struct nvme_rw_command *cmnd,
-               struct bio_vec *bv)
+static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req)
 {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-       unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
-       unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
-
-       iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
-       if (dma_mapping_error(dev->dev, iod->first_dma))
-               return BLK_STS_RESOURCE;
-       iod->dma_len = bv->bv_len;
-
-       cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
-       if (bv->bv_len > first_prp_len)
-               cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
-       else
-               cmnd->dptr.prp2 = 0;
-       return BLK_STS_OK;
-}
+       enum nvme_use_sgl use_sgl = nvme_pci_use_sgls(dev, req);
+       struct blk_dma_iter iter;
+       blk_status_t ret;
 
-static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
-               struct request *req, struct nvme_rw_command *cmnd,
-               struct bio_vec *bv)
-{
-       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       if (blk_rq_nr_phys_segments(req) == 1) {
+               /*
+                * Try to skip the DMA iterator for single segment requests, as
+                * that significantly improves performances for workloads with
+                * small I/O sizes.
+                */
+               ret = nvme_pci_setup_data_simple(dev, req, use_sgl);
+               if (ret != BLK_STS_AGAIN)
+                       return ret;
+       }
 
-       iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
-       if (dma_mapping_error(dev->dev, iod->first_dma))
-               return BLK_STS_RESOURCE;
-       iod->dma_len = bv->bv_len;
+       if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
+               return iter.status;
 
-       cmnd->flags = NVME_CMD_SGL_METABUF;
-       cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
-       cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
-       cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
-       return BLK_STS_OK;
+       if (use_sgl == SGL_FORCED ||
+           (use_sgl == SGL_SUPPORTED &&
+            (!sgl_threshold || nvme_pci_avg_seg_size(req) < sgl_threshold)))
+               return nvme_pci_setup_data_sgl(dev, req, &iter);
+       return nvme_pci_setup_data_prp(dev, req, &iter);
 }
 
-static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
-               struct nvme_command *cmnd)
+static void nvme_unmap_metadata(struct nvme_dev *dev, struct request *req)
 {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-       blk_status_t ret = BLK_STS_RESOURCE;
-       int rc;
+       struct nvme_sgl_desc *sg_list = iod->meta_descriptor;
+       enum dma_data_direction dir = rq_dma_dir(req);
+       dma_addr_t meta_dma = le64_to_cpu(iod->cmd.common.metadata);
 
-       if (blk_rq_nr_phys_segments(req) == 1) {
-               struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
-               struct bio_vec bv = req_bvec(req);
-
-               if (!is_pci_p2pdma_page(bv.bv_page)) {
-                       if (!nvme_pci_metadata_use_sgls(dev, req) &&
-                           (bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
-                            bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
-                               return nvme_setup_prp_simple(dev, req,
-                                                            &cmnd->rw, &bv);
-
-                       if (nvmeq->qid && sgl_threshold &&
-                           nvme_ctrl_sgl_supported(&dev->ctrl))
-                               return nvme_setup_sgl_simple(dev, req,
-                                                            &cmnd->rw, &bv);
-               }
+       if (WARN_ON_ONCE(!iod->total_meta_len))
+               return;
+
+       /*
+        * If we are not using metadata SGLs, we must have a single input
+        * segment.
+        *
+        * Note that it would be nice to always use the linear buffer when using
+        * IOVA mappings and kernel buffers to avoid the SGL indirection, but
+        * that's left for a future optimization.
+        */
+       if (!(iod->cmd.common.flags & NVME_CMD_SGL_METASEG)) {
+               dma_unmap_page(dev->dev, meta_dma, iod->total_meta_len, dir);
+               return;
        }
 
-       iod->dma_len = 0;
-       iod->sgt.sgl = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
-       if (!iod->sgt.sgl)
-               return BLK_STS_RESOURCE;
-       sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req));
-       iod->sgt.orig_nents = blk_rq_map_sg(req, iod->sgt.sgl);
-       if (!iod->sgt.orig_nents)
-               goto out_free_sg;
+       if (blk_rq_dma_unmap(req, dev->dev, &iod->dma_meta_state,
+                             iod->total_meta_len)) {
+               ;
+       } else if (iod->flags & IOD_SINGLE_META_SGL) {
+               printk("unmapping single SGL\n");
+               dma_unmap_page(dev->dev, le64_to_cpu(sg_list[0].addr),
+                              le32_to_cpu(sg_list[0].length), dir);
+       } else {
+               unsigned int nr_entries = le32_to_cpu(sg_list[0].length) /
+                               sizeof(*sg_list);
+               unsigned int i;
 
-       rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req),
-                            DMA_ATTR_NO_WARN);
-       if (rc) {
-               if (rc == -EREMOTEIO)
-                       ret = BLK_STS_TARGET;
-               goto out_free_sg;
+               for (i = 1; i <= nr_entries; i++)
+                       dma_unmap_page(dev->dev, le64_to_cpu(sg_list[i].addr),
+                                      le32_to_cpu(sg_list[i].length), dir);
        }
 
-       if (nvme_pci_use_sgls(dev, req, iod->sgt.nents))
-               ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
-       else
-               ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
-       if (ret != BLK_STS_OK)
-               goto out_unmap_sg;
-       return BLK_STS_OK;
-
-out_unmap_sg:
-       dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
-out_free_sg:
-       mempool_free(iod->sgt.sgl, dev->iod_mempool);
-       return ret;
+       dma_pool_free(dev->prp_small_pool, sg_list, meta_dma);
 }
 
 static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
                                             struct request *req)
 {
+       unsigned int entries = req->nr_integrity_segments;
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-       struct nvme_rw_command *cmnd = &iod->cmd.rw;
+       struct nvme_rw_command *cmd = &iod->cmd.rw;
        struct nvme_sgl_desc *sg_list;
-       struct scatterlist *sgl, *sg;
-       unsigned int entries;
+       struct blk_dma_iter iter;
+       unsigned int mapped = 0;
        dma_addr_t sgl_dma;
-       int rc, i;
-
-       iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC);
-       if (!iod->meta_sgt.sgl)
-               return BLK_STS_RESOURCE;
 
-       sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments);
-       iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req,
-                                                          iod->meta_sgt.sgl);
-       if (!iod->meta_sgt.orig_nents)
-               goto out_free_sg;
-
-       rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req),
-                            DMA_ATTR_NO_WARN);
-       if (rc)
-               goto out_free_sg;
+       /*
+        * XXX: this needs to use different helpers iterating over the
+        * metadata.
+        */
+       if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_meta_state,
+                                      &iter))
+               return iter.status;
 
        sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma);
        if (!sg_list)
-               goto out_unmap_sg;
-
-       entries = iod->meta_sgt.nents;
+               return BLK_STS_RESOURCE;
        iod->meta_descriptor = sg_list;
-       iod->meta_dma = sgl_dma;
 
-       cmnd->flags = NVME_CMD_SGL_METASEG;
-       cmnd->metadata = cpu_to_le64(sgl_dma);
+       cmd->flags = NVME_CMD_SGL_METASEG;
+       cmd->metadata = cpu_to_le64(sgl_dma);
 
-       sgl = iod->meta_sgt.sgl;
-       if (entries == 1) {
-               nvme_pci_sgl_set_data(sg_list, sgl);
+       if (entries == 1 || blk_rq_dma_map_coalesce(&iod->dma_meta_state)) {
+               printk_ratelimited("using single meta sgl\n");
+               nvme_pci_sgl_set_data(sg_list, &iter);
+               iod->total_meta_len += iter.len;
+               iod->flags |= IOD_SINGLE_META_SGL;
                return BLK_STS_OK;
        }
 
-       sgl_dma += sizeof(*sg_list);
-       nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries);
-       for_each_sg(sgl, sg, entries, i)
-               nvme_pci_sgl_set_data(&sg_list[i + 1], sg);
+       printk_ratelimited("using meta sgls\n");
 
-       return BLK_STS_OK;
+       do {
+               if (WARN_ON_ONCE(mapped == entries)) {
+                       iter.status = BLK_STS_IOERR;
+                       break;
+               }
+               nvme_pci_sgl_set_data(&sg_list[++mapped], &iter);
+               iod->total_meta_len += iter.len;
+       } while (blk_rq_dma_map_iter_next(req, dev->dev, &iod->dma_meta_state,
+                                &iter));
+       nvme_pci_sgl_set_seg(&sg_list[0], sgl_dma + sizeof(*sg_list), mapped);
 
-out_unmap_sg:
-       dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
-out_free_sg:
-       mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
-       return BLK_STS_RESOURCE;
+       if (unlikely(iter.status))
+               nvme_unmap_metadata(dev, req);
+       return iter.status;
 }
 
 static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev,
@@ -896,11 +1042,14 @@ static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev,
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
        struct bio_vec bv = rq_integrity_vec(req);
        struct nvme_command *cmnd = &iod->cmd;
+       dma_addr_t meta_dma;
 
-       iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
-       if (dma_mapping_error(dev->dev, iod->meta_dma))
+       meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
+       if (dma_mapping_error(dev->dev, meta_dma))
                return BLK_STS_IOERR;
-       cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
+       iod->total_meta_len = bv.bv_len;
+       cmnd->rw.metadata = cpu_to_le64(meta_dma);
+       printk_ratelimited("using meta ptr\n");
        return BLK_STS_OK;
 }
 
@@ -909,7 +1058,7 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req)
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 
        if ((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) &&
-           nvme_pci_metadata_use_sgls(dev, req))
+           nvme_pci_metadata_use_sgls(req))
                return nvme_pci_setup_meta_sgls(dev, req);
        return nvme_pci_setup_meta_mptr(dev, req);
 }
@@ -921,15 +1070,15 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
 
        iod->flags = 0;
        iod->nr_descriptors = 0;
-       iod->sgt.nents = 0;
-       iod->meta_sgt.nents = 0;
+       iod->total_len = 0;
+       iod->total_meta_len = 0;
 
        ret = nvme_setup_cmd(req->q->queuedata, req);
        if (ret)
                return ret;
 
        if (blk_rq_nr_phys_segments(req)) {
-               ret = nvme_map_data(dev, req, &iod->cmd);
+               ret = nvme_map_data(dev, req);
                if (ret)
                        goto out_free_cmd;
        }
@@ -1033,23 +1182,6 @@ static void nvme_queue_rqs(struct rq_list *rqlist)
        *rqlist = requeue_list;
 }
 
-static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
-                                               struct request *req)
-{
-       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-
-       if (!iod->meta_sgt.nents) {
-               dma_unmap_page(dev->dev, iod->meta_dma,
-                              rq_integrity_vec(req).bv_len,
-                              rq_dma_dir(req));
-               return;
-       }
-
-       dma_pool_free(dev->prp_small_pool, iod->meta_descriptor, iod->meta_dma);
-       dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
-       mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
-}
-
 static __always_inline void nvme_pci_unmap_rq(struct request *req)
 {
        struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
@@ -2866,31 +2998,6 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
        dma_pool_destroy(dev->prp_small_pool);
 }
 
-static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
-{
-       size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
-       size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS;
-
-       dev->iod_mempool = mempool_create_node(1,
-                       mempool_kmalloc, mempool_kfree,
-                       (void *)alloc_size, GFP_KERNEL,
-                       dev_to_node(dev->dev));
-       if (!dev->iod_mempool)
-               return -ENOMEM;
-
-       dev->iod_meta_mempool = mempool_create_node(1,
-                       mempool_kmalloc, mempool_kfree,
-                       (void *)meta_size, GFP_KERNEL,
-                       dev_to_node(dev->dev));
-       if (!dev->iod_meta_mempool)
-               goto free;
-
-       return 0;
-free:
-       mempool_destroy(dev->iod_mempool);
-       return -ENOMEM;
-}
-
 static void nvme_free_tagset(struct nvme_dev *dev)
 {
        if (dev->tagset.tags)
@@ -3259,15 +3366,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        if (result)
                goto out_dev_unmap;
 
-       result = nvme_pci_alloc_iod_mempool(dev);
-       if (result)
-               goto out_release_prp_pools;
-
        dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
        result = nvme_pci_enable(dev);
        if (result)
-               goto out_release_iod_mempool;
+               goto out_release_prp_pools;
 
        result = nvme_alloc_admin_tag_set(&dev->ctrl, &dev->admin_tagset,
                                &nvme_mq_admin_ops, sizeof(struct nvme_iod));
@@ -3334,9 +3437,6 @@ out_disable:
        nvme_dev_remove_admin(dev);
        nvme_dbbuf_dma_free(dev);
        nvme_free_queues(dev, 0);
-out_release_iod_mempool:
-       mempool_destroy(dev->iod_mempool);
-       mempool_destroy(dev->iod_meta_mempool);
 out_release_prp_pools:
        nvme_release_prp_pools(dev);
 out_dev_unmap:
@@ -3401,8 +3501,6 @@ static void nvme_remove(struct pci_dev *pdev)
        nvme_dev_remove_admin(dev);
        nvme_dbbuf_dma_free(dev);
        nvme_free_queues(dev, 0);
-       mempool_destroy(dev->iod_mempool);
-       mempool_destroy(dev->iod_meta_mempool);
        nvme_release_prp_pools(dev);
        nvme_dev_unmap(dev);
        nvme_uninit_ctrl(&dev->ctrl);