module_param(tx_spare_buf_size, uint, 0400);
 MODULE_PARM_DESC(tx_spare_buf_size, "Size used to allocate tx spare buffer");
 
+static unsigned int tx_sgl = 1;
+module_param(tx_sgl, uint, 0600);
+MODULE_PARM_DESC(tx_sgl, "Minimum number of frags when using dma_map_sg() to optimize the IOMMU mapping");
+
+#define HNS3_SGL_SIZE(nfrag)   (sizeof(struct scatterlist) * (nfrag) + \
+                                sizeof(struct sg_table))
+#define HNS3_MAX_SGL_SIZE      ALIGN(HNS3_SGL_SIZE(HNS3_MAX_TSO_BD_NUM),\
+                                     dma_get_cache_alignment())
+
 #define DEFAULT_MSG_LEVEL (NETIF_MSG_PROBE | NETIF_MSG_LINK | \
                           NETIF_MSG_IFDOWN | NETIF_MSG_IFUP)
 
        return true;
 }
 
+static bool hns3_can_use_tx_sgl(struct hns3_enet_ring *ring,
+                               struct sk_buff *skb,
+                               u32 space)
+{
+       if (skb->len <= ring->tx_copybreak || !tx_sgl ||
+           (!skb_has_frag_list(skb) &&
+            skb_shinfo(skb)->nr_frags < tx_sgl))
+               return false;
+
+       if (space < HNS3_MAX_SGL_SIZE) {
+               u64_stats_update_begin(&ring->syncp);
+               ring->stats.tx_spare_full++;
+               u64_stats_update_end(&ring->syncp);
+               return false;
+       }
+
+       return true;
+}
+
 static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring)
 {
        struct hns3_tx_spare *tx_spare;
 
        /* This tx spare buffer is only really reclaimed after calling
         * hns3_tx_spare_update(), so it is still safe to use the info in
-        * the tx buffer to do the dma sync after tx_spare->next_to_clean
-        * is moved forword.
+        * the tx buffer to do the dma sync or sg unmapping after
+        * tx_spare->next_to_clean is moved forword.
         */
        if (cb->type & (DESC_TYPE_BOUNCE_HEAD | DESC_TYPE_BOUNCE_ALL)) {
                dma_addr_t dma = tx_spare->dma + ntc;
 
                dma_sync_single_for_cpu(ring_to_dev(ring), dma, len,
                                        DMA_TO_DEVICE);
+       } else {
+               struct sg_table *sgt = tx_spare->buf + ntc;
+
+               dma_unmap_sg(ring_to_dev(ring), sgt->sgl, sgt->orig_nents,
+                            DMA_TO_DEVICE);
        }
 }
 
        return bd_num;
 }
 
+static int hns3_handle_tx_sgl(struct hns3_enet_ring *ring,
+                             struct sk_buff *skb)
+{
+       struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use];
+       u32 nfrag = skb_shinfo(skb)->nr_frags + 1;
+       struct sg_table *sgt;
+       int i, bd_num = 0;
+       dma_addr_t dma;
+       u32 cb_len;
+       int nents;
+
+       if (skb_has_frag_list(skb))
+               nfrag = HNS3_MAX_TSO_BD_NUM;
+
+       /* hns3_can_use_tx_sgl() is called to ensure the below
+        * function can always return the tx buffer.
+        */
+       sgt = hns3_tx_spare_alloc(ring, HNS3_SGL_SIZE(nfrag),
+                                 &dma, &cb_len);
+
+       /* scatterlist follows by the sg table */
+       sgt->sgl = (struct scatterlist *)(sgt + 1);
+       sg_init_table(sgt->sgl, nfrag);
+       nents = skb_to_sgvec(skb, sgt->sgl, 0, skb->len);
+       if (unlikely(nents < 0)) {
+               hns3_tx_spare_rollback(ring, cb_len);
+               u64_stats_update_begin(&ring->syncp);
+               ring->stats.skb2sgl_err++;
+               u64_stats_update_end(&ring->syncp);
+               return -ENOMEM;
+       }
+
+       sgt->orig_nents = nents;
+       sgt->nents = dma_map_sg(ring_to_dev(ring), sgt->sgl, sgt->orig_nents,
+                               DMA_TO_DEVICE);
+       if (unlikely(!sgt->nents)) {
+               hns3_tx_spare_rollback(ring, cb_len);
+               u64_stats_update_begin(&ring->syncp);
+               ring->stats.map_sg_err++;
+               u64_stats_update_end(&ring->syncp);
+               return -ENOMEM;
+       }
+
+       desc_cb->priv = skb;
+       desc_cb->length = cb_len;
+       desc_cb->dma = dma;
+       desc_cb->type = DESC_TYPE_SGL_SKB;
+
+       for (i = 0; i < sgt->nents; i++)
+               bd_num += hns3_fill_desc(ring, sg_dma_address(sgt->sgl + i),
+                                        sg_dma_len(sgt->sgl + i));
+
+       u64_stats_update_begin(&ring->syncp);
+       ring->stats.tx_sgl++;
+       u64_stats_update_end(&ring->syncp);
+
+       return bd_num;
+}
+
 static int hns3_handle_desc_filling(struct hns3_enet_ring *ring,
                                    struct sk_buff *skb)
 {
 
        space = hns3_tx_spare_space(ring);
 
+       if (hns3_can_use_tx_sgl(ring, skb, space))
+               return hns3_handle_tx_sgl(ring, skb);
+
        if (hns3_can_use_tx_bounce(ring, skb, space))
                return hns3_handle_tx_bounce(ring, skb);
 
                        tx_drop += ring->stats.over_max_recursion;
                        tx_drop += ring->stats.hw_limitation;
                        tx_drop += ring->stats.copy_bits_err;
+                       tx_drop += ring->stats.skb2sgl_err;
+                       tx_drop += ring->stats.map_sg_err;
                        tx_errors += ring->stats.sw_err_cnt;
                        tx_errors += ring->stats.tx_vlan_err;
                        tx_errors += ring->stats.tx_l4_proto_err;
                        tx_errors += ring->stats.over_max_recursion;
                        tx_errors += ring->stats.hw_limitation;
                        tx_errors += ring->stats.copy_bits_err;
+                       tx_errors += ring->stats.skb2sgl_err;
+                       tx_errors += ring->stats.map_sg_err;
                } while (u64_stats_fetch_retry_irq(&ring->syncp, start));
 
                /* fetch the rx stats */
                             struct hns3_desc_cb *cb, int budget)
 {
        if (cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_HEAD |
-                       DESC_TYPE_BOUNCE_ALL))
+                       DESC_TYPE_BOUNCE_ALL | DESC_TYPE_SGL_SKB))
                napi_consume_skb(cb->priv, budget);
        else if (!HNAE3_IS_TX_RING(ring) && cb->pagecnt_bias)
                __page_frag_cache_drain(cb->priv, cb->pagecnt_bias);
        else if ((cb->type & DESC_TYPE_PAGE) && cb->length)
                dma_unmap_page(ring_to_dev(ring), cb->dma, cb->length,
                               ring_to_dma_dir(ring));
-       else if (cb->type & (DESC_TYPE_BOUNCE_ALL | DESC_TYPE_BOUNCE_HEAD))
+       else if (cb->type & (DESC_TYPE_BOUNCE_ALL | DESC_TYPE_BOUNCE_HEAD |
+                            DESC_TYPE_SGL_SKB))
                hns3_tx_spare_reclaim_cb(ring, cb);
 }
 
                desc_cb = &ring->desc_cb[ntc];
 
                if (desc_cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_ALL |
-                                    DESC_TYPE_BOUNCE_HEAD)) {
+                                    DESC_TYPE_BOUNCE_HEAD |
+                                    DESC_TYPE_SGL_SKB)) {
                        (*pkts)++;
                        (*bytes) += desc_cb->send_bytes;
                }