static int ice_xdp_alloc_setup_rings(struct ice_vsi *vsi)
 {
        struct device *dev = ice_pf_to_dev(vsi->back);
-       int i;
+       struct ice_tx_desc *tx_desc;
+       int i, j;
 
        for (i = 0; i < vsi->num_xdp_txq; i++) {
                u16 xdp_q_idx = vsi->alloc_txq + i;
                xdp_ring->reg_idx = vsi->txq_map[xdp_q_idx];
                xdp_ring->vsi = vsi;
                xdp_ring->netdev = NULL;
+               xdp_ring->next_dd = ICE_TX_THRESH - 1;
+               xdp_ring->next_rs = ICE_TX_THRESH - 1;
                xdp_ring->dev = dev;
                xdp_ring->count = vsi->num_tx_desc;
                WRITE_ONCE(vsi->xdp_rings[i], xdp_ring);
                        goto free_xdp_rings;
                ice_set_ring_xdp(xdp_ring);
                xdp_ring->xsk_pool = ice_tx_xsk_pool(xdp_ring);
+               for (j = 0; j < xdp_ring->count; j++) {
+                       tx_desc = ICE_TX_DESC(xdp_ring, j);
+                       tx_desc->cmd_type_offset_bsz = cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE);
+               }
        }
 
        ice_for_each_rxq(vsi, i)
 
                total_bytes += tx_buf->bytecount;
                total_pkts += tx_buf->gso_segs;
 
-               if (ice_ring_is_xdp(tx_ring))
-                       page_frag_free(tx_buf->raw_buf);
-               else
-                       /* free the skb */
-                       napi_consume_skb(tx_buf->skb, napi_budget);
+               /* free the skb */
+               napi_consume_skb(tx_buf->skb, napi_budget);
 
                /* unmap skb header data */
                dma_unmap_single(tx_ring->dev,
 
        ice_update_tx_ring_stats(tx_ring, total_pkts, total_bytes);
 
-       if (ice_ring_is_xdp(tx_ring))
-               return !!budget;
-
        netdev_tx_completed_queue(txring_txq(tx_ring), total_pkts,
                                  total_bytes);
 
         * budget and be more aggressive about cleaning up the Tx descriptors.
         */
        ice_for_each_tx_ring(tx_ring, q_vector->tx) {
-               bool wd = tx_ring->xsk_pool ?
-                         ice_clean_tx_irq_zc(tx_ring, budget) :
-                         ice_clean_tx_irq(tx_ring, budget);
+               bool wd;
+
+               if (tx_ring->xsk_pool)
+                       wd = ice_clean_tx_irq_zc(tx_ring, budget);
+               else if (ice_ring_is_xdp(tx_ring))
+                       wd = true;
+               else
+                       wd = ice_clean_tx_irq(tx_ring, budget);
 
                if (!wd)
                        clean_complete = false;
 
 #define ICE_MAX_CHAINED_RX_BUFS        5
 #define ICE_MAX_BUF_TXD                8
 #define ICE_MIN_TX_LEN         17
+#define ICE_TX_THRESH          32
 
 /* The size limit for a transmit buffer in a descriptor is (16K - 1).
  * In order to align with the read requests we will align the value to
        struct ice_vsi *vsi;            /* Backreference to associated VSI */
        /* CL2 - 2nd cacheline starts here */
        dma_addr_t dma;                 /* physical address of ring */
+       struct xsk_buff_pool *xsk_pool;
        u16 next_to_use;
        u16 next_to_clean;
+       u16 next_rs;
+       u16 next_dd;
+       u16 q_handle;                   /* Queue handle per TC */
+       u16 reg_idx;                    /* HW register index of the ring */
        u16 count;                      /* Number of descriptors */
        u16 q_index;                    /* Queue number of ring */
-       struct xsk_buff_pool *xsk_pool;
-
        /* stats structs */
        struct ice_q_stats      stats;
        struct u64_stats_sync syncp;
        DECLARE_BITMAP(xps_state, ICE_TX_NBITS);        /* XPS Config State */
        struct ice_ptp_tx *tx_tstamps;
        u32 txq_teid;                   /* Added Tx queue TEID */
-       u16 q_handle;                   /* Queue handle per TC */
-       u16 reg_idx;                    /* HW register index of the ring */
 #define ICE_TX_FLAGS_RING_XDP          BIT(0)
        u8 flags;
        u8 dcb_tc;                      /* Traffic class of ring */
 
 
 #include "ice_txrx_lib.h"
 #include "ice_eswitch.h"
+#include "ice_lib.h"
 
 /**
  * ice_release_rx_desc - Store the new tail and head values
        napi_gro_receive(&rx_ring->q_vector->napi, skb);
 }
 
+/**
+ * ice_clean_xdp_irq - Reclaim resources after transmit completes on XDP ring
+ * @xdp_ring: XDP ring to clean
+ */
+static void ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
+{
+       unsigned int total_bytes = 0, total_pkts = 0;
+       u16 ntc = xdp_ring->next_to_clean;
+       struct ice_tx_desc *next_dd_desc;
+       u16 next_dd = xdp_ring->next_dd;
+       struct ice_tx_buf *tx_buf;
+       int i;
+
+       next_dd_desc = ICE_TX_DESC(xdp_ring, next_dd);
+       if (!(next_dd_desc->cmd_type_offset_bsz &
+           cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE)))
+               return;
+
+       for (i = 0; i < ICE_TX_THRESH; i++) {
+               tx_buf = &xdp_ring->tx_buf[ntc];
+
+               total_bytes += tx_buf->bytecount;
+               /* normally tx_buf->gso_segs was taken but at this point
+                * it's always 1 for us
+                */
+               total_pkts++;
+
+               page_frag_free(tx_buf->raw_buf);
+               dma_unmap_single(xdp_ring->dev, dma_unmap_addr(tx_buf, dma),
+                                dma_unmap_len(tx_buf, len), DMA_TO_DEVICE);
+               dma_unmap_len_set(tx_buf, len, 0);
+               tx_buf->raw_buf = NULL;
+
+               ntc++;
+               if (ntc >= xdp_ring->count)
+                       ntc = 0;
+       }
+
+       next_dd_desc->cmd_type_offset_bsz = 0;
+       xdp_ring->next_dd = xdp_ring->next_dd + ICE_TX_THRESH;
+       if (xdp_ring->next_dd > xdp_ring->count)
+               xdp_ring->next_dd = ICE_TX_THRESH - 1;
+       xdp_ring->next_to_clean = ntc;
+       ice_update_tx_ring_stats(xdp_ring, total_pkts, total_bytes);
+}
+
 /**
  * ice_xmit_xdp_ring - submit single packet to XDP ring for transmission
  * @data: packet data pointer
        struct ice_tx_buf *tx_buf;
        dma_addr_t dma;
 
+       if (ICE_DESC_UNUSED(xdp_ring) < ICE_TX_THRESH)
+               ice_clean_xdp_irq(xdp_ring);
+
        if (!unlikely(ICE_DESC_UNUSED(xdp_ring))) {
                xdp_ring->tx_stats.tx_busy++;
                return ICE_XDP_CONSUMED;
 
        tx_desc = ICE_TX_DESC(xdp_ring, i);
        tx_desc->buf_addr = cpu_to_le64(dma);
-       tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TXD_LAST_DESC_CMD, 0,
+       tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TX_DESC_CMD_EOP, 0,
                                                      size, 0);
 
-       /* Make certain all of the status bits have been updated
-        * before next_to_watch is written.
-        */
-       smp_wmb();
-
        i++;
-       if (i == xdp_ring->count)
+       if (i == xdp_ring->count) {
                i = 0;
-
-       tx_buf->next_to_watch = tx_desc;
+               tx_desc = ICE_TX_DESC(xdp_ring, xdp_ring->next_rs);
+               tx_desc->cmd_type_offset_bsz |=
+                       cpu_to_le64(ICE_TX_DESC_CMD_RS << ICE_TXD_QW1_CMD_S);
+               xdp_ring->next_rs = ICE_TX_THRESH - 1;
+       }
        xdp_ring->next_to_use = i;
 
+       if (i > xdp_ring->next_rs) {
+               tx_desc = ICE_TX_DESC(xdp_ring, xdp_ring->next_rs);
+               tx_desc->cmd_type_offset_bsz |=
+                       cpu_to_le64(ICE_TX_DESC_CMD_RS << ICE_TXD_QW1_CMD_S);
+               xdp_ring->next_rs += ICE_TX_THRESH;
+       }
+
        return ICE_XDP_TX;
 }