}
 }
 
+static void fbnic_clean_twq1(struct fbnic_napi_vector *nv, bool pp_allow_direct,
+                            struct fbnic_ring *ring, bool discard,
+                            unsigned int hw_head)
+{
+       unsigned int head = ring->head;
+       u64 total_bytes = 0;
+
+       while (hw_head != head) {
+               struct page *page;
+               u64 twd;
+
+               if (unlikely(!(ring->desc[head] & FBNIC_TWD_TYPE(AL))))
+                       goto next_desc;
+
+               twd = le64_to_cpu(ring->desc[head]);
+               page = ring->tx_buf[head];
+
+               total_bytes += FIELD_GET(FBNIC_TWD_LEN_MASK, twd);
+
+               page_pool_put_page(nv->page_pool, page, -1, pp_allow_direct);
+next_desc:
+               head++;
+               head &= ring->size_mask;
+       }
+
+       if (!total_bytes)
+               return;
+
+       ring->head = head;
+}
+
 static void fbnic_clean_tsq(struct fbnic_napi_vector *nv,
                            struct fbnic_ring *ring,
                            u64 tcd, int *ts_head, int *head0)
 }
 
 static void fbnic_clean_twq(struct fbnic_napi_vector *nv, int napi_budget,
-                           struct fbnic_q_triad *qt, s32 ts_head, s32 head0)
+                           struct fbnic_q_triad *qt, s32 ts_head, s32 head0,
+                           s32 head1)
 {
        if (head0 >= 0)
                fbnic_clean_twq0(nv, napi_budget, &qt->sub0, false, head0);
        else if (ts_head >= 0)
                fbnic_clean_twq0(nv, napi_budget, &qt->sub0, false, ts_head);
+
+       if (head1 >= 0) {
+               qt->cmpl.deferred_head = -1;
+               if (napi_budget)
+                       fbnic_clean_twq1(nv, true, &qt->sub1, false, head1);
+               else
+                       qt->cmpl.deferred_head = head1;
+       }
 }
 
 static void
                int napi_budget)
 {
        struct fbnic_ring *cmpl = &qt->cmpl;
+       s32 head1 = cmpl->deferred_head;
        s32 head0 = -1, ts_head = -1;
        __le64 *raw_tcd, done;
        u32 head = cmpl->head;
 
                switch (FIELD_GET(FBNIC_TCD_TYPE_MASK, tcd)) {
                case FBNIC_TCD_TYPE_0:
-                       if (!(tcd & FBNIC_TCD_TWQ1))
+                       if (tcd & FBNIC_TCD_TWQ1)
+                               head1 = FIELD_GET(FBNIC_TCD_TYPE0_HEAD1_MASK,
+                                                 tcd);
+                       else
                                head0 = FIELD_GET(FBNIC_TCD_TYPE0_HEAD0_MASK,
                                                  tcd);
                        /* Currently all err status bits are related to
        }
 
        /* Unmap and free processed buffers */
-       fbnic_clean_twq(nv, napi_budget, qt, ts_head, head0);
+       fbnic_clean_twq(nv, napi_budget, qt, ts_head, head0, head1);
 }
 
 static void fbnic_clean_bdq(struct fbnic_napi_vector *nv, int napi_budget,
        fbn->tx[txr->q_idx] = NULL;
 }
 
+static void fbnic_remove_xdp_ring(struct fbnic_net *fbn,
+                                 struct fbnic_ring *xdpr)
+{
+       if (!(xdpr->flags & FBNIC_RING_F_STATS))
+               return;
+
+       /* Remove pointer to the Tx ring */
+       WARN_ON(fbn->tx[xdpr->q_idx] && fbn->tx[xdpr->q_idx] != xdpr);
+       fbn->tx[xdpr->q_idx] = NULL;
+}
+
 static void fbnic_remove_rx_ring(struct fbnic_net *fbn,
                                 struct fbnic_ring *rxr)
 {
 
        for (i = 0; i < nv->txt_count; i++) {
                fbnic_remove_tx_ring(fbn, &nv->qt[i].sub0);
+               fbnic_remove_xdp_ring(fbn, &nv->qt[i].sub1);
                fbnic_remove_tx_ring(fbn, &nv->qt[i].cmpl);
        }
 
        ring->doorbell = doorbell;
        ring->q_idx = q_idx;
        ring->flags = flags;
+       ring->deferred_head = -1;
 }
 
 static int fbnic_alloc_napi_vector(struct fbnic_dev *fbd, struct fbnic_net *fbn,
 {
        int txt_count = txq_count, rxt_count = rxq_count;
        u32 __iomem *uc_addr = fbd->uc_addr0;
+       int xdp_count = 0, qt_count, err;
        struct fbnic_napi_vector *nv;
        struct fbnic_q_triad *qt;
-       int qt_count, err;
        u32 __iomem *db;
 
+       /* We need to reserve at least one Tx Queue Triad for an XDP ring */
+       if (rxq_count) {
+               xdp_count = 1;
+               if (!txt_count)
+                       txt_count = 1;
+       }
+
        qt_count = txt_count + rxq_count;
        if (!qt_count)
                return -EINVAL;
        qt = nv->qt;
 
        while (txt_count) {
+               u8 flags = FBNIC_RING_F_CTX | FBNIC_RING_F_STATS;
+
                /* Configure Tx queue */
                db = &uc_addr[FBNIC_QUEUE(txq_idx) + FBNIC_QUEUE_TWQ0_TAIL];
 
                /* Assign Tx queue to netdev if applicable */
                if (txq_count > 0) {
-                       u8 flags = FBNIC_RING_F_CTX | FBNIC_RING_F_STATS;
 
                        fbnic_ring_init(&qt->sub0, db, txq_idx, flags);
                        fbn->tx[txq_idx] = &qt->sub0;
                                        FBNIC_RING_F_DISABLED);
                }
 
+               /* Configure XDP queue */
+               db = &uc_addr[FBNIC_QUEUE(txq_idx) + FBNIC_QUEUE_TWQ1_TAIL];
+
+               /* Assign XDP queue to netdev if applicable
+                *
+                * The setup for this is in itself a bit different.
+                * 1. We only need one XDP Tx queue per NAPI vector.
+                * 2. We associate it to the first Rx queue index.
+                * 3. The hardware side is associated based on the Tx Queue.
+                * 4. The netdev queue is offset by FBNIC_MAX_TXQs.
+                */
+               if (xdp_count > 0) {
+                       unsigned int xdp_idx = FBNIC_MAX_TXQS + rxq_idx;
+
+                       fbnic_ring_init(&qt->sub1, db, xdp_idx, flags);
+                       fbn->tx[xdp_idx] = &qt->sub1;
+                       xdp_count--;
+               } else {
+                       fbnic_ring_init(&qt->sub1, db, 0,
+                                       FBNIC_RING_F_DISABLED);
+               }
+
                /* Configure Tx completion queue */
                db = &uc_addr[FBNIC_QUEUE(txq_idx) + FBNIC_QUEUE_TCQ_HEAD];
                fbnic_ring_init(&qt->cmpl, db, 0, 0);
                qt--;
 
                fbnic_remove_tx_ring(fbn, &qt->sub0);
+               fbnic_remove_xdp_ring(fbn, &qt->sub1);
                fbnic_remove_tx_ring(fbn, &qt->cmpl);
 
                txt_count++;
        if (err)
                return err;
 
+       err = fbnic_alloc_tx_ring_resources(fbn, &qt->sub1);
+       if (err)
+               goto free_sub0;
+
        err = fbnic_alloc_tx_ring_resources(fbn, &qt->cmpl);
        if (err)
                goto free_sub1;
        return 0;
 
 free_sub1:
+       fbnic_free_ring_resources(dev, &qt->sub1);
+free_sub0:
        fbnic_free_ring_resources(dev, &qt->sub0);
        return err;
 }
        fbnic_ring_wr32(txr, FBNIC_QUEUE_TWQ0_CTL, twq_ctl);
 }
 
+static void fbnic_disable_twq1(struct fbnic_ring *txr)
+{
+       u32 twq_ctl = fbnic_ring_rd32(txr, FBNIC_QUEUE_TWQ1_CTL);
+
+       twq_ctl &= ~FBNIC_QUEUE_TWQ_CTL_ENABLE;
+
+       fbnic_ring_wr32(txr, FBNIC_QUEUE_TWQ1_CTL, twq_ctl);
+}
+
 static void fbnic_disable_tcq(struct fbnic_ring *txr)
 {
        fbnic_ring_wr32(txr, FBNIC_QUEUE_TCQ_CTL, 0);
                        struct fbnic_q_triad *qt = &nv->qt[t];
 
                        fbnic_disable_twq0(&qt->sub0);
+                       fbnic_disable_twq1(&qt->sub1);
                        fbnic_disable_tcq(&qt->cmpl);
                }
 
 
                        /* Clean the work queues of unprocessed work */
                        fbnic_clean_twq0(nv, 0, &qt->sub0, true, qt->sub0.tail);
+                       fbnic_clean_twq1(nv, false, &qt->sub1, true,
+                                        qt->sub1.tail);
 
                        /* Reset completion queue descriptor ring */
                        memset(qt->cmpl.desc, 0, qt->cmpl.size);
        fbnic_ring_wr32(twq, FBNIC_QUEUE_TWQ0_CTL, FBNIC_QUEUE_TWQ_CTL_ENABLE);
 }
 
+static void fbnic_enable_twq1(struct fbnic_ring *twq)
+{
+       u32 log_size = fls(twq->size_mask);
+
+       if (!twq->size_mask)
+               return;
+
+       /* Reset head/tail */
+       fbnic_ring_wr32(twq, FBNIC_QUEUE_TWQ1_CTL, FBNIC_QUEUE_TWQ_CTL_RESET);
+       twq->tail = 0;
+       twq->head = 0;
+
+       /* Store descriptor ring address and size */
+       fbnic_ring_wr32(twq, FBNIC_QUEUE_TWQ1_BAL, lower_32_bits(twq->dma));
+       fbnic_ring_wr32(twq, FBNIC_QUEUE_TWQ1_BAH, upper_32_bits(twq->dma));
+
+       /* Write lower 4 bits of log size as 64K ring size is 0 */
+       fbnic_ring_wr32(twq, FBNIC_QUEUE_TWQ1_SIZE, log_size & 0xf);
+
+       fbnic_ring_wr32(twq, FBNIC_QUEUE_TWQ1_CTL, FBNIC_QUEUE_TWQ_CTL_ENABLE);
+}
+
 static void fbnic_enable_tcq(struct fbnic_napi_vector *nv,
                             struct fbnic_ring *tcq)
 {
                        struct fbnic_q_triad *qt = &nv->qt[t];
 
                        fbnic_enable_twq0(&qt->sub0);
+                       fbnic_enable_twq1(&qt->sub1);
                        fbnic_enable_tcq(nv, &qt->cmpl);
                }