#define BNXT_RX_DMA_OFFSET NET_SKB_PAD
 #define BNXT_RX_COPY_THRESH 256
 
-#define BNXT_TX_PUSH_THRESH 92
+#define BNXT_TX_PUSH_THRESH 164
 
 enum board_idx {
        BCM57301,
        }
 
        if (free_size == bp->tx_ring_size && length <= bp->tx_push_thresh) {
-               struct tx_push_bd *push = txr->tx_push;
-               struct tx_bd *tx_push = &push->txbd1;
-               struct tx_bd_ext *tx_push1 = &push->txbd2;
-               void *pdata = tx_push1 + 1;
-               int j;
+               struct tx_push_buffer *tx_push_buf = txr->tx_push;
+               struct tx_push_bd *tx_push = &tx_push_buf->push_bd;
+               struct tx_bd_ext *tx_push1 = &tx_push->txbd2;
+               void *pdata = tx_push_buf->data;
+               u64 *end;
+               int j, push_len;
 
                /* Set COAL_NOW to be ready quickly for the next push */
                tx_push->tx_bd_len_flags_type =
                tx_push1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags);
                tx_push1->tx_bd_cfa_action = cpu_to_le32(cfa_action);
 
+               end = PTR_ALIGN(pdata + length + 1, 8) - 1;
+               *end = 0;
+
                skb_copy_from_linear_data(skb, pdata, len);
                pdata += len;
                for (j = 0; j < last_frag; j++) {
                        pdata += skb_frag_size(frag);
                }
 
-               memcpy(txbd, tx_push, sizeof(*txbd));
+               txbd->tx_bd_len_flags_type = tx_push->tx_bd_len_flags_type;
+               txbd->tx_bd_haddr = txr->data_mapping;
                prod = NEXT_TX(prod);
                txbd = &txr->tx_desc_ring[TX_RING(prod)][TX_IDX(prod)];
                memcpy(txbd, tx_push1, sizeof(*txbd));
                prod = NEXT_TX(prod);
-               push->doorbell =
+               tx_push->doorbell =
                        cpu_to_le32(DB_KEY_TX_PUSH | DB_LONG_TX_PUSH | prod);
                txr->tx_prod = prod;
 
                netdev_tx_sent_queue(txq, skb->len);
 
-               __iowrite64_copy(txr->tx_doorbell, push,
-                                (length + sizeof(*push) + 8) / 8);
+               push_len = (length + sizeof(*tx_push) + 7) / 8;
+               if (push_len > 16) {
+                       __iowrite64_copy(txr->tx_doorbell, tx_push_buf, 16);
+                       __iowrite64_copy(txr->tx_doorbell + 4, tx_push_buf + 1,
+                                        push_len - 16);
+               } else {
+                       __iowrite64_copy(txr->tx_doorbell, tx_push_buf,
+                                        push_len);
+               }
 
                tx_buf->is_push = 1;
-
                goto tx_done;
        }
 
                push_size  = L1_CACHE_ALIGN(sizeof(struct tx_push_bd) +
                                        bp->tx_push_thresh);
 
-               if (push_size > 128) {
+               if (push_size > 256) {
                        push_size = 0;
                        bp->tx_push_thresh = 0;
                }
                        return rc;
 
                if (bp->tx_push_size) {
-                       struct tx_bd *txbd;
                        dma_addr_t mapping;
 
                        /* One pre-allocated DMA buffer to backup
                        if (!txr->tx_push)
                                return -ENOMEM;
 
-                       txbd = &txr->tx_push->txbd1;
-
                        mapping = txr->tx_push_mapping +
                                sizeof(struct tx_push_bd);
-                       txbd->tx_bd_haddr = cpu_to_le64(mapping);
+                       txr->data_mapping = cpu_to_le64(mapping);
 
-                       memset(txbd + 1, 0, sizeof(struct tx_bd_ext));
+                       memset(txr->tx_push, 0, sizeof(struct tx_push_bd));
                }
                ring->queue_id = bp->q_info[j].queue_id;
                if (i % bp->tx_nr_rings_per_tc == (bp->tx_nr_rings_per_tc - 1))