static u64 hwcsum(enum chip_type chip, const struct sk_buff *skb)
 {
        int csum_type;
-       const struct iphdr *iph = ip_hdr(skb);
+       bool inner_hdr_csum = false;
+       u16 proto, ver;
 
-       if (iph->version == 4) {
-               if (iph->protocol == IPPROTO_TCP)
+       if (skb->encapsulation &&
+           (CHELSIO_CHIP_VERSION(chip) > CHELSIO_T5))
+               inner_hdr_csum = true;
+
+       if (inner_hdr_csum) {
+               ver = inner_ip_hdr(skb)->version;
+               proto = (ver == 4) ? inner_ip_hdr(skb)->protocol :
+                       inner_ipv6_hdr(skb)->nexthdr;
+       } else {
+               ver = ip_hdr(skb)->version;
+               proto = (ver == 4) ? ip_hdr(skb)->protocol :
+                       ipv6_hdr(skb)->nexthdr;
+       }
+
+       if (ver == 4) {
+               if (proto == IPPROTO_TCP)
                        csum_type = TX_CSUM_TCPIP;
-               else if (iph->protocol == IPPROTO_UDP)
+               else if (proto == IPPROTO_UDP)
                        csum_type = TX_CSUM_UDPIP;
                else {
 nocsum:                        /*
                /*
                 * this doesn't work with extension headers
                 */
-               const struct ipv6hdr *ip6h = (const struct ipv6hdr *)iph;
-
-               if (ip6h->nexthdr == IPPROTO_TCP)
+               if (proto == IPPROTO_TCP)
                        csum_type = TX_CSUM_TCPIP6;
-               else if (ip6h->nexthdr == IPPROTO_UDP)
+               else if (proto == IPPROTO_UDP)
                        csum_type = TX_CSUM_UDPIP6;
                else
                        goto nocsum;
        }
 
        if (likely(csum_type >= TX_CSUM_TCPIP)) {
-               u64 hdr_len = TXPKT_IPHDR_LEN_V(skb_network_header_len(skb));
-               int eth_hdr_len = skb_network_offset(skb) - ETH_HLEN;
+               int eth_hdr_len, l4_len;
+               u64 hdr_len;
+
+               if (inner_hdr_csum) {
+                       /* This allows checksum offload for all encapsulated
+                        * packets like GRE etc..
+                        */
+                       l4_len = skb_inner_network_header_len(skb);
+                       eth_hdr_len = skb_inner_network_offset(skb) - ETH_HLEN;
+               } else {
+                       l4_len = skb_network_header_len(skb);
+                       eth_hdr_len = skb_network_offset(skb) - ETH_HLEN;
+               }
+               hdr_len = TXPKT_IPHDR_LEN_V(l4_len);
 
                if (CHELSIO_CHIP_VERSION(chip) <= CHELSIO_T5)
                        hdr_len |= TXPKT_ETHHDR_LEN_V(eth_hdr_len);
 netdev_tx_t t4_eth_xmit(struct sk_buff *skb, struct net_device *dev)
 {
        u32 wr_mid, ctrl0, op;
-       u64 cntrl, *end;
+       u64 cntrl, *end, *sgl;
        int qidx, credits;
        unsigned int flits, ndesc;
        struct adapter *adap;
                                 TX_CSUM_TCPIP6 : TX_CSUM_TCPIP) |
                                 TXPKT_IPHDR_LEN_V(l3hdr_len);
                }
+               sgl = (u64 *)(cpl + 1); /* sgl start here */
+               if (unlikely((u8 *)sgl >= (u8 *)q->q.stat)) {
+                       /* If current position is already at the end of the
+                        * txq, reset the current to point to start of the queue
+                        * and update the end ptr as well.
+                        */
+                       if (sgl == (u64 *)q->q.stat) {
+                               int left = (u8 *)end - (u8 *)q->q.stat;
+
+                               end = (void *)q->q.desc + left;
+                               sgl = (void *)q->q.desc;
+                       }
+               }
                q->tso++;
                q->tx_cso += ssi->gso_segs;
        } else {
                wr->op_immdlen = htonl(FW_WR_OP_V(op) |
                                       FW_WR_IMMDLEN_V(len));
                cpl = (void *)(wr + 1);
+               sgl = (u64 *)(cpl + 1);
                if (skb->ip_summed == CHECKSUM_PARTIAL) {
                        cntrl = hwcsum(adap->params.chip, skb) |
                                TXPKT_IPCSUM_DIS_F;
        cpl->ctrl1 = cpu_to_be64(cntrl);
 
        if (immediate) {
-               cxgb4_inline_tx_skb(skb, &q->q, cpl + 1);
+               cxgb4_inline_tx_skb(skb, &q->q, sgl);
                dev_consume_skb_any(skb);
        } else {
                int last_desc;
 
-               cxgb4_write_sgl(skb, &q->q, (struct ulptx_sgl *)(cpl + 1),
-                               end, 0, addr);
+               cxgb4_write_sgl(skb, &q->q, (void *)sgl, end, 0, addr);
                skb_orphan(skb);
 
                last_desc = q->q.pidx + ndesc - 1;
 }
 
 static void do_gro(struct sge_eth_rxq *rxq, const struct pkt_gl *gl,
-                  const struct cpl_rx_pkt *pkt)
+                  const struct cpl_rx_pkt *pkt, unsigned long tnl_hdr_len)
 {
        struct adapter *adapter = rxq->rspq.adap;
        struct sge *s = &adapter->sge;
        }
 
        copy_frags(skb, gl, s->pktshift);
+       if (tnl_hdr_len)
+               skb->csum_level = 1;
        skb->len = gl->tot_len - s->pktshift;
        skb->data_len = skb->len;
        skb->truesize += skb->data_len;
        struct sge *s = &q->adap->sge;
        int cpl_trace_pkt = is_t4(q->adap->params.chip) ?
                            CPL_TRACE_PKT : CPL_TRACE_PKT_T5;
-       u16 err_vec;
+       u16 err_vec, tnl_hdr_len = 0;
        struct port_info *pi;
        int ret = 0;
 
 
        pkt = (const struct cpl_rx_pkt *)rsp;
        /* Compressed error vector is enabled for T6 only */
-       if (q->adap->params.tp.rx_pkt_encap)
+       if (q->adap->params.tp.rx_pkt_encap) {
                err_vec = T6_COMPR_RXERR_VEC_G(be16_to_cpu(pkt->err_vec));
-       else
+               tnl_hdr_len = T6_RX_TNLHDR_LEN_G(ntohs(pkt->err_vec));
+       } else {
                err_vec = be16_to_cpu(pkt->err_vec);
+       }
 
        csum_ok = pkt->csum_calc && !err_vec &&
                  (q->netdev->features & NETIF_F_RXCSUM);
-       if ((pkt->l2info & htonl(RXF_TCP_F)) &&
+       if (((pkt->l2info & htonl(RXF_TCP_F)) ||
+            tnl_hdr_len) &&
            (q->netdev->features & NETIF_F_GRO) && csum_ok && !pkt->ip_frag) {
-               do_gro(rxq, si, pkt);
+               do_gro(rxq, si, pkt, tnl_hdr_len);
                return 0;
        }
 
                } else if (pkt->l2info & htonl(RXF_IP_F)) {
                        __sum16 c = (__force __sum16)pkt->csum;
                        skb->csum = csum_unfold(c);
-                       skb->ip_summed = CHECKSUM_COMPLETE;
+
+                       if (tnl_hdr_len) {
+                               skb->ip_summed = CHECKSUM_UNNECESSARY;
+                               skb->csum_level = 1;
+                       } else {
+                               skb->ip_summed = CHECKSUM_COMPLETE;
+                       }
                        rxq->stats.rx_cso++;
                }
        } else {