rq->wqe_sz = (priv->params.lro_en) ? priv->params.lro_wqe_sz :
                                             MLX5E_SW2HW_MTU(priv->netdev->mtu);
+       rq->wqe_sz = SKB_DATA_ALIGN(rq->wqe_sz + MLX5E_NET_IP_ALIGN);
 
        for (i = 0; i < wq_sz; i++) {
                struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
+               u32 byte_count = rq->wqe_sz - MLX5E_NET_IP_ALIGN;
 
                wqe->data.lkey       = c->mkey_be;
-               wqe->data.byte_count = cpu_to_be32(rq->wqe_sz);
+               wqe->data.byte_count =
+                       cpu_to_be32(byte_count | MLX5_HW_START_PADDING);
        }
 
        rq->pdev    = c->pdev;
 
        if (unlikely(!skb))
                return -ENOMEM;
 
-       skb_reserve(skb, MLX5E_NET_IP_ALIGN);
-
        dma_addr = dma_map_single(rq->pdev,
                                  /* hw start padding */
-                                 skb->data - MLX5E_NET_IP_ALIGN,
-                                 /* hw   end padding */
+                                 skb->data,
+                                 /* hw end padding */
                                  rq->wqe_sz,
                                  DMA_FROM_DEVICE);
 
        if (unlikely(dma_mapping_error(rq->pdev, dma_addr)))
                goto err_free_skb;
 
+       skb_reserve(skb, MLX5E_NET_IP_ALIGN);
+
        *((dma_addr_t *)skb->cb) = dma_addr;
        wqe->data.addr = cpu_to_be64(dma_addr + MLX5E_NET_IP_ALIGN);
 
 
                dma_unmap_single(rq->pdev,
                                 *((dma_addr_t *)skb->cb),
-                                skb_end_offset(skb),
+                                rq->wqe_sz,
                                 DMA_FROM_DEVICE);
 
                if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
 
        MLX5_INLINE_SEG = 0x80000000,
 };
 
+enum {
+       MLX5_HW_START_PADDING = MLX5_INLINE_SEG,
+};
+
 enum {
        MLX5_MIN_PKEY_TABLE_SIZE = 128,
        MLX5_MAX_LOG_PKEY_TABLE  = 5,