#define MLX5E_TX_CQ_POLL_BUDGET        128
 #define MLX5E_UPDATE_STATS_INTERVAL    200 /* msecs */
+#define MLX5E_SQ_BF_BUDGET             16
 
 static const char vport_strings[][ETH_GSTRING_LEN] = {
        /* vport statistics */
        /* dirtied @xmit */
        u16                        pc ____cacheline_aligned_in_smp;
        u32                        dma_fifo_pc;
-       u32                        bf_offset;
+       u16                        bf_offset;
+       u16                        prev_cc;
+       u8                         bf_budget;
        struct mlx5e_sq_stats      stats;
 
        struct mlx5e_cq            cq;
        struct mlx5_wq_cyc         wq;
        u32                        dma_fifo_mask;
        void __iomem              *uar_map;
+       void __iomem              *uar_bf_map;
        struct netdev_queue       *txq;
        u32                        sqn;
-       u32                        bf_buf_size;
+       u16                        bf_buf_size;
        u16                        max_inline;
        u16                        edge;
        struct device             *pdev;
                             struct mlx5e_params *new_params);
 
 static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
-                                     struct mlx5e_tx_wqe *wqe)
+                                     struct mlx5e_tx_wqe *wqe, int bf_sz)
 {
+       u16 ofst = MLX5_BF_OFFSET + sq->bf_offset;
+
        /* ensure wqe is visible to device before updating doorbell record */
        dma_wmb();
 
         */
        wmb();
 
-       mlx5_write64((__be32 *)&wqe->ctrl,
-                    sq->uar_map + MLX5_BF_OFFSET + sq->bf_offset,
-                    NULL);
+       if (bf_sz) {
+               __iowrite64_copy(sq->uar_bf_map + ofst, &wqe->ctrl, bf_sz);
+
+               /* flush the write-combining mapped buffer */
+               wmb();
+
+       } else {
+               mlx5_write64((__be32 *)&wqe->ctrl, sq->uar_map + ofst, NULL);
+       }
 
        sq->bf_offset ^= sq->bf_buf_size;
 }
 
 
        sq->wq.db       = &sq->wq.db[MLX5_SND_DBR];
        sq->uar_map     = sq->uar.map;
+       sq->uar_bf_map  = sq->uar.bf_map;
        sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2;
        sq->max_inline  = param->max_inline;
 
        txq_ix = c->ix + tc * priv->params.num_channels;
        sq->txq = netdev_get_tx_queue(priv->netdev, txq_ix);
 
-       sq->pdev    = c->pdev;
-       sq->mkey_be = c->mkey_be;
-       sq->channel = c;
-       sq->tc      = tc;
-       sq->edge    = (sq->wq.sz_m1 + 1) - MLX5_SEND_WQE_MAX_WQEBBS;
+       sq->pdev      = c->pdev;
+       sq->mkey_be   = c->mkey_be;
+       sq->channel   = c;
+       sq->tc        = tc;
+       sq->edge      = (sq->wq.sz_m1 + 1) - MLX5_SEND_WQE_MAX_WQEBBS;
+       sq->bf_budget = MLX5E_SQ_BF_BUDGET;
        priv->txq_to_sq_map[txq_ix] = sq;
 
        return 0;
 
 
        if (notify_hw) {
                cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
-               mlx5e_tx_notify_hw(sq, wqe);
+               mlx5e_tx_notify_hw(sq, wqe, 0);
        }
 }
 
 }
 
 static inline u16 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq,
-                                           struct sk_buff *skb)
+                                           struct sk_buff *skb, bool bf)
 {
        /* Some NIC TX decisions, e.g loopback, are based on the packet
         * headers and occur before the data gather.
         */
 #define MLX5E_MIN_INLINE (ETH_HLEN + 2/*vlan tag*/)
 
-       if (skb_headlen(skb) <= sq->max_inline)
+       if (bf && (skb_headlen(skb) <= sq->max_inline))
                return skb_headlen(skb);
 
        return MLX5E_MIN_INLINE;
 
        u8  opcode = MLX5_OPCODE_SEND;
        dma_addr_t dma_addr = 0;
+       bool bf = false;
        u16 headlen;
        u16 ds_cnt;
        u16 ihs;
        else
                sq->stats.csum_offload_none++;
 
+       if (sq->cc != sq->prev_cc) {
+               sq->prev_cc = sq->cc;
+               sq->bf_budget = (sq->cc == sq->pc) ? MLX5E_SQ_BF_BUDGET : 0;
+       }
+
        if (skb_is_gso(skb)) {
                u32 payload_len;
 
                sq->stats.tso_packets++;
                sq->stats.tso_bytes += payload_len;
        } else {
-               ihs = mlx5e_get_inline_hdr_size(sq, skb);
+               bf = sq->bf_budget &&
+                    !skb->xmit_more &&
+                    !skb_shinfo(skb)->nr_frags;
+               ihs = mlx5e_get_inline_hdr_size(sq, skb, bf);
                MLX5E_TX_SKB_CB(skb)->num_bytes = max_t(unsigned int, skb->len,
                                                        ETH_ZLEN);
        }
        }
 
        if (!skb->xmit_more || netif_xmit_stopped(sq->txq)) {
+               int bf_sz = 0;
+
+               if (bf && sq->uar_bf_map)
+                       bf_sz = MLX5E_TX_SKB_CB(skb)->num_wqebbs << 3;
+
                cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
-               mlx5e_tx_notify_hw(sq, wqe);
+               mlx5e_tx_notify_hw(sq, wqe, bf_sz);
        }
 
        /* fill sq edge with nops to avoid wqe wrap around */
        while ((sq->pc & wq->sz_m1) > sq->edge)
                mlx5e_send_nop(sq, false);
 
+       sq->bf_budget = bf ? sq->bf_budget - 1 : 0;
+
        sq->stats.packets++;
        return NETDEV_TX_OK;
 
 
 }
 #endif
 
+static int map_bf_area(struct mlx5_core_dev *dev)
+{
+       resource_size_t bf_start = pci_resource_start(dev->pdev, 0);
+       resource_size_t bf_len = pci_resource_len(dev->pdev, 0);
+
+       dev->priv.bf_mapping = io_mapping_create_wc(bf_start, bf_len);
+
+       return dev->priv.bf_mapping ? 0 : -ENOMEM;
+}
+
+static void unmap_bf_area(struct mlx5_core_dev *dev)
+{
+       if (dev->priv.bf_mapping)
+               io_mapping_free(dev->priv.bf_mapping);
+}
+
 static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
 {
        struct mlx5_priv *priv = &dev->priv;
                goto err_stop_eqs;
        }
 
+       if (map_bf_area(dev))
+               dev_err(&pdev->dev, "Failed to map blue flame area\n");
+
        err = mlx5_irq_set_affinity_hints(dev);
        if (err) {
                dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
-               goto err_free_comp_eqs;
+               goto err_unmap_bf_area;
        }
 
        MLX5_INIT_DOORBELL_LOCK(&priv->cq_uar_lock);
 
        return 0;
 
-err_free_comp_eqs:
+err_unmap_bf_area:
+       unmap_bf_area(dev);
+
        free_comp_eqs(dev);
 
 err_stop_eqs:
        mlx5_cleanup_qp_table(dev);
        mlx5_cleanup_cq_table(dev);
        mlx5_irq_clear_affinity_hints(dev);
+       unmap_bf_area(dev);
        free_comp_eqs(dev);
        mlx5_stop_eqs(dev);
        mlx5_free_uuars(dev, &priv->uuari);
 
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/io-mapping.h>
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/cmd.h>
 #include "mlx5_core.h"
                goto err_free_uar;
        }
 
+       if (mdev->priv.bf_mapping)
+               uar->bf_map = io_mapping_map_wc(mdev->priv.bf_mapping,
+                                               uar->index << PAGE_SHIFT);
+
        return 0;
 
 err_free_uar:
 
 void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar)
 {
+       io_mapping_unmap(uar->bf_map);
        iounmap(uar->map);
        mlx5_cmd_free_uar(mdev, uar->index);
 }
 
        u32                     index;
        struct list_head        bf_list;
        unsigned                free_bf_bmap;
-       void __iomem           *wc_map;
+       void __iomem           *bf_map;
        void __iomem           *map;
 };
 
        struct mlx5_uuar_info   uuari;
        MLX5_DECLARE_DOORBELL_LOCK(cq_uar_lock);
 
+       struct io_mapping       *bf_mapping;
+
        /* pages stuff */
        struct workqueue_struct *pg_wq;
        struct rb_root          page_root;