#include "ib_verbs.h"
 #include <rdma/bnxt_re-abi.h>
 
+static int __from_ib_access_flags(int iflags)
+{
+       int qflags = 0;
+
+       if (iflags & IB_ACCESS_LOCAL_WRITE)
+               qflags |= BNXT_QPLIB_ACCESS_LOCAL_WRITE;
+       if (iflags & IB_ACCESS_REMOTE_READ)
+               qflags |= BNXT_QPLIB_ACCESS_REMOTE_READ;
+       if (iflags & IB_ACCESS_REMOTE_WRITE)
+               qflags |= BNXT_QPLIB_ACCESS_REMOTE_WRITE;
+       if (iflags & IB_ACCESS_REMOTE_ATOMIC)
+               qflags |= BNXT_QPLIB_ACCESS_REMOTE_ATOMIC;
+       if (iflags & IB_ACCESS_MW_BIND)
+               qflags |= BNXT_QPLIB_ACCESS_MW_BIND;
+       if (iflags & IB_ZERO_BASED)
+               qflags |= BNXT_QPLIB_ACCESS_ZERO_BASED;
+       if (iflags & IB_ACCESS_ON_DEMAND)
+               qflags |= BNXT_QPLIB_ACCESS_ON_DEMAND;
+       return qflags;
+};
+
+static enum ib_access_flags __to_ib_access_flags(int qflags)
+{
+       enum ib_access_flags iflags = 0;
+
+       if (qflags & BNXT_QPLIB_ACCESS_LOCAL_WRITE)
+               iflags |= IB_ACCESS_LOCAL_WRITE;
+       if (qflags & BNXT_QPLIB_ACCESS_REMOTE_WRITE)
+               iflags |= IB_ACCESS_REMOTE_WRITE;
+       if (qflags & BNXT_QPLIB_ACCESS_REMOTE_READ)
+               iflags |= IB_ACCESS_REMOTE_READ;
+       if (qflags & BNXT_QPLIB_ACCESS_REMOTE_ATOMIC)
+               iflags |= IB_ACCESS_REMOTE_ATOMIC;
+       if (qflags & BNXT_QPLIB_ACCESS_MW_BIND)
+               iflags |= IB_ACCESS_MW_BIND;
+       if (qflags & BNXT_QPLIB_ACCESS_ZERO_BASED)
+               iflags |= IB_ZERO_BASED;
+       if (qflags & BNXT_QPLIB_ACCESS_ON_DEMAND)
+               iflags |= IB_ACCESS_ON_DEMAND;
+       return iflags;
+};
+
 static int bnxt_re_build_sgl(struct ib_sge *ib_sg_list,
                             struct bnxt_qplib_sge *sg_list, int num)
 {
        return IB_LINK_LAYER_ETHERNET;
 }
 
+#define        BNXT_RE_FENCE_PBL_SIZE  DIV_ROUND_UP(BNXT_RE_FENCE_BYTES, PAGE_SIZE)
+
+static void bnxt_re_create_fence_wqe(struct bnxt_re_pd *pd)
+{
+       struct bnxt_re_fence_data *fence = &pd->fence;
+       struct ib_mr *ib_mr = &fence->mr->ib_mr;
+       struct bnxt_qplib_swqe *wqe = &fence->bind_wqe;
+
+       memset(wqe, 0, sizeof(*wqe));
+       wqe->type = BNXT_QPLIB_SWQE_TYPE_BIND_MW;
+       wqe->wr_id = BNXT_QPLIB_FENCE_WRID;
+       wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP;
+       wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE;
+       wqe->bind.zero_based = false;
+       wqe->bind.parent_l_key = ib_mr->lkey;
+       wqe->bind.va = (u64)(unsigned long)fence->va;
+       wqe->bind.length = fence->size;
+       wqe->bind.access_cntl = __from_ib_access_flags(IB_ACCESS_REMOTE_READ);
+       wqe->bind.mw_type = SQ_BIND_MW_TYPE_TYPE1;
+
+       /* Save the initial rkey in fence structure for now;
+        * wqe->bind.r_key will be set at (re)bind time.
+        */
+       fence->bind_rkey = ib_inc_rkey(fence->mw->rkey);
+}
+
+static int bnxt_re_bind_fence_mw(struct bnxt_qplib_qp *qplib_qp)
+{
+       struct bnxt_re_qp *qp = container_of(qplib_qp, struct bnxt_re_qp,
+                                            qplib_qp);
+       struct ib_pd *ib_pd = qp->ib_qp.pd;
+       struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+       struct bnxt_re_fence_data *fence = &pd->fence;
+       struct bnxt_qplib_swqe *fence_wqe = &fence->bind_wqe;
+       struct bnxt_qplib_swqe wqe;
+       int rc;
+
+       memcpy(&wqe, fence_wqe, sizeof(wqe));
+       wqe.bind.r_key = fence->bind_rkey;
+       fence->bind_rkey = ib_inc_rkey(fence->bind_rkey);
+
+       dev_dbg(rdev_to_dev(qp->rdev),
+               "Posting bind fence-WQE: rkey: %#x QP: %d PD: %p\n",
+               wqe.bind.r_key, qp->qplib_qp.id, pd);
+       rc = bnxt_qplib_post_send(&qp->qplib_qp, &wqe);
+       if (rc) {
+               dev_err(rdev_to_dev(qp->rdev), "Failed to bind fence-WQE\n");
+               return rc;
+       }
+       bnxt_qplib_post_send_db(&qp->qplib_qp);
+
+       return rc;
+}
+
+static void bnxt_re_destroy_fence_mr(struct bnxt_re_pd *pd)
+{
+       struct bnxt_re_fence_data *fence = &pd->fence;
+       struct bnxt_re_dev *rdev = pd->rdev;
+       struct device *dev = &rdev->en_dev->pdev->dev;
+       struct bnxt_re_mr *mr = fence->mr;
+
+       if (fence->mw) {
+               bnxt_re_dealloc_mw(fence->mw);
+               fence->mw = NULL;
+       }
+       if (mr) {
+               if (mr->ib_mr.rkey)
+                       bnxt_qplib_dereg_mrw(&rdev->qplib_res, &mr->qplib_mr,
+                                            true);
+               if (mr->ib_mr.lkey)
+                       bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
+               kfree(mr);
+               fence->mr = NULL;
+       }
+       if (fence->dma_addr) {
+               dma_unmap_single(dev, fence->dma_addr, BNXT_RE_FENCE_BYTES,
+                                DMA_BIDIRECTIONAL);
+               fence->dma_addr = 0;
+       }
+}
+
+static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd)
+{
+       int mr_access_flags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_MW_BIND;
+       struct bnxt_re_fence_data *fence = &pd->fence;
+       struct bnxt_re_dev *rdev = pd->rdev;
+       struct device *dev = &rdev->en_dev->pdev->dev;
+       struct bnxt_re_mr *mr = NULL;
+       dma_addr_t dma_addr = 0;
+       struct ib_mw *mw;
+       u64 pbl_tbl;
+       int rc;
+
+       dma_addr = dma_map_single(dev, fence->va, BNXT_RE_FENCE_BYTES,
+                                 DMA_BIDIRECTIONAL);
+       rc = dma_mapping_error(dev, dma_addr);
+       if (rc) {
+               dev_err(rdev_to_dev(rdev), "Failed to dma-map fence-MR-mem\n");
+               rc = -EIO;
+               fence->dma_addr = 0;
+               goto fail;
+       }
+       fence->dma_addr = dma_addr;
+
+       /* Allocate a MR */
+       mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+       if (!mr) {
+               rc = -ENOMEM;
+               goto fail;
+       }
+       fence->mr = mr;
+       mr->rdev = rdev;
+       mr->qplib_mr.pd = &pd->qplib_pd;
+       mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR;
+       mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags);
+       rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
+       if (rc) {
+               dev_err(rdev_to_dev(rdev), "Failed to alloc fence-HW-MR\n");
+               goto fail;
+       }
+
+       /* Register MR */
+       mr->ib_mr.lkey = mr->qplib_mr.lkey;
+       mr->qplib_mr.va = (u64)(unsigned long)fence->va;
+       mr->qplib_mr.total_size = BNXT_RE_FENCE_BYTES;
+       pbl_tbl = dma_addr;
+       rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, &pbl_tbl,
+                              BNXT_RE_FENCE_PBL_SIZE, false);
+       if (rc) {
+               dev_err(rdev_to_dev(rdev), "Failed to register fence-MR\n");
+               goto fail;
+       }
+       mr->ib_mr.rkey = mr->qplib_mr.rkey;
+
+       /* Create a fence MW only for kernel consumers */
+       mw = bnxt_re_alloc_mw(&pd->ib_pd, IB_MW_TYPE_1, NULL);
+       if (!mw) {
+               dev_err(rdev_to_dev(rdev),
+                       "Failed to create fence-MW for PD: %p\n", pd);
+               rc = -EINVAL;
+               goto fail;
+       }
+       fence->mw = mw;
+
+       bnxt_re_create_fence_wqe(pd);
+       return 0;
+
+fail:
+       bnxt_re_destroy_fence_mr(pd);
+       return rc;
+}
+
 /* Protection Domains */
 int bnxt_re_dealloc_pd(struct ib_pd *ib_pd)
 {
        struct bnxt_re_dev *rdev = pd->rdev;
        int rc;
 
+       bnxt_re_destroy_fence_mr(pd);
        if (ib_pd->uobject && pd->dpi.dbr) {
                struct ib_ucontext *ib_uctx = ib_pd->uobject->context;
                struct bnxt_re_ucontext *ucntx;
                }
        }
 
+       if (!udata)
+               if (bnxt_re_create_fence_mr(pd))
+                       dev_warn(rdev_to_dev(rdev),
+                                "Failed to create Fence-MR\n");
        return &pd->ib_pd;
 dbfail:
        (void)bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl,
        /* Shadow QP SQ depth should be same as QP1 RQ depth */
        qp->qplib_qp.sq.max_wqe = qp1_qp->rq.max_wqe;
        qp->qplib_qp.sq.max_sge = 2;
+       /* Q full delta can be 1 since it is internal QP */
+       qp->qplib_qp.sq.q_full_delta = 1;
 
        qp->qplib_qp.scq = qp1_qp->scq;
        qp->qplib_qp.rcq = qp1_qp->rcq;
 
        qp->qplib_qp.rq.max_wqe = qp1_qp->rq.max_wqe;
        qp->qplib_qp.rq.max_sge = qp1_qp->rq.max_sge;
+       /* Q full delta can be 1 since it is internal QP */
+       qp->qplib_qp.rq.q_full_delta = 1;
 
        qp->qplib_qp.mtu = qp1_qp->mtu;
 
        qp->qplib_qp.sig_type = ((qp_init_attr->sq_sig_type ==
                                  IB_SIGNAL_ALL_WR) ? true : false);
 
-       entries = roundup_pow_of_two(qp_init_attr->cap.max_send_wr + 1);
-       qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
-                                       dev_attr->max_qp_wqes + 1);
-
        qp->qplib_qp.sq.max_sge = qp_init_attr->cap.max_send_sge;
        if (qp->qplib_qp.sq.max_sge > dev_attr->max_qp_sges)
                qp->qplib_qp.sq.max_sge = dev_attr->max_qp_sges;
                qp->qplib_qp.rq.max_wqe = min_t(u32, entries,
                                                dev_attr->max_qp_wqes + 1);
 
+               qp->qplib_qp.rq.q_full_delta = qp->qplib_qp.rq.max_wqe -
+                                               qp_init_attr->cap.max_recv_wr;
+
                qp->qplib_qp.rq.max_sge = qp_init_attr->cap.max_recv_sge;
                if (qp->qplib_qp.rq.max_sge > dev_attr->max_qp_sges)
                        qp->qplib_qp.rq.max_sge = dev_attr->max_qp_sges;
        qp->qplib_qp.mtu = ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu));
 
        if (qp_init_attr->qp_type == IB_QPT_GSI) {
+               /* Allocate 1 more than what's provided */
+               entries = roundup_pow_of_two(qp_init_attr->cap.max_send_wr + 1);
+               qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
+                                               dev_attr->max_qp_wqes + 1);
+               qp->qplib_qp.sq.q_full_delta = qp->qplib_qp.sq.max_wqe -
+                                               qp_init_attr->cap.max_send_wr;
                qp->qplib_qp.rq.max_sge = dev_attr->max_qp_sges;
                if (qp->qplib_qp.rq.max_sge > dev_attr->max_qp_sges)
                        qp->qplib_qp.rq.max_sge = dev_attr->max_qp_sges;
                }
 
        } else {
+               /* Allocate 128 + 1 more than what's provided */
+               entries = roundup_pow_of_two(qp_init_attr->cap.max_send_wr +
+                                            BNXT_QPLIB_RESERVED_QP_WRS + 1);
+               qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
+                                               dev_attr->max_qp_wqes +
+                                               BNXT_QPLIB_RESERVED_QP_WRS + 1);
+               qp->qplib_qp.sq.q_full_delta = BNXT_QPLIB_RESERVED_QP_WRS + 1;
+
+               /*
+                * Reserving one slot for Phantom WQE. Application can
+                * post one extra entry in this case. But allowing this to avoid
+                * unexpected Queue full condition
+                */
+
+               qp->qplib_qp.sq.q_full_delta -= 1;
+
                qp->qplib_qp.max_rd_atomic = dev_attr->max_qp_rd_atom;
                qp->qplib_qp.max_dest_rd_atomic = dev_attr->max_qp_init_rd_atom;
                if (udata) {
        }
 }
 
-static int __from_ib_access_flags(int iflags)
-{
-       int qflags = 0;
-
-       if (iflags & IB_ACCESS_LOCAL_WRITE)
-               qflags |= BNXT_QPLIB_ACCESS_LOCAL_WRITE;
-       if (iflags & IB_ACCESS_REMOTE_READ)
-               qflags |= BNXT_QPLIB_ACCESS_REMOTE_READ;
-       if (iflags & IB_ACCESS_REMOTE_WRITE)
-               qflags |= BNXT_QPLIB_ACCESS_REMOTE_WRITE;
-       if (iflags & IB_ACCESS_REMOTE_ATOMIC)
-               qflags |= BNXT_QPLIB_ACCESS_REMOTE_ATOMIC;
-       if (iflags & IB_ACCESS_MW_BIND)
-               qflags |= BNXT_QPLIB_ACCESS_MW_BIND;
-       if (iflags & IB_ZERO_BASED)
-               qflags |= BNXT_QPLIB_ACCESS_ZERO_BASED;
-       if (iflags & IB_ACCESS_ON_DEMAND)
-               qflags |= BNXT_QPLIB_ACCESS_ON_DEMAND;
-       return qflags;
-};
-
-static enum ib_access_flags __to_ib_access_flags(int qflags)
-{
-       enum ib_access_flags iflags = 0;
-
-       if (qflags & BNXT_QPLIB_ACCESS_LOCAL_WRITE)
-               iflags |= IB_ACCESS_LOCAL_WRITE;
-       if (qflags & BNXT_QPLIB_ACCESS_REMOTE_WRITE)
-               iflags |= IB_ACCESS_REMOTE_WRITE;
-       if (qflags & BNXT_QPLIB_ACCESS_REMOTE_READ)
-               iflags |= IB_ACCESS_REMOTE_READ;
-       if (qflags & BNXT_QPLIB_ACCESS_REMOTE_ATOMIC)
-               iflags |= IB_ACCESS_REMOTE_ATOMIC;
-       if (qflags & BNXT_QPLIB_ACCESS_MW_BIND)
-               iflags |= IB_ACCESS_MW_BIND;
-       if (qflags & BNXT_QPLIB_ACCESS_ZERO_BASED)
-               iflags |= IB_ZERO_BASED;
-       if (qflags & BNXT_QPLIB_ACCESS_ON_DEMAND)
-               iflags |= IB_ACCESS_ON_DEMAND;
-       return iflags;
-};
-
 static int bnxt_re_modify_shadow_qp(struct bnxt_re_dev *rdev,
                                    struct bnxt_re_qp *qp1_qp,
                                    int qp_attr_mask)
                entries = roundup_pow_of_two(qp_attr->cap.max_send_wr);
                qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
                                                dev_attr->max_qp_wqes + 1);
+               qp->qplib_qp.sq.q_full_delta = qp->qplib_qp.sq.max_wqe -
+                                               qp_attr->cap.max_send_wr;
+               /*
+                * Reserving one slot for Phantom WQE. Some application can
+                * post one extra entry in this case. Allowing this to avoid
+                * unexpected Queue full condition
+                */
+               qp->qplib_qp.sq.q_full_delta -= 1;
                qp->qplib_qp.sq.max_sge = qp_attr->cap.max_send_sge;
                if (qp->qplib_qp.rq.max_wqe) {
                        entries = roundup_pow_of_two(qp_attr->cap.max_recv_wr);
                        qp->qplib_qp.rq.max_wqe =
                                min_t(u32, entries, dev_attr->max_qp_wqes + 1);
+                       qp->qplib_qp.rq.q_full_delta = qp->qplib_qp.rq.max_wqe -
+                                                      qp_attr->cap.max_recv_wr;
                        qp->qplib_qp.rq.max_sge = qp_attr->cap.max_recv_sge;
                } else {
                        /* SRQ was used prior, just ignore the RQ caps */
                wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
 }
 
+static int send_phantom_wqe(struct bnxt_re_qp *qp)
+{
+       struct bnxt_qplib_qp *lib_qp = &qp->qplib_qp;
+       unsigned long flags;
+       int rc = 0;
+
+       spin_lock_irqsave(&qp->sq_lock, flags);
+
+       rc = bnxt_re_bind_fence_mw(lib_qp);
+       if (!rc) {
+               lib_qp->sq.phantom_wqe_cnt++;
+               dev_dbg(&lib_qp->sq.hwq.pdev->dev,
+                       "qp %#x sq->prod %#x sw_prod %#x phantom_wqe_cnt %d\n",
+                       lib_qp->id, lib_qp->sq.hwq.prod,
+                       HWQ_CMP(lib_qp->sq.hwq.prod, &lib_qp->sq.hwq),
+                       lib_qp->sq.phantom_wqe_cnt);
+       }
+
+       spin_unlock_irqrestore(&qp->sq_lock, flags);
+       return rc;
+}
+
 int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
 {
        struct bnxt_re_cq *cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq);
        struct bnxt_re_qp *qp;
        struct bnxt_qplib_cqe *cqe;
        int i, ncqe, budget;
+       struct bnxt_qplib_q *sq;
+       struct bnxt_qplib_qp *lib_qp;
        u32 tbl_idx;
        struct bnxt_re_sqp_entries *sqp_entry = NULL;
        unsigned long flags;
        }
        cqe = &cq->cql[0];
        while (budget) {
-               ncqe = bnxt_qplib_poll_cq(&cq->qplib_cq, cqe, budget);
+               lib_qp = NULL;
+               ncqe = bnxt_qplib_poll_cq(&cq->qplib_cq, cqe, budget, &lib_qp);
+               if (lib_qp) {
+                       sq = &lib_qp->sq;
+                       if (sq->send_phantom) {
+                               qp = container_of(lib_qp,
+                                                 struct bnxt_re_qp, qplib_qp);
+                               if (send_phantom_wqe(qp) == -ENOMEM)
+                                       dev_err(rdev_to_dev(cq->rdev),
+                                               "Phantom failed! Scheduled to send again\n");
+                               else
+                                       sq->send_phantom = false;
+                       }
+               }
+
                if (!ncqe)
                        break;
 
        return ERR_PTR(rc);
 }
 
+struct ib_mw *bnxt_re_alloc_mw(struct ib_pd *ib_pd, enum ib_mw_type type,
+                              struct ib_udata *udata)
+{
+       struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+       struct bnxt_re_dev *rdev = pd->rdev;
+       struct bnxt_re_mw *mw;
+       int rc;
+
+       mw = kzalloc(sizeof(*mw), GFP_KERNEL);
+       if (!mw)
+               return ERR_PTR(-ENOMEM);
+       mw->rdev = rdev;
+       mw->qplib_mw.pd = &pd->qplib_pd;
+
+       mw->qplib_mw.type = (type == IB_MW_TYPE_1 ?
+                              CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE1 :
+                              CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B);
+       rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mw->qplib_mw);
+       if (rc) {
+               dev_err(rdev_to_dev(rdev), "Allocate MW failed!");
+               goto fail;
+       }
+       mw->ib_mw.rkey = mw->qplib_mw.rkey;
+
+       atomic_inc(&rdev->mw_count);
+       return &mw->ib_mw;
+
+fail:
+       kfree(mw);
+       return ERR_PTR(rc);
+}
+
+int bnxt_re_dealloc_mw(struct ib_mw *ib_mw)
+{
+       struct bnxt_re_mw *mw = container_of(ib_mw, struct bnxt_re_mw, ib_mw);
+       struct bnxt_re_dev *rdev = mw->rdev;
+       int rc;
+
+       rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &mw->qplib_mw);
+       if (rc) {
+               dev_err(rdev_to_dev(rdev), "Free MW failed: %#x\n", rc);
+               return rc;
+       }
+
+       kfree(mw);
+       atomic_dec(&rdev->mw_count);
+       return rc;
+}
+
 /* Fast Memory Regions */
 struct ib_fmr *bnxt_re_alloc_fmr(struct ib_pd *ib_pd, int mr_access_flags,
                                 struct ib_fmr_attr *fmr_attr)
 
                rc = -EINVAL;
                goto done;
        }
-       if (HWQ_CMP((sq->hwq.prod + 1), &sq->hwq) ==
-           HWQ_CMP(sq->hwq.cons, &sq->hwq)) {
+
+       if (bnxt_qplib_queue_full(sq)) {
+               dev_err(&sq->hwq.pdev->dev,
+                       "QPLIB: prod = %#x cons = %#x qdepth = %#x delta = %#x",
+                       sq->hwq.prod, sq->hwq.cons, sq->hwq.max_elements,
+                       sq->q_full_delta);
                rc = -ENOMEM;
                goto done;
        }
                rc = -EINVAL;
                goto done;
        }
-       if (HWQ_CMP((rq->hwq.prod + 1), &rq->hwq) ==
-           HWQ_CMP(rq->hwq.cons, &rq->hwq)) {
+       if (bnxt_qplib_queue_full(rq)) {
                dev_err(&rq->hwq.pdev->dev,
                        "QPLIB: FP: QP (0x%x) RQ is full!", qp->id);
                rc = -EINVAL;
        return rc;
 }
 
+/* Note: SQE is valid from sw_sq_cons up to cqe_sq_cons (exclusive)
+ *       CQE is track from sw_cq_cons to max_element but valid only if VALID=1
+ */
+static int do_wa9060(struct bnxt_qplib_qp *qp, struct bnxt_qplib_cq *cq,
+                    u32 cq_cons, u32 sw_sq_cons, u32 cqe_sq_cons)
+{
+       struct bnxt_qplib_q *sq = &qp->sq;
+       struct bnxt_qplib_swq *swq;
+       u32 peek_sw_cq_cons, peek_raw_cq_cons, peek_sq_cons_idx;
+       struct cq_base *peek_hwcqe, **peek_hw_cqe_ptr;
+       struct cq_req *peek_req_hwcqe;
+       struct bnxt_qplib_qp *peek_qp;
+       struct bnxt_qplib_q *peek_sq;
+       int i, rc = 0;
+
+       /* Normal mode */
+       /* Check for the psn_search marking before completing */
+       swq = &sq->swq[sw_sq_cons];
+       if (swq->psn_search &&
+           le32_to_cpu(swq->psn_search->flags_next_psn) & 0x80000000) {
+               /* Unmark */
+               swq->psn_search->flags_next_psn = cpu_to_le32
+                       (le32_to_cpu(swq->psn_search->flags_next_psn)
+                                    & ~0x80000000);
+               dev_dbg(&cq->hwq.pdev->dev,
+                       "FP: Process Req cq_cons=0x%x qp=0x%x sq cons sw=0x%x cqe=0x%x marked!\n",
+                       cq_cons, qp->id, sw_sq_cons, cqe_sq_cons);
+               sq->condition = true;
+               sq->send_phantom = true;
+
+               /* TODO: Only ARM if the previous SQE is ARMALL */
+               bnxt_qplib_arm_cq(cq, DBR_DBR_TYPE_CQ_ARMALL);
+
+               rc = -EAGAIN;
+               goto out;
+       }
+       if (sq->condition) {
+               /* Peek at the completions */
+               peek_raw_cq_cons = cq->hwq.cons;
+               peek_sw_cq_cons = cq_cons;
+               i = cq->hwq.max_elements;
+               while (i--) {
+                       peek_sw_cq_cons = HWQ_CMP((peek_sw_cq_cons), &cq->hwq);
+                       peek_hw_cqe_ptr = (struct cq_base **)cq->hwq.pbl_ptr;
+                       peek_hwcqe = &peek_hw_cqe_ptr[CQE_PG(peek_sw_cq_cons)]
+                                                    [CQE_IDX(peek_sw_cq_cons)];
+                       /* If the next hwcqe is VALID */
+                       if (CQE_CMP_VALID(peek_hwcqe, peek_raw_cq_cons,
+                                         cq->hwq.max_elements)) {
+                               /* If the next hwcqe is a REQ */
+                               if ((peek_hwcqe->cqe_type_toggle &
+                                   CQ_BASE_CQE_TYPE_MASK) ==
+                                   CQ_BASE_CQE_TYPE_REQ) {
+                                       peek_req_hwcqe = (struct cq_req *)
+                                                        peek_hwcqe;
+                                       peek_qp = (struct bnxt_qplib_qp *)
+                                               ((unsigned long)
+                                                le64_to_cpu
+                                                (peek_req_hwcqe->qp_handle));
+                                       peek_sq = &peek_qp->sq;
+                                       peek_sq_cons_idx = HWQ_CMP(le16_to_cpu(
+                                               peek_req_hwcqe->sq_cons_idx) - 1
+                                               , &sq->hwq);
+                                       /* If the hwcqe's sq's wr_id matches */
+                                       if (peek_sq == sq &&
+                                           sq->swq[peek_sq_cons_idx].wr_id ==
+                                           BNXT_QPLIB_FENCE_WRID) {
+                                               /*
+                                                *  Unbreak only if the phantom
+                                                *  comes back
+                                                */
+                                               dev_dbg(&cq->hwq.pdev->dev,
+                                                       "FP:Got Phantom CQE");
+                                               sq->condition = false;
+                                               sq->single = true;
+                                               rc = 0;
+                                               goto out;
+                                       }
+                               }
+                               /* Valid but not the phantom, so keep looping */
+                       } else {
+                               /* Not valid yet, just exit and wait */
+                               rc = -EINVAL;
+                               goto out;
+                       }
+                       peek_sw_cq_cons++;
+                       peek_raw_cq_cons++;
+               }
+               dev_err(&cq->hwq.pdev->dev,
+                       "Should not have come here! cq_cons=0x%x qp=0x%x sq cons sw=0x%x hw=0x%x",
+                       cq_cons, qp->id, sw_sq_cons, cqe_sq_cons);
+               rc = -EINVAL;
+       }
+out:
+       return rc;
+}
+
 static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq,
                                     struct cq_req *hwcqe,
-                                    struct bnxt_qplib_cqe **pcqe, int *budget)
+                                    struct bnxt_qplib_cqe **pcqe, int *budget,
+                                    u32 cq_cons, struct bnxt_qplib_qp **lib_qp)
 {
        struct bnxt_qplib_qp *qp;
        struct bnxt_qplib_q *sq;
        struct bnxt_qplib_cqe *cqe;
-       u32 sw_cons, cqe_cons;
+       u32 sw_sq_cons, cqe_sq_cons;
+       struct bnxt_qplib_swq *swq;
        int rc = 0;
 
        qp = (struct bnxt_qplib_qp *)((unsigned long)
        }
        sq = &qp->sq;
 
-       cqe_cons = HWQ_CMP(le16_to_cpu(hwcqe->sq_cons_idx), &sq->hwq);
-       if (cqe_cons > sq->hwq.max_elements) {
+       cqe_sq_cons = HWQ_CMP(le16_to_cpu(hwcqe->sq_cons_idx), &sq->hwq);
+       if (cqe_sq_cons > sq->hwq.max_elements) {
                dev_err(&cq->hwq.pdev->dev,
                        "QPLIB: FP: CQ Process req reported ");
                dev_err(&cq->hwq.pdev->dev,
                        "QPLIB: sq_cons_idx 0x%x which exceeded max 0x%x",
-                       cqe_cons, sq->hwq.max_elements);
+                       cqe_sq_cons, sq->hwq.max_elements);
                return -EINVAL;
        }
        /* If we were in the middle of flushing the SQ, continue */
 
        /* Require to walk the sq's swq to fabricate CQEs for all previously
         * signaled SWQEs due to CQE aggregation from the current sq cons
-        * to the cqe_cons
+        * to the cqe_sq_cons
         */
        cqe = *pcqe;
        while (*budget) {
-               sw_cons = HWQ_CMP(sq->hwq.cons, &sq->hwq);
-               if (sw_cons == cqe_cons)
+               sw_sq_cons = HWQ_CMP(sq->hwq.cons, &sq->hwq);
+               if (sw_sq_cons == cqe_sq_cons)
+                       /* Done */
                        break;
+
+               swq = &sq->swq[sw_sq_cons];
                memset(cqe, 0, sizeof(*cqe));
                cqe->opcode = CQ_BASE_CQE_TYPE_REQ;
                cqe->qp_handle = (u64)(unsigned long)qp;
                cqe->src_qp = qp->id;
-               cqe->wr_id = sq->swq[sw_cons].wr_id;
-               cqe->type = sq->swq[sw_cons].type;
+               cqe->wr_id = swq->wr_id;
+               if (cqe->wr_id == BNXT_QPLIB_FENCE_WRID)
+                       goto skip;
+               cqe->type = swq->type;
 
                /* For the last CQE, check for status.  For errors, regardless
                 * of the request being signaled or not, it must complete with
                 * the hwcqe error status
                 */
-               if (HWQ_CMP((sw_cons + 1), &sq->hwq) == cqe_cons &&
+               if (HWQ_CMP((sw_sq_cons + 1), &sq->hwq) == cqe_sq_cons &&
                    hwcqe->status != CQ_REQ_STATUS_OK) {
                        cqe->status = hwcqe->status;
                        dev_err(&cq->hwq.pdev->dev,
                                "QPLIB: FP: CQ Processed Req ");
                        dev_err(&cq->hwq.pdev->dev,
                                "QPLIB: wr_id[%d] = 0x%llx with status 0x%x",
-                               sw_cons, cqe->wr_id, cqe->status);
+                               sw_sq_cons, cqe->wr_id, cqe->status);
                        cqe++;
                        (*budget)--;
                        sq->flush_in_progress = true;
                        /* Must block new posting of SQ and RQ */
                        qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR;
+                       sq->condition = false;
+                       sq->single = false;
                } else {
-                       if (sq->swq[sw_cons].flags &
-                           SQ_SEND_FLAGS_SIGNAL_COMP) {
+                       if (swq->flags & SQ_SEND_FLAGS_SIGNAL_COMP) {
+                               /* Before we complete, do WA 9060 */
+                               if (do_wa9060(qp, cq, cq_cons, sw_sq_cons,
+                                             cqe_sq_cons)) {
+                                       *lib_qp = qp;
+                                       goto out;
+                               }
                                cqe->status = CQ_REQ_STATUS_OK;
                                cqe++;
                                (*budget)--;
                        }
                }
+skip:
                sq->hwq.cons++;
+               if (sq->single)
+                       break;
        }
+out:
        *pcqe = cqe;
-       if (!*budget && HWQ_CMP(sq->hwq.cons, &sq->hwq) != cqe_cons) {
+       if (HWQ_CMP(sq->hwq.cons, &sq->hwq) != cqe_sq_cons) {
                /* Out of budget */
                rc = -EAGAIN;
                goto done;
        }
+       /*
+        * Back to normal completion mode only after it has completed all of
+        * the WC for this CQE
+        */
+       sq->single = false;
        if (!sq->flush_in_progress)
                goto done;
 flush:
 }
 
 int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe,
-                      int num_cqes)
+                      int num_cqes, struct bnxt_qplib_qp **lib_qp)
 {
        struct cq_base *hw_cqe, **hw_cqe_ptr;
        unsigned long flags;
                case CQ_BASE_CQE_TYPE_REQ:
                        rc = bnxt_qplib_cq_process_req(cq,
                                                       (struct cq_req *)hw_cqe,
-                                                      &cqe, &budget);
+                                                      &cqe, &budget,
+                                                      sw_cons, lib_qp);
                        break;
                case CQ_BASE_CQE_TYPE_RES_RC:
                        rc = bnxt_qplib_cq_process_res_rc(cq,