val = PIDX(q->pend_cred / 8);
                if (!is_t4(adap->params.chip))
                        val |= DBTYPE(1);
+               val |= DBPRIO(1);
                wmb();
-               t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL), DBPRIO(1) |
-                            QID(q->cntxt_id) | val);
+
+               /* If we're on T4, use the old doorbell mechanism; otherwise
+                * use the new BAR2 mechanism.
+                */
+               if (is_t4(adap->params.chip)) {
+                       t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL),
+                                    val | QID(q->cntxt_id));
+               } else {
+                       writel(val,  adap->bar2 + q->udb + SGE_UDB_KDOORBELL);
+
+                       /* This Write memory Barrier will force the write to
+                        * the User Doorbell area to be flushed.
+                        */
+                       wmb();
+               }
                q->pend_cred &= 7;
        }
 }
  */
 static inline void ring_tx_db(struct adapter *adap, struct sge_txq *q, int n)
 {
-       unsigned int *wr, index;
-       unsigned long flags;
-
        wmb();            /* write descriptors before telling HW */
-       spin_lock_irqsave(&q->db_lock, flags);
-       if (!q->db_disabled) {
-               if (is_t4(adap->params.chip)) {
+
+       if (is_t4(adap->params.chip)) {
+               u32 val = PIDX(n);
+               unsigned long flags;
+
+               /* For T4 we need to participate in the Doorbell Recovery
+                * mechanism.
+                */
+               spin_lock_irqsave(&q->db_lock, flags);
+               if (!q->db_disabled)
                        t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL),
-                                    QID(q->cntxt_id) | PIDX(n));
+                                    QID(q->cntxt_id) | val);
+               else
+                       q->db_pidx_inc += n;
+               q->db_pidx = q->pidx;
+               spin_unlock_irqrestore(&q->db_lock, flags);
+       } else {
+               u32 val = PIDX_T5(n);
+
+               /* T4 and later chips share the same PIDX field offset within
+                * the doorbell, but T5 and later shrank the field in order to
+                * gain a bit for Doorbell Priority.  The field was absurdly
+                * large in the first place (14 bits) so we just use the T5
+                * and later limits and warn if a Queue ID is too large.
+                */
+               WARN_ON(val & DBPRIO(1));
+
+               /* For T5 and later we use the Write-Combine mapped BAR2 User
+                * Doorbell mechanism.  If we're only writing a single TX
+                * Descriptor and TX Write Combining hasn't been disabled, we
+                * can use the Write Combining Gather Buffer; otherwise we use
+                * the simple doorbell.
+                */
+               if (n == 1) {
+                       int index = (q->pidx
+                                    ? (q->pidx - 1)
+                                    : (q->size - 1));
+                       unsigned int *wr = (unsigned int *)&q->desc[index];
+
+                       cxgb_pio_copy((u64 __iomem *)
+                                     (adap->bar2 + q->udb +
+                                      SGE_UDB_WCDOORBELL),
+                                     (u64 *)wr);
                } else {
-                       if (n == 1) {
-                               index = q->pidx ? (q->pidx - 1) : (q->size - 1);
-                               wr = (unsigned int *)&q->desc[index];
-                               cxgb_pio_copy((u64 __iomem *)
-                                             (adap->bar2 + q->udb + 64),
-                                             (u64 *)wr);
-                       } else
-                               writel(n,  adap->bar2 + q->udb + 8);
-                       wmb();
+                       writel(val,  adap->bar2 + q->udb + SGE_UDB_KDOORBELL);
                }
-       } else
-               q->db_pidx_inc += n;
-       q->db_pidx = q->pidx;
-       spin_unlock_irqrestore(&q->db_lock, flags);
+
+               /* This Write Memory Barrier will force the write to the User
+                * Doorbell area to be flushed.  This is needed to prevent
+                * writes on different CPUs for the same queue from hitting
+                * the adapter out of order.  This is required when some Work
+                * Requests take the Write Combine Gather Buffer path (user
+                * doorbell area offset [SGE_UDB_WCDOORBELL..+63]) and some
+                * take the traditional path where we simply increment the
+                * PIDX (User Doorbell area SGE_UDB_KDOORBELL) and have the
+                * hardware DMA read the actual Work Request.
+                */
+               wmb();
+       }
 }
 
 /**
        unsigned int params;
        struct sge_rspq *q = container_of(napi, struct sge_rspq, napi);
        int work_done = process_responses(q, budget);
+       u32 val;
 
        if (likely(work_done < budget)) {
                napi_complete(napi);
        } else
                params = QINTR_TIMER_IDX(7);
 
-       t4_write_reg(q->adap, MYPF_REG(SGE_PF_GTS), CIDXINC(work_done) |
-                    INGRESSQID((u32)q->cntxt_id) | SEINTARM(params));
+       val = CIDXINC(work_done) | SEINTARM(params);
+       if (is_t4(q->adap->params.chip)) {
+               t4_write_reg(q->adap, MYPF_REG(SGE_PF_GTS),
+                            val | INGRESSQID((u32)q->cntxt_id));
+       } else {
+               writel(val, q->adap->bar2 + q->udb + SGE_UDB_GTS);
+               wmb();
+       }
        return work_done;
 }
 
        unsigned int credits;
        const struct rsp_ctrl *rc;
        struct sge_rspq *q = &adap->sge.intrq;
+       u32 val;
 
        spin_lock(&adap->sge.intrq_lock);
        for (credits = 0; ; credits++) {
                rspq_next(q);
        }
 
-       t4_write_reg(adap, MYPF_REG(SGE_PF_GTS), CIDXINC(credits) |
-                    INGRESSQID(q->cntxt_id) | SEINTARM(q->intr_params));
+       val =  CIDXINC(credits) | SEINTARM(q->intr_params);
+       if (is_t4(adap->params.chip)) {
+               t4_write_reg(adap, MYPF_REG(SGE_PF_GTS),
+                            val | INGRESSQID(q->cntxt_id));
+       } else {
+               writel(val, adap->bar2 + q->udb + SGE_UDB_GTS);
+               wmb();
+       }
        spin_unlock(&adap->sge.intrq_lock);
        return credits;
 }
        mod_timer(&s->tx_timer, jiffies + (budget ? TX_QCHECK_PERIOD : 2));
 }
 
+/**
+ *      udb_address - return the BAR2 User Doorbell address for a Queue
+ *      @adap: the adapter
+ *      @cntxt_id: the Queue Context ID
+ *      @qpp: Queues Per Page (for all PFs)
+ *
+ *      Returns the BAR2 address of the user Doorbell associated with the
+ *      indicated Queue Context ID.  Note that this is only applicable
+ *      for T5 and later.
+ */
+static u64 udb_address(struct adapter *adap, unsigned int cntxt_id,
+                      unsigned int qpp)
+{
+       u64 udb;
+       unsigned int s_qpp;
+       unsigned short udb_density;
+       unsigned long qpshift;
+       int page;
+
+       BUG_ON(is_t4(adap->params.chip));
+
+       s_qpp = (QUEUESPERPAGEPF0 +
+               (QUEUESPERPAGEPF1 - QUEUESPERPAGEPF0) * adap->fn);
+       udb_density = 1 << ((qpp >> s_qpp) & QUEUESPERPAGEPF0_MASK);
+       qpshift = PAGE_SHIFT - ilog2(udb_density);
+       udb = cntxt_id << qpshift;
+       udb &= PAGE_MASK;
+       page = udb / PAGE_SIZE;
+       udb += (cntxt_id - (page * udb_density)) * SGE_UDB_SIZE;
+
+       return udb;
+}
+
+static u64 udb_address_eq(struct adapter *adap, unsigned int cntxt_id)
+{
+       return udb_address(adap, cntxt_id,
+                          t4_read_reg(adap, SGE_EGRESS_QUEUES_PER_PAGE_PF));
+}
+
+static u64 udb_address_iq(struct adapter *adap, unsigned int cntxt_id)
+{
+       return udb_address(adap, cntxt_id,
+                          t4_read_reg(adap, SGE_INGRESS_QUEUES_PER_PAGE_PF));
+}
+
 int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq,
                     struct net_device *dev, int intr_idx,
                     struct sge_fl *fl, rspq_handler_t hnd)
        iq->next_intr_params = iq->intr_params;
        iq->cntxt_id = ntohs(c.iqid);
        iq->abs_id = ntohs(c.physiqid);
+       if (!is_t4(adap->params.chip))
+               iq->udb = udb_address_iq(adap, iq->cntxt_id);
        iq->size--;                           /* subtract status entry */
        iq->netdev = dev;
        iq->handler = hnd;
                fl->pidx = fl->cidx = 0;
                fl->alloc_failed = fl->large_alloc_failed = fl->starving = 0;
                adap->sge.egr_map[fl->cntxt_id - adap->sge.egr_start] = fl;
+
+               /* Note, we must initialize the Free List User Doorbell
+                * address before refilling the Free List!
+                */
+               if (!is_t4(adap->params.chip))
+                       fl->udb = udb_address_eq(adap, fl->cntxt_id);
                refill_fl(adap, fl, fl_cap(fl), GFP_KERNEL);
        }
        return 0;
 static void init_txq(struct adapter *adap, struct sge_txq *q, unsigned int id)
 {
        q->cntxt_id = id;
-       if (!is_t4(adap->params.chip)) {
-               unsigned int s_qpp;
-               unsigned short udb_density;
-               unsigned long qpshift;
-               int page;
-
-               s_qpp = QUEUESPERPAGEPF1 * adap->fn;
-               udb_density = 1 << QUEUESPERPAGEPF0_GET((t4_read_reg(adap,
-                               SGE_EGRESS_QUEUES_PER_PAGE_PF) >> s_qpp));
-               qpshift = PAGE_SHIFT - ilog2(udb_density);
-               q->udb = q->cntxt_id << qpshift;
-               q->udb &= PAGE_MASK;
-               page = q->udb / PAGE_SIZE;
-               q->udb += (q->cntxt_id - (page * udb_density)) * 128;
-       }
+       if (!is_t4(adap->params.chip))
+               q->udb = udb_address_eq(adap, q->cntxt_id);
 
        q->in_use = 0;
        q->cidx = q->pidx = 0;