]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
mlx4_ib: add blue flame support for kernel consumers
authorEli Cohen <eli@mellanox.co.il>
Mon, 19 Nov 2012 11:33:28 +0000 (13:33 +0200)
committerMukesh Kacker <mukesh.kacker@oracle.com>
Tue, 7 Jul 2015 21:38:11 +0000 (14:38 -0700)
Using blue flame can improve latency by allowing the HW to more
efficiently access the WQE. A consumer who wants to use blue flame,
has to create the QP with inline support. When posting a send WR,
the consumer has to set IB_SEND_INLINE in the send flags. This
approach is similar to that take in userspace; that is, in order
to use blue flame you must use inline. However, if the send WR is
too large for blue flame, it will only use inline.

A kernel consumer that creates a QP with inline support, will be
allocated a UAR and a blue flame register. All QP doorbells will
be set to the UAR and blue flame posts to the blue flame register.
We make use of all available registers in a blue flame page.

Signed-off-by: Eli Cohen <eli@mellanox.co.il>
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
(Ported from Mellanox OFED 2.4)

Signed-off-by: Mukesh Kacker <mukesh.kacker@oracle.com>
drivers/infiniband/hw/mlx4/cq.c
drivers/infiniband/hw/mlx4/main.c
drivers/infiniband/hw/mlx4/mlx4_ib.h
drivers/infiniband/hw/mlx4/qp.c

index e41c3d69c3bc8228a21149df2c919a4da349543f..afcbd1025ffe674649d39a24572d35856923a4e6 100644 (file)
@@ -920,7 +920,7 @@ int mlx4_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
        mlx4_cq_arm(&to_mcq(ibcq)->mcq,
                    (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
                    MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT,
-                   to_mdev(ibcq->device)->uar_map,
+                   to_mdev(ibcq->device)->priv_uar.map,
                    MLX4_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->uar_lock));
 
        return 0;
index 9ff3e3a9d917d1e63358283a6397691cd89c1693..e2f683a9a8d1836fc503931c00e513aba69c5562 100644 (file)
@@ -2160,10 +2160,12 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
        if (mlx4_uar_alloc(dev, &ibdev->priv_uar))
                goto err_pd;
 
-       ibdev->uar_map = ioremap((phys_addr_t) ibdev->priv_uar.pfn << PAGE_SHIFT,
-                                PAGE_SIZE);
-       if (!ibdev->uar_map)
+       ibdev->priv_uar.map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT,
+               PAGE_SIZE);
+
+       if (!ibdev->priv_uar.map)
                goto err_uar;
+
        MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock);
 
        ibdev->dev = dev;
@@ -2456,7 +2458,7 @@ err_counter:
                        mlx4_counter_free(ibdev->dev, ibdev->counters[i - 1]);
 
 err_map:
-       iounmap(ibdev->uar_map);
+       iounmap(ibdev->priv_uar.map);
 
 err_uar:
        mlx4_uar_free(dev, &ibdev->priv_uar);
@@ -2569,7 +2571,7 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
        }
 #endif
 
-       iounmap(ibdev->uar_map);
+       iounmap(ibdev->priv_uar.map);
        for (p = 0; p < ibdev->num_ports; ++p)
                if (ibdev->counters[p] != -1)
                        mlx4_counter_free(ibdev->dev, ibdev->counters[p]);
index 224ed9c9c1f69885f24c2d43591c40ae2452da7f..236ba7258640e8e2737096cbfff7ad0c4a904e69 100644 (file)
@@ -303,6 +303,8 @@ struct mlx4_ib_qp {
        struct mlx4_roce_smac_vlan_info pri;
        struct mlx4_roce_smac_vlan_info alt;
        u64                     reg_id;
+       int                     max_inline_data;
+       struct mlx4_bf          bf;
        struct list_head        qps_list;
        struct list_head        cq_recv_list;
        struct list_head        cq_send_list;
@@ -507,8 +509,6 @@ struct mlx4_ib_dev {
        struct ib_device        ib_dev;
        struct mlx4_dev        *dev;
        int                     num_ports;
-       void __iomem           *uar_map;
-
        struct mlx4_uar         priv_uar;
        u32                     priv_pdn;
        MLX4_DECLARE_DOORBELL_LOCK(uar_lock);
index 156a11a048f0645852c8246ba7669368f59a4480..a9428284e1b5290338844e53d23d7ac46128379b 100644 (file)
@@ -42,6 +42,7 @@
 
 #include <linux/mlx4/driver.h>
 #include <linux/mlx4/qp.h>
+#include <linux/io.h>
 
 #include "mlx4_ib.h"
 #include "user.h"
@@ -117,6 +118,19 @@ static const __be32 mlx4_ib_opcode[] = {
        [IB_WR_BIND_MW]                         = cpu_to_be32(MLX4_OPCODE_BIND_MW),
 };
 
+#ifndef wc_wmb
+       #if defined(__i386__)
+               #define wc_wmb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
+       #elif defined(__x86_64__)
+               #define wc_wmb() asm volatile("sfence" ::: "memory")
+       #elif defined(__ia64__)
+               #define wc_wmb() asm volatile("fwb" ::: "memory")
+       #else
+               #define wc_wmb() wmb()
+       #endif
+#endif
+
+
 static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
 {
        return container_of(mqp, struct mlx4_ib_sqp, qp);
@@ -521,8 +535,7 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
        cap->max_send_sge = min(qp->sq.max_gs,
                                min(dev->dev->caps.max_sq_sg,
                                    dev->dev->caps.max_rq_sg));
-       /* We don't support inline sends for kernel QPs (yet) */
-       cap->max_inline_data = 0;
+       qp->max_inline_data = cap->max_inline_data;
 
        return 0;
 }
@@ -776,6 +789,16 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
                        *qp->db.db = 0;
                }
 
+               if (qp->max_inline_data) {
+                       err = mlx4_bf_alloc(dev->dev, &qp->bf, 0);
+                       if (err) {
+                               pr_err("failed to allocate blue flame"
+                                      " register (%d)", err);
+                               qp->bf.uar = &dev->priv_uar;
+                       }
+               } else
+                       qp->bf.uar = &dev->priv_uar;
+
                if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf, gfp)) {
                        err = -ENOMEM;
                        goto err_db;
@@ -895,6 +918,9 @@ err_db:
        if (!pd->uobject && qp_has_rq(init_attr))
                mlx4_db_free(dev->dev, &qp->db);
 
+       if (qp->max_inline_data)
+               mlx4_bf_free(dev->dev, &qp->bf);
+
 err:
        if (!*caller_qp)
                kfree(qp);
@@ -1060,6 +1086,9 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
                    MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
                        free_proxy_bufs(&dev->ib_dev, qp);
                mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+               if (qp->max_inline_data)
+                       mlx4_bf_free(dev->dev, &qp->bf);
+
                if (qp->rq.wqe_cnt)
                        mlx4_db_free(dev->dev, &qp->db);
        }
@@ -1529,7 +1558,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
        if (qp->ibqp.uobject)
                context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index);
        else
-               context->usr_page = cpu_to_be32(dev->priv_uar.index);
+               context->usr_page = cpu_to_be32(qp->bf.uar->index);
 
        if (attr_mask & IB_QP_DEST_QPN)
                context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
@@ -2640,12 +2669,94 @@ static void add_zero_len_inline(void *wqe)
        inl->byte_count = cpu_to_be32(1 << 31);
 }
 
+static int lay_inline_data(struct mlx4_ib_qp *qp, struct ib_send_wr *wr,
+                          void *wqe, int *sz)
+{
+       struct mlx4_wqe_inline_seg *seg;
+       void *addr;
+       int len, seg_len;
+       int num_seg;
+       int off, to_copy;
+       int i;
+       int inl = 0;
+
+       seg = wqe;
+       wqe += sizeof *seg;
+       off = ((unsigned long)wqe) & (unsigned long)(MLX4_INLINE_ALIGN - 1);
+       num_seg = 0;
+       seg_len = 0;
+
+       for (i = 0; i < wr->num_sge; ++i) {
+               addr = (void *) (unsigned long)(wr->sg_list[i].addr);
+               len  = wr->sg_list[i].length;
+               inl += len;
+
+               if (inl > qp->max_inline_data) {
+                       inl = 0;
+                       return -1;
+               }
+
+               while (len >= MLX4_INLINE_ALIGN - off) {
+                       to_copy = MLX4_INLINE_ALIGN - off;
+                       memcpy(wqe, addr, to_copy);
+                       len -= to_copy;
+                       wqe += to_copy;
+                       addr += to_copy;
+                       seg_len += to_copy;
+                       wmb(); /* see comment below */
+                       seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
+                       seg_len = 0;
+                       seg = wqe;
+                       wqe += sizeof *seg;
+                       off = sizeof *seg;
+                       ++num_seg;
+               }
+
+               memcpy(wqe, addr, len);
+               wqe += len;
+               seg_len += len;
+               off += len;
+       }
+
+       if (seg_len) {
+               ++num_seg;
+               /*
+                * Need a barrier here to make sure
+                * all the data is visible before the
+                * byte_count field is set.  Otherwise
+                * the HCA prefetcher could grab the
+                * 64-byte chunk with this inline
+                * segment and get a valid (!=
+                * 0xffffffff) byte count but stale
+                * data, and end up sending the wrong
+                * data.
+                */
+               wmb();
+               seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
+       }
+
+       *sz = (inl + num_seg * sizeof *seg + 15) / 16;
+
+       return 0;
+}
+
+/*
+ * Avoid using memcpy() to copy to BlueFlame page, since memcpy()
+ * implementations may use move-string-buffer assembler instructions,
+ * which do not guarantee order of copying.
+ */
+static void mlx4_bf_copy(unsigned long *dst, unsigned long *src,
+                               unsigned bytecnt)
+{
+       __iowrite64_copy(dst, src, bytecnt / 8);
+}
+
 int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                      struct ib_send_wr **bad_wr)
 {
        struct mlx4_ib_qp *qp = to_mqp(ibqp);
        void *wqe;
-       struct mlx4_wqe_ctrl_seg *ctrl;
+       struct mlx4_wqe_ctrl_seg *uninitialized_var(ctrl);
        struct mlx4_wqe_data_seg *dseg;
        unsigned long flags;
        int nreq;
@@ -2659,6 +2770,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
        __be32 uninitialized_var(lso_hdr_sz);
        __be32 blh;
        int i;
+       int inl = 0;
        struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
 
        spin_lock_irqsave(&qp->sq.lock, flags);
@@ -2688,6 +2800,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                }
 
                ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+               *((u32 *) (&ctrl->vlan_tag)) = 0;
                qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
 
                ctrl->srcrb_flags =
@@ -2862,10 +2975,8 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                 * cacheline.  This avoids issues with WQE
                 * prefetching.
                 */
-
                dseg = wqe;
                dseg += wr->num_sge - 1;
-               size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16);
 
                /* Add one more inline data segment for ICRC for MLX sends */
                if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
@@ -2876,8 +2987,19 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                        size += sizeof (struct mlx4_wqe_data_seg) / 16;
                }
 
-               for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
-                       set_data_seg(dseg, wr->sg_list + i);
+               if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) {
+                       int sz;
+                       err = lay_inline_data(qp, wr, wqe, &sz);
+                       if (!err) {
+                               inl = 1;
+                               size += sz;
+                       }
+               } else {
+                       size += wr->num_sge *
+                               (sizeof(struct mlx4_wqe_data_seg) / 16);
+                       for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
+                               set_data_seg(dseg, wr->sg_list + i);
+               }
 
                /*
                 * Possibly overwrite stamping in cacheline with LSO
@@ -2886,7 +3008,6 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                 */
                wmb();
                *lso_wqe = lso_hdr_sz;
-
                ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
                                    MLX4_WQE_CTRL_FENCE : 0) | size;
 
@@ -2925,7 +3046,27 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
        }
 
 out:
-       if (likely(nreq)) {
+       if (nreq == 1 && inl && size > 1 && size < qp->bf.buf_size / 16) {
+               ctrl->owner_opcode |= htonl((qp->sq_next_wqe & 0xffff) << 8);
+               /* We set above doorbell_qpn bits to 0 as part of vlan
+                 * tag initialization, so |= should be correct.
+               */
+               *(u32 *) (&ctrl->vlan_tag) |= qp->doorbell_qpn;
+               /*
+                * Make sure that descriptor is written to memory
+                * before writing to BlueFlame page.
+                */
+               wmb();
+
+               ++qp->sq.head;
+
+               mlx4_bf_copy(qp->bf.reg + qp->bf.offset, (unsigned long *) ctrl,
+                            ALIGN(size * 16, 64));
+               wc_wmb();
+
+               qp->bf.offset ^= qp->bf.buf_size;
+
+       } else if (nreq) {
                qp->sq.head += nreq;
 
                /*
@@ -2934,8 +3075,7 @@ out:
                 */
                wmb();
 
-               writel(qp->doorbell_qpn,
-                      to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL);
+               writel(qp->doorbell_qpn, qp->bf.uar->map + MLX4_SEND_DOORBELL);
 
                /*
                 * Make sure doorbells don't leak out of SQ spinlock
@@ -2943,8 +3083,10 @@ out:
                 */
                mmiowb();
 
-               stamp_send_wqe(qp, stamp, size * 16);
+       }
 
+       if (likely(nreq)) {
+               stamp_send_wqe(qp, stamp, size * 16);
                ind = pad_wraparound(qp, ind);
                qp->sq_next_wqe = ind;
        }