From fd39442aa637811a064d8a39e26cb0bb0b7c3e2d Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 19 Nov 2012 13:33:28 +0200 Subject: [PATCH] mlx4_ib: add blue flame support for kernel consumers Using blue flame can improve latency by allowing the HW to more efficiently access the WQE. A consumer who wants to use blue flame, has to create the QP with inline support. When posting a send WR, the consumer has to set IB_SEND_INLINE in the send flags. This approach is similar to that take in userspace; that is, in order to use blue flame you must use inline. However, if the send WR is too large for blue flame, it will only use inline. A kernel consumer that creates a QP with inline support, will be allocated a UAR and a blue flame register. All QP doorbells will be set to the UAR and blue flame posts to the blue flame register. We make use of all available registers in a blue flame page. Signed-off-by: Eli Cohen Signed-off-by: Yishai Hadas (Ported from Mellanox OFED 2.4) Signed-off-by: Mukesh Kacker --- drivers/infiniband/hw/mlx4/cq.c | 2 +- drivers/infiniband/hw/mlx4/main.c | 12 +- drivers/infiniband/hw/mlx4/mlx4_ib.h | 4 +- drivers/infiniband/hw/mlx4/qp.c | 168 ++++++++++++++++++++++++--- 4 files changed, 165 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index e41c3d69c3bc..afcbd1025ffe 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -920,7 +920,7 @@ int mlx4_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) mlx4_cq_arm(&to_mcq(ibcq)->mcq, (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT, - to_mdev(ibcq->device)->uar_map, + to_mdev(ibcq->device)->priv_uar.map, MLX4_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->uar_lock)); return 0; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 9ff3e3a9d917..e2f683a9a8d1 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2160,10 +2160,12 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) if (mlx4_uar_alloc(dev, &ibdev->priv_uar)) goto err_pd; - ibdev->uar_map = ioremap((phys_addr_t) ibdev->priv_uar.pfn << PAGE_SHIFT, - PAGE_SIZE); - if (!ibdev->uar_map) + ibdev->priv_uar.map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT, + PAGE_SIZE); + + if (!ibdev->priv_uar.map) goto err_uar; + MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock); ibdev->dev = dev; @@ -2456,7 +2458,7 @@ err_counter: mlx4_counter_free(ibdev->dev, ibdev->counters[i - 1]); err_map: - iounmap(ibdev->uar_map); + iounmap(ibdev->priv_uar.map); err_uar: mlx4_uar_free(dev, &ibdev->priv_uar); @@ -2569,7 +2571,7 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) } #endif - iounmap(ibdev->uar_map); + iounmap(ibdev->priv_uar.map); for (p = 0; p < ibdev->num_ports; ++p) if (ibdev->counters[p] != -1) mlx4_counter_free(ibdev->dev, ibdev->counters[p]); diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 224ed9c9c1f6..236ba7258640 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -303,6 +303,8 @@ struct mlx4_ib_qp { struct mlx4_roce_smac_vlan_info pri; struct mlx4_roce_smac_vlan_info alt; u64 reg_id; + int max_inline_data; + struct mlx4_bf bf; struct list_head qps_list; struct list_head cq_recv_list; struct list_head cq_send_list; @@ -507,8 +509,6 @@ struct mlx4_ib_dev { struct ib_device ib_dev; struct mlx4_dev *dev; int num_ports; - void __iomem *uar_map; - struct mlx4_uar priv_uar; u32 priv_pdn; MLX4_DECLARE_DOORBELL_LOCK(uar_lock); diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 156a11a048f0..a9428284e1b5 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -42,6 +42,7 @@ #include #include +#include #include "mlx4_ib.h" #include "user.h" @@ -117,6 +118,19 @@ static const __be32 mlx4_ib_opcode[] = { [IB_WR_BIND_MW] = cpu_to_be32(MLX4_OPCODE_BIND_MW), }; +#ifndef wc_wmb + #if defined(__i386__) + #define wc_wmb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory") + #elif defined(__x86_64__) + #define wc_wmb() asm volatile("sfence" ::: "memory") + #elif defined(__ia64__) + #define wc_wmb() asm volatile("fwb" ::: "memory") + #else + #define wc_wmb() wmb() + #endif +#endif + + static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) { return container_of(mqp, struct mlx4_ib_sqp, qp); @@ -521,8 +535,7 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, cap->max_send_sge = min(qp->sq.max_gs, min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)); - /* We don't support inline sends for kernel QPs (yet) */ - cap->max_inline_data = 0; + qp->max_inline_data = cap->max_inline_data; return 0; } @@ -776,6 +789,16 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, *qp->db.db = 0; } + if (qp->max_inline_data) { + err = mlx4_bf_alloc(dev->dev, &qp->bf, 0); + if (err) { + pr_err("failed to allocate blue flame" + " register (%d)", err); + qp->bf.uar = &dev->priv_uar; + } + } else + qp->bf.uar = &dev->priv_uar; + if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf, gfp)) { err = -ENOMEM; goto err_db; @@ -895,6 +918,9 @@ err_db: if (!pd->uobject && qp_has_rq(init_attr)) mlx4_db_free(dev->dev, &qp->db); + if (qp->max_inline_data) + mlx4_bf_free(dev->dev, &qp->bf); + err: if (!*caller_qp) kfree(qp); @@ -1060,6 +1086,9 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) free_proxy_bufs(&dev->ib_dev, qp); mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); + if (qp->max_inline_data) + mlx4_bf_free(dev->dev, &qp->bf); + if (qp->rq.wqe_cnt) mlx4_db_free(dev->dev, &qp->db); } @@ -1529,7 +1558,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, if (qp->ibqp.uobject) context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index); else - context->usr_page = cpu_to_be32(dev->priv_uar.index); + context->usr_page = cpu_to_be32(qp->bf.uar->index); if (attr_mask & IB_QP_DEST_QPN) context->remote_qpn = cpu_to_be32(attr->dest_qp_num); @@ -2640,12 +2669,94 @@ static void add_zero_len_inline(void *wqe) inl->byte_count = cpu_to_be32(1 << 31); } +static int lay_inline_data(struct mlx4_ib_qp *qp, struct ib_send_wr *wr, + void *wqe, int *sz) +{ + struct mlx4_wqe_inline_seg *seg; + void *addr; + int len, seg_len; + int num_seg; + int off, to_copy; + int i; + int inl = 0; + + seg = wqe; + wqe += sizeof *seg; + off = ((unsigned long)wqe) & (unsigned long)(MLX4_INLINE_ALIGN - 1); + num_seg = 0; + seg_len = 0; + + for (i = 0; i < wr->num_sge; ++i) { + addr = (void *) (unsigned long)(wr->sg_list[i].addr); + len = wr->sg_list[i].length; + inl += len; + + if (inl > qp->max_inline_data) { + inl = 0; + return -1; + } + + while (len >= MLX4_INLINE_ALIGN - off) { + to_copy = MLX4_INLINE_ALIGN - off; + memcpy(wqe, addr, to_copy); + len -= to_copy; + wqe += to_copy; + addr += to_copy; + seg_len += to_copy; + wmb(); /* see comment below */ + seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); + seg_len = 0; + seg = wqe; + wqe += sizeof *seg; + off = sizeof *seg; + ++num_seg; + } + + memcpy(wqe, addr, len); + wqe += len; + seg_len += len; + off += len; + } + + if (seg_len) { + ++num_seg; + /* + * Need a barrier here to make sure + * all the data is visible before the + * byte_count field is set. Otherwise + * the HCA prefetcher could grab the + * 64-byte chunk with this inline + * segment and get a valid (!= + * 0xffffffff) byte count but stale + * data, and end up sending the wrong + * data. + */ + wmb(); + seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); + } + + *sz = (inl + num_seg * sizeof *seg + 15) / 16; + + return 0; +} + +/* + * Avoid using memcpy() to copy to BlueFlame page, since memcpy() + * implementations may use move-string-buffer assembler instructions, + * which do not guarantee order of copying. + */ +static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, + unsigned bytecnt) +{ + __iowrite64_copy(dst, src, bytecnt / 8); +} + int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { struct mlx4_ib_qp *qp = to_mqp(ibqp); void *wqe; - struct mlx4_wqe_ctrl_seg *ctrl; + struct mlx4_wqe_ctrl_seg *uninitialized_var(ctrl); struct mlx4_wqe_data_seg *dseg; unsigned long flags; int nreq; @@ -2659,6 +2770,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, __be32 uninitialized_var(lso_hdr_sz); __be32 blh; int i; + int inl = 0; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); spin_lock_irqsave(&qp->sq.lock, flags); @@ -2688,6 +2800,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); + *((u32 *) (&ctrl->vlan_tag)) = 0; qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id; ctrl->srcrb_flags = @@ -2862,10 +2975,8 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, * cacheline. This avoids issues with WQE * prefetching. */ - dseg = wqe; dseg += wr->num_sge - 1; - size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16); /* Add one more inline data segment for ICRC for MLX sends */ if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI || @@ -2876,8 +2987,19 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, size += sizeof (struct mlx4_wqe_data_seg) / 16; } - for (i = wr->num_sge - 1; i >= 0; --i, --dseg) - set_data_seg(dseg, wr->sg_list + i); + if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) { + int sz; + err = lay_inline_data(qp, wr, wqe, &sz); + if (!err) { + inl = 1; + size += sz; + } + } else { + size += wr->num_sge * + (sizeof(struct mlx4_wqe_data_seg) / 16); + for (i = wr->num_sge - 1; i >= 0; --i, --dseg) + set_data_seg(dseg, wr->sg_list + i); + } /* * Possibly overwrite stamping in cacheline with LSO @@ -2886,7 +3008,6 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, */ wmb(); *lso_wqe = lso_hdr_sz; - ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ? MLX4_WQE_CTRL_FENCE : 0) | size; @@ -2925,7 +3046,27 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } out: - if (likely(nreq)) { + if (nreq == 1 && inl && size > 1 && size < qp->bf.buf_size / 16) { + ctrl->owner_opcode |= htonl((qp->sq_next_wqe & 0xffff) << 8); + /* We set above doorbell_qpn bits to 0 as part of vlan + * tag initialization, so |= should be correct. + */ + *(u32 *) (&ctrl->vlan_tag) |= qp->doorbell_qpn; + /* + * Make sure that descriptor is written to memory + * before writing to BlueFlame page. + */ + wmb(); + + ++qp->sq.head; + + mlx4_bf_copy(qp->bf.reg + qp->bf.offset, (unsigned long *) ctrl, + ALIGN(size * 16, 64)); + wc_wmb(); + + qp->bf.offset ^= qp->bf.buf_size; + + } else if (nreq) { qp->sq.head += nreq; /* @@ -2934,8 +3075,7 @@ out: */ wmb(); - writel(qp->doorbell_qpn, - to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL); + writel(qp->doorbell_qpn, qp->bf.uar->map + MLX4_SEND_DOORBELL); /* * Make sure doorbells don't leak out of SQ spinlock @@ -2943,8 +3083,10 @@ out: */ mmiowb(); - stamp_send_wqe(qp, stamp, size * 16); + } + if (likely(nreq)) { + stamp_send_wqe(qp, stamp, size * 16); ind = pad_wraparound(qp, ind); qp->sq_next_wqe = ind; } -- 2.50.1