event->event, rds_ib_event_str(event->event), data);
}
+static void rds_ib_cq_comp_handler_fastreg(struct ib_cq *cq, void *context)
+{
+ struct rds_ib_device *rds_ibdev = context;
+
+ tasklet_schedule(&rds_ibdev->fastreg_tasklet);
+}
+
static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context)
{
struct rds_connection *conn = context;
tasklet_schedule(&ic->i_rtasklet);
}
+static void poll_fcq(struct rds_ib_device *rds_ibdev, struct ib_cq *cq,
+ struct ib_wc *wcs)
+{
+ int nr, i;
+ struct ib_wc *wc;
+
+ while ((nr = ib_poll_cq(cq, RDS_WC_MAX, wcs)) > 0) {
+ for (i = 0; i < nr; i++) {
+ wc = wcs + i;
+ rds_ib_fcq_handler(rds_ibdev, wc);
+ }
+ }
+}
+
static void poll_scq(struct rds_ib_connection *ic, struct ib_cq *cq,
struct ib_wc *wcs)
{
}
}
+static void rds_ib_tasklet_fn_fastreg(unsigned long data)
+{
+ struct rds_ib_device *rds_ibdev = (struct rds_ib_device *)data;
+
+ poll_fcq(rds_ibdev, rds_ibdev->fastreg_cq, rds_ibdev->fastreg_wc);
+ ib_req_notify_cq(rds_ibdev->fastreg_cq, IB_CQ_NEXT_COMP);
+ poll_fcq(rds_ibdev, rds_ibdev->fastreg_cq, rds_ibdev->fastreg_wc);
+}
+
void rds_ib_tasklet_fn_send(unsigned long data)
{
struct rds_ib_connection *ic = (struct rds_ib_connection *) data;
struct ib_qp_init_attr attr;
struct rds_ib_device *rds_ibdev;
int ret;
- int mr_reg, mr_inv;
+ int mr_reg;
/*
* It's normal to see a null device if an incoming connection races
if (!rds_ibdev)
return -EOPNOTSUPP;
- /* In the case of FRWR, mr registration and invalidation wrs use the
+ /* In the case of FRWR, mr registration wrs use the
* same work queue as the send wrs. To make sure that we are not
* overflowing the workqueue, we allocate separately for each operation.
- * mr_reg and mr_inv are the wr numbers allocated for reg and inv.
+ * mr_reg is the wr numbers allocated for reg.
*/
- if (rds_ibdev->use_fastreg) {
+ if (rds_ibdev->use_fastreg)
mr_reg = RDS_IB_DEFAULT_FREG_WR;
- mr_inv = 1;
- } else {
+ else
mr_reg = 0;
- mr_inv = 0;
- }
/* add the conn now so that connection establishment has the dev */
rds_ib_add_conn(rds_ibdev, conn);
- if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1 + mr_reg + mr_inv)
+ if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1 + mr_reg)
rds_ib_ring_resize(&ic->i_send_ring,
- rds_ibdev->max_wrs - 1 - mr_reg - mr_inv);
+ rds_ibdev->max_wrs - 1 - mr_reg);
if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1)
rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1);
ic->i_scq_vector = ibdev_get_unused_vector(rds_ibdev);
ic->i_scq = ib_create_cq(dev, rds_ib_cq_comp_handler_send,
rds_ib_cq_event_handler, conn,
- ic->i_send_ring.w_nr + 1 + mr_reg + mr_inv,
+ ic->i_send_ring.w_nr + 1 + mr_reg,
ic->i_scq_vector);
if (IS_ERR(ic->i_scq)) {
ret = PTR_ERR(ic->i_scq);
attr.event_handler = rds_ib_qp_event_handler;
attr.qp_context = conn;
/* + 1 to allow for the single ack message */
- attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1 + mr_reg + mr_inv;
+ attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1 + mr_reg;
attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
attr.cap.max_send_sge = rds_ibdev->max_sge;
attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
kfree(ic);
}
+
+void rds_ib_destroy_fastreg(struct rds_ib_device *rds_ibdev)
+{
+ /* Because we are using rw_lock, by this point we should have
+ * received completions for all the wrs posted
+ */
+ WARN_ON(atomic_read(&rds_ibdev->fastreg_wrs) != RDS_IB_DEFAULT_FREG_WR);
+
+ tasklet_kill(&rds_ibdev->fastreg_tasklet);
+ if (rds_ibdev->fastreg_qp) {
+ /* Destroy qp */
+ if (ib_destroy_qp(rds_ibdev->fastreg_qp))
+ pr_err("Error destroying fastreg qp for rds_ibdev: %p\n",
+ rds_ibdev);
+ rds_ibdev->fastreg_qp = NULL;
+ }
+
+ if (rds_ibdev->fastreg_cq) {
+ /* Destroy cq and cq_vector */
+ if (ib_destroy_cq(rds_ibdev->fastreg_cq))
+ pr_err("Error destroying fastreg cq for rds_ibdev: %p\n",
+ rds_ibdev);
+ rds_ibdev->fastreg_cq = NULL;
+ ibdev_put_vector(rds_ibdev, rds_ibdev->fastreg_cq_vector);
+ }
+}
+
+int rds_ib_setup_fastreg(struct rds_ib_device *rds_ibdev)
+{
+ int ret = 0;
+ struct ib_qp_init_attr qp_init_attr;
+ struct ib_qp_attr qp_attr;
+ struct ib_port_attr port_attr;
+ int gid_index = 0;
+ union ib_gid dgid;
+
+ rds_ibdev->fastreg_cq_vector = ibdev_get_unused_vector(rds_ibdev);
+ rds_ibdev->fastreg_cq = ib_create_cq(rds_ibdev->dev,
+ rds_ib_cq_comp_handler_fastreg,
+ rds_ib_cq_event_handler,
+ rds_ibdev,
+ RDS_IB_DEFAULT_FREG_WR + 1,
+ rds_ibdev->fastreg_cq_vector);
+ if (IS_ERR(rds_ibdev->fastreg_cq)) {
+ ret = PTR_ERR(rds_ibdev->fastreg_cq);
+ rds_ibdev->fastreg_cq = NULL;
+ ibdev_put_vector(rds_ibdev, rds_ibdev->fastreg_cq_vector);
+ rds_rtd(RDS_RTD_ERR, "ib_create_cq failed: %d\n", ret);
+ goto clean_up;
+ }
+
+ ret = ib_req_notify_cq(rds_ibdev->fastreg_cq, IB_CQ_NEXT_COMP);
+ if (ret)
+ goto clean_up;
+ rds_rtd(RDS_RTD_RDMA_IB,
+ "Successfully created fast reg cq for ib_device: %p\n",
+ rds_ibdev->dev);
+
+ memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+ qp_init_attr.send_cq = rds_ibdev->fastreg_cq;
+ qp_init_attr.recv_cq = rds_ibdev->fastreg_cq;
+ qp_init_attr.qp_type = IB_QPT_RC;
+ /* 1 WR is used for invalidaton */
+ qp_init_attr.cap.max_send_wr = RDS_IB_DEFAULT_FREG_WR + 1;
+ qp_init_attr.cap.max_recv_wr = 0;
+ qp_init_attr.cap.max_send_sge = 0;
+ qp_init_attr.cap.max_recv_sge = 0;
+
+ rds_ibdev->fastreg_qp = ib_create_qp(rds_ibdev->pd, &qp_init_attr);
+ if (IS_ERR(rds_ibdev->fastreg_qp)) {
+ ret = PTR_ERR(rds_ibdev->fastreg_qp);
+ rds_ibdev->fastreg_qp = NULL;
+ rds_rtd(RDS_RTD_ERR, "ib_create_qp failed: %d\n", ret);
+ goto clean_up;
+ }
+ rds_rtd(RDS_RTD_RDMA_IB,
+ "Successfully created fast reg qp for ib_device: %p\n",
+ rds_ibdev->dev);
+
+ /* Use modify_qp verb to change the state from RESET to INIT */
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_INIT;
+ qp_attr.pkey_index = 0;
+ qp_attr.qp_access_flags = IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE;
+ qp_attr.port_num = RDS_IB_DEFAULT_FREG_PORT_NUM;
+
+ ret = ib_modify_qp(rds_ibdev->fastreg_qp, &qp_attr, IB_QP_STATE |
+ IB_QP_PKEY_INDEX |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PORT);
+ if (ret) {
+ rds_rtd(RDS_RTD_ERR, "ib_modify_qp to IB_QPS_INIT failed: %d\n",
+ ret);
+ goto clean_up;
+ }
+ rds_rtd(RDS_RTD_RDMA_IB,
+ "Successfully moved qp to INIT state for ib_device: %p\n",
+ rds_ibdev->dev);
+
+ /* query port to get the lid */
+ ret = ib_query_port(rds_ibdev->dev, RDS_IB_DEFAULT_FREG_PORT_NUM,
+ &port_attr);
+ if (ret) {
+ rds_rtd(RDS_RTD_ERR, "ib_query_port failed: %d\n", ret);
+ goto clean_up;
+ }
+ rds_rtd(RDS_RTD_RDMA_IB,
+ "Successfully queried the port and the port is in %d state\n",
+ port_attr.state);
+
+ ret = ib_query_gid(rds_ibdev->dev, RDS_IB_DEFAULT_FREG_PORT_NUM,
+ gid_index, &dgid);
+ if (ret) {
+ rds_rtd(RDS_RTD_ERR, "ib_query_gid failed: %d\n", ret);
+ goto clean_up;
+ }
+ rds_rtd(RDS_RTD_RDMA_IB,
+ "Successfully queried the gid_index %d and the gid is " RDS_IB_GID_FMT "\n",
+ gid_index, RDS_IB_GID_ARG(dgid));
+
+ /* Use modify_qp verb to change the state from INIT to RTR */
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_RTR;
+ qp_attr.path_mtu = IB_MTU_4096;
+ qp_attr.dest_qp_num = rds_ibdev->fastreg_qp->qp_num;
+ qp_attr.rq_psn = 1;
+ qp_attr.ah_attr.ah_flags = IB_AH_GRH;
+ qp_attr.ah_attr.dlid = port_attr.lid;
+ qp_attr.ah_attr.sl = 0;
+ qp_attr.ah_attr.src_path_bits = 0;
+ qp_attr.ah_attr.port_num = RDS_IB_DEFAULT_FREG_PORT_NUM;
+ qp_attr.ah_attr.grh.hop_limit = 1;
+ qp_attr.ah_attr.grh.dgid = dgid;
+ qp_attr.ah_attr.grh.sgid_index = gid_index;
+
+ ret = ib_modify_qp(rds_ibdev->fastreg_qp, &qp_attr, IB_QP_STATE |
+ IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER);
+ if (ret) {
+ rds_rtd(RDS_RTD_ERR, "ib_modify_qp to IB_QPS_RTR failed: %d\n",
+ ret);
+ goto clean_up;
+ }
+ rds_rtd(RDS_RTD_RDMA_IB,
+ "Successfully moved qp to RTR state for ib_device: %p\n",
+ rds_ibdev->dev);
+
+ /* Use modify_qp verb to change the state from RTR to RTS */
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_RTS;
+ qp_attr.sq_psn = 1;
+ qp_attr.timeout = 14;
+ qp_attr.retry_cnt = 6;
+ qp_attr.rnr_retry = 6;
+ qp_attr.max_rd_atomic = 1;
+
+ ret = ib_modify_qp(rds_ibdev->fastreg_qp, &qp_attr, IB_QP_STATE |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_SQ_PSN |
+ IB_QP_MAX_QP_RD_ATOMIC);
+ if (ret) {
+ rds_rtd(RDS_RTD_ERR, "ib_modify_qp to IB_QPS_RTS failed: %d\n",
+ ret);
+ goto clean_up;
+ }
+ rds_rtd(RDS_RTD_RDMA_IB,
+ "Successfully moved qp to RTS state for ib_device: %p\n",
+ rds_ibdev->dev);
+
+ tasklet_init(&rds_ibdev->fastreg_tasklet, rds_ib_tasklet_fn_fastreg,
+ (unsigned long)rds_ibdev);
+ atomic_set(&rds_ibdev->fastreg_wrs, RDS_IB_DEFAULT_FREG_WR);
+
+clean_up:
+ if (ret)
+ rds_ib_destroy_fastreg(rds_ibdev);
+ return ret;
+}
+
+void rds_ib_reset_fastreg(struct work_struct *work)
+{
+ struct rds_ib_device *rds_ibdev = container_of(work,
+ struct rds_ib_device,
+ fastreg_reset_w);
+
+ pr_warn("RDS: IB: Resetting fastreg qp\n");
+ /* Acquire write lock to stop posting on fastreg qp before resetting */
+ down_write(&rds_ibdev->fastreg_lock);
+
+ rds_ib_destroy_fastreg(rds_ibdev);
+ if (rds_ib_setup_fastreg(rds_ibdev)) {
+ /* Failing to setup fastreg qp at this stage is unexpected.
+ * If it happens, throw a warning, and return immediately,
+ * without up_writing the fastreg_lock.
+ */
+ pr_err("RDS: IB: Failed to setup fastreg resources in %s\n",
+ __func__);
+ WARN_ON(1);
+ return;
+ }
+
+ up_write(&rds_ibdev->fastreg_lock);
+ pr_warn("RDS: IB: Finished resetting fastreg qp\n");
+}
goto out;
}
- if (conn) {
+ if (conn)
ic = conn->c_transport_data;
- } else if (rds_ibdev->use_fastreg) {
- /* TODO: Add FRWR support for RDS_GET_MR */
- ret = -EOPNOTSUPP;
- goto out;
- }
if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) {
ret = -ENODEV;
return ret;
}
-static int rds_ib_rdma_build_fastreg(struct rds_ib_mr *ibmr)
+static int rds_ib_rdma_build_fastreg(struct rds_ib_device *rds_ibdev,
+ struct rds_ib_mr *ibmr)
{
struct ib_send_wr f_wr, *failed_wr;
+ struct ib_qp *qp;
+ atomic_t *n_wrs;
int ret = 0;
- while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
- atomic_inc(&ibmr->ic->i_fastreg_wrs);
+ if (ibmr->ic) {
+ n_wrs = &ibmr->ic->i_fastreg_wrs;
+ qp = ibmr->ic->i_cm_id->qp;
+ } else {
+ down_read(&rds_ibdev->fastreg_lock);
+ n_wrs = &rds_ibdev->fastreg_wrs;
+ qp = rds_ibdev->fastreg_qp;
+ }
+
+ while (atomic_dec_return(n_wrs) <= 0) {
+ atomic_inc(n_wrs);
/* Depending on how many times schedule() is called,
* we could replace it with wait_event() in future.
*/
f_wr.send_flags = IB_SEND_SIGNALED;
failed_wr = &f_wr;
- ret = ib_post_send(ibmr->ic->i_cm_id->qp, &f_wr, &failed_wr);
+ ret = ib_post_send(qp, &f_wr, &failed_wr);
BUG_ON(failed_wr != &f_wr);
if (ret) {
- atomic_inc(&ibmr->ic->i_fastreg_wrs);
+ atomic_inc(n_wrs);
ibmr->fr_state = MR_IS_INVALID;
pr_warn_ratelimited("RDS/IB: %s:%d ib_post_send returned %d\n",
__func__, __LINE__, ret);
}
out:
+ if (!ibmr->ic)
+ up_read(&rds_ibdev->fastreg_lock);
return ret;
}
if (ret)
goto out;
- ret = rds_ib_rdma_build_fastreg(ibmr);
+ ret = rds_ib_rdma_build_fastreg(rds_ibdev, ibmr);
if (ret)
goto out;
struct ib_send_wr s_wr, *failed_wr;
int ret = 0;
+ down_read(&ibmr->device->fastreg_lock);
+
if (ibmr->fr_state != MR_IS_VALID)
goto out;
s_wr.send_flags = IB_SEND_SIGNALED;
failed_wr = &s_wr;
- ret = ib_post_send(ibmr->ic->i_cm_id->qp, &s_wr, &failed_wr);
+ ret = ib_post_send(ibmr->device->fastreg_qp, &s_wr, &failed_wr);
BUG_ON(failed_wr != &s_wr);
if (ret) {
ibmr->fr_state = MR_IS_STALE;
}
wait_for_completion(&ibmr->wr_comp);
-
out:
+ up_read(&ibmr->device->fastreg_lock);
return ret;
}
+void rds_ib_fcq_handler(struct rds_ib_device *rds_ibdev, struct ib_wc *wc)
+{
+ struct rds_ib_mr *ibmr = (struct rds_ib_mr *)wc->wr_id;
+ enum rds_ib_fr_state fr_state = ibmr->fr_state;
+
+ WARN_ON(ibmr->fr_state == MR_IS_STALE);
+
+ if (wc->status != IB_WC_SUCCESS) {
+ pr_warn("RDS: IB: MR completion on fastreg qp status %u vendor_err %u\n",
+ wc->status, wc->vendor_err);
+ ibmr->fr_state = MR_IS_STALE;
+ queue_work(rds_wq, &rds_ibdev->fastreg_reset_w);
+ }
+
+ if (fr_state == MR_IS_INVALID) {
+ complete(&ibmr->wr_comp);
+ } else if (fr_state == MR_IS_VALID) {
+ atomic_inc(&rds_ibdev->fastreg_wrs);
+ complete(&ibmr->wr_comp);
+ }
+}
+
void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
{
struct rds_ib_mr *ibmr = (struct rds_ib_mr *)wc->wr_id;