From 01f47882225a571c5ee8e7d018639a67c180baea Mon Sep 17 00:00:00 2001 From: =?utf8?q?H=C3=A5kon=20Bugge?= Date: Mon, 19 Jun 2017 12:23:03 +0200 Subject: [PATCH] IB/mlx4: Fix CM REQ retries in paravirt mode MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit CM REQs cannot be successfully retried, because a new pv_cm_id is created for each request, without checking if one already exists. This commit fixes this, by checking if an id exists before creating one. This bug can be provoked by running an RDMA CM user-land application, but inserting a five seconds delay before the rdma_accept() call on the passive side. This delay is larger than the default CMA timeout, and triggers a retry from the active side. The retried REQ will use another pv_cm_id (the cm_id on the wire). This confuses the CM protocol and two REJs are sent from the passive side. This commit is required to achieve the reduced HA Brownout time, needed by Exadata. The Brownout issue is tracked by orabug 25521901. Orabug: 26287667 Suggested-by: Venkat Venkatsubra Signed-off-by: HÃ¥kon Bugge Reported-by: Wei Lin Guay Tested-by: Wei Lin Guay Reviewed-by: Yuval Shaia --- drivers/infiniband/hw/mlx4/cm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c index a21b5ddcbaf6..160c1326f547 100644 --- a/drivers/infiniband/hw/mlx4/cm.c +++ b/drivers/infiniband/hw/mlx4/cm.c @@ -325,6 +325,9 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id mad->mad_hdr.attr_id == CM_REP_ATTR_ID || mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { sl_cm_id = get_local_comm_id(mad); + id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id); + if (id) + goto cont; id = id_map_alloc(ibdev, slave_id, sl_cm_id); if (IS_ERR(id)) { mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n", @@ -345,6 +348,7 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id return -EINVAL; } +cont: set_local_comm_id(mad, id->pv_cm_id); if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID) -- 2.50.1