From: Wei Lin Guay Date: Fri, 25 Aug 2017 09:13:51 +0000 (+0200) Subject: net/rds: Avoid stalled connection due to CM REQ retries X-Git-Tag: v4.1.12-124.31.3~716 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=8456dc853376a9254b325db2933c25d3ec9e8532;p=users%2Fjedix%2Flinux-maple.git net/rds: Avoid stalled connection due to CM REQ retries RDS drops a connection and destroys its cm_id once a CM REJ is sent. In a congested fabric, there is a race where a remote node receives a CM REJ after CM has retried another CM REQ. In this scenario, the cm_id that sends the CM REQ is no longer exists even though the remote end might respond with a CM REP, and wait for an incoming CM RTU. This RDS connection establishment is stuck until the connection is destroyed after the CM timeout. As a result, this leads to a very long brownout time. Thus, this patch adds a mechanism to detect a rejected CM REQ and rejects all the subsequent CM REQ that are retried by the CM. Orabug: 28068627 Signed-off-by: Wei Lin Guay Tested-by: Dib Chatterjee (cherry picked from commit c5c4f1472bc788ddc69af713f975ad92bdefe206 repo https://linux-git.us.oracle.com/UEK/linux-wguay-public) Conflict: net/rds/ib_cm.c Made it checkpatch clean. v1->v2: Added Shannon's recommendations Signed-off-by: HÃ¥kon Bugge Reviewed-by: Shannon Nelson Reviewed-by: Zhu Yanjun Signed-off-by: Brian Maly --- diff --git a/net/rds/ib.h b/net/rds/ib.h index 2445c4c0c9b43..189b04ef4cd89 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -55,6 +55,8 @@ #define RDS_IB_CLEAN_CACHE 1 #define RDS_IB_DEFAULT_FREG_PORT_NUM 1 +#define RDS_CM_RETRY_SEQ_EN BIT(7) +#define RDS_CM_REQ_SEQ_SZ (RDS_CM_RETRY_SEQ_EN - 1) extern struct rw_semaphore rds_ib_devices_lock; extern struct list_head rds_ib_devices; @@ -92,7 +94,7 @@ struct rds_ib_conn_priv_cmn { u8 ricpc_protocol_minor; __be16 ricpc_protocol_minor_mask; /* bitmask */ u8 ricpc_tos; - u8 ricpc_reserved1; + u8 ricpc_cm_seq; __be16 ricpc_frag_sz; __be64 ricpc_ack_seq; __be32 ricpc_credit; /* non-zero enables flow ctl */ @@ -116,7 +118,7 @@ struct rds6_ib_connect_private { #define dp_protocol_minor dp_cmn.ricpc_protocol_minor #define dp_protocol_minor_mask dp_cmn.ricpc_protocol_minor_mask #define dp_tos dp_cmn.ricpc_tos -#define dp_reserved1 dp_cmn.ricpc_reserved1 +#define dp_cm_seq dp_cmn.ricpc_cm_seq #define dp_frag_sz dp_cmn.ricpc_frag_sz #define dp_ack_seq dp_cmn.ricpc_ack_seq #define dp_credit dp_cmn.ricpc_credit @@ -271,6 +273,9 @@ struct rds_ib_connection { unsigned int i_rx_wait_for_handler; atomic_t i_worker_has_rx; atomic_t i_cq_quiesce; + u8 i_req_sequence; + u8 i_prev_seq; + u8 i_last_rej_seq; }; /* This assumes that atomic_t is at least 32 bits */ @@ -278,6 +283,7 @@ struct rds_ib_connection { #define IB_GET_POST_CREDITS(v) ((v) >> 16) #define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) #define IB_SET_POST_CREDITS(v) ((v) << 16) +#define IB_GET_CM_SEQ_NUM(v) ((v) & RDS_CM_REQ_SEQ_SZ) struct rds_ib_ipaddr { struct list_head list; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 603b295987bd3..54a5d3d584619 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -367,7 +367,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, u32 protocol_version, u32 max_responder_resources, u32 max_initiator_depth, u16 frag, - bool isv6) + bool isv6, u8 seq) { struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_device *rds_ibdev = ic->rds_ibdev; @@ -397,6 +397,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, cpu_to_be64(rds_ib_piggyb_ack(ic)); dp->ricp_v6.dp_tos = conn->c_tos; dp->ricp_v6.dp_frag_sz = cpu_to_be16(frag); + dp->ricp_v6.dp_cm_seq = seq; conn_param->private_data = &dp->ricp_v6; conn_param->private_data_len = sizeof(dp->ricp_v6); @@ -413,6 +414,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, cpu_to_be64(rds_ib_piggyb_ack(ic)); dp->ricp_v4.dp_tos = conn->c_tos; dp->ricp_v4.dp_frag_sz = cpu_to_be16(frag); + dp->ricp_v4.dp_cm_seq = seq; conn_param->private_data = &dp->ricp_v4; conn_param->private_data_len = sizeof(dp->ricp_v4); @@ -983,6 +985,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, u32 version; int err = 1; u16 frag; + u8 cm_req_seq = 0; + bool cm_seq_check_enable = false; /* Check whether the remote protocol version matches ours. */ version = rds_ib_protocol_compatible(event, isv6); @@ -1007,12 +1011,16 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, goto out; } } + cm_seq_check_enable = dp->ricp_v6.dp_cm_seq & RDS_CM_RETRY_SEQ_EN; + cm_req_seq = IB_GET_CM_SEQ_NUM(dp->ricp_v6.dp_cm_seq); } else { dp_cmn = &dp->ricp_v4.dp_cmn; ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr); ipv6_addr_set_v4mapped(dp->ricp_v4.dp_daddr, &d_mapped_addr); saddr6 = &s_mapped_addr; daddr6 = &d_mapped_addr; + cm_seq_check_enable = dp->ricp_v4.dp_cm_seq & RDS_CM_RETRY_SEQ_EN; + cm_req_seq = IB_GET_CM_SEQ_NUM(dp->ricp_v4.dp_cm_seq); } rds_rtd_ptr(RDS_RTD_CM, @@ -1056,12 +1064,27 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, * see the comment above rds_queue_reconnect() */ mutex_lock(&conn->c_cm_lock); + ic = conn->c_transport_data; + + if (ic && cm_seq_check_enable) { + if (cm_req_seq != ic->i_prev_seq) { + rds_rtd(RDS_RTD_CM_EXT_P, + "cm_id %p conn %p updating ic->i_prev_seq %d cm_req_seq %d\n", + cm_id, conn, ic->i_prev_seq, cm_req_seq); + ic->i_prev_seq = cm_req_seq; + } else if (cm_req_seq == ic->i_prev_seq && ic->i_last_rej_seq == cm_req_seq) { + rds_rtd(RDS_RTD_CM_EXT_P, + "duplicated REQ cm_id %p conn %p reject! ic->i_last_rej_seq %d cm_req_seq %d\n", + cm_id, conn, ic->i_last_rej_seq, cm_req_seq); + goto out; + } + } + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { /* * in both of the cases below, the conn is half setup. * we need to make sure the lower layers don't destroy it */ - ic = conn->c_transport_data; if (ic && ic->i_cm_id == cm_id) destroy = 0; if (rds_conn_state(conn) == RDS_CONN_UP) { @@ -1105,6 +1128,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, rds_ib_stats_inc(s_ib_connect_raced); } } + if (ic && cm_seq_check_enable) + ic->i_last_rej_seq = cm_req_seq; goto out; } @@ -1152,7 +1177,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version, event->param.conn.responder_resources, event->param.conn.initiator_depth, - frag, isv6); + frag, isv6, cm_req_seq); /* rdma_accept() calls rdma_reject() internally if it fails */ err = rdma_accept(cm_id, &conn_param); @@ -1204,6 +1229,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6) union rds_ib_conn_priv dp; u16 frag; int ret; + u8 seq; ret = rds_ib_match_acl(ic->i_cm_id, &conn->c_faddr); if (ret < 0) { @@ -1237,9 +1263,11 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6) goto out; } frag = rds_ib_set_frag_size(conn, ib_init_frag_size); + ic->i_req_sequence = IB_GET_CM_SEQ_NUM(ic->i_req_sequence + 1); + seq = RDS_CM_RETRY_SEQ_EN | ic->i_req_sequence; rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, conn->c_proposed_version, UINT_MAX, UINT_MAX, - frag, isv6); + frag, isv6, seq); ret = rdma_connect(cm_id, &conn_param); if (ret) { pr_warn("RDS/IB: rdma_connect failed (%d)\n", ret);