]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
net/rds: Avoid stalled connection due to CM REQ retries
authorWei Lin Guay <wei.lin.guay@oracle.com>
Fri, 25 Aug 2017 09:13:51 +0000 (11:13 +0200)
committerBrian Maly <brian.maly@oracle.com>
Tue, 12 Jun 2018 00:37:25 +0000 (20:37 -0400)
RDS drops a connection and destroys its cm_id once a CM REJ is sent. In a
congested fabric, there is a race where a remote node receives a CM REJ
after CM has retried another CM REQ. In this scenario, the cm_id that sends
the CM REQ is no longer exists even though the remote end might respond
with a CM REP, and wait for an incoming CM RTU. This RDS connection
establishment is stuck until the connection is destroyed after the CM
timeout. As a result, this leads to a very long brownout time. Thus, this
patch adds a mechanism to detect a rejected CM REQ and rejects all the
subsequent CM REQ that are retried by the CM.

Orabug: 28068627

Signed-off-by: Wei Lin Guay <wei.lin.guay@oracle.com>
Tested-by: Dib Chatterjee <dib.chatterjee@oracle.com>
(cherry picked from commit c5c4f1472bc788ddc69af713f975ad92bdefe206
repo https://linux-git.us.oracle.com/UEK/linux-wguay-public)

Conflict:
net/rds/ib_cm.c

Made it checkpatch clean.

v1->v2:
Added Shannon's recommendations

Signed-off-by: HÃ¥kon Bugge <haakon.bugge@oracle.com>
Reviewed-by: Shannon Nelson <shannon.nelson@oracle.com>
Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
Signed-off-by: Brian Maly <brian.maly@oracle.com>
net/rds/ib.h
net/rds/ib_cm.c

index 2445c4c0c9b438ef18d6b7701bacd46dcacfbd52..189b04ef4cd892c6796196b15647275b088d760f 100644 (file)
@@ -55,6 +55,8 @@
 #define        RDS_IB_CLEAN_CACHE      1
 
 #define RDS_IB_DEFAULT_FREG_PORT_NUM   1
+#define RDS_CM_RETRY_SEQ_EN    BIT(7)
+#define RDS_CM_REQ_SEQ_SZ      (RDS_CM_RETRY_SEQ_EN - 1)
 
 extern struct rw_semaphore rds_ib_devices_lock;
 extern struct list_head rds_ib_devices;
@@ -92,7 +94,7 @@ struct rds_ib_conn_priv_cmn {
        u8                      ricpc_protocol_minor;
        __be16                  ricpc_protocol_minor_mask;      /* bitmask */
        u8                      ricpc_tos;
-       u8                      ricpc_reserved1;
+       u8                      ricpc_cm_seq;
        __be16                  ricpc_frag_sz;
        __be64                  ricpc_ack_seq;
        __be32                  ricpc_credit;   /* non-zero enables flow ctl */
@@ -116,7 +118,7 @@ struct rds6_ib_connect_private {
 #define dp_protocol_minor      dp_cmn.ricpc_protocol_minor
 #define dp_protocol_minor_mask dp_cmn.ricpc_protocol_minor_mask
 #define dp_tos                 dp_cmn.ricpc_tos
-#define dp_reserved1           dp_cmn.ricpc_reserved1
+#define dp_cm_seq              dp_cmn.ricpc_cm_seq
 #define dp_frag_sz             dp_cmn.ricpc_frag_sz
 #define dp_ack_seq             dp_cmn.ricpc_ack_seq
 #define dp_credit              dp_cmn.ricpc_credit
@@ -271,6 +273,9 @@ struct rds_ib_connection {
        unsigned int            i_rx_wait_for_handler;
        atomic_t                i_worker_has_rx;
        atomic_t                i_cq_quiesce;
+       u8                      i_req_sequence;
+       u8                      i_prev_seq;
+       u8                      i_last_rej_seq;
 };
 
 /* This assumes that atomic_t is at least 32 bits */
@@ -278,6 +283,7 @@ struct rds_ib_connection {
 #define IB_GET_POST_CREDITS(v) ((v) >> 16)
 #define IB_SET_SEND_CREDITS(v) ((v) & 0xffff)
 #define IB_SET_POST_CREDITS(v) ((v) << 16)
+#define IB_GET_CM_SEQ_NUM(v)   ((v) & RDS_CM_REQ_SEQ_SZ)
 
 struct rds_ib_ipaddr {
        struct list_head        list;
index 603b295987bd35667168172cb275702dbb9b23c8..54a5d3d58461961c891970a52b0af2e1d7a766ae 100644 (file)
@@ -367,7 +367,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
                                      u32 protocol_version,
                                      u32 max_responder_resources,
                                      u32 max_initiator_depth, u16 frag,
-                                     bool isv6)
+                                     bool isv6, u8 seq)
 {
        struct rds_ib_connection *ic = conn->c_transport_data;
        struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
@@ -397,6 +397,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
                            cpu_to_be64(rds_ib_piggyb_ack(ic));
                        dp->ricp_v6.dp_tos = conn->c_tos;
                        dp->ricp_v6.dp_frag_sz = cpu_to_be16(frag);
+                       dp->ricp_v6.dp_cm_seq = seq;
 
                        conn_param->private_data = &dp->ricp_v6;
                        conn_param->private_data_len = sizeof(dp->ricp_v6);
@@ -413,6 +414,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
                            cpu_to_be64(rds_ib_piggyb_ack(ic));
                        dp->ricp_v4.dp_tos = conn->c_tos;
                        dp->ricp_v4.dp_frag_sz = cpu_to_be16(frag);
+                       dp->ricp_v4.dp_cm_seq = seq;
 
                        conn_param->private_data = &dp->ricp_v4;
                        conn_param->private_data_len = sizeof(dp->ricp_v4);
@@ -983,6 +985,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
        u32 version;
        int err = 1;
        u16 frag;
+       u8 cm_req_seq = 0;
+       bool cm_seq_check_enable = false;
 
        /* Check whether the remote protocol version matches ours. */
        version = rds_ib_protocol_compatible(event, isv6);
@@ -1007,12 +1011,16 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
                                goto out;
                        }
                }
+               cm_seq_check_enable = dp->ricp_v6.dp_cm_seq & RDS_CM_RETRY_SEQ_EN;
+               cm_req_seq = IB_GET_CM_SEQ_NUM(dp->ricp_v6.dp_cm_seq);
        } else {
                dp_cmn = &dp->ricp_v4.dp_cmn;
                ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr);
                ipv6_addr_set_v4mapped(dp->ricp_v4.dp_daddr, &d_mapped_addr);
                saddr6 = &s_mapped_addr;
                daddr6 = &d_mapped_addr;
+               cm_seq_check_enable = dp->ricp_v4.dp_cm_seq & RDS_CM_RETRY_SEQ_EN;
+               cm_req_seq = IB_GET_CM_SEQ_NUM(dp->ricp_v4.dp_cm_seq);
        }
 
        rds_rtd_ptr(RDS_RTD_CM,
@@ -1056,12 +1064,27 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
         * see the comment above rds_queue_reconnect()
         */
        mutex_lock(&conn->c_cm_lock);
+       ic = conn->c_transport_data;
+
+       if (ic && cm_seq_check_enable) {
+               if (cm_req_seq != ic->i_prev_seq) {
+                       rds_rtd(RDS_RTD_CM_EXT_P,
+                               "cm_id %p conn %p updating ic->i_prev_seq %d cm_req_seq %d\n",
+                               cm_id, conn, ic->i_prev_seq, cm_req_seq);
+                       ic->i_prev_seq = cm_req_seq;
+               } else if (cm_req_seq == ic->i_prev_seq && ic->i_last_rej_seq == cm_req_seq) {
+                       rds_rtd(RDS_RTD_CM_EXT_P,
+                               "duplicated REQ cm_id %p conn %p reject! ic->i_last_rej_seq %d cm_req_seq %d\n",
+                               cm_id, conn, ic->i_last_rej_seq, cm_req_seq);
+                       goto out;
+               }
+       }
+
        if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
                /*
                 * in both of the cases below, the conn is half setup.
                 * we need to make sure the lower layers don't destroy it
                 */
-               ic = conn->c_transport_data;
                if (ic && ic->i_cm_id == cm_id)
                        destroy = 0;
                if (rds_conn_state(conn) == RDS_CONN_UP) {
@@ -1105,6 +1128,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
                                rds_ib_stats_inc(s_ib_connect_raced);
                        }
                }
+               if (ic && cm_seq_check_enable)
+                       ic->i_last_rej_seq = cm_req_seq;
                goto out;
        }
 
@@ -1152,7 +1177,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
        rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
                                  event->param.conn.responder_resources,
                                  event->param.conn.initiator_depth,
-                                 frag, isv6);
+                                 frag, isv6, cm_req_seq);
 
        /* rdma_accept() calls rdma_reject() internally if it fails */
        err = rdma_accept(cm_id, &conn_param);
@@ -1204,6 +1229,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
        union rds_ib_conn_priv dp;
        u16 frag;
        int ret;
+       u8 seq;
 
        ret = rds_ib_match_acl(ic->i_cm_id, &conn->c_faddr);
        if (ret < 0) {
@@ -1237,9 +1263,11 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
                goto out;
        }
        frag = rds_ib_set_frag_size(conn, ib_init_frag_size);
+       ic->i_req_sequence = IB_GET_CM_SEQ_NUM(ic->i_req_sequence + 1);
+       seq = RDS_CM_RETRY_SEQ_EN | ic->i_req_sequence;
        rds_ib_cm_fill_conn_param(conn, &conn_param, &dp,
                                  conn->c_proposed_version, UINT_MAX, UINT_MAX,
-                                 frag, isv6);
+                                 frag, isv6, seq);
        ret = rdma_connect(cm_id, &conn_param);
        if (ret) {
                pr_warn("RDS/IB: rdma_connect failed (%d)\n", ret);