From: Chris Mason Date: Fri, 3 Feb 2012 16:07:54 +0000 (-0500) Subject: RDS: give up on half formed connections after 15s X-Git-Tag: v4.1.12-92~319^2^2~2^2~52 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=d783cad5b5f03d845788e04114df93ba7cda42a9;p=users%2Fjedix%2Flinux-maple.git RDS: give up on half formed connections after 15s RDS relies on events to transition connections through a few different states, but sometimes we get stuck and end up with a half formed connection that is never able to finish The other end has either wandered off or there are bugs in other layers, and we end up with any future attempts from the other end rejected because we're already working on a connection attempt. This patch changes things to give up on half formed connections after 15 seconds. Signed-off-by: Chris Mason Signed-off-by: Bang Nguyen --- diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 709ce2e5d8665..306a42928edec 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -576,20 +576,49 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, */ mutex_lock(&conn->c_cm_lock); if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + /* + * in both of the cases below, the conn is half setup. + * we need to make sure the lower layers don't destroy it + */ + ic = conn->c_transport_data; + if (ic && ic->i_cm_id == cm_id) + destroy = 0; if (rds_conn_state(conn) == RDS_CONN_UP) { rdsdebug("incoming connect while connecting\n"); rds_conn_drop(conn); rds_ib_stats_inc(s_ib_listen_closed_stale); - } else - if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { - /* Wait and see - our connect may still be succeeding */ - rds_ib_stats_inc(s_ib_connect_raced); + } else if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { + unsigned long now = get_seconds(); + + /* + * after 15 seconds, give up on existing connection + * attempts and make them try again. At this point + * it's no longer a race but something has gone + * horribly wrong + */ + if (now > conn->c_connection_start && + now - conn->c_connection_start > 15) { + printk(KERN_CRIT "rds connection racing for 15s, forcing reset " + "connection %u.%u.%u.%u->%u.%u.%u.%u\n", + NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); + rds_conn_drop(conn); + rds_ib_stats_inc(s_ib_listen_closed_stale); + } else { + /* Wait and see - our connect may still be succeeding */ + rds_ib_stats_inc(s_ib_connect_raced); + } } goto out; } ic = conn->c_transport_data; + /* + * record the time we started trying to connect so that we can + * drop the connection if it doesn't work out after a while + */ + conn->c_connection_start = get_seconds(); + rds_ib_set_protocol(conn, version); rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); diff --git a/net/rds/rds.h b/net/rds/rds.h index 3a19aa9948a6d..c033c63609225 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -123,6 +123,7 @@ struct rds_connection { struct list_head c_map_item; unsigned long c_map_queued; + unsigned long c_connection_start; /* when was this connection started */ unsigned int c_unacked_packets; unsigned int c_unacked_bytes; diff --git a/net/rds/threads.c b/net/rds/threads.c index 0fd90f8c5f59c..21aeecca48316 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -89,6 +89,7 @@ void rds_connect_complete(struct rds_connection *conn) set_bit(0, &conn->c_map_queued); queue_delayed_work(rds_wq, &conn->c_send_w, 0); queue_delayed_work(rds_wq, &conn->c_recv_w, 0); + conn->c_connection_start = get_seconds(); } EXPORT_SYMBOL_GPL(rds_connect_complete); @@ -143,6 +144,12 @@ void rds_connect_worker(struct work_struct *work) clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags); if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + /* + * record the time we started trying to connect so that we can + * drop the connection if it doesn't work out after a while + */ + conn->c_connection_start = get_seconds(); + ret = conn->c_trans->conn_connect(conn); rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n", conn, &conn->c_laddr, &conn->c_faddr, ret);