]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
RDS: give up on half formed connections after 15s
authorChris Mason <chris.mason@oracle.com>
Fri, 3 Feb 2012 16:07:54 +0000 (11:07 -0500)
committerMukesh Kacker <mukesh.kacker@oracle.com>
Tue, 7 Jul 2015 23:41:29 +0000 (16:41 -0700)
RDS relies on events to transition connections through a few
different states, but sometimes we get stuck and end up with
a half formed connection that is never able to finish

The other end has either wandered off or there are bugs in
other layers, and we end up with any future attempts from
the other end rejected because we're already working on a
connection attempt.

This patch changes things to give up on half formed connections
after 15 seconds.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Bang Nguyen <bang.nguyen@oracle.com>
net/rds/ib_cm.c
net/rds/rds.h
net/rds/threads.c

index 709ce2e5d8665ed3025d12890dc56253713e02f5..306a42928edecafeff9fb7086dfcfc339531de5f 100644 (file)
@@ -576,20 +576,49 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
         */
        mutex_lock(&conn->c_cm_lock);
        if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+               /*
+                * in both of the cases below, the conn is half setup.
+                * we need to make sure the lower layers don't destroy it
+                */
+               ic = conn->c_transport_data;
+               if (ic && ic->i_cm_id == cm_id)
+                       destroy = 0;
                if (rds_conn_state(conn) == RDS_CONN_UP) {
                        rdsdebug("incoming connect while connecting\n");
                        rds_conn_drop(conn);
                        rds_ib_stats_inc(s_ib_listen_closed_stale);
-               } else
-               if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
-                       /* Wait and see - our connect may still be succeeding */
-                       rds_ib_stats_inc(s_ib_connect_raced);
+               } else if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
+                       unsigned long now = get_seconds();
+
+                       /*
+                        * after 15 seconds, give up on existing connection
+                        * attempts and make them try again.  At this point
+                        * it's no longer a race but something has gone
+                        * horribly wrong
+                        */
+                       if (now > conn->c_connection_start &&
+                           now - conn->c_connection_start > 15) {
+                               printk(KERN_CRIT "rds connection racing for 15s, forcing reset "
+                                        "connection %u.%u.%u.%u->%u.%u.%u.%u\n",
+                                        NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr));
+                               rds_conn_drop(conn);
+                               rds_ib_stats_inc(s_ib_listen_closed_stale);
+                       } else {
+                               /* Wait and see - our connect may still be succeeding */
+                               rds_ib_stats_inc(s_ib_connect_raced);
+                       }
                }
                goto out;
        }
 
        ic = conn->c_transport_data;
 
+       /*
+        * record the time we started trying to connect so that we can
+        * drop the connection if it doesn't work out after a while
+        */
+       conn->c_connection_start = get_seconds();
+
        rds_ib_set_protocol(conn, version);
        rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
 
index 3a19aa9948a6d4822f6d64412a6d45619f2ff165..c033c63609225652f90cb29800b65eb84928b46e 100644 (file)
@@ -123,6 +123,7 @@ struct rds_connection {
 
        struct list_head        c_map_item;
        unsigned long           c_map_queued;
+       unsigned long           c_connection_start; /* when was this connection started */
 
        unsigned int            c_unacked_packets;
        unsigned int            c_unacked_bytes;
index 0fd90f8c5f59c75c18c244701f3230f93c408d85..21aeecca483167b56aa618c9082243052ae2fcd0 100644 (file)
@@ -89,6 +89,7 @@ void rds_connect_complete(struct rds_connection *conn)
        set_bit(0, &conn->c_map_queued);
        queue_delayed_work(rds_wq, &conn->c_send_w, 0);
        queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+       conn->c_connection_start = get_seconds();
 }
 EXPORT_SYMBOL_GPL(rds_connect_complete);
 
@@ -143,6 +144,12 @@ void rds_connect_worker(struct work_struct *work)
 
        clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
        if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+               /*
+                * record the time we started trying to connect so that we can
+                * drop the connection if it doesn't work out after a while
+                */
+               conn->c_connection_start = get_seconds();
+
                ret = conn->c_trans->conn_connect(conn);
                rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
                        conn, &conn->c_laddr, &conn->c_faddr, ret);