RDS: Idle QoS connections during remote peer reboot causing application brownout

author Chien-Hua Yen <chien.yen@oracle.com>

Tue, 18 Mar 2014 21:46:49 +0000 (14:46 -0700)

committer Mukesh Kacker <mukesh.kacker@oracle.com>

Wed, 8 Jul 2015 20:59:51 +0000 (13:59 -0700)
author Chien-Hua Yen <chien.yen@oracle.com>
Tue, 18 Mar 2014 21:46:49 +0000 (14:46 -0700)
committer Mukesh Kacker <mukesh.kacker@oracle.com>
Wed, 8 Jul 2015 20:59:51 +0000 (13:59 -0700)
diff --git a/net/rds/connection.c b/net/rds/connection.c

index e5337aef1c8d449812c2c494e762b85584ffe1b3..3067c6be7bf0d16c62bd0b04f9fc9779ffd9d9fa 100644 (file)
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -580,6 +580,32 @@ void rds_conn_exit(void)
                                  rds_conn_message_info_retrans);
  }
  
+/*
+ * Drop connections when the idled QoS connection not getting
+ * disconnect event when the remote peer reboots.  This is causing
+ * delayed reconnect, hence application brownout when the peer comes online.
+ * The fix was to proactively drop and reconnect them when the base lane is
+ * going through the reconnect to the reboot peer, in effect forcing all
+ * the lanes to go through the reconnect at the same time.
+ */
+static void rds_conn_shutdown_lanes(struct rds_connection *conn)
+{
+       struct hlist_head *head =
+               rds_conn_bucket(conn->c_laddr, conn->c_faddr);
+       struct rds_connection *tmp;
+
+       rcu_read_lock();
+       hlist_for_each_entry_rcu(tmp, head, c_hash_node) {
+               if (tmp->c_faddr == conn->c_faddr &&
+                       tmp->c_laddr == conn->c_laddr &&
+                       tmp->c_tos != 0 &&
+                       tmp->c_trans == conn->c_trans) {
+                               rds_conn_drop(tmp);
+               }
+       }
+       rcu_read_unlock();
+}
+
  /*
   * Force a disconnect
   */
@@ -608,6 +634,10 @@ void rds_conn_drop(struct rds_connection *conn)
                         conn->c_reconnect_drops,
                         conn->c_reconnect_err);
                 conn->c_reconnect_warn = 0;
+
+               /* see comment for rds_conn_shutdown_lanes() */
+               if (conn->c_tos == 0)
+                       rds_conn_shutdown_lanes(conn);
         }
         conn->c_reconnect_drops++;
  
diff --git a/net/rds/ib.c b/net/rds/ib.c

index 442cbbb55b1dd46a35f559041bf9e8cdef642657..948a3b3c036352d95bbfceec3d9a1ce837b06437 100644 (file)
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -772,7 +772,7 @@ static int rds_ib_move_ip(char                      *from_dev,
  
                                         work->conn = (struct rds_ib_connection *)ic->conn;
                                         INIT_DELAYED_WORK(&work->work, rds_ib_conn_drop);
-                                       queue_delayed_work(rds_wq, &work->work,
+                                       queue_delayed_work(rds_aux_wq, &work->work,
                                                 msecs_to_jiffies(1000 * rds_ib_active_bonding_reconnect_delay));
                                 } else
                                         rds_conn_drop(ic->conn);
author	Chien-Hua Yen <chien.yen@oracle.com>
	Tue, 18 Mar 2014 21:46:49 +0000 (14:46 -0700)
committer	Mukesh Kacker <mukesh.kacker@oracle.com>
	Wed, 8 Jul 2015 20:59:51 +0000 (13:59 -0700)
net/rds/connection.c		patch \| blob \| history
net/rds/ib.c		patch \| blob \| history