]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
RDS: SA query optimization
authorBang Nguyen <bang.nguyen@oracle.com>
Wed, 16 Apr 2014 20:56:02 +0000 (13:56 -0700)
committerMukesh Kacker <mukesh.kacker@oracle.com>
Wed, 8 Jul 2015 20:59:57 +0000 (13:59 -0700)
SA query optimization
The fact is all QoS lanes share the same physical path
b/w an IP pair. The only difference is the service level
that affects the quality of service for each lane. With
that, we have the following optimization:

1. Lane 0 to issue SA query request to the SM. All other
lanes will wait for lane 0 to finish route resolution,
then copy in the resolved path and fill in its service
level.

2. One-side reconnect to reduce reconnect racing, thus
further reducing the number of SA queries to the SM.

Reducing brownout for non-zero lanes
In some case, RDMA CM is delaying the disconnect event
after switch/node failure and this is causing extra
brownout for RDS reconnection. The workaround is to have
lane 0 probe other lanes by sending a HB msg. If the lane
is down, this will cause a send completion error and an
immediate reconnect.

Orabug: 18801977

Signed-off-by: Bang Nguyen <bang.nguyen@oracle.com>
Signed-off-by: Mukesh Kacker <mukesh.kacker@oracle.com>
(cherry picked from commit 8f84b1ff46e449e99c5fcf4d4f94dc2e8ea82cd7)
Signed-off-by: Jerry Snitselaar <jerry.snitselaar@oracle.com>
(cherry picked from commit 8991a87c6c3fc8b17383a140bd6f15a958e31298)

net/rds/connection.c
net/rds/ib.c
net/rds/ib.h
net/rds/ib_cm.c
net/rds/ib_sysctl.c
net/rds/rdma_transport.c
net/rds/rds.h
net/rds/threads.c

index 3067c6be7bf0d16c62bd0b04f9fc9779ffd9d9fa..c57e2d953f8bd3cefb8101b0e0e86021b09df567 100644 (file)
@@ -214,6 +214,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
        INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
        INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker);
        INIT_DELAYED_WORK(&conn->c_hb_w, rds_hb_worker);
+       INIT_DELAYED_WORK(&conn->c_reconn_w, rds_reconnect_timeout);
        INIT_DELAYED_WORK(&conn->c_reject_w, rds_reject_worker);
        INIT_WORK(&conn->c_down_w, rds_shutdown_worker);
        mutex_init(&conn->c_cm_lock);
@@ -580,15 +581,7 @@ void rds_conn_exit(void)
                                 rds_conn_message_info_retrans);
 }
 
-/*
- * Drop connections when the idled QoS connection not getting
- * disconnect event when the remote peer reboots.  This is causing
- * delayed reconnect, hence application brownout when the peer comes online.
- * The fix was to proactively drop and reconnect them when the base lane is
- * going through the reconnect to the reboot peer, in effect forcing all
- * the lanes to go through the reconnect at the same time.
- */
-static void rds_conn_shutdown_lanes(struct rds_connection *conn)
+static void rds_conn_probe_lanes(struct rds_connection *conn)
 {
        struct hlist_head *head =
                rds_conn_bucket(conn->c_laddr, conn->c_faddr);
@@ -600,7 +593,8 @@ static void rds_conn_shutdown_lanes(struct rds_connection *conn)
                        tmp->c_laddr == conn->c_laddr &&
                        tmp->c_tos != 0 &&
                        tmp->c_trans == conn->c_trans) {
-                               rds_conn_drop(tmp);
+                       if (rds_conn_up(tmp))
+                               rds_send_hb(tmp, 0);
                }
        }
        rcu_read_unlock();
@@ -618,11 +612,16 @@ void rds_conn_drop(struct rds_connection *conn)
                conn->c_reconnect_warn = 1;
                conn->c_reconnect_drops = 0;
                conn->c_reconnect_err = 0;
+               conn->c_reconnect_racing = 0;
                printk(KERN_INFO "RDS/IB: connection "
                        "<%u.%u.%u.%u,%u.%u.%u.%u,%d> dropped\n",
                        NIPQUAD(conn->c_laddr),
                        NIPQUAD(conn->c_faddr),
                        conn->c_tos);
+
+               if (conn->c_tos == 0)
+                       rds_conn_probe_lanes(conn);
+
        } else if ((conn->c_reconnect_warn) &&
                   (now - conn->c_reconnect_start > 60)) {
                printk(KERN_INFO "RDS/IB: re-connect "
@@ -635,9 +634,8 @@ void rds_conn_drop(struct rds_connection *conn)
                        conn->c_reconnect_err);
                conn->c_reconnect_warn = 0;
 
-               /* see comment for rds_conn_shutdown_lanes() */
                if (conn->c_tos == 0)
-                       rds_conn_shutdown_lanes(conn);
+                       rds_conn_probe_lanes(conn);
        }
        conn->c_reconnect_drops++;
 
index 3fd8457b474d2a28d72bed5395ee7e093db0c515..f75a95c4fc7c12abc5a5aec327ee81c438327f4d 100644 (file)
@@ -427,7 +427,7 @@ static void rds_ib_send_gratuitous_arp(struct net_device    *out_dev,
 
        /* Send multiple ARPs to improve reliability */
        for (i = 0; i < rds_ib_active_bonding_arps; i++) {
-               arp_send(ARPOP_REQUEST, ETH_P_ARP,
+               arp_send(ARPOP_REPLY, ETH_P_ARP,
                        ip_addr, out_dev,
                        ip_addr, NULL,
                        dev_addr, NULL);
@@ -1736,8 +1736,14 @@ static int rds_ib_netdev_callback(struct notifier_block *self, unsigned long eve
 
                break;
        case NETDEV_DOWN:
-               INIT_DELAYED_WORK(&work->work, rds_ib_failover);
-               queue_delayed_work(rds_wq, &work->work, 0);
+               if (rds_ib_sysctl_active_bonding) {
+                       INIT_DELAYED_WORK(&work->work, rds_ib_failover);
+                       queue_delayed_work(rds_wq, &work->work, 0);
+               } else {
+                       ip_config[port].port_state = RDS_IB_PORT_INIT;
+                       ip_config[port].ip_active_port = port;
+                       kfree(work);
+               }
                break;
        }
 
index d8c5996282ba1487a4aafb9e5d0511b0cff631e8..da0798a8cf645c79e53eca2ea58ded4cccd2c3b4 100644 (file)
@@ -600,5 +600,6 @@ extern unsigned long rds_ib_sysctl_max_unsig_wrs;
 extern unsigned long rds_ib_sysctl_max_unsig_bytes;
 extern unsigned long rds_ib_sysctl_max_recv_allocation;
 extern unsigned int rds_ib_sysctl_flow_control;
+extern unsigned int rds_ib_sysctl_active_bonding;
 
 #endif
index b125997d08edc573221c94b7b1d1a2e2eb782bd6..f577779e6111e93849c33de17cbaf84603eac100 100644 (file)
@@ -426,7 +426,7 @@ static void rds_ib_rx(struct rds_ib_connection *ic)
        if (ic->i_rx_poll_cq >= RDS_IB_RX_LIMIT) {
                ic->i_rx_w.ic = ic;
                /* Delay 10 msecs until the RX worker starts reaping again */
-               queue_delayed_work(rds_aux_wq, &ic->i_rx_w,
+               queue_delayed_work(rds_aux_wq, &ic->i_rx_w.work,
                                        msecs_to_jiffies(10));
                ic->i_rx_wait_for_handler = 1;
        }
@@ -823,6 +823,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
                } else if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
                        unsigned long now = get_seconds();
 
+                       conn->c_reconnect_racing++;
+
                        /*
                         * after 15 seconds, give up on existing connection
                         * attempts and make them try again.  At this point
index f5467cc13dc32e4780825b1061b8e0e5740826c2..9428276168166b152f93ab7aa1c56a1d03fb93c7 100644 (file)
@@ -61,6 +61,7 @@ static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
  */
 
 unsigned int rds_ib_sysctl_flow_control = 0;
+unsigned int rds_ib_sysctl_active_bonding = 1;
 
 static struct ctl_table rds_ib_sysctl_table[] = {
        {
@@ -104,6 +105,13 @@ static struct ctl_table rds_ib_sysctl_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+       {
+               .procname       = "active_bonding",
+               .data           = &rds_ib_sysctl_active_bonding,
+               .maxlen         = sizeof(rds_ib_sysctl_active_bonding),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
        { }
 };
 
index 70019e1947e7a5c416bd224a3593cd172a7402dd..00d88a5696dff0341fa71fdaa22228a92b4d416c 100644 (file)
@@ -31,6 +31,7 @@
  *
  */
 #include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
 
 #include "rdma_transport.h"
 #include "ib.h"
@@ -96,6 +97,40 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                        rdma_set_timeout(cm_id, rds_ib_apm_timeout);
 #endif
 
+               if (conn->c_tos && conn->c_reconnect) {
+                       struct rds_ib_connection *base_ic =
+                               conn->c_base_conn->c_transport_data;
+
+                       mutex_lock(&conn->c_base_conn->c_cm_lock);
+                       if (rds_conn_transition(conn->c_base_conn, RDS_CONN_UP,
+                                               RDS_CONN_UP)) {
+                               ret = rdma_set_ib_paths(cm_id,
+                                       base_ic->i_cm_id->route.path_rec,
+                                       base_ic->i_cm_id->route.num_paths);
+                               if (!ret) {
+                                       struct rds_ib_connection *ic =
+                                               conn->c_transport_data;
+
+                                       cm_id->route.path_rec[0].sl =
+                                               ic->i_sl;
+                                       cm_id->route.path_rec[0].qos_class =
+                                               conn->c_tos;
+                                       ret = trans->cm_initiate_connect(cm_id);
+                               }
+                       } else {
+                               ret = 1;
+                       }
+                       mutex_unlock(&conn->c_base_conn->c_cm_lock);
+
+                       if (ret) {
+                               rds_conn_drop(conn);
+                               ret = 0;
+                       }
+
+                       break;
+               }
+
+
                /* XXX do we need to clean up if this fails? */
                ret = rdma_resolve_route(cm_id,
                                rds_rdma_resolve_to_ms[conn->c_to_index]);
@@ -176,6 +211,10 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                break;
 
        case RDMA_CM_EVENT_ADDR_ERROR:
+               if (conn)
+                       rds_conn_drop(conn);
+               break;
+
        case RDMA_CM_EVENT_CONNECT_ERROR:
        case RDMA_CM_EVENT_UNREACHABLE:
        case RDMA_CM_EVENT_DEVICE_REMOVAL:
@@ -185,8 +224,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 
        case RDMA_CM_EVENT_REJECTED:
                err = (int *)event->param.conn.private_data;
+
+               if (conn && event->status == RDS_REJ_CONSUMER_DEFINED &&
+                   *err <= 1)
+                       conn->c_reconnect_racing++;
+
                if (conn) {
-                       if (event->status == RDS_REJ_CONSUMER_DEFINED && (*err) == 0) {
+                       if (event->status == RDS_REJ_CONSUMER_DEFINED &&
+                           (*err) == 0) {
                                /* Rejection from RDSV3.1 */
                                if (!conn->c_tos) {
                                        conn->c_proposed_version =
index 55ca1f7a1fb96b25969d854906ec3dfd553f78e3..9f6d29fbb30f89c66661d158006ff049c6a90e2d 100644 (file)
@@ -92,6 +92,7 @@ enum {
 #define RDS_RECV_REFILL                3
 
 #define RDS_RDMA_RESOLVE_TO_MAX_INDEX   5
+#define RDS_ADDR_RES_TM_INDEX_MAX 5
 
 struct rds_connection {
        struct hlist_node       c_hash_node;
@@ -130,6 +131,7 @@ struct rds_connection {
        struct delayed_work     c_conn_w;
        struct delayed_work     c_reject_w;
        struct delayed_work     c_hb_w;
+       struct delayed_work     c_reconn_w;
        struct work_struct      c_down_w;
        struct mutex            c_cm_lock;      /* protect conn state & cm */
        wait_queue_head_t       c_waitq;
@@ -166,6 +168,8 @@ struct rds_connection {
        unsigned int            c_route_to_base;
 
        unsigned int            c_rdsinfo_pending;
+
+       unsigned int            c_reconnect_racing;
 };
 
 #define RDS_FLAG_CONG_BITMAP   0x01
@@ -869,6 +873,7 @@ void rds_send_worker(struct work_struct *);
 void rds_reject_worker(struct work_struct *);
 void rds_recv_worker(struct work_struct *);
 void rds_hb_worker(struct work_struct *);
+void rds_reconnect_timeout(struct work_struct *);
 void rds_connect_complete(struct rds_connection *conn);
 
 /* transport.c */
index 8935cc13c91e83b33cce12a6a7718df7750db75e..c911cdc701a8f81553b7d22c35e06ff7b1b3eb0d 100644 (file)
@@ -264,11 +264,35 @@ void rds_hb_worker(struct work_struct *work)
        }
 }
 
+void rds_reconnect_timeout(struct work_struct *work)
+{
+       struct rds_connection *conn =
+               container_of(work, struct rds_connection, c_reconn_w.work);
+
+       /* if the higher IP has not reconnected, reset back to two-sided
+        * reconnect.
+        */
+       if (!rds_conn_up(conn)) {
+               rds_conn_drop(conn);
+               conn->c_reconnect_racing = 0;
+       }
+}
+
 void rds_shutdown_worker(struct work_struct *work)
 {
        struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
 
-       rds_conn_shutdown(conn, 1);
+
+       /* if racing is detected, lower IP backs off and let the higher IP
+        * drives the reconnect (one-sided reconnect)
+        */
+       if (conn->c_laddr < conn->c_faddr && conn->c_reconnect_racing) {
+               rds_conn_shutdown(conn, 0);
+               queue_delayed_work(rds_wq, &conn->c_reconn_w,
+                               msecs_to_jiffies(5000));
+       } else
+               rds_conn_shutdown(conn, 1);
+
 }
 
 void rds_threads_exit(void)