SA query optimization
The fact is all QoS lanes share the same physical path
b/w an IP pair. The only difference is the service level
that affects the quality of service for each lane. With
that, we have the following optimization:
1. Lane 0 to issue SA query request to the SM. All other
lanes will wait for lane 0 to finish route resolution,
then copy in the resolved path and fill in its service
level.
2. One-side reconnect to reduce reconnect racing, thus
further reducing the number of SA queries to the SM.
Reducing brownout for non-zero lanes
In some case, RDMA CM is delaying the disconnect event
after switch/node failure and this is causing extra
brownout for RDS reconnection. The workaround is to have
lane 0 probe other lanes by sending a HB msg. If the lane
is down, this will cause a send completion error and an
immediate reconnect.
Orabug:
18801977
Signed-off-by: Bang Nguyen <bang.nguyen@oracle.com>
Signed-off-by: Mukesh Kacker <mukesh.kacker@oracle.com>
(cherry picked from commit
8f84b1ff46e449e99c5fcf4d4f94dc2e8ea82cd7)
Signed-off-by: Jerry Snitselaar <jerry.snitselaar@oracle.com>
(cherry picked from commit
8991a87c6c3fc8b17383a140bd6f15a958e31298)
INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker);
INIT_DELAYED_WORK(&conn->c_hb_w, rds_hb_worker);
+ INIT_DELAYED_WORK(&conn->c_reconn_w, rds_reconnect_timeout);
INIT_DELAYED_WORK(&conn->c_reject_w, rds_reject_worker);
INIT_WORK(&conn->c_down_w, rds_shutdown_worker);
mutex_init(&conn->c_cm_lock);
rds_conn_message_info_retrans);
}
-/*
- * Drop connections when the idled QoS connection not getting
- * disconnect event when the remote peer reboots. This is causing
- * delayed reconnect, hence application brownout when the peer comes online.
- * The fix was to proactively drop and reconnect them when the base lane is
- * going through the reconnect to the reboot peer, in effect forcing all
- * the lanes to go through the reconnect at the same time.
- */
-static void rds_conn_shutdown_lanes(struct rds_connection *conn)
+static void rds_conn_probe_lanes(struct rds_connection *conn)
{
struct hlist_head *head =
rds_conn_bucket(conn->c_laddr, conn->c_faddr);
tmp->c_laddr == conn->c_laddr &&
tmp->c_tos != 0 &&
tmp->c_trans == conn->c_trans) {
- rds_conn_drop(tmp);
+ if (rds_conn_up(tmp))
+ rds_send_hb(tmp, 0);
}
}
rcu_read_unlock();
conn->c_reconnect_warn = 1;
conn->c_reconnect_drops = 0;
conn->c_reconnect_err = 0;
+ conn->c_reconnect_racing = 0;
printk(KERN_INFO "RDS/IB: connection "
"<%u.%u.%u.%u,%u.%u.%u.%u,%d> dropped\n",
NIPQUAD(conn->c_laddr),
NIPQUAD(conn->c_faddr),
conn->c_tos);
+
+ if (conn->c_tos == 0)
+ rds_conn_probe_lanes(conn);
+
} else if ((conn->c_reconnect_warn) &&
(now - conn->c_reconnect_start > 60)) {
printk(KERN_INFO "RDS/IB: re-connect "
conn->c_reconnect_err);
conn->c_reconnect_warn = 0;
- /* see comment for rds_conn_shutdown_lanes() */
if (conn->c_tos == 0)
- rds_conn_shutdown_lanes(conn);
+ rds_conn_probe_lanes(conn);
}
conn->c_reconnect_drops++;
/* Send multiple ARPs to improve reliability */
for (i = 0; i < rds_ib_active_bonding_arps; i++) {
- arp_send(ARPOP_REQUEST, ETH_P_ARP,
+ arp_send(ARPOP_REPLY, ETH_P_ARP,
ip_addr, out_dev,
ip_addr, NULL,
dev_addr, NULL);
break;
case NETDEV_DOWN:
- INIT_DELAYED_WORK(&work->work, rds_ib_failover);
- queue_delayed_work(rds_wq, &work->work, 0);
+ if (rds_ib_sysctl_active_bonding) {
+ INIT_DELAYED_WORK(&work->work, rds_ib_failover);
+ queue_delayed_work(rds_wq, &work->work, 0);
+ } else {
+ ip_config[port].port_state = RDS_IB_PORT_INIT;
+ ip_config[port].ip_active_port = port;
+ kfree(work);
+ }
break;
}
extern unsigned long rds_ib_sysctl_max_unsig_bytes;
extern unsigned long rds_ib_sysctl_max_recv_allocation;
extern unsigned int rds_ib_sysctl_flow_control;
+extern unsigned int rds_ib_sysctl_active_bonding;
#endif
if (ic->i_rx_poll_cq >= RDS_IB_RX_LIMIT) {
ic->i_rx_w.ic = ic;
/* Delay 10 msecs until the RX worker starts reaping again */
- queue_delayed_work(rds_aux_wq, &ic->i_rx_w,
+ queue_delayed_work(rds_aux_wq, &ic->i_rx_w.work,
msecs_to_jiffies(10));
ic->i_rx_wait_for_handler = 1;
}
} else if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
unsigned long now = get_seconds();
+ conn->c_reconnect_racing++;
+
/*
* after 15 seconds, give up on existing connection
* attempts and make them try again. At this point
*/
unsigned int rds_ib_sysctl_flow_control = 0;
+unsigned int rds_ib_sysctl_active_bonding = 1;
static struct ctl_table rds_ib_sysctl_table[] = {
{
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "active_bonding",
+ .data = &rds_ib_sysctl_active_bonding,
+ .maxlen = sizeof(rds_ib_sysctl_active_bonding),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
{ }
};
*
*/
#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
#include "rdma_transport.h"
#include "ib.h"
rdma_set_timeout(cm_id, rds_ib_apm_timeout);
#endif
+ if (conn->c_tos && conn->c_reconnect) {
+ struct rds_ib_connection *base_ic =
+ conn->c_base_conn->c_transport_data;
+
+ mutex_lock(&conn->c_base_conn->c_cm_lock);
+ if (rds_conn_transition(conn->c_base_conn, RDS_CONN_UP,
+ RDS_CONN_UP)) {
+ ret = rdma_set_ib_paths(cm_id,
+ base_ic->i_cm_id->route.path_rec,
+ base_ic->i_cm_id->route.num_paths);
+ if (!ret) {
+ struct rds_ib_connection *ic =
+ conn->c_transport_data;
+
+ cm_id->route.path_rec[0].sl =
+ ic->i_sl;
+ cm_id->route.path_rec[0].qos_class =
+ conn->c_tos;
+ ret = trans->cm_initiate_connect(cm_id);
+ }
+ } else {
+ ret = 1;
+ }
+ mutex_unlock(&conn->c_base_conn->c_cm_lock);
+
+ if (ret) {
+ rds_conn_drop(conn);
+ ret = 0;
+ }
+
+ break;
+ }
+
+
/* XXX do we need to clean up if this fails? */
ret = rdma_resolve_route(cm_id,
rds_rdma_resolve_to_ms[conn->c_to_index]);
break;
case RDMA_CM_EVENT_ADDR_ERROR:
+ if (conn)
+ rds_conn_drop(conn);
+ break;
+
case RDMA_CM_EVENT_CONNECT_ERROR:
case RDMA_CM_EVENT_UNREACHABLE:
case RDMA_CM_EVENT_DEVICE_REMOVAL:
case RDMA_CM_EVENT_REJECTED:
err = (int *)event->param.conn.private_data;
+
+ if (conn && event->status == RDS_REJ_CONSUMER_DEFINED &&
+ *err <= 1)
+ conn->c_reconnect_racing++;
+
if (conn) {
- if (event->status == RDS_REJ_CONSUMER_DEFINED && (*err) == 0) {
+ if (event->status == RDS_REJ_CONSUMER_DEFINED &&
+ (*err) == 0) {
/* Rejection from RDSV3.1 */
if (!conn->c_tos) {
conn->c_proposed_version =
#define RDS_RECV_REFILL 3
#define RDS_RDMA_RESOLVE_TO_MAX_INDEX 5
+#define RDS_ADDR_RES_TM_INDEX_MAX 5
struct rds_connection {
struct hlist_node c_hash_node;
struct delayed_work c_conn_w;
struct delayed_work c_reject_w;
struct delayed_work c_hb_w;
+ struct delayed_work c_reconn_w;
struct work_struct c_down_w;
struct mutex c_cm_lock; /* protect conn state & cm */
wait_queue_head_t c_waitq;
unsigned int c_route_to_base;
unsigned int c_rdsinfo_pending;
+
+ unsigned int c_reconnect_racing;
};
#define RDS_FLAG_CONG_BITMAP 0x01
void rds_reject_worker(struct work_struct *);
void rds_recv_worker(struct work_struct *);
void rds_hb_worker(struct work_struct *);
+void rds_reconnect_timeout(struct work_struct *);
void rds_connect_complete(struct rds_connection *conn);
/* transport.c */
}
}
+void rds_reconnect_timeout(struct work_struct *work)
+{
+ struct rds_connection *conn =
+ container_of(work, struct rds_connection, c_reconn_w.work);
+
+ /* if the higher IP has not reconnected, reset back to two-sided
+ * reconnect.
+ */
+ if (!rds_conn_up(conn)) {
+ rds_conn_drop(conn);
+ conn->c_reconnect_racing = 0;
+ }
+}
+
void rds_shutdown_worker(struct work_struct *work)
{
struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
- rds_conn_shutdown(conn, 1);
+
+ /* if racing is detected, lower IP backs off and let the higher IP
+ * drives the reconnect (one-sided reconnect)
+ */
+ if (conn->c_laddr < conn->c_faddr && conn->c_reconnect_racing) {
+ rds_conn_shutdown(conn, 0);
+ queue_delayed_work(rds_wq, &conn->c_reconn_w,
+ msecs_to_jiffies(5000));
+ } else
+ rds_conn_shutdown(conn, 1);
+
}
void rds_threads_exit(void)