From: Bang Nguyen Date: Wed, 16 Apr 2014 20:56:02 +0000 (-0700) Subject: RDS: SA query optimization X-Git-Tag: v4.1.12-92~293^2^2~33 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=b12826152417844ef5c4fb12f7c6eff2a62ba52f;p=users%2Fjedix%2Flinux-maple.git RDS: SA query optimization SA query optimization The fact is all QoS lanes share the same physical path b/w an IP pair. The only difference is the service level that affects the quality of service for each lane. With that, we have the following optimization: 1. Lane 0 to issue SA query request to the SM. All other lanes will wait for lane 0 to finish route resolution, then copy in the resolved path and fill in its service level. 2. One-side reconnect to reduce reconnect racing, thus further reducing the number of SA queries to the SM. Reducing brownout for non-zero lanes In some case, RDMA CM is delaying the disconnect event after switch/node failure and this is causing extra brownout for RDS reconnection. The workaround is to have lane 0 probe other lanes by sending a HB msg. If the lane is down, this will cause a send completion error and an immediate reconnect. Orabug: 18801977 Signed-off-by: Bang Nguyen Signed-off-by: Mukesh Kacker (cherry picked from commit 8f84b1ff46e449e99c5fcf4d4f94dc2e8ea82cd7) Signed-off-by: Jerry Snitselaar (cherry picked from commit 8991a87c6c3fc8b17383a140bd6f15a958e31298) --- diff --git a/net/rds/connection.c b/net/rds/connection.c index 3067c6be7bf0d..c57e2d953f8bd 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -214,6 +214,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker); INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker); INIT_DELAYED_WORK(&conn->c_hb_w, rds_hb_worker); + INIT_DELAYED_WORK(&conn->c_reconn_w, rds_reconnect_timeout); INIT_DELAYED_WORK(&conn->c_reject_w, rds_reject_worker); INIT_WORK(&conn->c_down_w, rds_shutdown_worker); mutex_init(&conn->c_cm_lock); @@ -580,15 +581,7 @@ void rds_conn_exit(void) rds_conn_message_info_retrans); } -/* - * Drop connections when the idled QoS connection not getting - * disconnect event when the remote peer reboots. This is causing - * delayed reconnect, hence application brownout when the peer comes online. - * The fix was to proactively drop and reconnect them when the base lane is - * going through the reconnect to the reboot peer, in effect forcing all - * the lanes to go through the reconnect at the same time. - */ -static void rds_conn_shutdown_lanes(struct rds_connection *conn) +static void rds_conn_probe_lanes(struct rds_connection *conn) { struct hlist_head *head = rds_conn_bucket(conn->c_laddr, conn->c_faddr); @@ -600,7 +593,8 @@ static void rds_conn_shutdown_lanes(struct rds_connection *conn) tmp->c_laddr == conn->c_laddr && tmp->c_tos != 0 && tmp->c_trans == conn->c_trans) { - rds_conn_drop(tmp); + if (rds_conn_up(tmp)) + rds_send_hb(tmp, 0); } } rcu_read_unlock(); @@ -618,11 +612,16 @@ void rds_conn_drop(struct rds_connection *conn) conn->c_reconnect_warn = 1; conn->c_reconnect_drops = 0; conn->c_reconnect_err = 0; + conn->c_reconnect_racing = 0; printk(KERN_INFO "RDS/IB: connection " "<%u.%u.%u.%u,%u.%u.%u.%u,%d> dropped\n", NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr), conn->c_tos); + + if (conn->c_tos == 0) + rds_conn_probe_lanes(conn); + } else if ((conn->c_reconnect_warn) && (now - conn->c_reconnect_start > 60)) { printk(KERN_INFO "RDS/IB: re-connect " @@ -635,9 +634,8 @@ void rds_conn_drop(struct rds_connection *conn) conn->c_reconnect_err); conn->c_reconnect_warn = 0; - /* see comment for rds_conn_shutdown_lanes() */ if (conn->c_tos == 0) - rds_conn_shutdown_lanes(conn); + rds_conn_probe_lanes(conn); } conn->c_reconnect_drops++; diff --git a/net/rds/ib.c b/net/rds/ib.c index 3fd8457b474d2..f75a95c4fc7c1 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -427,7 +427,7 @@ static void rds_ib_send_gratuitous_arp(struct net_device *out_dev, /* Send multiple ARPs to improve reliability */ for (i = 0; i < rds_ib_active_bonding_arps; i++) { - arp_send(ARPOP_REQUEST, ETH_P_ARP, + arp_send(ARPOP_REPLY, ETH_P_ARP, ip_addr, out_dev, ip_addr, NULL, dev_addr, NULL); @@ -1736,8 +1736,14 @@ static int rds_ib_netdev_callback(struct notifier_block *self, unsigned long eve break; case NETDEV_DOWN: - INIT_DELAYED_WORK(&work->work, rds_ib_failover); - queue_delayed_work(rds_wq, &work->work, 0); + if (rds_ib_sysctl_active_bonding) { + INIT_DELAYED_WORK(&work->work, rds_ib_failover); + queue_delayed_work(rds_wq, &work->work, 0); + } else { + ip_config[port].port_state = RDS_IB_PORT_INIT; + ip_config[port].ip_active_port = port; + kfree(work); + } break; } diff --git a/net/rds/ib.h b/net/rds/ib.h index d8c5996282ba1..da0798a8cf645 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -600,5 +600,6 @@ extern unsigned long rds_ib_sysctl_max_unsig_wrs; extern unsigned long rds_ib_sysctl_max_unsig_bytes; extern unsigned long rds_ib_sysctl_max_recv_allocation; extern unsigned int rds_ib_sysctl_flow_control; +extern unsigned int rds_ib_sysctl_active_bonding; #endif diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index b125997d08edc..f577779e6111e 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -426,7 +426,7 @@ static void rds_ib_rx(struct rds_ib_connection *ic) if (ic->i_rx_poll_cq >= RDS_IB_RX_LIMIT) { ic->i_rx_w.ic = ic; /* Delay 10 msecs until the RX worker starts reaping again */ - queue_delayed_work(rds_aux_wq, &ic->i_rx_w, + queue_delayed_work(rds_aux_wq, &ic->i_rx_w.work, msecs_to_jiffies(10)); ic->i_rx_wait_for_handler = 1; } @@ -823,6 +823,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, } else if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { unsigned long now = get_seconds(); + conn->c_reconnect_racing++; + /* * after 15 seconds, give up on existing connection * attempts and make them try again. At this point diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c index f5467cc13dc32..9428276168166 100644 --- a/net/rds/ib_sysctl.c +++ b/net/rds/ib_sysctl.c @@ -61,6 +61,7 @@ static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; */ unsigned int rds_ib_sysctl_flow_control = 0; +unsigned int rds_ib_sysctl_active_bonding = 1; static struct ctl_table rds_ib_sysctl_table[] = { { @@ -104,6 +105,13 @@ static struct ctl_table rds_ib_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "active_bonding", + .data = &rds_ib_sysctl_active_bonding, + .maxlen = sizeof(rds_ib_sysctl_active_bonding), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { } }; diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 70019e1947e7a..00d88a5696dff 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -31,6 +31,7 @@ * */ #include +#include #include "rdma_transport.h" #include "ib.h" @@ -96,6 +97,40 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, rdma_set_timeout(cm_id, rds_ib_apm_timeout); #endif + if (conn->c_tos && conn->c_reconnect) { + struct rds_ib_connection *base_ic = + conn->c_base_conn->c_transport_data; + + mutex_lock(&conn->c_base_conn->c_cm_lock); + if (rds_conn_transition(conn->c_base_conn, RDS_CONN_UP, + RDS_CONN_UP)) { + ret = rdma_set_ib_paths(cm_id, + base_ic->i_cm_id->route.path_rec, + base_ic->i_cm_id->route.num_paths); + if (!ret) { + struct rds_ib_connection *ic = + conn->c_transport_data; + + cm_id->route.path_rec[0].sl = + ic->i_sl; + cm_id->route.path_rec[0].qos_class = + conn->c_tos; + ret = trans->cm_initiate_connect(cm_id); + } + } else { + ret = 1; + } + mutex_unlock(&conn->c_base_conn->c_cm_lock); + + if (ret) { + rds_conn_drop(conn); + ret = 0; + } + + break; + } + + /* XXX do we need to clean up if this fails? */ ret = rdma_resolve_route(cm_id, rds_rdma_resolve_to_ms[conn->c_to_index]); @@ -176,6 +211,10 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, break; case RDMA_CM_EVENT_ADDR_ERROR: + if (conn) + rds_conn_drop(conn); + break; + case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_DEVICE_REMOVAL: @@ -185,8 +224,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_REJECTED: err = (int *)event->param.conn.private_data; + + if (conn && event->status == RDS_REJ_CONSUMER_DEFINED && + *err <= 1) + conn->c_reconnect_racing++; + if (conn) { - if (event->status == RDS_REJ_CONSUMER_DEFINED && (*err) == 0) { + if (event->status == RDS_REJ_CONSUMER_DEFINED && + (*err) == 0) { /* Rejection from RDSV3.1 */ if (!conn->c_tos) { conn->c_proposed_version = diff --git a/net/rds/rds.h b/net/rds/rds.h index 55ca1f7a1fb96..9f6d29fbb30f8 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -92,6 +92,7 @@ enum { #define RDS_RECV_REFILL 3 #define RDS_RDMA_RESOLVE_TO_MAX_INDEX 5 +#define RDS_ADDR_RES_TM_INDEX_MAX 5 struct rds_connection { struct hlist_node c_hash_node; @@ -130,6 +131,7 @@ struct rds_connection { struct delayed_work c_conn_w; struct delayed_work c_reject_w; struct delayed_work c_hb_w; + struct delayed_work c_reconn_w; struct work_struct c_down_w; struct mutex c_cm_lock; /* protect conn state & cm */ wait_queue_head_t c_waitq; @@ -166,6 +168,8 @@ struct rds_connection { unsigned int c_route_to_base; unsigned int c_rdsinfo_pending; + + unsigned int c_reconnect_racing; }; #define RDS_FLAG_CONG_BITMAP 0x01 @@ -869,6 +873,7 @@ void rds_send_worker(struct work_struct *); void rds_reject_worker(struct work_struct *); void rds_recv_worker(struct work_struct *); void rds_hb_worker(struct work_struct *); +void rds_reconnect_timeout(struct work_struct *); void rds_connect_complete(struct rds_connection *conn); /* transport.c */ diff --git a/net/rds/threads.c b/net/rds/threads.c index 8935cc13c91e8..c911cdc701a8f 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -264,11 +264,35 @@ void rds_hb_worker(struct work_struct *work) } } +void rds_reconnect_timeout(struct work_struct *work) +{ + struct rds_connection *conn = + container_of(work, struct rds_connection, c_reconn_w.work); + + /* if the higher IP has not reconnected, reset back to two-sided + * reconnect. + */ + if (!rds_conn_up(conn)) { + rds_conn_drop(conn); + conn->c_reconnect_racing = 0; + } +} + void rds_shutdown_worker(struct work_struct *work) { struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); - rds_conn_shutdown(conn, 1); + + /* if racing is detected, lower IP backs off and let the higher IP + * drives the reconnect (one-sided reconnect) + */ + if (conn->c_laddr < conn->c_faddr && conn->c_reconnect_racing) { + rds_conn_shutdown(conn, 0); + queue_delayed_work(rds_wq, &conn->c_reconn_w, + msecs_to_jiffies(5000)); + } else + rds_conn_shutdown(conn, 1); + } void rds_threads_exit(void)