From 5fe5f2d6e883c3fd3eee5fea3aa196f144332caa Mon Sep 17 00:00:00 2001 From: Ajaykumar Hotchandani Date: Tue, 1 Jul 2014 01:57:55 -0700 Subject: [PATCH] RDS: Ensure non-zero SL uses correct path before lane 0 connection is dropped There is an issue with the following scenario: * if non-zero lane is going down first with send completion error 12 * Before lane 0 connection goes down, the peer initiates connection request with the non-zero lane * This non-zero lane connection request may be using old ARP entries of lane 0 This also fixes race condition between connection establishment and drop for following scenario: * non-zero lane connection dropped * non-zero connection is initiated and this time it finds proper route and connection request goes through. * before non-zero lane connection is established at RDS layer, zero lane connection is getting dropped. * now this zero-lane connection will drop non-zero lane connection as well (with the assumption that non-zero lane did not find proper route). * when non-zero lane connection establishment event is received (REP packet), we have a race between connection establishment event on one CPU and connection drop on other CPU. Orabug: 19133664 Signed-off-by: Ajaykumar Hotchandani Signed-off-by: Chien-Hua Yen Reviewed-by: Bang Nguyen Signed-off-by: Guangyu Sun (cherry picked from commit 47d8e78f82872bbb9d709a0743ea2bdb2e9f6cbb) --- net/rds/connection.c | 11 +++++++++++ net/rds/ib_cm.c | 10 ++++++++++ net/rds/rds.h | 1 + 3 files changed, 22 insertions(+) diff --git a/net/rds/connection.c b/net/rds/connection.c index b29d7754e5aeb..7a53a8eaae86e 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -209,6 +209,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, conn->c_reconnect_drops = 0; conn->c_reconnect_err = 0; conn->c_proposed_version = RDS_PROTOCOL_VERSION; + conn->c_route_resolved = 1; INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker); INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker); @@ -595,6 +596,16 @@ static void rds_conn_probe_lanes(struct rds_connection *conn) tmp->c_trans == conn->c_trans) { if (rds_conn_up(tmp)) rds_send_hb(tmp, 0); + else if (rds_conn_connecting(tmp) && (tmp->c_route_resolved == 0)) { + printk(KERN_INFO "RDS/IB: connection " + "<%u.%u.%u.%u,%u.%u.%u.%u,%d> " + "connecting, force reset ", + NIPQUAD(tmp->c_laddr), + NIPQUAD(tmp->c_faddr), + tmp->c_tos); + + rds_conn_drop(tmp); + } } } rcu_read_unlock(); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index f577779e6111e..4f3b52cdd95ac 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -132,6 +132,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even struct ib_qp_attr qp_attr; int err; + if (conn->c_route_resolved == 0) + conn->c_route_resolved = 1; if (event->param.conn.private_data_len >= sizeof(*dp)) { dp = event->param.conn.private_data; @@ -176,6 +178,13 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even return; } + /* Drop connection if connection state is not CONNECTING. + Potentially connection drop from some other place like rds_conn_probe_lanes() */ + if (!rds_conn_connecting(conn)) { + rds_conn_drop(conn); + return; + } + ic->i_sl = ic->i_cm_id->route.path_rec->sl; /* @@ -1060,6 +1069,7 @@ int rds_ib_conn_connect(struct rds_connection *conn) struct sockaddr_in src, dest; int ret; + conn->c_route_resolved = 0; /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, diff --git a/net/rds/rds.h b/net/rds/rds.h index 0295ee34f354c..104a0ee06e0f7 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -170,6 +170,7 @@ struct rds_connection { unsigned int c_rdsinfo_pending; unsigned int c_reconnect_racing; + unsigned int c_route_resolved; }; #define RDS_FLAG_CONG_BITMAP 0x01 -- 2.50.1