From: Bang Nguyen Date: Tue, 20 Aug 2013 14:27:21 +0000 (-0700) Subject: RDS: double free rdma_cm_id X-Git-Tag: v4.1.12-92~293^2^2~46 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=1163b541915baa3f6f5926bfef8f3282a5873d3b;p=users%2Fjedix%2Flinux-maple.git RDS: double free rdma_cm_id RDS currently offloads rdma_destroy_id() to an aux thread as part of the connection shutdown. This was to workaround a bug in which rdma_destroy_id() could block and cause RDS reconnect to hang. By queuing the rdma_destroy_id() work, we unfortunately open up a timing window in which the pending CMA_ADDR_QUERY request might not get canceled right away and race with rdma_destroy_id(). In this case, rdma_destroyed_id() gets called and frees the cm id. Then, CMA_ADDR_QUERY completes and calls RDS event handler which calls rds_resolve_route on the destroyed cm id. The event handler returns failure which causes RDMA CM to call rdma_destroy_id() again on the same cm id! Hence the problem. Since the rdma_destroy_id() bug has been fixed by MLX to offload the blocking operation to the worker thread, RDS no longer needs to queue up rdma_destroy_id(). This closes up the window above and fixes the problem. Orabug: 17192816 Signed-off-by: Richard Frank (cherry picked from commit 3fec98717bf926d869d049e17baad849d1ba7d78) --- diff --git a/net/rds/ib.h b/net/rds/ib.h index e3a6c422cbc9..823ec806b48f 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -137,11 +137,6 @@ struct rds_ib_path { union ib_gid p_dgid; }; -struct rds_ib_destroy_id_work { - struct delayed_work work; - struct rdma_cm_id *cm_id; -}; - struct rds_ib_migrate_work { struct delayed_work work; struct rds_ib_connection *ic; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 11e2a5f831fa..8be0141b4b56 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -1002,22 +1002,10 @@ void rds_ib_check_migration(struct rds_connection *conn, } #endif -static void rds_ib_destroy_id(struct work_struct *_work) -{ - struct rds_ib_destroy_id_work *work = - container_of(_work, struct rds_ib_destroy_id_work, work.work); - struct rdma_cm_id *cm_id = work->cm_id; - - rdma_destroy_id(cm_id); - - kfree(work); -} - int rds_ib_conn_connect(struct rds_connection *conn) { struct rds_ib_connection *ic = conn->c_transport_data; struct sockaddr_in src, dest; - struct rds_ib_destroy_id_work *work; int ret; /* XXX I wonder what affect the port space has */ @@ -1047,13 +1035,7 @@ int rds_ib_conn_connect(struct rds_connection *conn) if (ret) { rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id, ret); - work = kzalloc(sizeof *work, GFP_KERNEL); - if (work) { - work->cm_id = ic->i_cm_id; - INIT_DELAYED_WORK(&work->work, rds_ib_destroy_id); - queue_delayed_work(rds_aux_wq, &work->work, 0); - } else - rdma_destroy_id(ic->i_cm_id); + rdma_destroy_id(ic->i_cm_id); ic->i_cm_id = NULL; } @@ -1070,7 +1052,6 @@ out: void rds_ib_conn_shutdown(struct rds_connection *conn) { struct rds_ib_connection *ic = conn->c_transport_data; - struct rds_ib_destroy_id_work *work; int err = 0; rdsdebug("cm %p pd %p cq %p qp %p\n", ic->i_cm_id, @@ -1149,17 +1130,7 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) if (ic->i_recvs) rds_ib_recv_clear_ring(ic); - /* - * rdma_destroy_id may block so offload it to the aux - * thread for processing. - */ - work = kzalloc(sizeof *work, GFP_KERNEL); - if (work) { - work->cm_id = ic->i_cm_id; - INIT_DELAYED_WORK(&work->work, rds_ib_destroy_id); - queue_delayed_work(rds_aux_wq, &work->work, 0); - } else - rdma_destroy_id(ic->i_cm_id); + rdma_destroy_id(ic->i_cm_id); /* * Move connection back to the nodev list.