]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
net/rds: Fix kernel panic caused by a race between setup/teardown
authorHans Westgaard Ry <hans.westgaard.ry@oracle.com>
Tue, 10 Jul 2018 11:14:37 +0000 (13:14 +0200)
committerBrian Maly <brian.maly@oracle.com>
Tue, 24 Jul 2018 19:49:20 +0000 (15:49 -0400)
Running rds-stress with --reset option in a tight loop provokes a
"NULL pointer dereference" in rds_ib_recv_refill().

 IP: [<..fa04fefa1>] rds_ib_recv_refill+0x2a1/0x630 [rds_rdma]
 Call Trace:
  [<..fa04fc140>] rds_ib_cm_connect_complete+0x2f0/0x360 [rds_rdma]
  [<..f810ba337>] ? wake_up_process+0x27/0x50
  [<..f810a1b54>] ? wake_up_worker+0x24/0x30
  [<..f810a2902>] ? insert_work+0x62/0xa0
  [<..fa04f4405>] rds_rdma_cm_event_handler_cmn+0x405/0x8c0 [rds_rdma]
  [<..f810a2d82>] ? __queue_delayed_work+0xb2/0x1a0
  [<..fa04f48d0>] rds_rdma_cm_event_handler+0x10/0x20 [rds_rdma]
  [<..fa03bf85f>] cma_ib_handler+0x10f/0x280 [rdma_cm]
  [<..fa03b1f5b>] cm_process_work+0x2b/0x140 [ib_cm]
  [<..fa03b414b>] cm_work_handler+0x99b/0x1630 [ib_cm]
  [<..f810a5375>] process_one_work+0x165/0x470
  [<..f810a5b82>] worker_thread+0x112/0x540
  [<..f810a5a70>] ? rescuer_thread+0x3f0/0x3f0
  [<..f810ab51a>] kthread+0xda/0xf0
  [<..f810ab440>] ? kthread_create_on_node+0x1b0/0x1b0
  [<..f817521b8>] ret_from_fork+0x58/0x90
  [<..f810ab440>] ? kthread_create_on_node+0x1b0/0x1b0

This is due to a race condition between rds_ib_cm_connect_complete()
trying to post a recv (using QP) and rds_ib_conn_path_shutdown
destroying the QP and zeroing it. By waiting and reserving the
RDS_RECV_REFILL bit before calling shutdown we avoid the problem.

Adding path-transition of CONNECTING to DISCONNECTING for
completeness.

Orabug: 28326553

Signed-off-by: Hans Westgaard Ry <hans.westgaard.ry@oracle.com>
Reviewed-by: Shannon Nelson <shannon.nelson@oracle.com>
Signed-off-by: Brian Maly <brian.maly@oracle.com>
net/rds/connection.c

index 5dc20884a9e338350207e4b75930c052ac7cf503..be0dacd6cd77b2f620629ada6ee0e2a71542c223 100644 (file)
@@ -400,6 +400,8 @@ void rds_conn_shutdown(struct rds_conn_path *cp, int restart)
                mutex_lock(&cp->cp_cm_lock);
                if (!rds_conn_path_transition(cp, RDS_CONN_UP,
                                              RDS_CONN_DISCONNECTING) &&
+                   !rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
+                                             RDS_CONN_DISCONNECTING) &&
                    !rds_conn_path_transition(cp, RDS_CONN_ERROR,
                                              RDS_CONN_DISCONNECTING)) {
                        rds_conn_path_drop(cp, DR_INV_CONN_STATE);
@@ -411,11 +413,12 @@ void rds_conn_shutdown(struct rds_conn_path *cp, int restart)
                wait_event(cp->cp_waitq,
                           !test_bit(RDS_IN_XMIT, &cp->cp_flags));
                wait_event(cp->cp_waitq,
-                          !test_bit(RDS_RECV_REFILL, &cp->cp_flags));
+                          !test_and_set_bit(RDS_RECV_REFILL, &cp->cp_flags));
                wait_event(cp->cp_waitq,
                           (atomic_read(&cp->cp_rdma_map_pending) == 0));
 
                conn->c_trans->conn_path_shutdown(cp);
+               clear_bit(RDS_RECV_REFILL, &cp->cp_flags);
                rds_conn_path_reset(cp);
 
                if (!rds_conn_path_transition(cp, RDS_CONN_DISCONNECTING,