]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
RDS: avoid double destory of cm_id when rdms_resolve_route fails
authorVenkat Venkatsubra <venkat.x.venkatsubra@oracle.com>
Fri, 3 Feb 2012 16:09:07 +0000 (11:09 -0500)
committerMukesh Kacker <mukesh.kacker@oracle.com>
Tue, 7 Jul 2015 23:41:33 +0000 (16:41 -0700)
It crashes in rds_ib_conn_shutdown because it was using a freed cm_id.  The
cm_id had got freed quite a while back actually (more than 15 secs back) during
an earlier connect attempt.

This was the sequence of the earlier connect attempt: rds_ib_conn_connect calls
rdma_resolve_addr.  The synchronous part of rdma_resolve_addr succeeds. But the
asynchronous part fails at some point.  RDMA Connection Manager returns the
event RDMA_CM_EVENT_ADDR_RESOLVED. This part succeeds.  Next, RDS calls
rdma_resolve_route from the rds_rdma_cm_event_handler. This fails.  We return
this error back to the RDMA CM addr_handler which destroys the cm_id as
follows: addr_handler (cma.c):

static void addr_handler(int status, struct sockaddr *src_addr,
                         struct rdma_dev_addr *dev_addr, void *context)
{
     .....
        if (id_priv->id.event_handler(&id_priv->id, &event)) {
                cma_exch(id_priv, CMA_DESTROYING);
                mutex_unlock(&id_priv->handler_mutex);
                cma_deref_id(id_priv);
                rdma_destroy_id(&id_priv->id);    <----  here
                return;
        }

RDS continues to point to this freed cm_id.

Later when a new connect req comes in from the remote side, we shutdown this cm_id
and try to reconnect:
  /*
   * after 15 seconds, give up on existing connection
   * attempts and make them try again.  At this point
   * it's no longer a race but something has gone
   * horribly wrong
   */
   if (now > conn->c_connection_start &&
           now - conn->c_connection_start > 5) {
              printk(KERN_CRIT "rds connection racing for 15s, forcing reset "
                        "connection %u.%u.%u.%u->%u.%u.%u.%u\n",
                        NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr));
       rds_conn_drop(conn);
          ....
We crash during the shutdown.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Bang Nguyen <bang.nguyen@oracle.com>
net/rds/rdma_transport.c

index a029ba8ff44e4412a31afd3083ab88fc3909e281..ed82a8d47cc93c2722a773bd88a92fd2bf796e6f 100644 (file)
@@ -33,6 +33,7 @@
 #include <rdma/rdma_cm.h>
 
 #include "rdma_transport.h"
+#include "ib.h"
 
 static struct rdma_cm_id *rds_iw_listen_id;
 
@@ -78,6 +79,22 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                /* XXX do we need to clean up if this fails? */
                ret = rdma_resolve_route(cm_id,
                                         RDS_RDMA_RESOLVE_TIMEOUT_MS);
+               if (ret) {
+                       /*
+                        * The cm_id will get destroyed by addr_handler
+                        * in RDMA CM when we return from here.
+                        */
+                       if (conn) {
+                               struct rds_ib_connection *ibic;
+
+                               printk(KERN_CRIT "rds dropping connection after rdma_resolve_route failure"
+                                      "connection %u.%u.%u.%u->%u.%u.%u.%u\n", NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr));
+                               ibic = conn->c_transport_data;
+                               if (ibic && ibic->i_cm_id == cm_id) 
+                                       ibic->i_cm_id = NULL;
+                               rds_conn_drop(conn);
+                       }
+               }
                break;
 
        case RDMA_CM_EVENT_ROUTE_RESOLVED: