]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
rds: free ib_device related resource
authorWengang Wang <wen.gang.wang@oracle.com>
Mon, 7 Sep 2015 07:35:29 +0000 (15:35 +0800)
committerMukesh Kacker <mukesh.kacker@oracle.com>
Thu, 17 Sep 2015 00:07:09 +0000 (17:07 -0700)
There is a (rare) case that a ib_device gets removed(driver unload) while
upper layer(RDS) is still having references to the resources allocated
from this ib_device.

The result is either causing memory leak or crashing when accessing
the freed memory.

The resources are mainly rds_ib_mr objects, in-use rds_ib_mr (rds_mr)
objects are stored in rds_sock.rs_rdma_keys.

The fix is to
1) links up all in-use rds_ib_mr objects to the pool
2) links the rds_sock to rds_ib_mr
3) the destroy of the rds_ib_mr_pool takes care of freeing rds_ib_mrs
   by calling rds_rdma_drop_keys()

Orabug: 21795824

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Acked-by: Chien Yen <chien.yen@oracle.com>
Signed-off-by: Guangyu Sun <guangyu.sun@oracle.com>
Signed-off-by: Mukesh Kacker <mukesh.kacker@oracle.com>
net/rds/ib.c
net/rds/ib.h
net/rds/ib_rdma.c
net/rds/rdma.c

index cc29ab75b78007d000db97f8e1ffaf2228639ea9..9291e86d27f52a2214d1a4648cb9e4164e57437c 100644 (file)
@@ -201,27 +201,42 @@ void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
  * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
  * from interrupt context so we push freing off into a work struct in krdsd.
  */
-static void rds_ib_dev_free(struct work_struct *work)
+
+/* free up rds_ibdev->dev related resource. We have to wait until freeing
+ * work is done to avoid the racing with freeing in mlx4_remove_one ->
+ * mlx4_cleanup_mr_table path
+ */
+static void rds_ib_dev_free_dev(struct rds_ib_device *rds_ibdev)
 {
-       struct rds_ib_ipaddr *i_ipaddr, *i_next;
-       struct rds_ib_device *rds_ibdev = container_of(work,
-                                       struct rds_ib_device, free_work);
+       mutex_lock(&rds_ibdev->free_dev_lock);
+       if (!atomic_dec_and_test(&rds_ibdev->free_dev))
+               goto out;
 
-       if (rds_ibdev->dev) {
-               if (rds_ibdev->mr_8k_pool)
-                       rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool);
-               if (rds_ibdev->mr_1m_pool)
-                       rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool);
-               if (rds_ibdev->mr)
-                       ib_dereg_mr(rds_ibdev->mr);
-               if (rds_ibdev->pd)
-                       ib_dealloc_pd(rds_ibdev->pd);
-       }
        if (rds_ibdev->srq) {
                rds_ib_srq_exit(rds_ibdev);
                kfree(rds_ibdev->srq);
        }
 
+       if (rds_ibdev->mr_8k_pool)
+               rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool);
+       if (rds_ibdev->mr_1m_pool)
+               rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool);
+       if (rds_ibdev->mr)
+               ib_dereg_mr(rds_ibdev->mr);
+       if (rds_ibdev->pd)
+               ib_dealloc_pd(rds_ibdev->pd);
+out:
+       mutex_unlock(&rds_ibdev->free_dev_lock);
+}
+
+static void rds_ib_dev_free(struct work_struct *work)
+{
+       struct rds_ib_ipaddr *i_ipaddr, *i_next;
+       struct rds_ib_device *rds_ibdev = container_of(work,
+                                       struct rds_ib_device, free_work);
+
+       rds_ib_dev_free_dev(rds_ibdev);
+
        list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
                list_del(&i_ipaddr->list);
                kfree(i_ipaddr);
@@ -230,13 +245,7 @@ static void rds_ib_dev_free(struct work_struct *work)
        if (rds_ibdev->vector_load)
                kfree(rds_ibdev->vector_load);
 
-       if (rds_ibdev->dev) {
-               WARN_ON(!waitqueue_active(&rds_ibdev->wait));
-
-               rds_ibdev->done = 1;
-               wake_up(&rds_ibdev->wait);
-       } else
-               kfree(rds_ibdev);
+       kfree(rds_ibdev);
 }
 
 void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
@@ -314,22 +323,9 @@ void rds_ib_remove_one(struct ib_device *device)
         */
        synchronize_rcu();
        rds_ib_dev_put(rds_ibdev);
+       /* free up lower layer resource since it may be the last change */
+       rds_ib_dev_free_dev(rds_ibdev);
        rds_ib_dev_put(rds_ibdev);
-
-       if (!wait_event_timeout(rds_ibdev->wait, rds_ibdev->done, 30*HZ)) {
-               printk(KERN_WARNING "RDS/IB: device cleanup timed out after "
-                       " 30 secs (refcount=%d)\n",
-                        atomic_read(&rds_ibdev->refcount));
-               /* when rds_ib_remove_one return, driver's mlx4_ib_removeone
-                * function destroy ib_device, so we must clear rds_ibdev->dev
-                * to NULL, or will cause crash when rds connection be released,
-                * at the moment rds_ib_dev_free through ib_device
-                * .i.e rds_ibdev->dev to release mr and fmr, reusing the
-                * released ib_device will cause crash.
-                */
-               rds_ibdev->dev = NULL;
-       } else
-               kfree(rds_ibdev);
 }
 
 struct ib_client rds_ib_client = {
@@ -2105,11 +2101,11 @@ void rds_ib_add_one(struct ib_device *device)
        if (!rds_ibdev)
                goto free_attr;
 
+       atomic_set(&rds_ibdev->free_dev, 1);
+       mutex_init(&rds_ibdev->free_dev_lock);
        spin_lock_init(&rds_ibdev->spinlock);
        atomic_set(&rds_ibdev->refcount, 1);
        INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
-       init_waitqueue_head(&rds_ibdev->wait);
-       rds_ibdev->done = 0;
 
        rds_ibdev->max_wrs = dev_attr->max_qp_wr;
        rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
index 92dfe07f84ac5f503bfc38498e054594de5291a9..476efc726eea21989a56781e81978a69a17e40f9 100644 (file)
@@ -433,8 +433,13 @@ struct rds_ib_device {
        struct rds_ib_port      *ports;
        struct ib_event_handler event_handler;
        int                     *vector_load;
-       wait_queue_head_t       wait;
-       int                     done;
+
+       /* flag indicating ib_device is under freeing up or is freed up to make
+        * the race between rds_ib_remove_one() and rds_release() safe.
+        */
+       atomic_t                free_dev;
+       /* wait until freeing work is done */
+       struct mutex            free_dev_lock;
 };
 
 #define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus)
@@ -485,6 +490,8 @@ struct rds_ib_statistics {
        uint64_t        s_ib_rdma_mr_1m_pool_flush;
        uint64_t        s_ib_rdma_mr_1m_pool_wait;
        uint64_t        s_ib_rdma_mr_1m_pool_depleted;
+       uint64_t        s_ib_rdma_mr_1m_pool_reuse;
+       uint64_t        s_ib_rdma_mr_8k_pool_reuse;
        uint64_t        s_ib_atomic_cswp;
        uint64_t        s_ib_atomic_fadd;
        uint64_t        s_ib_srq_lows;
index 426d64366d4843418abc48af1fc30080e4329714..ce4a7c8e4ef96bcf0c4f5cbcf7c9ee42e7eb9155 100644 (file)
@@ -60,6 +60,9 @@ struct rds_ib_mr {
        unsigned int            sg_len;
        u64                     *dma;
        int                     sg_dma_len;
+
+       struct rds_sock         *rs;
+       struct list_head        pool_list;
 };
 
 /*
@@ -83,6 +86,13 @@ struct rds_ib_mr_pool {
        atomic_t                max_items_soft;
        unsigned long           max_free_pinned;
        struct ib_fmr_attr      fmr_attr;
+
+       spinlock_t              busy_lock; /* protect ops on 'busy_list' */
+       /* All in use MRs allocated from this pool are listed here. This list
+        * helps freeing up in use MRs when a ib_device got removed while it's
+        * resources are still in use in RDS layer. Protected by busy_lock.
+        */
+       struct list_head        busy_list;
 };
 
 static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
@@ -232,6 +242,9 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
        if (!pool)
                return ERR_PTR(-ENOMEM);
 
+       spin_lock_init(&pool->busy_lock);
+       INIT_LIST_HEAD(&pool->busy_list);
+
        pool->pool_type = pool_type;
        INIT_XLIST_HEAD(&pool->free_list);
        INIT_XLIST_HEAD(&pool->drop_list);
@@ -266,6 +279,22 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
 
 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
 {
+       struct rds_ib_mr *ibmr;
+       LIST_HEAD(drp_list);
+
+       /* move MRs in in-use list to drop or free list */
+       spin_lock(&pool->busy_lock);
+       list_splice_init(&pool->busy_list, &drp_list);
+       spin_unlock(&pool->busy_lock);
+
+       /* rds_rdma_drop_keys may drops more than one MRs in one iteration */
+       while (!list_empty(&drp_list)) {
+               ibmr = list_first_entry(&drp_list, struct rds_ib_mr, pool_list);
+               list_del_init(&ibmr->pool_list);
+               if (ibmr->rs)
+                       rds_rdma_drop_keys(ibmr->rs);
+       }
+
        cancel_delayed_work_sync(&pool->flush_worker);
        rds_ib_flush_mr_pool(pool, 1, NULL);
        WARN_ON(atomic_read(&pool->item_count));
@@ -296,6 +325,15 @@ static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
 
        clear_bit(CLEAN_LIST_BUSY_BIT, flag);
        preempt_enable();
+       if (ibmr) {
+               if (pool->pool_type == RDS_IB_MR_8K_POOL)
+                       rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_reuse);
+               else
+                       rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_reuse);
+               spin_lock(&pool->busy_lock);
+               list_add(&ibmr->pool_list, &pool->busy_list);
+               spin_unlock(&pool->busy_lock);
+       }
        return ibmr;
 }
 
@@ -373,6 +411,11 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev,
 
        memset(ibmr, 0, sizeof(*ibmr));
 
+       INIT_LIST_HEAD(&ibmr->pool_list);
+       spin_lock(&pool->busy_lock);
+       list_add(&ibmr->pool_list, &pool->busy_list);
+       spin_unlock(&pool->busy_lock);
+
        ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
                        (IB_ACCESS_LOCAL_WRITE |
                         IB_ACCESS_REMOTE_READ |
@@ -812,6 +855,13 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
 
        rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
 
+       ibmr->rs = NULL;
+
+       /* remove from pool->busy_list or a tmp list(destroy path) */
+       spin_lock(&pool->busy_lock);
+       list_del_init(&ibmr->pool_list);
+       spin_unlock(&pool->busy_lock);
+
        /* Return it to the pool's free list */
        if (ibmr->remap_count >= pool->fmr_attr.max_maps)
                xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list);
@@ -886,6 +936,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
        else
                printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
 
+       ibmr->rs = rs;
        ibmr->device = rds_ibdev;
        rds_ibdev = NULL;
 
index fd1f0f71dd44663ab1b0a286a21739b4c50a0495..e49d4b1cc8f4951878ed53709dd298340288f6cd 100644 (file)
@@ -149,6 +149,7 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
        if (rs->rs_transport && rs->rs_transport->flush_mrs)
                rs->rs_transport->flush_mrs();
 }
+EXPORT_SYMBOL_GPL(rds_rdma_drop_keys);
 
 /*
  * Helper function to pin user pages.