From: Wengang Wang Date: Mon, 7 Sep 2015 07:35:29 +0000 (+0800) Subject: rds: free ib_device related resource X-Git-Tag: v4.1.12-92~277^2^2~2 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=ceb99ba579a769f4e02375a3d52e36f44ae5f27f;p=users%2Fjedix%2Flinux-maple.git rds: free ib_device related resource There is a (rare) case that a ib_device gets removed(driver unload) while upper layer(RDS) is still having references to the resources allocated from this ib_device. The result is either causing memory leak or crashing when accessing the freed memory. The resources are mainly rds_ib_mr objects, in-use rds_ib_mr (rds_mr) objects are stored in rds_sock.rs_rdma_keys. The fix is to 1) links up all in-use rds_ib_mr objects to the pool 2) links the rds_sock to rds_ib_mr 3) the destroy of the rds_ib_mr_pool takes care of freeing rds_ib_mrs by calling rds_rdma_drop_keys() Orabug: 21795824 Signed-off-by: Wengang Wang Acked-by: Chien Yen Signed-off-by: Guangyu Sun Signed-off-by: Mukesh Kacker --- diff --git a/net/rds/ib.c b/net/rds/ib.c index cc29ab75b7800..9291e86d27f52 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -201,27 +201,42 @@ void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references * from interrupt context so we push freing off into a work struct in krdsd. */ -static void rds_ib_dev_free(struct work_struct *work) + +/* free up rds_ibdev->dev related resource. We have to wait until freeing + * work is done to avoid the racing with freeing in mlx4_remove_one -> + * mlx4_cleanup_mr_table path + */ +static void rds_ib_dev_free_dev(struct rds_ib_device *rds_ibdev) { - struct rds_ib_ipaddr *i_ipaddr, *i_next; - struct rds_ib_device *rds_ibdev = container_of(work, - struct rds_ib_device, free_work); + mutex_lock(&rds_ibdev->free_dev_lock); + if (!atomic_dec_and_test(&rds_ibdev->free_dev)) + goto out; - if (rds_ibdev->dev) { - if (rds_ibdev->mr_8k_pool) - rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool); - if (rds_ibdev->mr_1m_pool) - rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool); - if (rds_ibdev->mr) - ib_dereg_mr(rds_ibdev->mr); - if (rds_ibdev->pd) - ib_dealloc_pd(rds_ibdev->pd); - } if (rds_ibdev->srq) { rds_ib_srq_exit(rds_ibdev); kfree(rds_ibdev->srq); } + if (rds_ibdev->mr_8k_pool) + rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool); + if (rds_ibdev->mr_1m_pool) + rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool); + if (rds_ibdev->mr) + ib_dereg_mr(rds_ibdev->mr); + if (rds_ibdev->pd) + ib_dealloc_pd(rds_ibdev->pd); +out: + mutex_unlock(&rds_ibdev->free_dev_lock); +} + +static void rds_ib_dev_free(struct work_struct *work) +{ + struct rds_ib_ipaddr *i_ipaddr, *i_next; + struct rds_ib_device *rds_ibdev = container_of(work, + struct rds_ib_device, free_work); + + rds_ib_dev_free_dev(rds_ibdev); + list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { list_del(&i_ipaddr->list); kfree(i_ipaddr); @@ -230,13 +245,7 @@ static void rds_ib_dev_free(struct work_struct *work) if (rds_ibdev->vector_load) kfree(rds_ibdev->vector_load); - if (rds_ibdev->dev) { - WARN_ON(!waitqueue_active(&rds_ibdev->wait)); - - rds_ibdev->done = 1; - wake_up(&rds_ibdev->wait); - } else - kfree(rds_ibdev); + kfree(rds_ibdev); } void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) @@ -314,22 +323,9 @@ void rds_ib_remove_one(struct ib_device *device) */ synchronize_rcu(); rds_ib_dev_put(rds_ibdev); + /* free up lower layer resource since it may be the last change */ + rds_ib_dev_free_dev(rds_ibdev); rds_ib_dev_put(rds_ibdev); - - if (!wait_event_timeout(rds_ibdev->wait, rds_ibdev->done, 30*HZ)) { - printk(KERN_WARNING "RDS/IB: device cleanup timed out after " - " 30 secs (refcount=%d)\n", - atomic_read(&rds_ibdev->refcount)); - /* when rds_ib_remove_one return, driver's mlx4_ib_removeone - * function destroy ib_device, so we must clear rds_ibdev->dev - * to NULL, or will cause crash when rds connection be released, - * at the moment rds_ib_dev_free through ib_device - * .i.e rds_ibdev->dev to release mr and fmr, reusing the - * released ib_device will cause crash. - */ - rds_ibdev->dev = NULL; - } else - kfree(rds_ibdev); } struct ib_client rds_ib_client = { @@ -2105,11 +2101,11 @@ void rds_ib_add_one(struct ib_device *device) if (!rds_ibdev) goto free_attr; + atomic_set(&rds_ibdev->free_dev, 1); + mutex_init(&rds_ibdev->free_dev_lock); spin_lock_init(&rds_ibdev->spinlock); atomic_set(&rds_ibdev->refcount, 1); INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); - init_waitqueue_head(&rds_ibdev->wait); - rds_ibdev->done = 0; rds_ibdev->max_wrs = dev_attr->max_qp_wr; rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); diff --git a/net/rds/ib.h b/net/rds/ib.h index 92dfe07f84ac5..476efc726eea2 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -433,8 +433,13 @@ struct rds_ib_device { struct rds_ib_port *ports; struct ib_event_handler event_handler; int *vector_load; - wait_queue_head_t wait; - int done; + + /* flag indicating ib_device is under freeing up or is freed up to make + * the race between rds_ib_remove_one() and rds_release() safe. + */ + atomic_t free_dev; + /* wait until freeing work is done */ + struct mutex free_dev_lock; }; #define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus) @@ -485,6 +490,8 @@ struct rds_ib_statistics { uint64_t s_ib_rdma_mr_1m_pool_flush; uint64_t s_ib_rdma_mr_1m_pool_wait; uint64_t s_ib_rdma_mr_1m_pool_depleted; + uint64_t s_ib_rdma_mr_1m_pool_reuse; + uint64_t s_ib_rdma_mr_8k_pool_reuse; uint64_t s_ib_atomic_cswp; uint64_t s_ib_atomic_fadd; uint64_t s_ib_srq_lows; diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 426d64366d484..ce4a7c8e4ef96 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -60,6 +60,9 @@ struct rds_ib_mr { unsigned int sg_len; u64 *dma; int sg_dma_len; + + struct rds_sock *rs; + struct list_head pool_list; }; /* @@ -83,6 +86,13 @@ struct rds_ib_mr_pool { atomic_t max_items_soft; unsigned long max_free_pinned; struct ib_fmr_attr fmr_attr; + + spinlock_t busy_lock; /* protect ops on 'busy_list' */ + /* All in use MRs allocated from this pool are listed here. This list + * helps freeing up in use MRs when a ib_device got removed while it's + * resources are still in use in RDS layer. Protected by busy_lock. + */ + struct list_head busy_list; }; static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **); @@ -232,6 +242,9 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, if (!pool) return ERR_PTR(-ENOMEM); + spin_lock_init(&pool->busy_lock); + INIT_LIST_HEAD(&pool->busy_list); + pool->pool_type = pool_type; INIT_XLIST_HEAD(&pool->free_list); INIT_XLIST_HEAD(&pool->drop_list); @@ -266,6 +279,22 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) { + struct rds_ib_mr *ibmr; + LIST_HEAD(drp_list); + + /* move MRs in in-use list to drop or free list */ + spin_lock(&pool->busy_lock); + list_splice_init(&pool->busy_list, &drp_list); + spin_unlock(&pool->busy_lock); + + /* rds_rdma_drop_keys may drops more than one MRs in one iteration */ + while (!list_empty(&drp_list)) { + ibmr = list_first_entry(&drp_list, struct rds_ib_mr, pool_list); + list_del_init(&ibmr->pool_list); + if (ibmr->rs) + rds_rdma_drop_keys(ibmr->rs); + } + cancel_delayed_work_sync(&pool->flush_worker); rds_ib_flush_mr_pool(pool, 1, NULL); WARN_ON(atomic_read(&pool->item_count)); @@ -296,6 +325,15 @@ static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) clear_bit(CLEAN_LIST_BUSY_BIT, flag); preempt_enable(); + if (ibmr) { + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_reuse); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_reuse); + spin_lock(&pool->busy_lock); + list_add(&ibmr->pool_list, &pool->busy_list); + spin_unlock(&pool->busy_lock); + } return ibmr; } @@ -373,6 +411,11 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, memset(ibmr, 0, sizeof(*ibmr)); + INIT_LIST_HEAD(&ibmr->pool_list); + spin_lock(&pool->busy_lock); + list_add(&ibmr->pool_list, &pool->busy_list); + spin_unlock(&pool->busy_lock); + ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | @@ -812,6 +855,13 @@ void rds_ib_free_mr(void *trans_private, int invalidate) rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); + ibmr->rs = NULL; + + /* remove from pool->busy_list or a tmp list(destroy path) */ + spin_lock(&pool->busy_lock); + list_del_init(&ibmr->pool_list); + spin_unlock(&pool->busy_lock); + /* Return it to the pool's free list */ if (ibmr->remap_count >= pool->fmr_attr.max_maps) xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list); @@ -886,6 +936,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, else printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); + ibmr->rs = rs; ibmr->device = rds_ibdev; rds_ibdev = NULL; diff --git a/net/rds/rdma.c b/net/rds/rdma.c index fd1f0f71dd446..e49d4b1cc8f49 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -149,6 +149,7 @@ void rds_rdma_drop_keys(struct rds_sock *rs) if (rs->rs_transport && rs->rs_transport->flush_mrs) rs->rs_transport->flush_mrs(); } +EXPORT_SYMBOL_GPL(rds_rdma_drop_keys); /* * Helper function to pin user pages.