* rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
* from interrupt context so we push freing off into a work struct in krdsd.
*/
-static void rds_ib_dev_free(struct work_struct *work)
+
+/* free up rds_ibdev->dev related resource. We have to wait until freeing
+ * work is done to avoid the racing with freeing in mlx4_remove_one ->
+ * mlx4_cleanup_mr_table path
+ */
+static void rds_ib_dev_free_dev(struct rds_ib_device *rds_ibdev)
{
- struct rds_ib_ipaddr *i_ipaddr, *i_next;
- struct rds_ib_device *rds_ibdev = container_of(work,
- struct rds_ib_device, free_work);
+ mutex_lock(&rds_ibdev->free_dev_lock);
+ if (!atomic_dec_and_test(&rds_ibdev->free_dev))
+ goto out;
- if (rds_ibdev->dev) {
- if (rds_ibdev->mr_8k_pool)
- rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool);
- if (rds_ibdev->mr_1m_pool)
- rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool);
- if (rds_ibdev->mr)
- ib_dereg_mr(rds_ibdev->mr);
- if (rds_ibdev->pd)
- ib_dealloc_pd(rds_ibdev->pd);
- }
if (rds_ibdev->srq) {
rds_ib_srq_exit(rds_ibdev);
kfree(rds_ibdev->srq);
}
+ if (rds_ibdev->mr_8k_pool)
+ rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool);
+ if (rds_ibdev->mr_1m_pool)
+ rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool);
+ if (rds_ibdev->mr)
+ ib_dereg_mr(rds_ibdev->mr);
+ if (rds_ibdev->pd)
+ ib_dealloc_pd(rds_ibdev->pd);
+out:
+ mutex_unlock(&rds_ibdev->free_dev_lock);
+}
+
+static void rds_ib_dev_free(struct work_struct *work)
+{
+ struct rds_ib_ipaddr *i_ipaddr, *i_next;
+ struct rds_ib_device *rds_ibdev = container_of(work,
+ struct rds_ib_device, free_work);
+
+ rds_ib_dev_free_dev(rds_ibdev);
+
list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
list_del(&i_ipaddr->list);
kfree(i_ipaddr);
if (rds_ibdev->vector_load)
kfree(rds_ibdev->vector_load);
- if (rds_ibdev->dev) {
- WARN_ON(!waitqueue_active(&rds_ibdev->wait));
-
- rds_ibdev->done = 1;
- wake_up(&rds_ibdev->wait);
- } else
- kfree(rds_ibdev);
+ kfree(rds_ibdev);
}
void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
*/
synchronize_rcu();
rds_ib_dev_put(rds_ibdev);
+ /* free up lower layer resource since it may be the last change */
+ rds_ib_dev_free_dev(rds_ibdev);
rds_ib_dev_put(rds_ibdev);
-
- if (!wait_event_timeout(rds_ibdev->wait, rds_ibdev->done, 30*HZ)) {
- printk(KERN_WARNING "RDS/IB: device cleanup timed out after "
- " 30 secs (refcount=%d)\n",
- atomic_read(&rds_ibdev->refcount));
- /* when rds_ib_remove_one return, driver's mlx4_ib_removeone
- * function destroy ib_device, so we must clear rds_ibdev->dev
- * to NULL, or will cause crash when rds connection be released,
- * at the moment rds_ib_dev_free through ib_device
- * .i.e rds_ibdev->dev to release mr and fmr, reusing the
- * released ib_device will cause crash.
- */
- rds_ibdev->dev = NULL;
- } else
- kfree(rds_ibdev);
}
struct ib_client rds_ib_client = {
if (!rds_ibdev)
goto free_attr;
+ atomic_set(&rds_ibdev->free_dev, 1);
+ mutex_init(&rds_ibdev->free_dev_lock);
spin_lock_init(&rds_ibdev->spinlock);
atomic_set(&rds_ibdev->refcount, 1);
INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
- init_waitqueue_head(&rds_ibdev->wait);
- rds_ibdev->done = 0;
rds_ibdev->max_wrs = dev_attr->max_qp_wr;
rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
struct rds_ib_port *ports;
struct ib_event_handler event_handler;
int *vector_load;
- wait_queue_head_t wait;
- int done;
+
+ /* flag indicating ib_device is under freeing up or is freed up to make
+ * the race between rds_ib_remove_one() and rds_release() safe.
+ */
+ atomic_t free_dev;
+ /* wait until freeing work is done */
+ struct mutex free_dev_lock;
};
#define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus)
uint64_t s_ib_rdma_mr_1m_pool_flush;
uint64_t s_ib_rdma_mr_1m_pool_wait;
uint64_t s_ib_rdma_mr_1m_pool_depleted;
+ uint64_t s_ib_rdma_mr_1m_pool_reuse;
+ uint64_t s_ib_rdma_mr_8k_pool_reuse;
uint64_t s_ib_atomic_cswp;
uint64_t s_ib_atomic_fadd;
uint64_t s_ib_srq_lows;
unsigned int sg_len;
u64 *dma;
int sg_dma_len;
+
+ struct rds_sock *rs;
+ struct list_head pool_list;
};
/*
atomic_t max_items_soft;
unsigned long max_free_pinned;
struct ib_fmr_attr fmr_attr;
+
+ spinlock_t busy_lock; /* protect ops on 'busy_list' */
+ /* All in use MRs allocated from this pool are listed here. This list
+ * helps freeing up in use MRs when a ib_device got removed while it's
+ * resources are still in use in RDS layer. Protected by busy_lock.
+ */
+ struct list_head busy_list;
};
static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
if (!pool)
return ERR_PTR(-ENOMEM);
+ spin_lock_init(&pool->busy_lock);
+ INIT_LIST_HEAD(&pool->busy_list);
+
pool->pool_type = pool_type;
INIT_XLIST_HEAD(&pool->free_list);
INIT_XLIST_HEAD(&pool->drop_list);
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
{
+ struct rds_ib_mr *ibmr;
+ LIST_HEAD(drp_list);
+
+ /* move MRs in in-use list to drop or free list */
+ spin_lock(&pool->busy_lock);
+ list_splice_init(&pool->busy_list, &drp_list);
+ spin_unlock(&pool->busy_lock);
+
+ /* rds_rdma_drop_keys may drops more than one MRs in one iteration */
+ while (!list_empty(&drp_list)) {
+ ibmr = list_first_entry(&drp_list, struct rds_ib_mr, pool_list);
+ list_del_init(&ibmr->pool_list);
+ if (ibmr->rs)
+ rds_rdma_drop_keys(ibmr->rs);
+ }
+
cancel_delayed_work_sync(&pool->flush_worker);
rds_ib_flush_mr_pool(pool, 1, NULL);
WARN_ON(atomic_read(&pool->item_count));
clear_bit(CLEAN_LIST_BUSY_BIT, flag);
preempt_enable();
+ if (ibmr) {
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_reuse);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_reuse);
+ spin_lock(&pool->busy_lock);
+ list_add(&ibmr->pool_list, &pool->busy_list);
+ spin_unlock(&pool->busy_lock);
+ }
return ibmr;
}
memset(ibmr, 0, sizeof(*ibmr));
+ INIT_LIST_HEAD(&ibmr->pool_list);
+ spin_lock(&pool->busy_lock);
+ list_add(&ibmr->pool_list, &pool->busy_list);
+ spin_unlock(&pool->busy_lock);
+
ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
(IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_READ |
rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
+ ibmr->rs = NULL;
+
+ /* remove from pool->busy_list or a tmp list(destroy path) */
+ spin_lock(&pool->busy_lock);
+ list_del_init(&ibmr->pool_list);
+ spin_unlock(&pool->busy_lock);
+
/* Return it to the pool's free list */
if (ibmr->remap_count >= pool->fmr_attr.max_maps)
xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list);
else
printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
+ ibmr->rs = rs;
ibmr->device = rds_ibdev;
rds_ibdev = NULL;
if (rs->rs_transport && rs->rs_transport->flush_mrs)
rs->rs_transport->flush_mrs();
}
+EXPORT_SYMBOL_GPL(rds_rdma_drop_keys);
/*
* Helper function to pin user pages.