From: Dotan Barak Date: Wed, 22 Feb 2012 12:23:21 +0000 (+0200) Subject: ib/core: Enable usermode FMR X-Git-Tag: v4.1.12-92~281^2^2~5 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=93a2a88242c4e4a4bc67559c8d2ecf994accf2ae;p=users%2Fjedix%2Flinux-maple.git ib/core: Enable usermode FMR Signed-off-by: Arun Kaimalettu Signed-off-by: Dotan Barak (Ported from UEK2/OFED 1.5.5) Signed-off-by: Mukesh Kacker --- diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 18c1ece765f2c..6b604e0dd93da 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -293,6 +293,8 @@ int ib_register_device(struct ib_device *device, INIT_LIST_HEAD(&device->client_data_list); spin_lock_init(&device->event_handler_lock); spin_lock_init(&device->client_data_lock); + device->relaxed_pd = NULL; + INIT_LIST_HEAD(&device->relaxed_pool_list); ret = read_port_table_lengths(device); if (ret) { diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index 9f5ad7cc33c89..dfadbc0c45dcc 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -80,8 +80,11 @@ enum { * pool_lock to maintain consistency. */ +#define FMR_SPLIT_COUNT 3 + struct ib_fmr_pool { spinlock_t pool_lock; + spinlock_t used_pool_lock; int pool_size; int max_pages; @@ -89,6 +92,7 @@ struct ib_fmr_pool { int dirty_watermark; int dirty_len; struct list_head free_list; + struct list_head used_list; struct list_head dirty_list; struct hlist_head *cache_bucket; @@ -102,6 +106,8 @@ struct ib_fmr_pool { atomic_t flush_ser; wait_queue_head_t force_wait; + struct ib_pd *pd; + int relaxed; }; static inline u32 ib_fmr_hash(u64 first_page) @@ -114,7 +120,8 @@ static inline u32 ib_fmr_hash(u64 first_page) static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool, u64 *page_list, int page_list_len, - u64 io_virtual_address) + u64 io_virtual_address, + struct ib_pd *pd) { struct hlist_head *bucket; struct ib_pool_fmr *fmr; @@ -127,6 +134,7 @@ static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool, hlist_for_each_entry(fmr, bucket, cache_node) if (io_virtual_address == fmr->io_virtual_address && page_list_len == fmr->page_list_len && + pd == fmr->pd && !memcmp(page_list, fmr->page_list, page_list_len * sizeof *page_list)) return fmr; @@ -134,13 +142,97 @@ static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool, return NULL; } -static void ib_fmr_batch_release(struct ib_fmr_pool *pool) + +static void fmr_teardown_mr(struct ib_pool_fmr *fmr) +{ + + if (fmr->sg_len) { + ib_dma_unmap_sg(fmr->pd->device, + fmr->sg, fmr->sg_len, + DMA_BIDIRECTIONAL); + } + + /* Release the s/g list */ + if (fmr->sg_len) { + unsigned int i; + + for (i = 0; i < fmr->sg_len; ++i) { + struct page *page = sg_page(&fmr->sg[i]); + + /* FIXME we need a way to tell a r/w MR + * from a r/o MR */ + BUG_ON(irqs_disabled()); + set_page_dirty(page); + put_page(page); + } + kfree(fmr->sg); + + fmr->sg = NULL; + fmr->sg_len = 0; + } +} + +static void ib_fmr_batch_release(struct ib_fmr_pool *pool, int unmap_usedonce) { int ret; struct ib_pool_fmr *fmr; LIST_HEAD(unmap_list); LIST_HEAD(fmr_list); + if (unmap_usedonce) { + /* force a flush */ + struct ib_pool_fmr *fmr; + int already_split = 0; + int count = 0; + LIST_HEAD(temp_list); + + spin_lock_irq(&pool->used_pool_lock); + list_splice_init(&pool->used_list, &temp_list); + spin_unlock_irq(&pool->used_pool_lock); + list_for_each_entry(fmr, &temp_list, list) { + /* find first fmr that is not mapped yet */ + if (fmr->remap_count == 0 || + (count > (pool->pool_size / FMR_SPLIT_COUNT))) { + /* split the list 2 two */ + list_cut_position(&unmap_list, &temp_list, + &fmr->list); + spin_lock_irq(&pool->used_pool_lock); + list_splice(&temp_list, &pool->used_list); + spin_unlock_irq(&pool->used_pool_lock); + already_split = 1; + break; + } else { + hlist_del_init(&fmr->cache_node); + fmr->remap_count = 0; + list_add_tail(&fmr->fmr->list, &fmr_list); + count++; + } + } + + if (!already_split) { + /* All are mapped once */ + list_splice_tail(&temp_list, &unmap_list); + } + if (!list_empty(&unmap_list)) { + ret = ib_unmap_fmr(&fmr_list); + if (ret) + pr_warn(PFX "ib_unmap_fmr returned %d\n", ret); + + if (pool->relaxed) { + list_for_each_entry(fmr, &unmap_list, list) { + fmr_teardown_mr(fmr); + } + } + spin_lock_irq(&pool->pool_lock); + list_splice(&unmap_list, &pool->free_list); + spin_unlock_irq(&pool->pool_lock); + } + INIT_LIST_HEAD(&unmap_list); + INIT_LIST_HEAD(&fmr_list); + + + } + spin_lock_irq(&pool->pool_lock); list_for_each_entry(fmr, &pool->dirty_list, list) { @@ -150,8 +242,8 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool) #ifdef DEBUG if (fmr->ref_count !=0) { - printk(KERN_WARNING PFX "Unmapping FMR 0x%08x with ref count %d\n", - fmr, fmr->ref_count); + pr_warn(PFX "Unmapping FMR 0x%08x with ref count %d\n", + fmr, fmr->ref_count); } #endif } @@ -167,7 +259,13 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool) ret = ib_unmap_fmr(&fmr_list); if (ret) - printk(KERN_WARNING PFX "ib_unmap_fmr returned %d\n", ret); + pr_warn(PFX "ib_unmap_fmr returned %d\n", ret); + + if (pool->relaxed) { + list_for_each_entry(fmr, &unmap_list, list) { + fmr_teardown_mr(fmr); + } + } spin_lock_irq(&pool->pool_lock); list_splice(&unmap_list, &pool->free_list); @@ -177,10 +275,12 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool) static int ib_fmr_cleanup_thread(void *pool_ptr) { struct ib_fmr_pool *pool = pool_ptr; + int time_left = 1; do { if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) { - ib_fmr_batch_release(pool); + ib_fmr_batch_release(pool, 0); + time_left = 1; atomic_inc(&pool->flush_ser); wake_up_interruptible(&pool->force_wait); @@ -189,16 +289,26 @@ static int ib_fmr_cleanup_thread(void *pool_ptr) pool->flush_function(pool, pool->flush_arg); } + if (!time_left && pool->relaxed) { + ib_fmr_batch_release(pool, 1); + + if (pool->flush_function) + pool->flush_function(pool, pool->flush_arg); + } + set_current_state(TASK_INTERRUPTIBLE); if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 && - !kthread_should_stop()) - schedule(); + !kthread_should_stop()) { + /* run once in 1 sec */ + time_left = schedule_timeout((HZ/FMR_SPLIT_COUNT)/20); + } __set_current_state(TASK_RUNNING); } while (!kthread_should_stop()); return 0; } + /** * ib_create_fmr_pool - Create an FMR pool * @pd:Protection domain for FMRs @@ -220,23 +330,33 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, if (!params) return ERR_PTR(-EINVAL); + if (params->cache && params->relaxed) + return ERR_PTR(-EINVAL); + device = pd->device; if (!device->alloc_fmr || !device->dealloc_fmr || !device->map_phys_fmr || !device->unmap_fmr) { - printk(KERN_INFO PFX "Device %s does not support FMRs\n", - device->name); + pr_warn(PFX "Device %s does not support FMRs\n", + device->name); + return ERR_PTR(-ENOSYS); + } + + if (params->relaxed && !device->set_fmr_pd) { + pr_warn(PFX "Device %s does not support relaxed FMRs\n", + device->name); return ERR_PTR(-ENOSYS); } + attr = kmalloc(sizeof *attr, GFP_KERNEL); if (!attr) { - printk(KERN_WARNING PFX "couldn't allocate device attr struct\n"); + pr_warn(PFX "couldn't allocate device attr struct\n"); return ERR_PTR(-ENOMEM); } ret = ib_query_device(device, attr); if (ret) { - printk(KERN_WARNING PFX "couldn't query device: %d\n", ret); + pr_warn(PFX "couldn't query device: %d\n", ret); kfree(attr); return ERR_PTR(ret); } @@ -250,7 +370,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, pool = kmalloc(sizeof *pool, GFP_KERNEL); if (!pool) { - printk(KERN_WARNING PFX "couldn't allocate pool struct\n"); + pr_warn(PFX "couldn't allocate pool struct\n"); return ERR_PTR(-ENOMEM); } @@ -260,6 +380,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, pool->flush_arg = params->flush_arg; INIT_LIST_HEAD(&pool->free_list); + INIT_LIST_HEAD(&pool->used_list); INIT_LIST_HEAD(&pool->dirty_list); if (params->cache) { @@ -267,7 +388,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, kmalloc(IB_FMR_HASH_SIZE * sizeof *pool->cache_bucket, GFP_KERNEL); if (!pool->cache_bucket) { - printk(KERN_WARNING PFX "Failed to allocate cache in pool\n"); + pr_warn(PFX "Failed to allocate cache in pool\n"); ret = -ENOMEM; goto out_free_pool; } @@ -282,16 +403,19 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, pool->dirty_watermark = params->dirty_watermark; pool->dirty_len = 0; spin_lock_init(&pool->pool_lock); + spin_lock_init(&pool->used_pool_lock); atomic_set(&pool->req_ser, 0); atomic_set(&pool->flush_ser, 0); init_waitqueue_head(&pool->force_wait); + pool->pd = pd; + pool->relaxed = params->relaxed; pool->thread = kthread_run(ib_fmr_cleanup_thread, pool, "ib_fmr(%s)", device->name); if (IS_ERR(pool->thread)) { - printk(KERN_WARNING PFX "couldn't start cleanup thread\n"); + pr_warn(PFX "couldn't start cleanup thread\n"); ret = PTR_ERR(pool->thread); goto out_free_pool; } @@ -311,7 +435,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, for (i = 0; i < params->pool_size; ++i) { fmr = kmalloc(bytes_per_fmr, GFP_KERNEL); if (!fmr) { - printk(KERN_WARNING PFX "failed to allocate fmr " + pr_warn(PFX "failed to allocate fmr " "struct for FMR %d\n", i); goto out_fail; } @@ -319,11 +443,15 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, fmr->pool = pool; fmr->remap_count = 0; fmr->ref_count = 0; + fmr->pd = pd; + fmr->page_list_len = 0; + fmr->sg = NULL; + fmr->sg_len = 0; INIT_HLIST_NODE(&fmr->cache_node); fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr); if (IS_ERR(fmr->fmr)) { - printk(KERN_WARNING PFX "fmr_create failed " + pr_warn(PFX "fmr_create failed " "for FMR %d\n", i); kfree(fmr); goto out_fail; @@ -363,15 +491,25 @@ void ib_destroy_fmr_pool(struct ib_fmr_pool *pool) int i; kthread_stop(pool->thread); - ib_fmr_batch_release(pool); + ib_fmr_batch_release(pool, 0); i = 0; list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) { + ib_set_fmr_pd(fmr->fmr, pool->pd); + ib_dealloc_fmr(fmr->fmr); + list_del(&fmr->list); + kfree(fmr); + ++i; + } + list_for_each_entry_safe(fmr, tmp, &pool->used_list, list) { if (fmr->remap_count) { INIT_LIST_HEAD(&fmr_list); list_add_tail(&fmr->fmr->list, &fmr_list); ib_unmap_fmr(&fmr_list); + if (pool->relaxed) + fmr_teardown_mr(fmr); } + ib_set_fmr_pd(fmr->fmr, pool->pd); ib_dealloc_fmr(fmr->fmr); list_del(&fmr->list); kfree(fmr); @@ -379,7 +517,7 @@ void ib_destroy_fmr_pool(struct ib_fmr_pool *pool) } if (i < pool->pool_size) - printk(KERN_WARNING PFX "pool still has %d regions registered\n", + pr_warn(PFX "pool still has %d regions registered\n", pool->pool_size - i); kfree(pool->cache_bucket); @@ -396,7 +534,6 @@ EXPORT_SYMBOL(ib_destroy_fmr_pool); int ib_flush_fmr_pool(struct ib_fmr_pool *pool) { int serial; - struct ib_pool_fmr *fmr, *next; /* * The free_list holds FMRs that may have been used @@ -404,12 +541,9 @@ int ib_flush_fmr_pool(struct ib_fmr_pool *pool) * Put them on the dirty list now so that the cleanup * thread will reap them too. */ - spin_lock_irq(&pool->pool_lock); - list_for_each_entry_safe(fmr, next, &pool->free_list, list) { - if (fmr->remap_count > 0) - list_move(&fmr->list, &pool->dirty_list); - } - spin_unlock_irq(&pool->pool_lock); + spin_lock_irq(&pool->used_pool_lock); + list_splice_init(&pool->used_list, &pool->dirty_list); + spin_unlock_irq(&pool->used_pool_lock); serial = atomic_inc_return(&pool->req_ser); wake_up_process(pool->thread); @@ -428,13 +562,15 @@ EXPORT_SYMBOL(ib_flush_fmr_pool); * @page_list:List of pages to map * @list_len:Number of pages in @page_list * @io_virtual_address:I/O virtual address for new FMR + * @rargs: argument sepecified when relaxed MR is used. * * Map an FMR from an FMR pool. */ struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, u64 *page_list, int list_len, - u64 io_virtual_address) + u64 io_virtual_address, + struct ib_fmr_args_relaxed *rargs) { struct ib_fmr_pool *pool = pool_handle; struct ib_pool_fmr *fmr; @@ -444,11 +580,15 @@ struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, if (list_len < 1 || list_len > pool->max_pages) return ERR_PTR(-EINVAL); + if (pool->relaxed && rargs == NULL) + return ERR_PTR(-EINVAL); + + spin_lock_irqsave(&pool->pool_lock, flags); fmr = ib_fmr_cache_lookup(pool, page_list, list_len, - io_virtual_address); + io_virtual_address, rargs ? rargs->pd : NULL); if (fmr) { /* found in cache */ ++fmr->ref_count; @@ -463,23 +603,46 @@ struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, if (list_empty(&pool->free_list)) { spin_unlock_irqrestore(&pool->pool_lock, flags); - return ERR_PTR(-EAGAIN); + spin_lock_irqsave(&pool->used_pool_lock, flags); + if (list_empty(&pool->used_list)) { + spin_unlock_irqrestore(&pool->used_pool_lock, flags); + return ERR_PTR(-EAGAIN); + } + fmr = list_entry(pool->used_list.next, struct ib_pool_fmr, + list); + list_del(&fmr->list); + hlist_del_init(&fmr->cache_node); + spin_unlock_irqrestore(&pool->used_pool_lock, flags); + } else { + fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, + list); + list_del(&fmr->list); + hlist_del_init(&fmr->cache_node); + spin_unlock_irqrestore(&pool->pool_lock, flags); } - fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, list); - list_del(&fmr->list); - hlist_del_init(&fmr->cache_node); - spin_unlock_irqrestore(&pool->pool_lock, flags); + if (pool->relaxed && fmr->pd != rargs->pd) { + result = ib_set_fmr_pd(fmr->fmr, rargs->pd); + if (result) { + spin_lock_irqsave(&pool->used_pool_lock, flags); + list_add(&fmr->list, &pool->used_list); + spin_unlock_irqrestore(&pool->used_pool_lock, flags); + + pr_warn(PFX "set_fmr_pd returns %d\n", result); + + return ERR_PTR(result); + } + } result = ib_map_phys_fmr(fmr->fmr, page_list, list_len, io_virtual_address); if (result) { - spin_lock_irqsave(&pool->pool_lock, flags); - list_add(&fmr->list, &pool->free_list); - spin_unlock_irqrestore(&pool->pool_lock, flags); + spin_lock_irqsave(&pool->used_pool_lock, flags); + list_add(&fmr->list, &pool->used_list); + spin_unlock_irqrestore(&pool->used_pool_lock, flags); - printk(KERN_WARNING PFX "fmr_map returns %d\n", result); + pr_warn(PFX "fmr_map returns %d\n", result); return ERR_PTR(result); } @@ -498,6 +661,16 @@ struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, spin_unlock_irqrestore(&pool->pool_lock, flags); } + if (pool->relaxed) { + fmr->pd = rargs->pd; + /* if it was mapped earlier */ + if (fmr->remap_count > 1) + fmr_teardown_mr(fmr); + + fmr->sg = rargs->sg; + fmr->sg_len = rargs->sg_len; + } + return fmr; } EXPORT_SYMBOL(ib_fmr_pool_map_phys); @@ -516,12 +689,12 @@ int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr) pool = fmr->pool; - spin_lock_irqsave(&pool->pool_lock, flags); + spin_lock_irqsave(&pool->used_pool_lock, flags); --fmr->ref_count; if (!fmr->ref_count) { if (fmr->remap_count < pool->max_remaps) { - list_add_tail(&fmr->list, &pool->free_list); + list_add_tail(&fmr->list, &pool->used_list); } else { list_add_tail(&fmr->list, &pool->dirty_list); if (++pool->dirty_len >= pool->dirty_watermark) { @@ -533,11 +706,11 @@ int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr) #ifdef DEBUG if (fmr->ref_count < 0) - printk(KERN_WARNING PFX "FMR %p has ref count %d < 0\n", + pr_warn(PFX "FMR %p has ref count %d < 0\n", fmr, fmr->ref_count); #endif - spin_unlock_irqrestore(&pool->pool_lock, flags); + spin_unlock_irqrestore(&pool->used_pool_lock, flags); return 0; } diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 55a0ac08b91e2..a579dca7d8f7f 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -46,6 +46,7 @@ #include #include #include +#include #define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ do { \ @@ -167,6 +168,7 @@ extern spinlock_t ib_uverbs_idr_lock; extern struct idr ib_uverbs_pd_idr; extern struct idr ib_uverbs_shpd_idr; extern struct idr ib_uverbs_mr_idr; +extern struct idr ib_uverbs_fmr_idr; extern struct idr ib_uverbs_mw_idr; extern struct idr ib_uverbs_ah_idr; extern struct idr ib_uverbs_cq_idr; @@ -253,6 +255,9 @@ IB_UVERBS_DECLARE_CMD(open_xrcd); IB_UVERBS_DECLARE_CMD(close_xrcd); IB_UVERBS_DECLARE_CMD(alloc_shpd); IB_UVERBS_DECLARE_CMD(share_pd); +IB_UVERBS_DECLARE_CMD(reg_mr_relaxed); +IB_UVERBS_DECLARE_CMD(dereg_mr_relaxed); +IB_UVERBS_DECLARE_CMD(flush_relaxed_mr); #define IB_UVERBS_DECLARE_EX_CMD(name) \ int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index ccb9270f0557a..aa39c2ec531c0 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -51,6 +51,7 @@ struct uverbs_lock_class { static struct uverbs_lock_class pd_lock_class = { .name = "PD-uobj" }; static struct uverbs_lock_class shpd_lock_class = { .name = "SHPD-uobj" }; static struct uverbs_lock_class mr_lock_class = { .name = "MR-uobj" }; +static struct uverbs_lock_class fmr_lock_class = { .name = "FMR-uobj"}; static struct uverbs_lock_class mw_lock_class = { .name = "MW-uobj" }; static struct uverbs_lock_class cq_lock_class = { .name = "CQ-uobj" }; static struct uverbs_lock_class qp_lock_class = { .name = "QP-uobj" }; @@ -287,6 +288,108 @@ static void put_xrcd_read(struct ib_uobject *uobj) put_uobj_read(uobj); } +/* + * get the number of pages by looking at the page indices that the start and + * end addresses fall in. + * + * Returns 0 if the vec is invalid. It is invalid if the number of bytes + * causes the address to wrap or overflows an unsigned int. This comes + * from being stored in the 'length' member of 'struct scatterlist'. + */ +static unsigned int get_pages_in_range(u64 addr, u64 bytes) +{ + if ((addr + bytes <= addr) || + (bytes > (u64)UINT_MAX)) + return 0; + + return ((addr + bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) - + (addr >> PAGE_SHIFT); +} + +/* Pin user pages*/ +static int fmr_pin_pages(unsigned long user_addr, unsigned int nr_pages, + struct page **pages, int write) +{ + int ret; + + down_read(¤t->mm->mmap_sem); + ret = get_user_pages(current, current->mm, user_addr, + nr_pages, write, 0, pages, NULL); + up_read(¤t->mm->mmap_sem); + + if (0 <= ret && (unsigned) ret < nr_pages) { + while (ret--) + put_page(pages[ret]); + ret = -EFAULT; + } + + return ret; +} + +static int create_fmr_pool(struct ib_pd *pd, int pages, u32 access) +{ + + int ret = 0; + struct ib_fmr_pool_param fmr_param; + struct ib_fmr_pool *fmr_pool; + struct ib_relaxed_pool_data *pool_data; + struct ib_relaxed_pool_data *pos; + int found = 0; + + /*create pools - 32k fmrs of 8k buf, 4k fmrs of 1meg */ + memset(&fmr_param, 0, sizeof(fmr_param)); + fmr_param.pool_size = (pages > 20) ? 8 * 1024 : 32*1024; + fmr_param.dirty_watermark = 512; + fmr_param.cache = 0; + fmr_param.relaxed = 1; + fmr_param.max_pages_per_fmr = pages; + fmr_param.page_shift = PAGE_SHIFT; + fmr_param.access = access; + + fmr_pool = ib_create_fmr_pool(pd, &fmr_param); + + if (IS_ERR(fmr_pool)) { + ret = PTR_ERR(fmr_pool); + goto err_exit; + } + + pool_data = kmalloc(sizeof(*pool_data), GFP_KERNEL); + + if (!pool_data) { + ret = -ENOMEM; + (void)ib_destroy_fmr_pool(fmr_pool); + goto err_exit; + } + + pool_data->fmr_pool = fmr_pool; + pool_data->access_flags = access; + pool_data->max_pages = pages; + list_for_each_entry(pos, &pd->device->relaxed_pool_list, pool_list) { + if (pages <= pos->max_pages) { + list_add_tail(&pool_data->pool_list, &pos->pool_list); + found = 1; + break; + } + } + if (!found) + list_add_tail(&pool_data->pool_list, + &pd->device->relaxed_pool_list); + +#ifdef DEBUG + pr_info("FMR POOLS :\n"); + list_for_each_entry(pos, &pd->device->relaxed_pool_list, pool_list) { + pr_info("\t pos -> %p, pages = %d, access = %x, pool = %p\n", + pos, pos->max_pages, pos->access_flags, + pos->fmr_pool); + } +#endif + + return 0; + +err_exit: + return ret; +} + ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) @@ -328,6 +431,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, ucontext->device = ibdev; INIT_LIST_HEAD(&ucontext->pd_list); INIT_LIST_HEAD(&ucontext->mr_list); + INIT_LIST_HEAD(&ucontext->fmr_list); INIT_LIST_HEAD(&ucontext->mw_list); INIT_LIST_HEAD(&ucontext->cq_list); INIT_LIST_HEAD(&ucontext->qp_list); @@ -828,6 +932,7 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, struct ib_uobject *shuobj = 0; struct ib_pd *pd = NULL; struct ib_shpd *shpd = NULL; + struct ib_relaxed_pool_data *pos; if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; @@ -838,6 +943,11 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, pd = uobj->object; + /* flush all pd reference from HCA - relaxed FMR */ + list_for_each_entry(pos, &pd->device->relaxed_pool_list, pool_list) { + ib_flush_fmr_pool(pos->fmr_pool); + } + /* is pd shared ?*/ if (pd->shpd) { shpd = pd->shpd; @@ -1386,6 +1496,284 @@ put_uobjs: return ret; } +ssize_t ib_uverbs_reg_mr_relaxed(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_reg_mr cmd; + struct ib_uverbs_reg_mr_resp resp; + struct ib_udata udata; + struct ib_uobject *uobj; + struct ib_pd *pd; + int ret; + + struct ib_relaxed_pool_data *pos; + struct ib_fmr_args_relaxed rel_args; + unsigned int n; + int found = 0; + struct page **pages; + int page_cnt; + u64 *dma_pages; + struct scatterlist *sg; + struct ib_pool_fmr *fmr; + int fmr_mapped = 0; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd), out_len - sizeof(resp)); + + if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) + return -EINVAL; + + /* + * Local write permission is required if remote write or + * remote atomic permission is also requested. + */ + if (cmd.access_flags & + (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) && + !(cmd.access_flags & IB_ACCESS_LOCAL_WRITE)) + return -EINVAL; + + /* FMRs are limited to less than 1M for now */ + if (cmd.length >= (1*1024*1024 + PAGE_SIZE - 1)) + return -EINVAL; + + uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) + return -ENOMEM; + + init_uobj(uobj, 0, file->ucontext, &fmr_lock_class); + down_write(&uobj->mutex); + + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto err_free; + } + + /* Relaxed MR */ + /* pd->device has a list of FMR pools, sorted by size & access_flags */ + /* if pool is already available use that pool and map the address. if + it is not available then allocate a new pool & allocate from there */ + { + + n = get_pages_in_range(cmd.start, cmd.length); + if (n == 0) { + ret = -EINVAL; + goto err_put; + } + + found = 0; + + list_for_each_entry(pos, &pd->device->relaxed_pool_list, pool_list) { + if (cmd.access_flags == pos->access_flags + && n <= pos->max_pages){ + found = 1; + break; + } + } + + if (!found) { + int pagesin8K = (8*1024 + PAGE_SIZE) >> PAGE_SHIFT; + int pagesin1M = (1024*1024 + PAGE_SIZE) >> PAGE_SHIFT; + struct ib_pd *pool_pd = file->device->ib_dev->relaxed_pd; + + /* Create pool for 8kb buffers */ + ret = create_fmr_pool(pool_pd, pagesin8K, cmd.access_flags); + if (ret < 0) + goto err_put; + + /* Create pool for 1mb buffers */ + ret = create_fmr_pool(pool_pd, pagesin1M, cmd.access_flags); + if (ret < 0) + goto err_put; + + list_for_each_entry(pos, &pd->device->relaxed_pool_list, + pool_list) { + if (cmd.access_flags == pos->access_flags + && n <= pos->max_pages){ + found = 1; + break; + } + } + if (!found) { + ret = -EINVAL; + goto err_put; + } + } + + + pages = kcalloc(n, sizeof(struct page *), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto err_put; + } + + ret = fmr_pin_pages(cmd.start & PAGE_MASK, n, pages, + cmd.access_flags & IB_ACCESS_LOCAL_WRITE ? 1 : 0); + if (ret < 0) + goto err_pages_alloc; + + + /* TODO: define following as a separate function */ + if (1) { + u32 len = 0; + int sg_dma_len; + int i, j; + + page_cnt = 0; + + sg = kcalloc(n, sizeof(*sg), GFP_KERNEL); + if (sg == NULL) { + ret = -ENOMEM; + goto err_unpin; + } + sg_init_table(sg, n); + /* Stick all pages into the scatterlist */ + for (i = 0 ; i < n; i++) + sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); + + sg_dma_len = ib_dma_map_sg(pd->device, sg, n, + DMA_BIDIRECTIONAL); + if (unlikely(!sg_dma_len)) { + pr_warn("RFMR/IB: dma_map_sg failed!\n"); + ret = -EBUSY; + goto err_free_sg; + } + + + for (i = 0; i < sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(pd->device, + &sg[i]); + u64 dma_addr = ib_sg_dma_address(pd->device, &sg[i]); + + if (dma_addr & ~PAGE_MASK) { + if (i > 0) { + ret = -EINVAL; + goto err_free_sg; + } else + ++page_cnt; + } + if ((dma_addr + dma_len) & ~PAGE_MASK) { + if (i < sg_dma_len - 1) { + ret = -EINVAL; + goto err_free_sg; + } else + ++page_cnt; + } + + len += dma_len; + } + + page_cnt += len >> PAGE_SHIFT; + + dma_pages = kmalloc_array(page_cnt, sizeof(u64), GFP_ATOMIC); + if (!dma_pages) { + ret = -ENOMEM; + goto err_free_sg; + } + + page_cnt = 0; + for (i = 0; i < sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(pd->device, + &sg[i]); + u64 dma_addr = ib_sg_dma_address(pd->device, &sg[i]); + + for (j = 0; j < dma_len; j += PAGE_SIZE) { + dma_pages[page_cnt++] = + (dma_addr & PAGE_MASK) + j; + } + } + } + + + rel_args.pd = pd; + rel_args.sg = sg; + rel_args.sg_len = n; + + fmr = ib_fmr_pool_map_phys(pos->fmr_pool, dma_pages, page_cnt, + cmd.hca_va & PAGE_MASK, &rel_args); + + kfree(dma_pages); + + if (IS_ERR(fmr)) { + ret = PTR_ERR(fmr); + goto err_free_sg; + } + + fmr_mapped = 1; + + kfree(pages); + + } + + fmr->fmr->device = pd->device; + fmr->fmr->pd = pd; + atomic_inc(&pd->usecnt); + + uobj->object = fmr; + ret = idr_add_uobj(&ib_uverbs_fmr_idr, uobj); + if (ret) + goto err_unreg; + + memset(&resp, 0, sizeof(resp)); + resp.lkey = fmr->fmr->lkey; + resp.rkey = fmr->fmr->rkey; + resp.mr_handle = uobj->id; + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof(resp))) { + ret = -EFAULT; + goto err_copy; + } + + put_pd_read(pd); + + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->fmr_list); + mutex_unlock(&file->mutex); + + uobj->live = 1; + + up_write(&uobj->mutex); + + return in_len; + +err_copy: + idr_remove_uobj(&ib_uverbs_fmr_idr, uobj); + +err_unreg: + ib_fmr_pool_unmap(fmr); + +err_free_sg: + /* if mapped already, this will be freed while flushing */ + if (!fmr_mapped) + kfree(sg); + +err_unpin: + /* if mapped already, pages will be unpinned during flushing */ + if (!fmr_mapped) + while (n--) + put_page(pages[n]); + +err_pages_alloc: + kfree(pages); + + +err_put: + put_pd_read(pd); + +err_free: + put_uobj_write(uobj); + return ret; +} + ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) @@ -1544,6 +1932,76 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, return in_len; } +ssize_t ib_uverbs_dereg_mr_relaxed(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_dereg_mr cmd; + struct ib_uobject *uobj; + int ret = -EINVAL; + struct ib_pool_fmr *fmr; + struct ib_pd *pd; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + uobj = idr_write_uobj(&ib_uverbs_fmr_idr, cmd.mr_handle, + file->ucontext); + if (!uobj) + return -EINVAL; + + fmr = uobj->object; + pd = fmr->fmr->pd; + + ret = ib_fmr_pool_unmap(fmr); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + + if (ret) + return ret; + + atomic_dec(&pd->usecnt); + + idr_remove_uobj(&ib_uverbs_fmr_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + put_uobj(uobj); + + return in_len; +} + +ssize_t ib_uverbs_flush_relaxed_mr(struct ib_uverbs_file *file, + const char __user *buf, + int in_len, int out_len) +{ + struct ib_uverbs_flush_relaxed_mr cmd; + struct ib_uobject *uobj; + struct ib_pd *pd; + struct ib_relaxed_pool_data *pos; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext); + if (!uobj) + return -EINVAL; + + /* flush all the pools associated with the pd */ + pd = uobj->object; + list_for_each_entry(pos, &pd->device->relaxed_pool_list, pool_list) { + ib_flush_fmr_pool(pos->fmr_pool); + } + + put_uobj_write(uobj); + + return in_len; +} + ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index ca4e0cc234dea..fedf6008603b4 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -68,6 +68,7 @@ DEFINE_SPINLOCK(ib_uverbs_idr_lock); DEFINE_IDR(ib_uverbs_pd_idr); DEFINE_IDR(ib_uverbs_shpd_idr); DEFINE_IDR(ib_uverbs_mr_idr); +DEFINE_IDR(ib_uverbs_fmr_idr); DEFINE_IDR(ib_uverbs_mw_idr); DEFINE_IDR(ib_uverbs_ah_idr); DEFINE_IDR(ib_uverbs_cq_idr); @@ -124,6 +125,9 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, */ [IB_USER_VERBS_CMD_ALLOC_SHPD] = ib_uverbs_alloc_shpd, [IB_USER_VERBS_CMD_SHARE_PD] = ib_uverbs_share_pd, + [IB_USER_VERBS_CMD_REG_MR_RELAXED] = ib_uverbs_reg_mr_relaxed, + [IB_USER_VERBS_CMD_DEREG_MR_RELAXED] = ib_uverbs_dereg_mr_relaxed, + [IB_USER_VERBS_CMD_FLUSH_RELAXED_MR] = ib_uverbs_flush_relaxed_mr, }; static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, @@ -300,6 +304,14 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, kfree(uobj); } + list_for_each_entry_safe(uobj, tmp, &context->fmr_list, list) { + struct ib_pool_fmr *fmr = uobj->object; + + idr_remove_uobj(&ib_uverbs_fmr_idr, uobj); + ib_fmr_pool_unmap(fmr); + kfree(uobj); + } + mutex_lock(&file->device->xrcd_tree_mutex); list_for_each_entry_safe(uobj, tmp, &context->xrcd_list, list) { struct ib_xrcd *xrcd = uobj->object; @@ -316,6 +328,13 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, struct ib_pd *pd = uobj->object; struct ib_uobject *shuobj = NULL; struct ib_shpd *shpd = NULL; + struct ib_relaxed_pool_data *pos; + + /* flush fmr pool associated with this pd */ + list_for_each_entry(pos, &pd->device->relaxed_pool_list, + pool_list) { + ib_flush_fmr_pool(pos->fmr_pool); + } idr_remove_uobj(&ib_uverbs_pd_idr, uobj); @@ -1005,6 +1024,12 @@ static void ib_uverbs_add_one(struct ib_device *device) if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version)) goto err_class; + device->relaxed_pd = ib_alloc_pd(device); + if (IS_ERR(device->relaxed_pd)) { + device->relaxed_pd = NULL; + goto err_class; + } + ib_set_client_data(device, &uverbs_client, uverbs_dev); return; @@ -1029,9 +1054,21 @@ err: static void ib_uverbs_remove_one(struct ib_device *device) { struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client); + struct ib_relaxed_pool_data *pos; + struct ib_relaxed_pool_data *tmp; + int ret = 0; if (!uverbs_dev) return; + list_for_each_entry_safe(pos, tmp, &device->relaxed_pool_list, + pool_list) { + ib_destroy_fmr_pool(pos->fmr_pool); + list_del(&pos->pool_list); + kfree(pos); + } + + ret = ib_dealloc_pd(device->relaxed_pd); + device->relaxed_pd = NULL; dev_set_drvdata(uverbs_dev->dev, NULL); device_destroy(uverbs_class, uverbs_dev->cdev.dev); @@ -1108,6 +1145,7 @@ static void __exit ib_uverbs_cleanup(void) idr_destroy(&ib_uverbs_pd_idr); idr_destroy(&ib_uverbs_shpd_idr); idr_destroy(&ib_uverbs_mr_idr); + idr_destroy(&ib_uverbs_fmr_idr); idr_destroy(&ib_uverbs_mw_idr); idr_destroy(&ib_uverbs_ah_idr); idr_destroy(&ib_uverbs_cq_idr); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 3c0549449cac0..6656c2aa1e0d4 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1322,6 +1322,22 @@ struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, } EXPORT_SYMBOL(ib_alloc_fmr); +int ib_set_fmr_pd(struct ib_fmr *fmr, struct ib_pd *pd) +{ + int ret = 0; + + if (fmr->device->set_fmr_pd) { + ret = fmr->device->set_fmr_pd(fmr, pd); + if (!ret) + fmr->pd = pd; + + return ret; + } else + return -ENOSYS; +} +EXPORT_SYMBOL(ib_set_fmr_pd); + + int ib_unmap_fmr(struct list_head *fmr_list) { struct ib_fmr *fmr; diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index f0cdc961eb11b..2d5a9571b5e00 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -464,7 +464,8 @@ int iser_reg_page_vec(struct iscsi_iser_task *iser_task, fmr = ib_fmr_pool_map_phys(ib_conn->fmr.pool, page_vec->pages, page_vec->length, - page_vec->pages[0]); + page_vec->pages[0], + NULL); if (IS_ERR(fmr)) { ret = PTR_ERR(fmr); iser_err("ib_fmr_pool_map_phys failed: %d\n", ret); diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 918814cd0f806..98ec08a300fdd 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1282,7 +1282,7 @@ static int srp_map_finish_fmr(struct srp_map_state *state, u64 io_addr = 0; fmr = ib_fmr_pool_map_phys(ch->fmr_pool, state->pages, - state->npages, io_addr); + state->npages, io_addr, NULL); if (IS_ERR(fmr)) return PTR_ERR(fmr); diff --git a/include/rdma/ib_fmr_pool.h b/include/rdma/ib_fmr_pool.h index f62b842e65961..102ed105a10c0 100644 --- a/include/rdma/ib_fmr_pool.h +++ b/include/rdma/ib_fmr_pool.h @@ -62,6 +62,7 @@ struct ib_fmr_pool_param { void *arg); void *flush_arg; unsigned cache:1; + unsigned relaxed:1; }; struct ib_pool_fmr { @@ -72,10 +73,20 @@ struct ib_pool_fmr { int ref_count; int remap_count; u64 io_virtual_address; + struct ib_pd *pd; + int list_id; + struct scatterlist *sg; + int sg_len; int page_list_len; u64 page_list[0]; }; +struct ib_fmr_args_relaxed { + struct ib_pd *pd; + struct scatterlist *sg; + int sg_len; +}; + struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, struct ib_fmr_pool_param *params); @@ -86,8 +97,10 @@ int ib_flush_fmr_pool(struct ib_fmr_pool *pool); struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, u64 *page_list, int list_len, - u64 io_virtual_address); + u64 io_virtual_address, + struct ib_fmr_args_relaxed *rargs); int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr); + #endif /* IB_FMR_POOL_H */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 25c43efb607e9..f4264cce20779 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1151,6 +1151,7 @@ struct ib_ucontext { struct ib_device *device; struct list_head pd_list; struct list_head mr_list; + struct list_head fmr_list; struct list_head mw_list; struct list_head cq_list; struct list_head qp_list; @@ -1676,6 +1677,8 @@ struct ib_device { int (*remove_shpd)(struct ib_device *ibdev, struct ib_shpd *shpd, int atinit); + int (*set_fmr_pd)(struct ib_fmr *fmr, + struct ib_pd *pd); struct ib_dma_mapping_ops *dma_ops; @@ -1699,6 +1702,15 @@ struct ib_device { u32 local_dma_lkey; u8 node_type; u8 phys_port_cnt; + struct ib_pd *relaxed_pd; + struct list_head relaxed_pool_list; +}; + +struct ib_relaxed_pool_data { + struct ib_fmr_pool *fmr_pool; + u32 access_flags; + int max_pages; + struct list_head pool_list; }; struct ib_client { @@ -2600,6 +2612,13 @@ struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, int mr_access_flags, struct ib_fmr_attr *fmr_attr); +/** + * ib_set_fmr_pd - set new PD for an FMR + * @fmr: The fast memory region to associate with the pd. + * @pd: new pd. + */ +int ib_set_fmr_pd(struct ib_fmr *fmr, struct ib_pd *pd); + /** * ib_map_phys_fmr - Maps a list of physical pages to a fast memory region. * @fmr: The fast memory region to associate with the pages. diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 016b14164468d..832761e6da983 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -104,6 +104,9 @@ enum { IB_USER_VERBS_CMD_ALLOC_SHPD = IB_USER_VERBS_CMD_ORACLE_ADDS_START, /* =46 */ IB_USER_VERBS_CMD_SHARE_PD, /* =47 */ + IB_USER_VERBS_CMD_REG_MR_RELAXED, /* =48 */ + IB_USER_VERBS_CMD_DEREG_MR_RELAXED, /* =49 */ + IB_USER_VERBS_CMD_FLUSH_RELAXED_MR, /* =50 */ }; enum { @@ -373,6 +376,11 @@ struct ib_uverbs_dealloc_mw { __u32 mw_handle; }; +struct ib_uverbs_flush_relaxed_mr { + __u32 pd_handle; +}; + + struct ib_uverbs_create_comp_channel { __u64 response; };