From: Knut Omang Date: Wed, 25 May 2016 09:01:11 +0000 (+0200) Subject: sif driver initial commit part 3 X-Git-Tag: v4.1.12-92~148^2~18 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=7e0b60f6dc9f1cd5d22d10bcc75d6c6521b8a36e;p=users%2Fjedix%2Flinux-maple.git sif driver initial commit part 3 sif_pt.c: SIF (private) page table management sif_pt.h: SIF (private) page table management. sif_qp.c: Implementation of IB queue pair logic for sif sif_qp.h: Interface to internal IB queue pair logic for sif sif_query.c: SIF implementation of some of IB query APIs sif_query.h: SIF implementation of some of IB query APIs sif_r3.c: Special handling specific for psif revision 3 and earlier sif_r3.h: Special handling specific for psif revision 3 and earlier sif_rq.c: Implementation of sif receive queues sif_rq.h: Interface to sif receive queues sif_sndrcv.c: Implementation of post send/recv logic for SIF sif_sndrcv.h: Interface to IB send/receive, MAD packet recv and sif_spt.c: Experimental implementation of shared use of the OS's page tables. sif_spt.h: Experimental (still unsafe) sif_sq.c: Implementation of the send queue side of an IB queue pair sif_sq.h: Implementation of the send queue side of an IB queue pair sif_srq.c: Interface to shared receive queues for SIF sif_srq.h: Interface to internal Shared receive queue logic for SIF sif_tqp.c: Implementation of EPSA tunneling QP for SIF Signed-off-by: Knut Omang --- diff --git a/drivers/infiniband/hw/sif/sif_pt.c b/drivers/infiniband/hw/sif/sif_pt.c new file mode 100644 index 000000000000..e6f314a9ba7c --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_pt.c @@ -0,0 +1,1408 @@ +/* + * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_pt.c: SIF (private) page table management + */ + +#include +#include +#include +#include "sif_dev.h" +#include "sif_mr.h" +#include "sif_mem.h" +#include "sif_pt.h" +#include "sif_base.h" + +/* A kmem_cache to allocate the nodes in the rb_trees */ +static struct kmem_cache *pt_page_cache; + +static inline void *sif_pt_cache_alloc(struct sif_dev *sdev, gfp_t flags) +{ +#ifdef CONFIG_NUMA + void *n; + + n = kmem_cache_alloc_node(pt_page_cache, flags, sdev->pdev->dev.numa_node); + if (n) + return n; + + sif_log(sdev, SIF_INFO, "Warning: unable to allocate mem on numa node %d", + sdev->pdev->dev.numa_node); +#endif + return kmem_cache_alloc(pt_page_cache, flags); +} + + +/* Declared below */ +static int init_top(struct sif_pt *pt, u64 vstart, int npages); + + +int sif_pt_init(void) +{ + pt_page_cache = KMEM_CACHE(sif_pt_page, 0); + if (!pt_page_cache) + return -ENOMEM; + sif_log0(SIF_INFO, "order PAGE_SIZE = %d", order_base_2(PAGE_SIZE)); + return 0; +} + +void sif_pt_exit(void) +{ + kmem_cache_destroy(pt_page_cache); +} + +/* some utilities: */ + +/* Find the optimal page size (represented by the leaf level) + * to use based on device capabilities, configuration and a max_shift + * value (typically based on continuousness of memory. + * The result is adjusted with the address pair of a corresponding virtual + * address and dma address to ensure that it is possible to create a mapping at that + * level. pte_ext_shift is set to the number bits to shift increment between + * each valid pte (For the odd sized leaf pages) + * Assumes vaddr and dma_add. + */ +int find_optimal_leaf_level(struct sif_dev *sdev, u32 max_shift, + u64 vaddr, u64 dma_addr, u64 size, + u8 *leaf_level, u8 *pte_ext_shift) +{ + u32 shift, adj_page_shift, page_shift; + unsigned long smallest_misalign; + u32 bits = sizeof(dma_addr_t) << 3; + + /* Page size not supported by device configuration + * TBD: Remove (Should not happen unless a programming error) + */ + if (sdev->mi.page_shift > max_shift) { + sif_log(sdev, SIF_INFO, + "Failed to find a valid leaf level (page_shift %d, max_shift %d)", + sdev->mi.page_shift, max_shift); + return -EINVAL; + } + + *leaf_level = 0; + *pte_ext_shift = 0; + shift = sdev->mi.page_shift; + + switch (shift) { + case 12: + /* Device configured for Intel page sizes: + * In x86 mode for PSIF 2.1 only 4K base page size is supported + */ + if (max_shift < 21) + break; + *leaf_level = 1; + if (max_shift < 30) + break; + *leaf_level = 2; + break; + case 13: /* Device configured for Sparc page sizes */ + if (max_shift < 16) + break; + *pte_ext_shift = 3; /* 64K base page - only populate every 8th leaf entry */ + if (max_shift < 19) + break; + *pte_ext_shift = 6; /* 512K base page - only populate every 64th leaf entry */ + if (max_shift < 22) + break; + *leaf_level = 1; + *pte_ext_shift = 0; + if (max_shift < 25) + break; + *pte_ext_shift = 3; /* Fits 32M pages at level 1 - every 8th 4M entry */ + if (max_shift < 28) + break; + *pte_ext_shift = 6; /* Fits 256M pages at level 1 - every 64th 4M entry */ + if (max_shift < 31) + break; + *leaf_level = 2; + *pte_ext_shift = 0; /* Fits 2GB pages at level 2 */ + if (max_shift < 34) + break; + *pte_ext_shift = 3; /* Fits 16GB pages at level 2 - every 8th 2GB entry */ + if (max_shift < 37) + break; + break; + default: + BUG(); + } + if (*leaf_level) { + page_shift = shift + (*leaf_level * sdev->mi.level_shift); + smallest_misalign = (dma_addr ^ vaddr) & ((1 << page_shift) - 1); + if (smallest_misalign & ~PAGE_MASK) { + sif_log(sdev, SIF_INFO, + "Failed to create page table: misaligned VA/DMA (0x%lx) dma 0x%llx vaddr 0x%llx", + smallest_misalign, dma_addr, vaddr); + return -EINVAL; + } + + if (smallest_misalign) { + adj_page_shift = find_first_bit(&smallest_misalign, bits); + *leaf_level = (adj_page_shift - shift) / sdev->mi.level_shift; + sif_log(sdev, SIF_PT, + "misaligned VA/DMA adj: leaf_level %d, page_shift %d, smallest_misalign 0x%lx, adj_page_shift %d", + *leaf_level, + page_shift, smallest_misalign, adj_page_shift); + page_shift = adj_page_shift; + } + /* TBD: Remove - just for debugging */ + if (*leaf_level > 3) { + sif_log(sdev, SIF_INFO, + "haywire leaf level %d - should not be possible - setting safe value 0", + *leaf_level); + *leaf_level = 0; + return -EINVAL; + } + if (*leaf_level) { + /* Check if we can do equally well with a lower level pointer */ + int size_order = order_base_2(size); + int size_shift = page_shift - size_order; + + if (size_shift < 0) + goto out; + sif_log(sdev, SIF_PT, "order %d page_shift %d size_shift %d", + size_order, page_shift, size_shift); + if (size_shift > 0) { + u32 new_leaf_level = + ((page_shift - size_shift + sdev->mi.level_shift - 1 - shift) + / sdev->mi.level_shift); + sif_log(sdev, SIF_PT, "new_leaf_level %d", new_leaf_level); + if (new_leaf_level < *leaf_level) { + *leaf_level = new_leaf_level; + sif_log(sdev, SIF_PT, + "size_shift %d, size adjusted leaf_level %d", + size_shift, *leaf_level); + } + } + } + } +out: + sif_log(sdev, SIF_PT, "shift %d leaf_level %d", shift, *leaf_level); + return 0; +} + +/* Find the aligned size of a region within a certain page alignment size + * (eg. the number of pages of size @alignment needed to address (start,len)) + */ +u64 aligned_size(u64 start, u64 len, u64 alignment) +{ + u64 mask = alignment - 1; + u64 aligned_start = start & ~mask; + u64 aligned_end = (start + len + mask) & ~mask; + + return aligned_end - aligned_start; +} + +/* Find the union of the two ranges including non-overlapped parts */ +static u64 merge_ranges(u64 start1, u64 size1, u64 start2, u64 size2, u64 *new_size) +{ + u64 new_start = min(start1, start2); + u64 new_end = max(start1 + size1, start2 + size2); + *new_size = new_end - new_start; + return new_start; +} + +static u32 level_to_pageshift(struct sif_pt *pt, int level) +{ + struct sif_mem_info *mi = &pt->sdev->mi; + + level++; + if (level < 0 || level > 4) + sif_log(pt->sdev, SIF_INFO, "level %d", level); + BUG_ON(level < 0 || level > 4); + return mi->page_shift + mi->level_shift * level; +} + +static u64 level_to_pagesize(struct sif_pt *pt, int level) +{ + return (1ull << level_to_pageshift(pt, level)); +} + +static u64 level_to_pagemask(struct sif_pt *pt, int level) +{ + return (level_to_pagesize(pt, level) - 1); +} + + +u32 sif_pt_page_shift(struct sif_pt *pt) +{ + return level_to_pageshift(pt, pt->leaf_level - 1); +} + +/* Find the required page table memory need in number of + * pt->page_table_page sized pages + * If pt->fixed_top, calculate space for a final page for each of the levels + * even if only one entry is necessary. + * + * NB! Sets pt->top_level as a side effect + */ +static u32 table_mem_need(struct sif_pt *pt, u64 vstart, u64 mapsize) +{ + u64 aligned_size_pte; + u64 aligned_size_pmd; + u64 aligned_size_pud; + u64 aligned_size_pgd; + u64 aligned_size_pml4; + u64 psz; + int nptes, npmds, npuds, npgds, pte_pages; + int pshift; + /* If we need to guarantee that the top node remains the same, we must build + * a max level page table + */ + int single = pt->fixed_top ? 1 : 0; + struct sif_dev *sdev = pt->sdev; + + /* Determine what setup to use for the kmem object based on the initial mapsize: + * We use 4K pages for now, and set sg_size to the number of pages needed to + * support mapsize + the full chain of pages if we need a 4-level table: + */ + psz = sdev->mi.page_size; + aligned_size_pte = aligned_size(vstart, mapsize, psz); + psz <<= sdev->mi.level_shift; + aligned_size_pmd = aligned_size(vstart, mapsize, psz); + psz <<= sdev->mi.level_shift; + aligned_size_pud = aligned_size(vstart, mapsize, psz); + psz <<= sdev->mi.level_shift; + aligned_size_pgd = aligned_size(vstart, mapsize, psz); + psz <<= sdev->mi.level_shift; + aligned_size_pml4 = aligned_size(vstart, mapsize, psz); + + sif_log(pt->sdev, SIF_MMU, "aligned lengths: pte %llx pmd %llx pud %llx pgd %llx pml4 %llx", + aligned_size_pte, aligned_size_pmd, aligned_size_pud, + aligned_size_pgd, aligned_size_pml4); + + pshift = sdev->mi.page_shift + sdev->mi.level_shift; + nptes = aligned_size_pmd >> pshift; + pshift += sdev->mi.level_shift; + npmds = nptes > 1 ? aligned_size_pud >> pshift : single; + pshift += sdev->mi.level_shift; + npuds = npmds > 1 ? aligned_size_pgd >> pshift : single; + pshift += sdev->mi.level_shift; + npgds = npuds > 1 ? aligned_size_pml4 >> pshift : single; + + pte_pages = pt->leaf_level ? 0 : nptes; + + sif_log(pt->sdev, SIF_MMU, "npgds %d, npuds %d, npmds: %d, pte_pages %d", + npgds, npuds, npmds, pte_pages); + + pt->top_level = single ? 3 : (npgds ? 3 : (npuds ? 2 : (npmds ? 1 : 0))); + return pte_pages + npmds + npuds + npgds; +} + +/* Find page table entry index for the pte referring + * the page starting at vaddr at level @level + */ +static inline int sif_pte_index(struct sif_dev *sdev, u64 vaddr, u64 page_shift) +{ + return (vaddr >> page_shift) & (sdev->mi.ptes_per_page - 1); +} + + + + +static void pt_free_page(struct sif_pt *pt, struct sif_pt_page *n) +{ + list_add_tail(&n->list, &pt->freelist); + n->parent = NULL; + n->vaddr = 0; +} + + +/* Destructor callback for kref */ +static void sif_pt_release(struct kref *kref) +{ + struct sif_pt *pt = container_of(kref, struct sif_pt, refcnt); + struct list_head *np; + struct list_head *npp; + struct sif_pt_page *n; + + sif_log(pt->sdev, SIF_MMU_V, "at %p", pt); + + if (pt->top) + pt_free_page(pt, pt->top); + + /* Actual cleanup */ + list_for_each_safe(np, npp, &pt->freelist) { + n = list_entry(np, struct sif_pt_page, list); + kfree(n); + } + if (pt->m.sg_size) + sif_kmem_free(pt->sdev, &pt->m); + kfree(pt); +} + + +/* Create a sif_page_table object and if mapsize > 0, + * map the range starting at @sg to a map with start at virtual + * address @vstart and size @mapsize and the number of bits to use in each page + * in page_shift. The object can later be resized using sif_pt_extend/sif_pt_shrink: + * Set @modifiable to allow the table to be extended and shrinked + * Set @fixed_top to have pt guarantee that the top node remains constant + * in which case it will always be a level 4 tree. + */ +struct sif_pt *sif_pt_create(struct sif_dev *sdev, struct scatterlist *sg, + u64 vstart, size_t size, u32 page_shift, + bool modifiable, bool fixed_top) +{ + int ret = 0; + int i; + dma_addr_t dma_start = sg ? sg_dma_address(sg) : 0; + struct sif_pt *pt = sif_kmalloc(sdev, sizeof(*pt), GFP_KERNEL | __GFP_ZERO); + + if (!pt) + return NULL; + + /* sub-page misalignment in vstart must correspond with + * misalignment in dma address but sg entries are page aligned: + */ + dma_start += vstart & ~PAGE_MASK; + + sif_log(sdev, SIF_MMU, "vstart %llx, size %lx, page_shift %d%s", vstart, size, + page_shift, (modifiable ? " (modifiable)" : "")); + pt->sdev = sdev; + pt->fixed_top = fixed_top; + pt->modifiable = modifiable; + + ret = find_optimal_leaf_level(sdev, page_shift, + vstart, dma_start, size, + &pt->leaf_level, &pt->pte_ext_shift); + if (ret) + goto extend_failed; + + pt->page_shift = sdev->mi.page_shift + pt->leaf_level * sdev->mi.level_shift; + pt->ptes_per_page = 1 << sdev->mi.level_shift; + + for (i = 0; i < PT_LEVELS; i++) + pt->pmd[i] = RB_ROOT; + kref_init(&pt->refcnt); + mutex_init(&pt->lock); + INIT_LIST_HEAD(&pt->freelist); + + ret = sif_pt_extend(pt, sg, vstart, size); + if (ret < 0) + goto extend_failed; + return pt; + +extend_failed: + kfree(pt); + return NULL; +} + + +struct sif_pt *sif_pt_create_for_mem(struct sif_mem *mem, + u64 vstart, u32 page_shift, bool modifiable, bool fixed_top) +{ + int ret = 0; + int i; + struct sif_dev *sdev = mem->sdev; + struct sif_pt *pt = sif_kmalloc(sdev, sizeof(*pt), GFP_KERNEL | __GFP_ZERO); + size_t size = mem->size; + + if (!pt) + return NULL; + + sif_log(sdev, SIF_MMU, "vstart %llx, size %lx, page_shift %d%s", vstart, size, + page_shift, (modifiable ? " (modifiable)" : "")); + pt->sdev = sdev; + pt->fixed_top = fixed_top; + pt->modifiable = modifiable; + ret = find_optimal_leaf_level(sdev, page_shift, + vstart, sif_mem_dma(mem, 0), size, + &pt->leaf_level, &pt->pte_ext_shift); + if (ret) + goto extend_failed; + + pt->page_shift = sdev->mi.page_shift + pt->leaf_level * sdev->mi.level_shift; + pt->ptes_per_page = 1 << sdev->mi.level_shift; + + for (i = 0; i < PT_LEVELS; i++) + pt->pmd[i] = RB_ROOT; + kref_init(&pt->refcnt); + mutex_init(&pt->lock); + INIT_LIST_HEAD(&pt->freelist); + + ret = sif_pt_extend_with_mem(pt, mem, vstart); + if (ret < 0) + goto extend_failed; + return pt; + +extend_failed: + kfree(pt); + return NULL; +} + + +/* Create an empty, extendable sif page table object */ +struct sif_pt *sif_pt_create_empty(struct sif_dev *sdev, u64 vstart, enum sif_mem_type map_mt) +{ + u32 page_shift = sdev->mi.page_shift; + struct sif_pt *pt; + int ret; + + if (map_mt == SIFMT_2M) + page_shift += sdev->mi.level_shift; + + pt = sif_pt_create(sdev, NULL, vstart, 0, page_shift, true, map_mt == SIFMT_CS); + if (!pt) + return NULL; + + if (map_mt == SIFMT_CS) { + /* Allocate an empty top page table page to get an address to send to PSIF: */ + pt->top_level = 3; + ret = init_top(pt, 0, 1); + if (ret) { + sif_kmem_free(pt->sdev, &pt->m); + return NULL; + } + } + return pt; +} + + +/* DMA address of root pointer of page table */ +dma_addr_t sif_pt_dma_root(struct sif_pt *pt) +{ + return pt->top ? sg_dma_address(pt->top->page) : 0; +} + +/* SIF level of root pointer */ +u8 sif_pt_root_table_level(struct sif_pt *pt) +{ + return pt->top_level + 1; +} + + +/* Create sif_pt_page objects for @npages new pages for the page list in @sgl + * and insert them into the freelist: + */ +static int add_pages_to_freelist(struct sif_pt *pt, struct scatterlist *sgl, int npages) +{ + struct scatterlist *sg; + struct sif_pt_page *n; + int i; + + for_each_sg(sgl, sg, npages, i) { + n = sif_pt_cache_alloc(pt->sdev, GFP_KERNEL | __GFP_ZERO); + if (!n) + return -ENOMEM; + sif_log(pt->sdev, SIF_MMU_V, "i = %d: sg %p", i, sg); + n->page = sg; + list_add_tail(&n->list, &pt->freelist); + } + return 0; +} + + +/* TBD: Consider allocating more than a single page at a time from @m object + * as sif_kmem_find_sg_list is O(n) where n is the number of sg arrays in @m. + */ +static struct sif_pt_page *pt_alloc_page(struct sif_pt *pt, u64 vaddr) +{ + int ret; + struct scatterlist *sg; + struct sif_pt_page *n; + + if (list_empty(&pt->freelist)) { + ret = sif_kmem_extend(pt->sdev, &pt->m, PAGE_SIZE, GFP_KERNEL); + if (ret < 0) + goto failed; + sg = sif_kmem_find_sg_idx(&pt->m, ret); + ret = add_pages_to_freelist(pt, sg, 1); + if (ret < 0) + goto failed; + } + + n = list_first_entry(&pt->freelist, struct sif_pt_page, list); + list_del(&n->list); + n->vaddr = vaddr; + return n; +failed: + return ERR_PTR(ret); +} + + + +static struct sif_pt_page *replace_top(struct sif_pt *pt, u64 vaddr) +{ + /* insert a new top node, put the old one into the + * empty rbtree for this level, and link the old top node from + * the new top: + */ + u64 aligned_vaddr, top_pagesize; + u64 pt_shift, ptv; + u64 *pmd; + int i; + struct sif_pt_page *ep; + struct sif_dev *sdev = pt->sdev; + + if (pt->top->usecnt == 1) { + /* Top node not used, just reuse with different va */ + pt->top->vaddr = vaddr; + return pt->top; + } + + pt->top->usecnt--; + /* Loop until we have a top node that spans vaddr */ + do { + int level = pt->top_level; + struct rb_root *root = &pt->pmd[level]; + struct rb_node **np = &root->rb_node; + + top_pagesize = level_to_pagesize(pt, ++pt->top_level); + aligned_vaddr = pt->top->vaddr & ~(top_pagesize - 1); + + rb_link_node(&pt->top->node, NULL, np); + rb_insert_color(&pt->top->node, root); + ep = pt->top; + pt->top = pt_alloc_page(pt, aligned_vaddr); + if (IS_ERR(pt->top)) { + ep = pt->top; + pt->top = NULL; + return ep; + } + + ep->parent = pt->top; + pmd = sg_virt(pt->top->page); + pt_shift = level_to_pageshift(pt, level); + i = sif_pte_index(sdev, ep->vaddr, pt_shift); + ptv = sg_dma_address(ep->page) | PT_PAGE_PRESENT; + sif_log(sdev, SIF_MMU_V, "level %d: pmd[%d](%p) = %llx", level, i, &pmd[i], ptv); + BUG_ON(pmd[i] != 0); + pmd[i] = ptv; + pt->top->usecnt++; + + sif_log(sdev, SIF_MMU, + "New top node at dma addr %pad level %d - aligned at %llx, page sz. %llx", + &sg_dma_address(pt->top->page), pt->top_level, aligned_vaddr, top_pagesize); + } while (vaddr < aligned_vaddr || vaddr >= aligned_vaddr + top_pagesize); + + return NULL; +} + + + +/* Find the page table page at level whose first entry references the sif virtual address @vaddr + * @vaddr assumed to be aligned to the appropriate alignment for the level. + * If the page does not exist, allocate a new one and add it: + */ +static struct sif_pt_page *find_insert_page(struct sif_pt *pt, u8 level, u64 vaddr) +{ + struct rb_root *root = &pt->pmd[level]; + struct rb_node **np = &root->rb_node; + struct rb_node *parent = NULL; + struct sif_pt_page *ep; + struct sif_dev *sdev = pt->sdev; + + sif_log(sdev, SIF_MMU, "level %d vaddr %llx", level, vaddr); + if (level == pt->top_level) { + if (likely(vaddr == pt->top->vaddr)) + return pt->top; + + /* (possibly recursively) build up a new top node that spans both + * the old tree and the new subtree: + */ + ep = replace_top(pt, vaddr); + if (ep) + return ep; + } + + while (*np) { + ep = container_of(*np, struct sif_pt_page, node); + parent = *np; + if (vaddr < ep->vaddr) + np = &((*np)->rb_left); + else if (vaddr > ep->vaddr) + np = &((*np)->rb_right); + else { + sif_log(sdev, SIF_PT, + "Level %d: Found page at vaddr %llx with dma addr %pad", + level, ep->vaddr, &sg_dma_address(ep->page)); + return ep; + } + } + + /* Allocate and insert a new node into the tree */ + ep = pt_alloc_page(pt, vaddr); + if (IS_ERR(ep)) + return ep; + + sif_log(sdev, SIF_PT, "Allocated new pt page for vaddr %llx with dma addr %pad", + vaddr, &sg_dma_address(ep->page)); + + rb_link_node(&ep->node, parent, np); + rb_insert_color(&ep->node, root); + return ep; +} + + +/* Find an element in the tree for the given level, return NULL if it does not + * exist: + */ +static struct sif_pt_page *find_page(struct sif_pt *pt, u8 level, u64 vaddr) +{ + struct rb_root *root; + struct rb_node *n; + struct rb_node *parent = NULL; + struct sif_pt_page *ep; + + if (level == pt->top_level) + return pt->top; + + root = &pt->pmd[level]; + n = root->rb_node; + + sif_log(pt->sdev, SIF_MMU_V, "level %d vaddr %llx", level, vaddr); + while (n) { + ep = container_of(n, struct sif_pt_page, node); + parent = n; + if (vaddr < ep->vaddr) + n = n->rb_left; + else if (vaddr > ep->vaddr) + n = n->rb_right; + else + return ep; + } + return NULL; +} + + +static inline struct sif_pt_page *next_page(struct sif_pt_page *p) +{ + struct rb_node *node = rb_next(&p->node); + + if (node) + return container_of(node, struct sif_pt_page, node); + else + return NULL; +} + +static inline struct sif_pt_page *prev_page(struct sif_pt_page *p) +{ + struct rb_node *node = rb_prev(&p->node); + + if (node) + return container_of(node, struct sif_pt_page, node); + else + return NULL; +} + +static inline struct sif_pt_page *first_page(struct sif_pt *pt, int level) +{ + struct rb_node *node = rb_first(&pt->pmd[level]); + + if (node) + return container_of(node, struct sif_pt_page, node); + else + return NULL; +} + +static inline struct sif_pt_page *last_page(struct sif_pt *pt, int level) +{ + struct rb_node *node = rb_last(&pt->pmd[level]); + + if (node) + return container_of(node, struct sif_pt_page, node); + else + return NULL; +} + + +/* Create the page table tree from the given vaddr upwards, until + * we reach an existsting node or find the top node. Update use counts on the + * involved nodes: + */ +static struct sif_pt_page *find_next(struct sif_pt *pt, u8 level, u64 vaddr) +{ + u64 vaddr_up = 0; + struct sif_pt_page *pt_page_start = find_insert_page(pt, level, vaddr); + struct sif_pt_page *pt_page; + struct sif_pt_page *pt_parent; + struct sif_dev *sdev = pt->sdev; + int i; + + if (pt_page_start == pt->top || IS_ERR(pt_page_start)) + return pt_page_start; + + sif_log(sdev, SIF_MMU_V, "level %d vaddr %llx", level, vaddr); + + pt_page = pt_page_start; + for (;;) { + u64 pt_shift, ptv; + u64 *pmd; + + pt_shift = level_to_pageshift(pt, level); + pt_parent = pt_page->parent; + level++; + if (pt_parent) { + /* We found an existing node - rest of the tree upwards is ok */ + break; + } + vaddr_up = vaddr & ~level_to_pagemask(pt, level); + if (level == pt->top_level && vaddr_up == pt->top->vaddr) { + sif_log(sdev, SIF_PT, "found top at level %d", level); + pt_parent = pt->top; + } else { + sif_log(sdev, SIF_PT, "searching at level %d/%d from vaddr %llx", + level, pt->top_level, vaddr_up); + pt_parent = find_insert_page(pt, level, vaddr_up); + } + + if (IS_ERR(pt_parent)) + return pt_parent; + + pt_page->parent = pt_parent; + + /* Set page pointer in parent */ + pmd = sg_virt(pt_parent->page); + i = sif_pte_index(sdev, vaddr, pt_shift); + ptv = sg_dma_address(pt_page->page) | PT_PAGE_PRESENT; + sif_log(sdev, SIF_MMU_V, "level %d: pmd[%d](%p) = %llx", level, i, &pmd[i], ptv); + WARN_ON(pmd[i] != 0); + pmd[i] = ptv; + + pt_parent->usecnt++; + if (pt_parent == pt->top || pt_parent->usecnt > 1) + break; + pt_page = pt_parent; + vaddr = vaddr_up; + } + return pt_page_start; +} + + +static int populate_pt(struct sif_pt *pt, struct scatterlist *sg, + u64 vstart, size_t size) +{ + int level = pt->leaf_level; + u64 va, vend, incr; + u64 pt_shift = level_to_pageshift(pt, level-1); /* page shift for the level below us */ + u64 page_flags = PT_PAGE_PRESENT; + struct sif_dev *sdev = pt->sdev; + u64 small_page_misalign; + u64 large_page_misalign = 0; + off_t sg_offset; /* Running page aligned offset within the current sg */ + + /* If level > 0 we must set the PS bit to indicate that this is a leaf node + * We also have two levels of alignment to consider: + */ + if (level > 0) { + small_page_misalign = vstart & level_to_pagemask(pt, level - 2); + large_page_misalign = (vstart & level_to_pagemask(pt, level - 1)) - small_page_misalign; + page_flags |= PT_PAGE_PS; + } else + small_page_misalign = (vstart & level_to_pagemask(pt, level - 1)); + + + /* Populate the table at level @level - assuming no overlap */ + vend = vstart + size; + va = vstart & ~level_to_pagemask(pt, level - 1); + + /* Depending on alignment we might need to point to a DMA address + * way ahead of the first sg, but aligned to the first small page size: + */ + sg_offset = -large_page_misalign; + incr = level_to_pagesize(pt, level - 1) << pt->pte_ext_shift; + + sif_log(sdev, SIF_PT, + "level %d mis (0x%llx,0x%llx) vstart %llx -> %llx size %lx pte_ext_shift %d, incr 0x%llx sg_offset %#lx", + level, small_page_misalign, large_page_misalign, vstart, va, size, + pt->pte_ext_shift, incr, sg_offset); + + while (va < vend) { + struct sif_pt_page *pt_page; + u64 *pmd; + int i; + u64 va_up = va & ~level_to_pagemask(pt, level); + + pt_page = find_next(pt, level, va_up); + if (IS_ERR(pt_page)) + return PTR_ERR(pt_page); + + pmd = sg_virt(pt_page->page); + i = sif_pte_index(sdev, va, pt_shift); + for (; i < sdev->mi.ptes_per_page && va < vend; i++) { + u64 ptv; + + if (!sg) { + sif_log(sdev, SIF_INFO, + "##### pt at %p: level %d: failed to find next sg at va %llx (vstart,size) = (%llx,%lx))", + pt, level, va, vstart, size); + return -EIO; + } + ptv = (sg_dma_address(sg) + sg_offset) | page_flags; + WARN_ON(pmd[i] != 0); + sif_log(sdev, SIF_PT_V, "va %llx: level %d: pmd[%d](%p) = %llx", + va, level, i, &pmd[i], ptv); + pmd[i] = ptv; + pt_page->usecnt++; + va += incr; + sg_offset += incr; + /* At this point size might be the end aligned size at this level so + * make sure to terminate at the end of the sg list: + */ + while (sg && sg_offset >= sg_dma_len(sg)) { + if (incr > sdev->mi.page_size) + sif_log(sdev, SIF_PT_VV, + "sg_offset %#lx sg->length %x sg_dma_len(sg) %x", + sg_offset, sg->length, sg_dma_len(sg)); + sg_offset -= sg_dma_len(sg); + sg = sg_next(sg); + } + /* Note that we must handle both small incr in large pages and opposite! */ + if (unlikely(sg_offset && sg_offset < incr)) + return 0; /* We're done - vend in the middle of a higher level page */ + } + } + + return 0; +} + + +/* sif_mem iterator based page table population - needed for special types */ +static int populate_pt_from_mem(struct sif_pt *pt, struct sif_mem *mem, u64 vstart, bool fast_path) +{ + u8 level = pt->leaf_level; + u64 va, vend, incr; + u64 pt_shift = level_to_pageshift(pt, level-1); /* page shift for the level below us */ + u64 page_flags = PT_PAGE_PRESENT; + struct sif_mem_iter mi; + struct sif_dev *sdev = pt->sdev; + u64 small_page_misalign; + u64 large_page_misalign = 0; + off_t sg_offset; /* Running page aligned offset within the current sg */ + + /* If level > 0 we must set the PS bit to indicate that this is a leaf node + * We also have two levels of alignment to consider: + */ + if (level > 0) { + small_page_misalign = vstart & level_to_pagemask(pt, level - 2); + large_page_misalign = (vstart & level_to_pagemask(pt, level - 1)) - small_page_misalign; + page_flags |= PT_PAGE_PS; + } else + small_page_misalign = (vstart & level_to_pagemask(pt, level - 1)); + + /* Populate the table at level @level - assuming no overlap */ + vend = vstart + mem->size; + va = vstart & ~level_to_pagemask(pt, level - 1); + + /* Depending on alignment we might need to point to a DMA address + * way ahead of the first sg, but aligned to the first small page size: + */ + sg_offset = -large_page_misalign; + incr = level_to_pagesize(pt, level - 1) << pt->pte_ext_shift; + sif_mem_iter_init(mem, &mi); + + sif_log(sdev, SIF_PT, + "level %d mis (0x%llx,0x%llx) vstart %llx -> %llx size %llx pte_ext_shift %d, incr 0x%llx sg_offset %#lx", + level, small_page_misalign, large_page_misalign, vstart, va, mem->size, + pt->pte_ext_shift, incr, sg_offset); + + while (va < vend) { + struct sif_pt_page *pt_page; + u64 *pmd; + int i; + u64 va_up = va & ~level_to_pagemask(pt, level); + + pt_page = find_next(pt, level, va_up); + if (IS_ERR(pt_page)) + return PTR_ERR(pt_page); + + pmd = sg_virt(pt_page->page); + i = sif_pte_index(sdev, va, pt_shift); + for (; i < sdev->mi.ptes_per_page && va < vend; i++) { + u64 ptv; + + ptv = (sif_mem_iter_dma(&mi) + sg_offset) | page_flags; + BUG_ON(!(ptv & ~0x81)); + sif_log(sdev, SIF_PT_V, "level %d: pmd[%d](%p) = %llx", level, i, &pmd[i], ptv); + pmd[i] = ptv; + if (!fast_path) + pt_page->usecnt++; + va += incr; + sg_offset += incr; + if (va < vend) { + int ret = sif_mem_iter_advance(&mi, sg_offset); + + if (ret) { + sif_log(sdev, SIF_MMU_V, "No page for vaddr %llx", va); + return ret; + } + sg_offset = 0; + } + } + } + + return 0; +} + + +/* (safe) observe leaf node of page table at @vaddr */ +int sif_pt_entry(struct sif_pt *pt, u64 vaddr, dma_addr_t *entry, dma_addr_t *val) +{ + int ret = 0; + struct sif_pt_page *p; + struct sif_dev *sdev = pt->sdev; + u64 *pmd; + u64 pt_shift; + u64 va_up; + u8 level; + int i, ip; + + mutex_lock(&pt->lock); + level = pt->leaf_level; + va_up = vaddr & ~level_to_pagemask(pt, level); + pt_shift = level_to_pageshift(pt, level-1); + p = find_page(pt, level, va_up); + if (p) { + pmd = sg_virt(p->page); + i = sif_pte_index(sdev, vaddr, pt_shift); + *val = pmd[i]; + pmd = sg_virt(p->parent->page); + ip = sif_pte_index(sdev, va_up, level_to_pageshift(pt, level)); + *entry = pmd[ip]; + sif_log(sdev, SIF_MMU_V, + "Page at vaddr %llx, lookup vaddr %llx at index %d: entry(idx = %d): %pad, value: %pad", + va_up, vaddr, i, ip, entry, val); + } else { + sif_log(sdev, SIF_MMU_V, "Page at vaddr %llx not found", va_up); + ret = -EINVAL; + } + mutex_unlock(&pt->lock); + return ret; +} + + +/* Remove a reference to the given remove_addr from page @p, + * if refcnt == 0, return page to freelist + * and (if at leaf level) return the next page in the rb_tree, otherwise return + * the same page. + * + */ +static struct sif_pt_page *remove_page_ref(struct sif_pt *pt, struct sif_pt_page *p, + u64 remove_addr, u8 level) +{ + struct sif_pt_page *np = p; + u64 *pmd = sg_virt(p->page); + int index = sif_pte_index(pt->sdev, remove_addr, level_to_pageshift(pt, level-1)); + u64 dma_addr = sg_dma_address(p->page); + + BUG_ON(p->usecnt < 1); + pmd[index] = 0; + + p->usecnt--; + sif_log(pt->sdev, SIF_PT_VV, + "level %d: index = %d ps = %d, page - dma at 0x%llx - use count %d", + level, index, level_to_pageshift(pt, level-1), dma_addr, p->usecnt); + if (!p->usecnt) { + if (p->parent) + remove_page_ref(pt, p->parent, p->vaddr, level + 1); + else + BUG_ON(p != pt->top); + if (level == pt->leaf_level) + np = next_page(p); + if (pt->top != p) /* We dont use the rbtree for the top node */ + rb_erase(&p->node, &pt->pmd[level]); + else + pt->top = NULL; /* So we can check if removal is needed in sif_pt_release() */ + pt_free_page(pt, p); + } + return np; +} + +/* size of each sg list used to maintain page table pages + * when fixed_top is set (currently only used by the sq_cmpl table) + * We want it reasonably large as we index in constant time into the list + * but use a linear scan to navigate the chain of lists + */ +#define FIXED_TOP_SG_SIZE 0x1000 + +static int init_top(struct sif_pt *pt, u64 vstart, int npages) +{ + u64 aligned_vaddr = vstart & ~(level_to_pagesize(pt, pt->top_level) - 1); + int ret; + size_t sg_size = pt->fixed_top ? FIXED_TOP_SG_SIZE : max(npages, 1); + + /* Single pte table necessary for WA for Bug #4096 */ + if (pt->top_level < pt->leaf_level) { + sif_log(pt->sdev, SIF_PT_V, "Adjusting top level %d -> %d", + pt->top_level, pt->leaf_level); + pt->top_level = pt->leaf_level; + } + + ret = sif_kmem_init(pt->sdev, &pt->m, sg_size, (u64)npages << PAGE_SHIFT, + PAGE_SHIFT, GFP_KERNEL, DMA_TO_DEVICE); + if (ret < 0) + return ret; + + if (add_pages_to_freelist(pt, pt->m.sg, pt->m.sg_max)) + return ret; + + /* Create the top node of the page table: */ + pt->top = pt_alloc_page(pt, aligned_vaddr); + if (unlikely(IS_ERR(pt->top))) { + int ret = PTR_ERR(pt->top); + + pt->top = NULL; + return ret; + } + sif_log(pt->sdev, SIF_PT_V, + "Created top node at kva %p, dma addr %pad level %d for vstart %llx - aligned at %llx", + sg_virt(pt->top->page), &sg_dma_address(pt->top->page), + pt->top_level, vstart, aligned_vaddr); + + if (pt->modifiable) { + /* avoid that this node gets freed if all mappings are removed */ + pt->top->usecnt++; + } + return 0; +} + + +inline void reinit_top(struct sif_pt *pt, u64 vstart) +{ + u64 aligned_vaddr = vstart & ~(level_to_pagesize(pt, pt->top_level) - 1); + + sif_log(pt->sdev, SIF_PT_V, + "Reused top node at dma addr %pad level %d for vstart %llx - aligned at %llx", + &sg_dma_address(pt->top->page), pt->top_level, vstart, aligned_vaddr); + pt->top->vaddr = aligned_vaddr; +} + + +static u64 recalc_vstart(struct sif_pt *pt) +{ + struct sif_dev *sdev = pt->sdev; + struct sif_pt_page *p = first_page(pt, pt->leaf_level); + u64 page_shift = level_to_pageshift(pt, pt->leaf_level - 1); + int i; + + if (p) { + u64 *pmd = sg_virt(p->page); + + for (i = 0; i < sdev->mi.ptes_per_page; i++) + if (pmd[i]) { + u64 nvaddr = p->vaddr + (i << page_shift); + u64 delta_sz = nvaddr - pt->vstart; + + sif_log(sdev, SIF_PT_V, "vstart %llx -> %llx (vsize %llx -> %llx)", + pt->vstart, nvaddr, pt->vsize, pt->vsize - delta_sz); + pt->vsize -= delta_sz; + return nvaddr; + } + } + pt->vsize = 0; + pt->vstart = 0; + return 0; +} + +static u64 recalc_size(struct sif_pt *pt) +{ + struct sif_dev *sdev = pt->sdev; + struct sif_pt_page *p = last_page(pt, pt->leaf_level); + u64 page_shift = level_to_pageshift(pt, pt->leaf_level - 1); + int i; + + if (p) { + u64 *pmd = sg_virt(p->page); + + for (i = sdev->mi.ptes_per_page - 1; i >= 0; i--) + if (pmd[i]) { + u64 nend = p->vaddr + ((i+1) << page_shift); + u64 nvsize = nend - pt->vstart; + + sif_log(sdev, SIF_MMU_V, "vstart at %llx, size %llx -> %llx", + pt->vstart, pt->vsize, nvsize); + return nvsize; + } + } + pt->vsize = 0; + pt->vstart = 0; + return 0; +} + + + +/* Extend a page table at DMA address @vstart with the list starting at @sg with size @size */ +int sif_pt_extend(struct sif_pt *pt, struct scatterlist *sg, u64 vstart, size_t size) +{ + int ret = 0; + u32 npages; + u64 page_mask = level_to_pagesize(pt, pt->leaf_level - 1) - 1; + u64 new_start; + u64 new_size; + + if (!size) + return 0; + + sif_log(pt->sdev, SIF_MMU, "** vstart %llx size %lx page size %llx leaf_level %d **", + vstart, size, page_mask + 1, pt->leaf_level); + mutex_lock(&pt->lock); + + /* Calculate a good size of each sg table in the kmem object: */ + if (!pt->top) { + /* This is a blank pt - allocate and set up the initial structures */ + npages = table_mem_need(pt, vstart, size); + + ret = init_top(pt, vstart, npages); + if (ret) + goto kmem_ext_failed; + + new_start = vstart; + new_size = size; + } else if (pt->vsize == 0) { + new_start = vstart; + new_size = size; + reinit_top(pt, vstart); + } else { + if (!pt->modifiable) { + sif_log(pt->sdev, SIF_INFO, "error: Attempt to modify an unmodifiable page table"); + return -EINVAL; + } + new_start = merge_ranges(pt->vstart, pt->vsize, vstart, size, &new_size); + sif_log(pt->sdev, SIF_MMU_V, "new_start %llx new_size %llx **", + new_start, new_size); + } + + kref_get(&pt->refcnt); + + ret = populate_pt(pt, sg, vstart, size); + if (ret) + goto populate_failed; + + /* sync the whole table memory to make sure the changes are reflected: + * TBD: Optimize to only sync the parts that have actually been modified. + * With this code we will potentially sync a long page freelist as well: + */ + dma_sync_sg_for_device(pt->sdev->ib_dev.dma_device, pt->m.sg, pt->m.sg_max, DMA_TO_DEVICE); + + pt->vstart = new_start; + pt->vsize = new_size; + mutex_unlock(&pt->lock); + return ret; +populate_failed: + kref_put(&pt->refcnt, sif_pt_release); +kmem_ext_failed: + sif_kmem_free(pt->sdev, &pt->m); + mutex_unlock(&pt->lock); + return ret; +} + + + +/* Extend a page table at DMA address @vstart with the contents of @mem */ +int sif_pt_extend_with_mem(struct sif_pt *pt, struct sif_mem *mem, u64 vstart) +{ + int ret = 0; + u32 npages; + u64 page_mask = level_to_pagesize(pt, pt->leaf_level - 1) - 1; + u64 new_start; + u64 new_size; + size_t size = mem->size; + + if (!size) + return 0; + + sif_log(pt->sdev, SIF_MMU, "** vstart %llx size %lx page size %llx leaf level %d **", + vstart, size, page_mask + 1, pt->leaf_level); + mutex_lock(&pt->lock); + + /* Calculate a good size of each sg table in the kmem object: */ + if (!pt->top) { + /* This is a blank pt - allocate and set up the initial structures */ + npages = table_mem_need(pt, vstart, size); + + ret = init_top(pt, vstart, npages); + if (ret) + goto kmem_ext_failed; + + new_start = vstart; + new_size = size; + } else if (!pt->modifiable) { + sif_log(pt->sdev, SIF_INFO, "error: Attempt to modify an unmodifiable page table"); + return -EINVAL; + } else if (pt->vsize == 0) { + new_start = vstart; + new_size = size; + reinit_top(pt, vstart); + } else { + new_start = merge_ranges(pt->vstart, pt->vsize, vstart, size, &new_size); + sif_log(pt->sdev, SIF_MMU_V, "new_start %llx new_size %llx **", + new_start, new_size); + } + + kref_get(&pt->refcnt); + + ret = populate_pt_from_mem(pt, mem, vstart, false); + + /* sync the whole table memory to make sure the changes are reflected: + * TBD: Optimize to only sync the parts that have actually been modified. + * With this code we will potentially sync a long page freelist as well: + */ + dma_sync_sg_for_device(pt->sdev->ib_dev.dma_device, pt->m.sg, pt->m.sg_max, DMA_TO_DEVICE); + + pt->vstart = new_start; + pt->vsize = new_size; + mutex_unlock(&pt->lock); + return ret; + +kmem_ext_failed: + sif_kmem_free(pt->sdev, &pt->m); + mutex_unlock(&pt->lock); + return ret; +} + + +/* Shrink a page table to no longer contain DMA address start @sg and size @size */ +int sif_pt_free_part(struct sif_pt *pt, u64 vstart, size_t size) +{ + struct sif_pt_page *p; + int level = pt->leaf_level; + u64 va = vstart & ~level_to_pagemask(pt, level - 1); + u64 va_up = va & ~level_to_pagemask(pt, level); + u64 vend = vstart + size; + u64 page_size; + int ret = 0; + + sif_log(pt->sdev, SIF_PT_V, "** vstart %llx -> %llx, size %lx **", vstart, va, size); + + page_size = level_to_pagesize(pt, level - 1); + mutex_lock(&pt->lock); + p = find_page(pt, level, va_up); + if (!p) { + sif_log(pt->sdev, SIF_INFO, "vaddr %llx not found at level %d", + va_up, level); + ret = -EINVAL; /* va not mapped */ + goto failed; + } + + while (va < vend && p) { + p = remove_page_ref(pt, p, va, level); + if (!p) + break; + if (va < p->vaddr) + va = p->vaddr; + else + va += page_size; + } + if (vstart == pt->vstart) { + pt->vsize -= size; + pt->vstart += size; + if (size == pt->vsize) + pt->vstart = pt->vsize = 0; + else + pt->vstart = recalc_vstart(pt); + } + if (vend == pt->vstart + pt->vsize) { + pt->vsize -= size; + if (size == pt->vsize) + pt->vstart = pt->vsize = 0; + else + pt->vsize = recalc_size(pt); + } + + /* sync the whole table memory to make sure the changes are reflected: + * TBD: Optimize to only sync the parts that have actually been modified. + * With this code we will potentially sync a long page freelist as well: + */ + dma_sync_sg_for_device(pt->sdev->ib_dev.dma_device, pt->m.sg, pt->m.sg_max, DMA_TO_DEVICE); + + mutex_unlock(&pt->lock); + return kref_put(&pt->refcnt, sif_pt_release); + +failed: + mutex_unlock(&pt->lock); + return ret; +} + +/* Free remaining mappings */ +int sif_pt_free(struct sif_pt *pt) +{ + int ret = 0; + + if (pt->vsize) { + int ref = atomic_read(&pt->refcnt.refcount); + + if (ref == 2) + ret = sif_pt_free_part(pt, pt->vstart, pt->vsize); + else { + sif_log(pt->sdev, SIF_MMU_V, "failed - vstart %llx, sz %llx, refcnt %d", + pt->vstart, pt->vsize, ref); + return -EBUSY; + } + } + if (!ret) { + sif_log(pt->sdev, SIF_MMU_V, "refcnt %d", atomic_read(&pt->refcnt.refcount) - 1); + ret = kref_put(&pt->refcnt, sif_pt_release); + if (!ret) + return -EBUSY; + ret = 0; + } + return ret; +} + + + +/* Remap the (remappable) page table to be used starting at vstart for the range of mem */ +int sif_pt_remap_for_mem(struct sif_pt *pt, struct sif_mem *mem, u32 page_shift, + u64 vstart) +{ + /* We optimize the case where @vstart is aligned in a way that allows + * the page table to be reused directly. For now we just handle the case where + * the old and new vaddr and the size is the same, which is the case for RDS, + * our main use case for FMR at this stage. + * For all other cases, we just do a full cycle of free/extend_with_mem: + */ + int ret = 0; + + if (pt->vstart != vstart || pt->vsize != mem->size || pt->page_shift != page_shift) { + ret = sif_pt_free_part(pt, pt->vstart, pt->vsize); + if (ret) + return ret; + ret = sif_pt_extend_with_mem(pt, mem, vstart); + return ret; + } + + sif_log(pt->sdev, SIF_MMU_V, "** vstart %llx size %llx **", vstart, mem->size); + mutex_lock(&pt->lock); + + /* Fast path: Repopulate ptes directly - all ref.cnts are kept as is: */ + + ret = populate_pt_from_mem(pt, mem, vstart, true); + + /* sync the whole table memory to make sure the changes are reflected: + * TBD: Optimize to only sync the parts that have actually been modified. + * With this code we will potentially sync a long page freelist as well: + */ + if (!ret) + dma_sync_sg_for_device(pt->sdev->ib_dev.dma_device, pt->m.sg, pt->m.sg_max, DMA_TO_DEVICE); + mutex_unlock(&pt->lock); + return ret; +} + + +/* Called from debugfs key file - caller assumes this function will + * finish the line in the file: + */ +void sif_pt_dfs_print(struct seq_file *s, struct sif_dev *sdev, loff_t pos) +{ + /* First figure out if a pt object exists for this key, + * we only care about MR keys here yet: + */ + struct sif_pt *pt; + struct sif_mr *mr = safe_get_sif_mr(sdev, pos); + + pt = mr ? mr->mmu_ctx.pt : NULL; + if (!pt) { + seq_puts(s, "\n"); + return; + } + + seq_printf(s, " %3d %3d %4lld\n", + pt->top_level, pt->leaf_level, pt->m.size >> pt->m.page_shift); +} diff --git a/drivers/infiniband/hw/sif/sif_pt.h b/drivers/infiniband/hw/sif/sif_pt.h new file mode 100644 index 000000000000..e62a91e9fb14 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_pt.h @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_pt.h: SIF (private) page table management. + * API for managing a sif specific page table which can be referenced from + * multiple MMU contexts. + */ + +#ifndef _SIF_PT_H +#define _SIF_PT_H +#include +#include +#include "sif_mem.h" + +struct seq_file; + +/* rb_tree entries to track virtual addresses + * in this page table. + */ +struct sif_pt_page { + struct rb_node node; /* Linkage for pt->pmd */ + struct list_head list; /* Linkage for freelist */ + struct scatterlist *page; /* Pointer to info on the used page within pt->m */ + struct sif_pt_page *parent; /* Pointer to the parent page in the page table */ + u64 vaddr; /* Virtual address mapped by the page table page */ + u32 usecnt; /* Number of entries in use in the referred pt page */ +}; + + +/* Number of page table page levels we support: + * This level uses 0 = pte pages, 1 = pmd pages, 2 = pud pages, 3 = pgdir pages + * This equals psif_table_level - 1 as we do not represent the pages themselves. + * + * Example: Corresponding page_shift will then eg be 12 (4K pages) for level -1 and 21 (2M) + * for level 1 for the default x86 case. For Sparc, several level 0 page sizes are + * supported, which gives multiple alternatives for the lowest level. + */ +#define PT_LEVELS 4 + +/* Lower bits with special meaning + * from the Intel page table spec + */ +#define PT_PAGE_PRESENT 0x1 /* Page is present */ +#define PT_PAGE_PS 0x80 /* If set (at level >= 0) page is a leaf pointer even at level > 0 */ +#define PT_PAGE_SHIFT 12 /* Number of insignificant bits in a sif page table pointer */ + +/* SIF driver representation of a generic + * driver maintained page table. + * + * Note that the base leaf page size is + * based on the "theoretical" smallest page, eg with 2M pages it will be 4K = shift 12. + * Whether that size is actually used is then determined by leaf_level. + */ +struct sif_pt { + struct sif_dev *sdev; /* Device this mapping is valid for */ + bool fixed_top; /* If set, pt guarantees that the top node remains constant */ + bool modifiable; /* Set if this page table should support modification */ + u8 top_level; /* Page table level of top node, 0 means no table */ + u8 leaf_level; /* Page table level of leaf node */ + u8 pte_ext_shift; /* Only populate every (1 << pte_ext_shift) pte */ + u16 ptes_per_page; /* #ptes per page table page - also defines size of the pt page */ + u32 page_shift; /* Base leaf page shift in use for this table */ + u64 vstart; /* Start of the mapping in VA as seen from SIF */ + u64 vsize; /* Extent of the mapping (including any holes) */ + struct sif_pt_page *top;/* Top level page table page exposed to sif */ + struct mutex lock; /* Protects modifications to the page table data structure */ + struct kref refcnt; /* Keep track of users of this page table */ + struct sif_kmem m; /* DMA mapped store for page table memory */ + struct rb_root pmd[PT_LEVELS];/* Pr.level lookup table from offset to page table page */ + struct list_head freelist; /* list of DMA mapped pt pages not currently in use */ +}; + + +/* Called from sif_init/exit to set up/clean up global data structures */ +int sif_pt_init(void); +void sif_pt_exit(void); + +/* Called from debugfs key file */ +void sif_pt_dfs_print(struct seq_file *s, struct sif_dev *sdev, loff_t pos); + +/* Create a referenced sif page table object with an empty top level page */ +struct sif_pt *sif_pt_create_empty(struct sif_dev *sdev, u64 vstart, enum sif_mem_type map_mt); + +/* Create a sif page table object of size @mapsize using memory referenced by @sg + * with SIF virtual address starting at @vstart, which must be aligned at a page + * size boundary compatible with page sizes used by the memory type used by the backing store + * @map_mt. Assuming sg is a valid (possibly chained) scatterlist long enough to provide + * backing for @mapsize. + * Set @modifiable to allow the table to be extended and shrinked + * Set @fixed_top to have pt guarantee that the top node remains constant + * in which case it will always be a level 4 tree. + */ +struct sif_pt *sif_pt_create(struct sif_dev *sdev, struct scatterlist *sg, + u64 vstart, size_t mapsize, + u32 page_shift, bool modifiable, bool fixed_top); + +/* Create a sif page table from a mem object: + * Set @fixed_top to prepare for a table where the top node is fixed: + * (will always be a level 4 tree) + */ +struct sif_pt *sif_pt_create_for_mem(struct sif_mem *mem, u64 vstart, + u32 page_shift, bool modifiable, bool fixed_top); + +/* Remap the (remappable) page table to be used starting at vstart for the range of mem + * eg. replace the current mapping with a new one, preserving the top node + * (but possibly reuse at a different level!) + */ +int sif_pt_remap_for_mem(struct sif_pt *pt, struct sif_mem *mem, + u32 page_shift, u64 vstart); + +/* Extend a page table at DMA address @vstart with the list starting at @sg with size @size */ +int sif_pt_extend(struct sif_pt *pt, struct scatterlist *sg, u64 vstart, size_t size); + +/* Extend a page table at DMA address @vstart with the contents of @mem */ +int sif_pt_extend_with_mem(struct sif_pt *pt, struct sif_mem *mem, u64 vstart); + +/* DMA address of root pointer of page table */ +dma_addr_t sif_pt_dma_root(struct sif_pt *pt); + +/* SIF level of root pointer */ +u8 sif_pt_root_table_level(struct sif_pt *pt); + +/* Leaf page shift (number of bits within page) of this page table */ +u32 sif_pt_page_shift(struct sif_pt *pt); + +/* Observe leaf node of page table at @vaddr */ +int sif_pt_entry(struct sif_pt *pt, u64 vaddr, dma_addr_t *entry, dma_addr_t *val); + +/* free a part of the page table and dereference */ +int sif_pt_free_part(struct sif_pt *pt, u64 vstart, size_t size); + +/* Free this page table. If more than one reference has been created (using sif_pt_extend) + * return -EBUSY, e.g. this call can be used parenthetic with sif_pt_create, but not if + * mapping has been referenced more than once, in which case sif_pt_free_part must be called + * with identical start, size as with extend to clean up properly before a final sif_pt_free: + */ +int sif_pt_free(struct sif_pt *pt); + +/* Div. utilities: */ + +/* Find the aligned size of a region within a certain page alignment size + * (eg. the number of pages of size @alignment needed to address (start,len)) + */ +u64 aligned_size(u64 start, u64 len, u64 alignment); + +/* Find the optimal page size (represented by leaf level) + * to use based on device capabilities, configuration and a max_shift + * value (typically based on continuousness of memory: + * The result is adjusted with the address pair of a corresponding virtual + * address and dma address to ensure that it is possible to create a mapping at that + * level. pte_extent is set to the number bits to shift increment between + * each valid pte (For the odd sized leaf pages) + */ +int find_optimal_leaf_level(struct sif_dev *sdev, u32 max_shift, + u64 vaddr, u64 dma_addr, u64 size, + u8 *leaf_level, u8 *pte_ext_shift); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_qp.c b/drivers/infiniband/hw/sif/sif_qp.c new file mode 100644 index 000000000000..7c293d426ee8 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_qp.c @@ -0,0 +1,2441 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_qp.c: Implementation of IB queue pair logic for sif + */ + +#include +#include +#include "sif_dev.h" +#include "sif_defs.h" +#include "sif_qp.h" +#include "sif_ah.h" +#include "sif_sq.h" +#include "sif_pqp.h" +#include "sif_dma.h" +#include "sif_user.h" +#include "sif_base.h" +#include "sif_mr.h" +#include "sif_xrc.h" +#include "sif_query.h" +#include "sif_hwi.h" +#include "sif_user.h" +#include "psif_hw_data.h" +#include "psif_hw_setget.h" +#include "psif_hw_csr.h" +#include "sif_ibcq.h" +#include "sif_sndrcv.h" +#include +#include + +/* Work-around for bz 3646 */ +static unsigned char bug_3646_conv_table[32] = { + 0, + 18, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, +}; + +static int reset_qp(struct sif_dev *sdev, struct sif_qp *qp); + +static int sif_create_pma_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct sif_qp_init_attr sif_attr); + +static int poll_wait_for_qp_writeback(struct sif_dev *sdev, struct sif_qp *qp) +{ + unsigned long timeout = sdev->min_resp_ticks; + unsigned long timeout_real = jiffies + timeout; + enum psif_qp_state state = PSIF_QP_STATE_INIT; + + sif_log(sdev, SIF_QP, "enter qp %d", qp->qp_idx); + do { + /* Make sure the update from hw is observed in correct order */ + smp_rmb(); + state = get_psif_qp_core__state(&qp->d.state); + + if (state == PSIF_QP_STATE_RESET) + break; + + if (time_is_before_jiffies(timeout_real)) + cond_resched(); + else { + sif_log(sdev, SIF_INFO, + "Timeout waiting for write back for QP %d - last state %s", + qp->qp_idx, string_enum_psif_qp_state(state)); + + if (unlikely(sif_debug_mask & SIF_QP_V)) { + struct psif_query_qp lqqp; + int ret; + + ret = epsc_query_qp(qp, &lqqp); + if (ret) + sif_log(sdev, SIF_QP_V, + "Unable to retrieve qp state for qp %d from epsc, status %d", + qp->qp_idx, ret); + else + sif_logs(SIF_QP_V, write_struct_psif_query_qp(NULL, 0, &lqqp)); + } + + return -ETIMEDOUT; + } + } while (true); + + sif_log(sdev, SIF_QP, "exit - write-back observed on qp %d", qp->qp_idx); + return 0; +} + +static int send_epsa_proxy_qp_sq_key(struct sif_dev *sdev, u32 lkey, + int qpnum, + enum psif_mbox_type eps_num) +{ + struct psif_epsc_csr_req req; + struct psif_epsc_csr_rsp rsp; + int ret; + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_A_COMMAND; + req.u.epsa_cmd.cmd = EPSA_GET_PROXY_QP_SQ_KEY; + req.u.epsa_cmd.key = lkey; + req.u.epsa_cmd.qpnum = qpnum; + ret = sif_eps_wr(sdev, eps_num, &req, &rsp); + + return ret; +} + +struct sif_qp *create_qp(struct sif_dev *sdev, + struct ib_qp_init_attr *init_attr, + struct sif_qp_init_attr *sif_attr) +{ + struct sif_qp *qp, *rqp = NULL; + struct sif_sq *sq; + struct psif_qp qpi; + struct sif_rq *rq = NULL; + struct sif_pd *pd = sif_attr->pd; + + int ret = 0; + int rq_idx = -1; + int request_qpn = -1; + int index; + bool mark_dirty = false; + struct sif_cq *send_cq = NULL; + struct sif_cq *recv_cq = NULL; + u32 flags = init_attr->create_flags; + u32 max_sge; + int min_tso_inline; + + if (init_attr->send_cq) + send_cq = to_scq(init_attr->send_cq); + if (init_attr->recv_cq) + recv_cq = to_scq(init_attr->recv_cq); + + /* Software need to support more than max hw send sge for UD - see #1883 */ + max_sge = + sif_attr->qp_type == PSIF_QP_TRANSPORT_UD ? SIF_SW_MAX_UD_SEND_SGE : SIF_HW_MAX_SEND_SGE; + + if (init_attr->cap.max_send_sge > max_sge) { + sif_log(sdev, SIF_INFO, "illegal max send sge %d, SIF only supports %d", + init_attr->cap.max_send_sge, max_sge); + return ERR_PTR(-EINVAL); + } + + if (init_attr->cap.max_inline_data > sif_max_inline) { + sif_log(sdev, SIF_INFO, + "%d bytes of inline data requested - supported max %u - this limit is defined by module parameter max_inline", + init_attr->cap.max_inline_data, sif_max_inline); + return ERR_PTR(-EINVAL); + } + + if (init_attr->qp_type <= IB_QPT_GSI) { + /* IB verbs port numbers start at 1 while psif starts w/port 0 */ + int qpn = init_attr->qp_type + ((init_attr->port_num - 1) << 1); + int ok = atomic_add_unless(&sdev->sqp_usecnt[qpn], 1, 1); + + if (!ok) { + sif_log(sdev, SIF_INFO, + "Attempt to create QP %d for port %d more than once", + init_attr->qp_type, init_attr->port_num); + return ERR_PTR(-EBUSY); + } + request_qpn = qpn; + sif_log(sdev, SIF_QP, "Requested qp %d, port %d", + init_attr->qp_type, init_attr->port_num); + } + + /* Allow allocation of qp 0/1 */ + index = request_qpn >= 0 ? request_qpn : sif_alloc_qp_idx(pd); + if (index < 0) { + rqp = ERR_PTR(-ENOMEM); + sif_log(sdev, SIF_QP, "sif_alloc_qp_idx failed"); + goto err_alloc_index; + } + qp = get_sif_qp(sdev, index); + + /* Set this temporarily - needed by reporting of qp write-back check */ + qp->qp_idx = index; + /* + * We add a sge (with the stencil) when sending with TSO. The stencil is stored at + * the beginning of the inline-area. TSO implies checksumming which again has + * a requirement that no inline can be used. It is therefore necessary to check that we have at least + * 64 bytes of inline-buffering. + */ + min_tso_inline = 64; + if ((flags & IB_QP_CREATE_IPOIB_UD_LSO) && + init_attr->cap.max_inline_data < min_tso_inline) { + sif_log(sdev, SIF_INFO, + "Create LSO QP; qp_%d max_sge %d inline_size %d qp_type %d; modifing max_inline_size to %d", + index, init_attr->cap.max_send_sge, init_attr->cap.max_inline_data, + init_attr->qp_type, min_tso_inline); + init_attr->cap.max_inline_data = min_tso_inline; + } + + if (init_attr->qp_type == IB_QPT_RC || init_attr->qp_type == IB_QPT_XRC_INI) { + /* Required in anticipation of Atomics use */ + init_attr->cap.max_inline_data = max(init_attr->cap.max_inline_data, 16U); + } + + /* Now, before we can write the QP state - we must ensure that any previous usage + * has been completed (the writeback after modify_qp to RESET happens asynchronously + * after the modify_qp request completes. + */ + ret = poll_wait_for_qp_writeback(sdev, qp); + if (ret) { + /* Dont release this desc as it is probably not safe to use anymore */ + mark_dirty = true; + rqp = ERR_PTR(ret); + goto err_lazy_wb; + } + + memset(qp, 0, sizeof(struct sif_qp)); + qp->qp_idx = index; + qp->ulp_type = sif_attr->ulp_type; + + if (qp->ulp_type == RDS_ULP) { + int new_max_inline = CB_LENGTH; /* collectbuffer_length is max 256 */ + + sif_log(sdev, SIF_QP, + "Create QP; qp_%d max_sge %d inline_size %d qp_type %d; modifing max_inline_size to %d", + index, init_attr->cap.max_send_sge, init_attr->cap.max_inline_data, + init_attr->qp_type, new_max_inline); + init_attr->cap.max_inline_data = new_max_inline; + } + + if (init_attr->qp_type <= IB_QPT_GSI) { + qp->port = init_attr->port_num; + if (init_attr->qp_type == IB_QPT_SMI) + qp->flags |= SIF_QPF_SMI; + else if (init_attr->qp_type == IB_QPT_GSI) + qp->flags |= SIF_QPF_GSI; + } else { + /* Let port 1 be default: init_attr->port_num is only valid for qp 0/1 */ + qp->port = 1; + } + + qp->last_set_state = IB_QPS_RESET; + qp->tracked_state = IB_QPS_RESET; + qp->mtu = IB_MTU_4096; + qp->type = sif_attr->qp_type; + + /* TBD: Optimize this log to a single stmt */ + if (send_cq) + sif_log(sdev, SIF_QP, "qpn %d, qp 0x%p send cq %d (type %s) port %d, pd %d", + index, qp, send_cq->index, string_enum_psif_qp_trans(qp->type), + qp->port, pd->idx); + else + sif_log(sdev, SIF_QP, "qpn %d, qp 0x%p [no send cq] (type %s) port %d, pd %d", + index, qp, string_enum_psif_qp_trans(qp->type), qp->port, pd->idx); + + /* The PQP does not have any receive queue, neither does the XRC qp + * where RQs are selected per work request via wr.xrc_hdr.xrqd_id + */ + if (is_regular_qp(qp)) { + if (init_attr->srq) { + rq = to_srq(init_attr->srq); + if (atomic_add_unless(&rq->refcnt, 1, 0)) { + rq_idx = rq->index; + sif_log(sdev, SIF_QP, "Connected qp %d to SRQ %d", + index, rq_idx); + } else { + sif_log(sdev, SIF_INFO, + "failed to connect qp %d to SRQ %d, rq invalid", + index, rq_idx); + rqp = ERR_PTR(-ENODEV); + goto err_rq_fail; + } + } else { + rq_idx = alloc_rq(sdev, pd, init_attr->cap.max_recv_wr, + init_attr->cap.max_recv_sge, NULL, + sif_attr->user_mode); + if (rq_idx >= 0) + rq = get_sif_rq(sdev, rq_idx); + } + if (rq_idx < 0) { + rqp = ERR_PTR(rq_idx); + goto err_rq_fail; + } + + /* Adjust requested values based on what we got: */ + init_attr->cap.max_recv_wr = rq->entries_user; + } + qp->rq_idx = rq_idx; + + if (rq && !init_attr->srq) { + /* Check/update max sge cap: */ + if (rq->sg_entries > init_attr->cap.max_recv_sge) { + sif_log(sdev, SIF_QP, "recv sge adjusted (%d -> %d)", + init_attr->cap.max_recv_sge, rq->sg_entries); + init_attr->cap.max_recv_sge = rq->sg_entries; + } + + /* Store cq reference for cleanup purposes */ + if (recv_cq) + rq->cq_idx = recv_cq->index; + } + + + /* sq always gets same index as QP.. */ + ret = sif_alloc_sq(sdev, pd, qp, &init_attr->cap, + sif_attr->user_mode, sif_attr->sq_hdl_sz); + if (ret < 0) { + rqp = ERR_PTR(ret); + goto err_sq_fail; + } + + /* Store send completion queue index default since + * for psif send cq number is a parameter in the work request + */ + sq = get_sif_sq(sdev, qp->qp_idx); + sq->cq_idx = send_cq ? send_cq->index : (u32)-1; /* XRC recv only */ + sq->complete_all = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR ? 1 : 0; + + /* Adjust requested values based on what we got: */ + init_attr->cap.max_send_wr = sq->entries; + + /* Initialization of qp state via local copy */ + memset(&qpi, 0, sizeof(struct psif_qp)); + + if (multipacket_qp(qp->type)) { + qpi.state.sq_clog2_extent = order_base_2(sq->extent); + qpi.state.sq_clog2_size = order_base_2(sq->entries); + } + qpi.state.retry_sq_seq = 0; + qpi.state.state = ib2sif_qp_state(IB_QPS_RESET); + qpi.state.pd = pd->idx; + if (!sif_feature(zero_magic)) { + qp->magic = prandom_u32(); + qpi.state.magic = qp->magic; + } + qpi.state.transport_type = qp->type; + if (qp->type == PSIF_QP_TRANSPORT_XRC && init_attr->xrcd) + qpi.state.xrc_domain = to_sxrcd(init_attr->xrcd)->index; + qpi.state.rq_indx = rq_idx; + qpi.state.rq_is_srq = !!init_attr->srq || (init_attr->qp_type == IB_QPT_XRC_TGT); + qpi.state.send_cq_indx = send_cq ? send_cq->index : (u32)-1; + qpi.state.rcv_cq_indx = recv_cq ? recv_cq->index : (u32)-1; + + qpi.state.mstate = APM_MIGRATED; + qpi.state.path_mtu = ib2sif_path_mtu(qp->mtu); + /* Last acked psn must be initialized to one less than xmit_psn + * and it is a 24 bit value. See issue #1011 + */ + qpi.state.xmit_psn = 0; + qpi.state.last_acked_psn = 0xffffff; + qpi.state.qosl = qp->qosl = sif_attr->qosl; + + /* See #2402/#2770 */ + if (sif_feature(infinite_rnr)) { + qpi.state.rnr_retry_init = 7; + qpi.state.rnr_retry_count = 7; + qpi.state.min_rnr_nak_time = 26; /* Bug 3646, this is about 160 us */ + } + + if (flags & IB_QP_NO_CSUM) + qpi.state.no_checksum = 1; + + if (sif_attr->proxy != SIFPX_OFF) { + /* This is a proxy QP */ + qpi.state.proxy_qp_enable = 1; + qp->eps_tag |= EPS_TAG_FROM_HOST; + ret = send_epsa_proxy_qp_sq_key(sdev, sq->sg_mr->index, + qp->qp_idx, + proxy_to_mbox(sif_attr->proxy)); + if (ret) + sif_log(sdev, SIF_QP, "send_epsa_proxy_qp_sq_key failed"); + } + + if (sif_attr->user_mode) + qp->flags |= SIF_QPF_USER_MODE; + + if (flags & IB_QP_CREATE_IPOIB_UD_LSO) { + qp->flags |= SIF_QPF_IPOIB; + qpi.state.ipoib_enable = 1; + qpi.state.ipoib = 1; + } + + /* PSIF extensions */ + if (flags & IB_QP_CREATE_EOIB) { + qp->flags |= SIF_QPF_EOIB; + qpi.state.eoib_enable = 1; + qpi.state.eoib = 1; + qpi.state.eoib_type = EOIB_QKEY_ONLY; + } + if (flags & IB_QP_CREATE_RSS) + qpi.state.rss_enable = 1; + if (flags & IB_QP_CREATE_HDR_SPLIT) + qpi.state.hdr_split_enable = 1; + if (flags & IB_QP_CREATE_RCV_DYNAMIC_MTU) + qpi.state.rcv_dynamic_mtu_enable = 1; + if (flags & IB_QP_CREATE_SND_DYNAMIC_MTU) + qpi.state.send_dynamic_mtu_enable = 1; + + /* according to ib_verbs.h init_attr->port_num is only valid for QP0/1 */ + if (init_attr->qp_type <= IB_QPT_GSI) + qpi.path_a.port = init_attr->port_num - 1; + + sif_log(sdev, SIF_QP, "qp %d path_a.port = %d", qp->qp_idx, qpi.path_a.port); + + /* Write composed entry to shared area */ + copy_conv_to_hw(&qp->d, &qpi, sizeof(struct psif_qp)); + + mutex_init(&qp->lock); /* TBD: Sync scheme! */ + + /* Users should see qp 0/1 even though qp 0/1 is mapped to qp 2/3 for + * port 2 + */ + qp->ibqp.qp_num = qp->qp_idx > 3 ? qp->qp_idx : (qp->qp_idx & 0x1); + + /* For the priv. QP types we need to set some other elements in the + * ib verbs struct as well + */ + if (qp->type == PSIF_QP_TRANSPORT_MANSP1) { + qp->ibqp.device = &sdev->ib_dev; + qp->ibqp.qp_num = qp->qp_idx; + qp->ibqp.qp_type = IB_QPT_UD; + } + + qp->flush_sq_done_wa4074 = false; + + ret = sif_dfs_add_qp(sdev, qp); + if (ret) + goto err_dfs_qp; + /* initialize the sychronization between destroy qp and event handling.*/ + init_completion(&qp->can_destroy); + + /* a qp can only be destroyed if refcnt == 0.*/ + atomic_set(&qp->refcnt, 1); + + return qp; + +err_dfs_qp: + sif_free_sq(sdev, qp); +err_sq_fail: + if (rq && !rq->is_srq) + free_rq(sdev, rq_idx); +err_rq_fail: +err_lazy_wb: + if (!mark_dirty) + sif_free_qp_idx(pd, qp->qp_idx); +err_alloc_index: + return rqp; +} + +/* PMA proxy QP */ +static int sif_create_pma_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct sif_qp_init_attr sif_attr) +{ + struct ib_qp *ret = NULL; + struct sif_dev *sdev; + struct sif_pd *pd; + struct sif_qp *qp; + + sdev = to_sdev(ibpd->device); + pd = to_spd(ibpd); + /* Let's override IB_QPT_GSI by IB_QPT_UD*/ + init_attr->qp_type = IB_QPT_UD; + + qp = create_qp(sdev, init_attr, &sif_attr); + + if (IS_ERR(qp)) { + /* Convert interior error to right type: */ + ret = (struct ib_qp *)qp; + goto err_create_qp; + } + qp->flags |= SIF_QPF_PMA_PXY; + qp->port = init_attr->port_num; + sdev->pma_qp_idxs[qp->port - 1] = qp->qp_idx; + + /* Make dfs and query_qp happy: */ + qp->ibqp.device = &sdev->ib_dev; + qp->ibqp.pd = &sdev->pd->ibpd; + + /* Set back IB_QPT_GSI */ + init_attr->qp_type = IB_QPT_GSI; + + sif_log(sdev, SIF_QP, "Exit: success 0x%p proxy qp %d - real qp %d", + &qp->ibqp, qp->ibqp.qp_num, qp->qp_idx); + return qp->qp_idx; + +err_create_qp: + sif_log(sdev, SIF_QP, "Exit: failed"); + return 0; +} + +struct ib_qp *sif_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct sif_dev *sdev; + struct sif_qp *qp; + struct sif_pd *pd; + struct sif_xrcd *xrcd = NULL; + struct ib_qp *ret = NULL; + enum ib_qp_create_flags flags = init_attr->create_flags; + ulong user_flags = 0; + + struct sif_qp_init_attr sif_attr = { + .qp_type = ib2sif_qp_type(init_attr->qp_type), + .user_mode = udata != NULL, + .sq_hdl_sz = sizeof(struct sif_sq_hdl), + }; + + + /* First we need to locate the device pointer - + * if this is an XRC QP ibpd will be NULL: + */ + if (init_attr->qp_type == IB_QPT_XRC_TGT) { + if (!init_attr->xrcd) { + sif_log0(SIF_INFO, "Error: missing XRC domain for XRC qp"); + return ERR_PTR(-EINVAL); + } + + xrcd = to_sxrcd(init_attr->xrcd); + sdev = to_sdev(init_attr->xrcd->device); + + pd = xrcd->pd; + } else { + sdev = to_sdev(ibpd->device); + pd = to_spd(ibpd); + } + + sif_attr.pd = pd; + + sif_log(sdev, SIF_QP, "Enter qp_type %d%s", init_attr->qp_type, + (udata ? " (user call)" : "")); + + /* TBD: How to handle this? */ + if (flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) + sif_log(sdev, SIF_QP, "flag IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK set (ignored)"); + + if (flags & IB_QP_CREATE_PROXY) { + /* We don't know the actual EPSA to use here but QPs dont care */ + sif_attr.proxy = SIFPX_EPSA_1; + } + + /* TBD: Verify that user params such as the send cq are authorized?? */ + if (!xrcd && !init_attr->send_cq) { + sif_log(sdev, SIF_INFO, "No send completion queue specified"); + ret = ERR_PTR(-EINVAL); + goto err_create_qp; + } + + if (!xrcd && !init_attr->recv_cq) { + sif_log(sdev, SIF_INFO, "No receive completion queue specified"); + ret = ERR_PTR(-EINVAL); + goto err_create_qp; + } + + if (udata && init_attr->qp_type <= IB_QPT_GSI) { + sif_log(sdev, SIF_INFO, "Attempt to create SMI/GSI QP %d from user space", + init_attr->qp_type); + return ERR_PTR(-EINVAL); + } + + if (udata) { + struct sif_create_qp_ext cmd; + int rv = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); + + if (rv) { + ret = ERR_PTR(rv); + goto err_create_qp; + } + user_flags = cmd.flags; + if (sif_vendor_enable(proxy_mode, user_flags)) + sif_attr.proxy = cmd.proxy; + + if (sif_vendor_enable(SVF_kernel_mode, user_flags)) + sif_attr.user_mode = false; + + if (sif_vendor_enable(tsu_qosl, user_flags)) + sif_attr.qosl = QOSL_LOW_LATENCY; + + if (sif_vendor_enable(no_checksum, user_flags)) { + /* update the init_attr->create_flags directly. + * This will allow the same code path if umem can pass this as a + * create_qp flag via struct ibv_qp_init_attr_ex in the future: + */ + init_attr->create_flags |= IB_QP_NO_CSUM; + } + } + + /* TBD: check init_attr params against device cap-limits */ + /* TBD update ib_qp_cap? */ + if (sif_vendor_enable(dynamic_mtu, user_flags)) { + /* TBD - check the device capabilities to determine whether to + * create qp with the support of send/receive dynamic MTU. + */ + init_attr->create_flags |= IB_QP_CREATE_RCV_DYNAMIC_MTU; + init_attr->create_flags |= IB_QP_CREATE_SND_DYNAMIC_MTU; + } + + /* best effort to determine the ULP caller. */ + if (!sif_attr.user_mode) + sif_attr.ulp_type = sif_find_kernel_ulp_caller(); + + qp = create_qp(sdev, init_attr, &sif_attr); + + if (IS_ERR(qp)) { + /* Convert interior error to right type: */ + ret = (struct ib_qp *)qp; + goto err_create_qp; + } else { + sif_log(sdev, SIF_QP, "Exit: success 0x%p ib qp %d - real qp %d%s", + &qp->ibqp, qp->ibqp.qp_num, qp->qp_idx, + (sif_attr.user_mode ? " (user mode)" : "")); + } + + qp->qosl = sif_attr.qosl; + qp->nocsum = init_attr->create_flags & IB_QP_NO_CSUM; + + + + if (sif_vendor_enable(dynamic_mtu, user_flags)) { + /* TBD - dynamic mtu flag should only be set during modify_qp in CM + * or OOB establishment. It is only set if remote dynamic_mtu_supported && + * local dynamic_send_mtu_supported. As create_qp should not be in + * the critical path, split this code from the setting of + * IB_QP_CREATE_RCV_DYNAMIC_MTU and IB_QP_CREATE_SND_DYNAMIC_MTU flags + * to remind ourself that this need to be implemented separately. + */ + sif_log(sdev, SIF_QP, "Enabling forced dynamic MTU for qp %d", qp->qp_idx); + qp->flags |= SIF_QPF_DYNAMIC_MTU; + } + + if (sif_vendor_enable(SQ_mode, user_flags)) { + sif_log(sdev, SIF_QP, "Enabling forced SQ mode for qp %d", qp->qp_idx); + qp->flags |= SIF_QPF_FORCE_SQ_MODE; + } + + if (udata) { + struct sif_create_qp_resp_ext resp; + struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx); + int rv; + + memset(&resp, 0, sizeof(resp)); + resp.qp_idx = qp->qp_idx; + resp.sq_extent = sq->extent; + resp.sq_sgl_offset = sq->sgl_offset; + resp.sq_mr_idx = sq->sg_mr ? sq->sg_mr->index : 0; + resp.sq_dma_handle = sif_mem_dma(sq->mem, 0); + if (init_attr->qp_type != IB_QPT_XRC_INI && init_attr->qp_type != IB_QPT_XRC_TGT) { + /* XRC qps do not have any rq */ + struct sif_rq *rq = get_sif_rq(sdev, qp->rq_idx); + + resp.rq_idx = qp->rq_idx; + resp.rq_extent = rq->extent; + } + + resp.magic = get_psif_qp_core__magic(&qp->d.state); + rv = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (rv) { + ret = ERR_PTR(rv); + goto err_udata; + } + } + /* Support for PMA_PXY QP bug #3357 */ + if (init_attr->qp_type == IB_QPT_GSI + && eps_version_ge(&sdev->es[sdev->mbox_epsc], 0, 57)) { + int pma_qp_idx = sif_create_pma_qp(ibpd, init_attr, sif_attr); + + if (!pma_qp_idx) + sif_log(sdev, SIF_INFO, "Create PMA_PXY qp %d port %d failed", + qp->qp_idx, init_attr->port_num); + } + + return &qp->ibqp; +err_udata: + destroy_qp(sdev, qp); +err_create_qp: + sif_log(sdev, SIF_QP, "Exit: failed"); + return ret; +} + + +/* Modify qp implementation related: */ + + +enum sif_mqp_type sif_modify_qp_is_ok(struct sif_qp *qp, enum ib_qp_state cur_state, + enum ib_qp_state next_state, enum ib_qp_attr_mask mask) +{ + struct sif_dev *sdev = to_sdev(qp->ibqp.device); + enum ib_qp_type type = qp->ibqp.qp_type; + int ret; + enum rdma_link_layer ll = IB_LINK_LAYER_INFINIBAND; + + /* PSIF treats XRC just as any other RC QP */ + if (type == IB_QPT_XRC_INI || type == IB_QPT_XRC_TGT) + type = IB_QPT_RC; + ret = ((qp->type == PSIF_QP_TRANSPORT_MANSP1 || is_epsa_tunneling_qp(type)) ? 1 : + ib_modify_qp_is_ok(cur_state, next_state, type, mask, ll)); + if (!ret) + return SIF_MQP_ERR; + switch (cur_state) { + case IB_QPS_RESET: + if (qp->tracked_state == IB_QPS_SQD) + qp->tracked_state = IB_QPS_RESET; + return SIF_MQP_SW; + case IB_QPS_INIT: + if (next_state == IB_QPS_INIT || next_state == IB_QPS_RESET || + next_state == IB_QPS_ERR) + return SIF_MQP_SW; + /* else fall-through */ + case IB_QPS_RTS: + /* TBD: Elim.hack to behave like mlx on this: */ + if (unlikely(qp->tracked_state == IB_QPS_SQD && + next_state != IB_QPS_RESET && next_state != IB_QPS_ERR)) + return SIF_MQP_ERR; + if (unlikely(next_state == IB_QPS_SQD)) { + qp->tracked_state = next_state; /* To fail on future transitions */ + return SIF_MQP_IGN; /* Allow, but ignore as MLX does */ + } + /* else fall-through */ + case IB_QPS_RTR: + if (unlikely(next_state == IB_QPS_SQD)) + return SIF_MQP_ERR; + return SIF_MQP_HW; + case IB_QPS_SQE: + return SIF_MQP_HW; + case IB_QPS_ERR: + /* Bug #3933 WA for HW bug 3928 + * For this specific transition, modify qp must be done based + * on current qp ownership (towards HW only if HW owned) + */ + return (PSIF_REVISION(sdev) <= 3) + && !(qp->flags & SIF_QPF_HW_OWNED) ? + SIF_MQP_SW : SIF_MQP_HW; + default: + return SIF_MQP_IGN; + } +} + + + +static int modify_qp_sw(struct sif_dev *sdev, struct sif_qp *qp, + struct ib_qp_attr *qp_attr, int qp_attr_mask); +static int modify_qp_hw(struct sif_dev *sdev, struct sif_qp *qp, + struct ib_qp_attr *qp_attr, int qp_attr_mask); + + + +int modify_qp_hw_wa_qp_retry(struct sif_dev *sdev, struct sif_qp *qp, + struct ib_qp_attr *qp_attr, int qp_attr_mask) +{ + struct ib_qp_attr mod_attr = { + .qp_state = IB_QPS_ERR + }; + + bool need_wa_3713 = PSIF_REVISION(sdev) <= 3 + && IS_PSIF(sdev) + && qp_attr_mask & IB_QP_STATE && qp_attr->qp_state == IB_QPS_RESET; + + /* WA for duplicate CQEs */ + bool need_wa_4074 = PSIF_REVISION(sdev) <= 3 + && (qp->type != PSIF_QP_TRANSPORT_MANSP1) + && qp_attr_mask & IB_QP_STATE && qp_attr->qp_state == IB_QPS_ERR + && IS_PSIF(sdev); + + int ret = 0; + + if (need_wa_3713 || need_wa_4074) { + if (qp->type != PSIF_QP_TRANSPORT_MANSP1) + ret = pre_process_wa4074(sdev, qp); + + if (ret) { + if (ret != -1) + sif_log(sdev, SIF_INFO, "Failed to pre-process WA4074, ret - %d", ret); + } + } + + if (need_wa_3713) { + /* Workaround for bug #3713 part 2 - see #3714 */ + ret = modify_qp_hw(sdev, qp, &mod_attr, IB_QP_STATE); + if (ret) + sif_log(sdev, SIF_INFO, "implicit modify qp %d to ERR failed - ignoring", + qp->qp_idx); + } + + ret = modify_qp_hw(sdev, qp, qp_attr, qp_attr_mask); + + if (need_wa_3713 || need_wa_4074) { + struct ib_qp_attr attr = { + .qp_state = IB_QPS_RESET + }; + + if (need_wa_4074) { + ret = modify_qp_hw(sdev, qp, &attr, IB_QP_STATE); + if (ret) { + sif_log(sdev, SIF_INFO, "qp %d RESET failed, ret %d", qp->qp_idx, ret); + goto err_modify_qp_wa; + } + /* Restore QP SW state to ERROR */ + qp->last_set_state = qp->tracked_state = IB_QPS_ERR; + } + + qp->flags &= ~SIF_QPF_HW_OWNED; + + if (qp->type != PSIF_QP_TRANSPORT_MANSP1) + ret = post_process_wa4074(sdev, qp); + + if (ret) + sif_log(sdev, SIF_INFO, "Failed to post-process WA #4074 %d", ret); + } +err_modify_qp_wa: + + return ret; +} + +int notify_epsc_pma_qp(struct sif_dev *sdev, int qp_idx, short port) +{ + struct sif_eps *es = &sdev->es[sdev->mbox_epsc]; + struct psif_epsc_csr_req req; + struct psif_epsc_csr_rsp rsp; + int ret = -1; + + if (eps_version_ge(es, 0, 57)) { + memset(&req, 0, sizeof(req)); + memset(&rsp, 0, sizeof(rsp)); + req.opcode = EPSC_SET; + req.u.set.data.op = EPSC_QUERY_PMA_REDIRECT_QP; + req.u.set.data.index = port; + req.u.set.data.value = qp_idx; + + ret = sif_epsc_wr_poll(sdev, &req, &rsp); + if (ret) { + sif_log(sdev, SIF_INFO, "Failed to configure epsc PMA_PXY QP\n"); + return ret; + } + return ret; + } else + return -EINVAL; +} + +int sif_modify_qp(struct ib_qp *ibqp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_udata *udata) +{ + struct sif_qp *qp = to_sqp(ibqp); + struct sif_dev *sdev = to_sdev(ibqp->device); + struct sif_qp *pma_qp = NULL; + struct sif_eps *es = &sdev->es[sdev->mbox_epsc]; + int ret = 0; + bool need_pma_pxy_qp = eps_version_ge(es, 0, 57) + && (qp_attr->qp_state != IB_QPS_RTS) + && (qp->qp_idx == 1 || qp->qp_idx == 3); + + if (need_pma_pxy_qp) { + pma_qp = get_sif_qp(sdev, sdev->pma_qp_idxs[!!(qp->qp_idx & 2)]); + ret = modify_qp(sdev, pma_qp, qp_attr, qp_attr_mask, true, udata); + if (ret) + sif_log(sdev, SIF_INFO, "Modify PMA_PXY QP %d failed", + pma_qp->qp_idx); + else if (qp_attr->qp_state == IB_QPS_RTR) { + ret = notify_epsc_pma_qp(sdev, pma_qp->qp_idx, pma_qp->port); + if (ret) + sif_log(sdev, SIF_INFO, "Notify epsc PMA_PXY QP %d failed", + pma_qp->qp_idx); + } + } + + return modify_qp(sdev, qp, qp_attr, qp_attr_mask, + true, udata); +} + + +int modify_qp(struct sif_dev *sdev, struct sif_qp *qp, + struct ib_qp_attr *qp_attr, int qp_attr_mask, + bool fail_on_same_state, struct ib_udata *udata) +{ + int ret = 0; + struct ib_qp *ibqp = &qp->ibqp; + enum ib_qp_state cur_state, new_state; + enum sif_mqp_type mqp_type = SIF_MQP_IGN; + + sif_log(sdev, SIF_QP, "Enter: qpn %d qp_idx %d mask 0x%x", + ibqp->qp_num, qp->qp_idx, qp_attr_mask); + + /* WA #622, RQ flush from error completion in userspace */ + if (udata && is_regular_qp(qp)) { + struct sif_modify_qp_ext cmd; + struct sif_rq *rq = get_sif_rq(sdev, qp->rq_idx); + + ret = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); + if (ret) { + sif_log(sdev, SIF_INFO, "ib_copy_from_udata failed, sts %d, qp %d, size %ld", + ret, qp->qp_idx, sizeof(cmd)); + return ret; + } + + switch (cmd.flush) { + case FLUSH_RQ: + ret = sif_flush_rq(sdev, rq, qp, rq->entries); + if (ret) + sif_log(sdev, SIF_INFO, "failed to flush RQ %d", + rq->index); + return ret; + case FLUSH_SQ: + ret = post_process_wa4074(sdev, qp); + if (ret) + sif_log(sdev, SIF_INFO, "failed to flush SQ %d", qp->qp_idx); + return ret; + default: + break; + } + } + + mutex_lock(&qp->lock); + + cur_state = qp_attr_mask & IB_QP_CUR_STATE ? + qp_attr->cur_qp_state : qp->last_set_state; + + new_state = qp_attr_mask & IB_QP_STATE ? qp_attr->qp_state : cur_state; + + if (!fail_on_same_state && cur_state == qp_attr->qp_state) { + /* Silently ignore.. (used at destroy time) */ + goto sif_mqp_ret; + } + + mqp_type = sif_modify_qp_is_ok(qp, cur_state, new_state, qp_attr_mask); + switch (mqp_type) { + case SIF_MQP_SW: + ret = modify_qp_sw(sdev, qp, qp_attr, qp_attr_mask); + break; + case SIF_MQP_HW: + ret = modify_qp_hw_wa_qp_retry(sdev, qp, qp_attr, qp_attr_mask); + break; + case SIF_MQP_IGN: + break; + case SIF_MQP_ERR: + default: + sif_log(sdev, SIF_INFO, "illegal state change from %d to %d for qp %d", + cur_state, new_state, qp->qp_idx); + ret = -EINVAL; + } + +sif_mqp_ret: + if (!ret && !(mqp_type == SIF_MQP_IGN)) { + /* TBD: Is this needed? */ + qp_attr->cur_qp_state = new_state; + } + + /* QP ownership flag must be updated before release + * the lock in order to avoid race conditions + */ + switch (new_state) { + case IB_QPS_RESET: + qp->flags &= ~SIF_QPF_HW_OWNED; + break; + case IB_QPS_RTR: + qp->flags |= SIF_QPF_HW_OWNED; + break; + default: + /* No extra actions needed */ + break; + } + + mutex_unlock(&qp->lock); + + if (ret) + return ret; + + /* Bug #3933 - WA for HW bug 3928 + * enable/disable the HW ownership QP flag + */ + switch (new_state) { + case IB_QPS_ERR: + if (is_regular_qp(qp)) { + struct sif_rq *rq = get_sif_rq(sdev, qp->rq_idx); + + /* WA #3850:if SRQ, generate LAST_WQE event */ + if (rq->is_srq && qp->ibqp.event_handler) { + struct ib_event ibe = { + .device = &sdev->ib_dev, + .event = IB_EVENT_QP_LAST_WQE_REACHED, + .element.qp = &qp->ibqp + }; + + qp->ibqp.event_handler(&ibe, qp->ibqp.qp_context); + } else if (rq && !rq->is_srq) { + /* WA #622: if reqular RQ, flush */ + ret = sif_flush_rq(sdev, rq, qp, rq->entries); + if (ret) { + sif_log(sdev, SIF_INFO, "failed to flush RQ %d", + rq->index); + return ret; + } + } + } + break; + case IB_QPS_RESET: + /* clean all state associated with this QP */ + ret = reset_qp(sdev, qp); + break; + default: + /* No extra actions needed */ + break; + } + return ret; +} + + +static void set_qp_path_hw(struct sif_qp *qp, struct psif_epsc_csr_modify_qp *mct, + struct ib_qp_attr *qp_attr, int qp_attr_mask, bool alternate) +{ + struct psif_qp_path *path; + struct ib_ah_attr *ah_attr; + struct sif_dev *sdev = to_sdev(qp->ibqp.device); + struct psif_csr_modify_qp_ctrl *ctrl_attr = &mct->ctrl; + u8 ipd = 0; + + /* IBV_QP_ALT_PATH Set the alternative path via: + * alt_ah_attr, alt_pkey_index, alt_port_num and + * alt_timeout. + */ + if (alternate) { + ctrl_attr->alt_path = 1; + path = &mct->data.alternate_path; + ah_attr = &qp_attr->alt_ah_attr; + path->pkey_indx = qp_attr->alt_pkey_index; + path->local_ack_timeout = qp_attr->alt_timeout; + path->port = qp_attr->alt_port_num - 1; + sif_log(sdev, SIF_QP, "Alternate pkey_indx %d local_ack_timeout %d, port %d", + qp_attr->alt_pkey_index, qp_attr->alt_timeout, qp_attr->alt_port_num + 1); + } else { + ctrl_attr->prim_path = 1; + /* TBD: Does this belong here? */ + ctrl_attr->pkey_index = 1; + path = &mct->data.primary_path; + ah_attr = &qp_attr->ah_attr; + path->pkey_indx = qp->pkey_index; + /* Use the value set by IB_QP_PORT: */ + path->port = qp->port - 1; + sif_log(sdev, SIF_QP, "Primary pkey_indx %d local_ack_timeout %d, port %d", + qp_attr->pkey_index, qp_attr->timeout, qp_attr->port_num + 1); + } + path->sl = ah_attr->sl; + path->remote_lid = ah_attr->dlid; + path->local_lid_path = ah_attr->src_path_bits; + + path->loopback = + (sdev->port[path->port].lid | path->local_lid_path) == ah_attr->dlid ? + LOOPBACK : NO_LOOPBACK; + + /* sif_calc_ipd do not set ipd if sif_calc_ipd failed. In that case, ipd = 0.*/ + sif_calc_ipd(sdev, qp->port, (enum ib_rate) ah_attr->static_rate, &ipd); + path->ipd = ipd; + + if (ah_attr->ah_flags & IB_AH_GRH) { + path->use_grh = USE_GRH; + path->remote_gid_0 = cpu_to_be64(ah_attr->grh.dgid.global.subnet_prefix); + path->remote_gid_1 = cpu_to_be64(ah_attr->grh.dgid.global.interface_id); + path->flowlabel = ah_attr->grh.flow_label; + path->hoplmt = ah_attr->grh.hop_limit; + /* TBD: ah_attr->grh.sgid_index? */ + + sif_log(sdev, SIF_QP, " - with grh dgid %llx.%llx", + ah_attr->grh.dgid.global.subnet_prefix, + ah_attr->grh.dgid.global.interface_id); + } + + if (qp_attr_mask & IB_QP_TIMEOUT) { + path->local_ack_timeout = qp_attr->timeout; + sif_log(sdev, SIF_QP, " - with timeout %d", qp_attr->timeout); + } + + sif_log(sdev, SIF_QP, "local_lid_path %d, remote_lid %d %s, QP(ipd):%d %s", + path->local_lid_path, path->remote_lid, (path->loopback ? "(loopback)" : ""), + path->ipd, (alternate ? "(alternate)" : "")); +} + +static int modify_qp_hw(struct sif_dev *sdev, struct sif_qp *qp, + struct ib_qp_attr *qp_attr, int qp_attr_mask) +{ + struct psif_epsc_csr_rsp resp; + struct psif_epsc_csr_req req; + struct psif_epsc_csr_modify_qp *mct = &req.u.modify_qp; + struct psif_csr_modify_qp_ctrl *ctrl_attr = &mct->ctrl; + struct psif_csr_modify_qp_ctrl *cmd = &mct->ctrl; + int ret = 0; + + memset(&req, 0, sizeof(req)); + + req.opcode = EPSC_MODIFY_QP; + + cmd->cmd = QP_CMD_MODIFY; + + if (qp->qp_idx <= 3) { + /* sif requires "real" QP numbers in modify_qp */ + cmd->qp_num = qp->qp_idx & 1; + cmd->port_num = qp->qp_idx >> 1; + } else + cmd->qp_num = qp->qp_idx; + + if (qp_attr_mask & IB_QP_STATE) { + ctrl_attr->qp_state = 1; + mct->data.state = ib2sif_qp_state(qp_attr->qp_state); + } + + if (qp->last_set_state == IB_QPS_INIT && qp_attr->qp_state == IB_QPS_RTR) { + /* Bug #3933 - WA for HW bug 3928 + * QP hw state must be set to INIT before modify_qp_hw to RTR + */ + volatile struct psif_qp *qps; + + qps = &qp->d; + set_psif_qp_core__state(&qps->state, PSIF_QP_STATE_INIT); + + /* For INIT -> RTR the rest of the attrs are set directly in the descriptor: */ + ret = modify_qp_sw(sdev, qp, qp_attr, qp_attr_mask & ~IB_QP_STATE); + + /* Flag to the FW that this is the PQP */ + if (qp->type == PSIF_QP_TRANSPORT_MANSP1) + req.flags |= EPSC_FL_PQP; + if (ret) + goto err_modify_qp; + else + goto ok_modify_qp_sw; + } + + if (qp_attr_mask & IB_QP_CUR_STATE) { + ctrl_attr->use_current_state = 1; + cmd->current_state = ib2sif_qp_state(qp_attr->cur_qp_state); + + /* TBD: Remove this sanity check later: */ + if (qp_attr->cur_qp_state != qp->last_set_state) + sif_log(sdev, SIF_QP, + "** WARNING: possible state inconsistency (user %d, driver %d)", + qp->last_set_state, qp_attr->cur_qp_state); + } + + if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) { + /* TBD: Needed? */ + sif_log(sdev, SIF_QP, + "IB_QP_EN_SQD_ASYNC_NOTIFY needed!"); + goto err_modify_qp; + } + + if (qp_attr_mask & IB_QP_ACCESS_FLAGS) { + /* TBD: qp_rcv_cap must be set and the whole struct psif_qp_rcv_cap + * must be set if any of it's values are modified.. + * - must keep driver copies of this + */ + + /* TBD: (qp_attr->qp_access_flags & IB_ACCESS_LOCAL_WRITE) ? 1 : 0; ? */ + mct->data.rdma_rd_enable = + (qp_attr->qp_access_flags & IB_ACCESS_REMOTE_READ) ? 1 : 0; + mct->data.rdma_wr_enable = + (qp_attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; + mct->data.atomic_enable = + (qp_attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0; + /* IB_ACCESS_MW_BIND not supported (?) */ + } + + /* This section must be before IB_QP_AV */ + if (qp_attr_mask & IB_QP_PKEY_INDEX) { + /* TBD: Argument check on index value ? */ + qp->pkey_index = qp_attr->pkey_index; + } + + /* This section must be before IB_QP_AV */ + if (qp_attr_mask & IB_QP_PORT) { + if (qp_attr->port_num < 1 || qp_attr->port_num > 2) { + sif_log(sdev, SIF_INFO, "Modify port: Illegal port %d specified for qp %d", + qp_attr->port_num, qp->qp_idx); + ret = -EINVAL; + goto err_modify_qp; + } + sif_log(sdev, SIF_QP, "Modify port to %d for qp %d", + qp_attr->port_num, qp->qp_idx); + qp->port = qp_attr->port_num; + } + + if (qp_attr_mask & IB_QP_QKEY) { + ctrl_attr->qkey = 1; + mct->data.rx_qkey = qp_attr->qkey; + + sif_log(sdev, SIF_QP, "Assign QKEY 0x%x for qp %d", + qp_attr->qkey, qp->qp_idx); + + } + + if (qp_attr_mask & IB_QP_AV) + set_qp_path_hw(qp, mct, qp_attr, qp_attr_mask, false); + + if (qp_attr_mask & IB_QP_PATH_MTU) { + if (!ib_legal_path_mtu(qp_attr->path_mtu)) { + sif_log(sdev, SIF_INFO, "Illegal MTU encoding %d", qp_attr->path_mtu); + ret = EINVAL; + goto err_modify_qp; + } + ctrl_attr->path_mtu = 1; + if ((qp->type == PSIF_QP_TRANSPORT_RC) && sif_feature(force_rc_2048_mtu)) { + if (qp_attr->path_mtu > IB_MTU_2048) + qp_attr->path_mtu = IB_MTU_2048; + } + mct->data.path_mtu = ib2sif_path_mtu(qp_attr->path_mtu); + qp->mtu = qp_attr->path_mtu; + } + + if (qp_attr_mask & IB_QP_TIMEOUT) { + ctrl_attr->local_ack_timeout = 1; + if (!(qp_attr_mask & (IB_QP_AV|IB_QP_ALT_PATH))) + mct->data.primary_path.local_ack_timeout = qp_attr->timeout; + } + + if (qp_attr_mask & IB_QP_RETRY_CNT) { + ctrl_attr->error_retry_count = 1; + mct->data.error_retry_count = qp_attr->retry_cnt; + } + + if (qp_attr_mask & IB_QP_RNR_RETRY) { + ctrl_attr->rnr_retry_count = 1; + mct->data.rnr_retry_count = qp_attr->rnr_retry; + } + + if (qp_attr_mask & IB_QP_RQ_PSN) { + /* expected receive PSN */ + ctrl_attr->expected_psn = 1; + mct->data.expected_psn = qp_attr->rq_psn; + } + + if (qp_attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + /* This is the sending side */ + ctrl_attr->max_outstanding = 1; + if (qp_attr->max_rd_atomic == 0) { + sif_log(sdev, SIF_QP, + "IB_QP_MAX_QP_RD_ATOMIC value 0 incrementing to 1"); + qp_attr->max_rd_atomic = 1; + } + if (qp_attr->max_rd_atomic > 16 || qp_attr->max_rd_atomic < 0) { + /* As per IBTA 9.4.4 & 11.2.4.2 */ + sif_log(sdev, SIF_INFO, + "IB_QP_MAX_QP_RD_ATOMIC value %u out of range", + qp_attr->max_rd_atomic); + ret = -EINVAL; + goto err_modify_qp; + } + mct->data.max_outstanding = qp_attr->max_rd_atomic; + } + + if (qp_attr_mask & IB_QP_ALT_PATH) { + if (qp_attr->alt_port_num < 1 || qp_attr->alt_port_num > 2) { + sif_log(sdev, SIF_INFO, "Illegal alternate port %d specified for qp %d", + qp_attr->alt_port_num, qp->qp_idx); + ret = -EINVAL; + goto err_modify_qp; + } + set_qp_path_hw(qp, mct, qp_attr, qp_attr_mask, true); + } + + if (qp_attr_mask & IB_QP_MIN_RNR_TIMER) { + ctrl_attr->min_rnr_nak_time = 1; + mct->data.min_rnr_nak_time = sif_feature(force_wa_3646) ? + bug_3646_conv_table[qp_attr->min_rnr_timer & 0x1F] : + qp_attr->min_rnr_timer & 0x1F; + } + + if (qp_attr_mask & IB_QP_SQ_PSN) { + /* Send packet sequence number */ + ctrl_attr->xmit_psn = 1; + mct->data.xmit_psn = qp_attr->sq_psn; + } + + if (qp_attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + /* Currently hard coded to 16 in psif */ + if (unlikely(qp_attr->max_dest_rd_atomic > 16)) { + sif_log(sdev, SIF_QP, + "IB_QP_MAX_DEST_RD_ATOMIC value %u out of range - psif supports 16 as a hard coded value", + qp_attr->max_dest_rd_atomic); + goto err_modify_qp; + } else if (qp_attr->max_dest_rd_atomic < 16) { + sif_log(sdev, SIF_QP, + "IB_QP_MAX_DEST_RD_ATOMIC value %u ignored - psif supports 16 as a hard coded value", + qp_attr->max_dest_rd_atomic); + } + } + + if (qp_attr_mask & IB_QP_PATH_MIG_STATE) { + ctrl_attr->mig_state = 1; + mct->data.mstate = ib2sif_mig_state(qp_attr->path_mig_state); + } + + if (qp_attr_mask & IB_QP_CAP) { + sif_log(sdev, SIF_QP, "IB_QP_CAP not supported by PSIF"); + goto err_modify_qp; + } + + if (qp_attr_mask & IB_QP_DEST_QPN) { + /* Since this is only valid from the init state which is + * owned by software anyway, we set it directly from software + * (see issues #929, #1027) + */ + qp->remote_qp = qp_attr->dest_qp_num; + set_psif_qp_core__remote_qp(&qp->d.state, qp_attr->dest_qp_num); + sif_log(sdev, SIF_QP, "Modified remote qp (hw), qp_idx: %d, value %d\n", + qp->qp_idx, qp_attr->dest_qp_num); + } + +ok_modify_qp_sw: + + /* + * On modify to RTR, we set the TSU SL (tsl), because we have + * port # and sl present in the QP state at this point. + */ + if ((qp_attr_mask & IB_QP_STATE) && (qp_attr->qp_state == IB_QPS_RTR)) { + int sl = get_psif_qp_path__sl(&qp->d.path_a); + int port = qp->port - 1; + enum psif_tsu_qos qosl = qp->qosl; + + if (cmd->qp_num == 0) + qp->tsl = sdev->qp0_tsl[qp->port - 1]; + else if (qp->type == PSIF_QP_TRANSPORT_MANSP1) + qp->tsl = sdev->pqp_rcn_tsl[qp->port - 1]; + else + qp->tsl = sdev->sl2tsl[sl][port][(int)qosl]; + + set_psif_qp_core__tsl(&qp->d.state, qp->tsl); + + /* Tell user-lib about tsl to use */ + if (qp->flags & SIF_QPF_USER_MODE) { + struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx); + + sq_sw->tsl = qp->tsl; + } + + sif_log(sdev, SIF_TSL, + "%s qp_idx: %d with sl: %d, port: %d, qosl: %s tsl: %d", + qp->type == PSIF_QP_TRANSPORT_MANSP1 ? "privileged" : "regular", + qp->qp_idx, sl, qp->port, string_enum_psif_tsu_qos(qosl) + 5, qp->tsl); + } + + { + struct sif_eps_cqe lcqe; + u16 seq_num; + + lcqe.rsp = &resp; + init_completion(&lcqe.cmpl); + + ret = sif_post_epsc_wr(sdev, &req, &seq_num, &lcqe, true); + if (ret) + goto err_modify_qp; + + if (reliable_qp(qp->type) + && (qp_attr_mask & IB_QP_STATE)) { + if ((qp->last_set_state == IB_QPS_INIT) + && (qp_attr->qp_state == IB_QPS_RTR)) { + /* Map the new send queue into the global sq_cmpl PSIF + * only address map, see #944 + */ + ret = sif_sq_cmpl_map_sq(sdev, get_sif_sq(sdev, qp->qp_idx)); + if (ret) + goto err_modify_qp; + + qp->sq_cmpl_map_valid = true; + + } else if ((qp->sq_cmpl_map_valid) + && (qp_attr->qp_state == IB_QPS_RESET)) { + /* Unmap the send queue from the global sq_cmpl PSIF */ + ret = sif_sq_cmpl_unmap_sq(sdev, get_sif_sq(sdev, qp->qp_idx)); + if (ret) + goto err_modify_qp; + + qp->sq_cmpl_map_valid = false; + } + } + + ret = sif_epsc_waitfor(sdev, seq_num, &lcqe); + if (ret) + goto err_modify_qp; + } + + if (resp.status != EPSC_SUCCESS) { + sif_log(sdev, SIF_INFO, "qp %d failed with status %s", + qp->qp_idx, string_enum_psif_epsc_csr_status(resp.status)); + goto err_modify_qp; + } + + /* sif_logs(SIF_DUMP, write_struct_psif_qp(0, 1, (const struct psif_qp *)&qp->d)); */ + sif_log(sdev, SIF_QP, "qp %d done QP state %d -> %d", + qp->qp_idx, qp->last_set_state, + (qp_attr_mask & IB_QP_STATE ? qp_attr->qp_state : qp->last_set_state)); + + if (qp_attr_mask & IB_QP_STATE) + qp->last_set_state = qp_attr->qp_state; + + return ret; + +err_modify_qp: + if (resp.status == EPSC_MODIFY_INVALID_QP_STATE) + ret = -ESPIPE; + + if (!ret) + ret = -EINVAL; + if (qp_attr_mask & IB_QP_STATE) + sif_log(sdev, SIF_QPE, + "qp %d failed - mask 0x%x cur.state %d, requested state %d, ret %d", + qp->qp_idx, qp_attr_mask, qp->last_set_state, + qp_attr->qp_state, + ret); + else + sif_log(sdev, SIF_QPE, "qp %d failed - mask 0x%x no state trans requested, ret %d", + qp->qp_idx, qp_attr_mask, ret); + + sif_logs(SIF_DUMP, write_struct_psif_qp(NULL, 1, (const struct psif_qp *)&qp->d)); + return ret; +} + + +static void set_qp_path_sw(struct sif_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, bool alternate) +{ + volatile struct psif_qp_path *path; + struct ib_ah_attr *ah_attr; + struct sif_dev *sdev = to_sdev(qp->ibqp.device); + unsigned int local_lid_path; + u8 psif_port; + u8 ipd = 0; + + if (alternate) { + path = &qp->d.path_b; + ah_attr = &qp_attr->alt_ah_attr; + set_psif_qp_path__pkey_indx(path, qp_attr->alt_pkey_index); + set_psif_qp_path__local_ack_timeout(path, qp_attr->alt_timeout); + set_psif_qp_path__port(path, qp_attr->alt_port_num - 1); + } else { + path = &qp->d.path_a; + ah_attr = &qp_attr->ah_attr; + set_psif_qp_path__pkey_indx(path, qp->pkey_index); + /* Use the value set by IB_QP_PORT: */ + set_psif_qp_path__port(path, qp->port - 1); + } + set_psif_qp_path__sl(path, ah_attr->sl); + + if (ah_attr->ah_flags & IB_AH_GRH) { + set_psif_qp_path__use_grh(path, USE_GRH); + set_psif_qp_path__remote_gid_0(path, cpu_to_be64(ah_attr->grh.dgid.global.subnet_prefix)); + set_psif_qp_path__remote_gid_1(path, cpu_to_be64(ah_attr->grh.dgid.global.interface_id)); + set_psif_qp_path__flowlabel(path, ah_attr->grh.flow_label); + set_psif_qp_path__hoplmt(path, ah_attr->grh.hop_limit); + /* TBD: ah_attr->grh.sgid_index? */ + + sif_log(sdev, SIF_QP, " - with grh dgid %llx.%llx", + be64_to_cpu(path->remote_gid_0), + be64_to_cpu(path->remote_gid_1)); + } + + if (qp_attr_mask & IB_QP_TIMEOUT) { + set_psif_qp_path__local_ack_timeout(path, qp_attr->timeout); + sif_log(sdev, SIF_QP, " - with timeout %d", qp_attr->timeout); + } + + qp->remote_lid = ah_attr->dlid; + set_psif_qp_path__remote_lid(path, ah_attr->dlid); + local_lid_path = ah_attr->src_path_bits; + psif_port = get_psif_qp_path__port(path); + set_psif_qp_path__local_lid_path(path, local_lid_path); + set_psif_qp_path__loopback(path, + (sdev->port[psif_port].lid | local_lid_path) == ah_attr->dlid ? + LOOPBACK : NO_LOOPBACK); + + /* sif_calc_ipd do not set ipd if sif_calc_ipd failed. In that case, ipd = 0.*/ + sif_calc_ipd(sdev, qp->port, (enum ib_rate) ah_attr->static_rate, &ipd); + set_psif_qp_path__ipd(path, ipd); + + sif_log(sdev, SIF_QP, "port %d lid %d(%#x) local_lid_path %d(%#x) remote_lid %d(%#x)", + ah_attr->port_num, + sdev->port[psif_port].lid, + sdev->port[psif_port].lid, + ah_attr->src_path_bits, + ah_attr->src_path_bits, + ah_attr->dlid, + ah_attr->dlid); + + sif_log(sdev, SIF_QP, "(path_%c) psif_port %d, remote_lid %d(%#x) %s", + (alternate ? 'b' : 'a'), + psif_port, + get_psif_qp_path__remote_lid(path), get_psif_qp_path__remote_lid(path), + (get_psif_qp_path__loopback(path) == LOOPBACK ? "(loopback)" : "(not loopback)")); +} + +static int modify_qp_sw(struct sif_dev *sdev, struct sif_qp *qp, + struct ib_qp_attr *qp_attr, int qp_attr_mask) +{ + int ret = 0; + volatile struct psif_qp *qps; + struct sif_rq *rq = NULL; + + if (qp->rq_idx >= 0) + rq = get_sif_rq(sdev, qp->rq_idx); + + qps = &qp->d; + + if ((qp_attr_mask & IB_QP_STATE) + && (qp->last_set_state == IB_QPS_RESET) + && (qp_attr->qp_state == IB_QPS_INIT)) { + set_psif_qp_core__bytes_received(&qps->state, 0); + set_psif_qp_core__committed_received_psn(&qps->state, 0); + set_psif_qp_core__expected_psn(&qps->state, 0); + set_psif_qp_core__last_committed_msn(&qps->state, 0); + set_psif_qp_core__last_received_outstanding_msn(&qps->state, 0); + set_psif_qp_core__msn(&qps->state, 0); /* According to Brian 11.9.2012 */ + set_psif_qp_core__scatter_indx(&qps->state, 0); + set_psif_qp_core__spin_hit(&qps->state, 0); + set_psif_qp_core__sq_seq(&qps->state, 1); + set_psif_qp_core__srq_pd(&qps->state, 0); + } + + if (qp_attr_mask & IB_QP_CUR_STATE && qp_attr->cur_qp_state != qp->last_set_state) { + sif_log(sdev, SIF_INFO, + "Error: current state %d - user expected %d", + qp->last_set_state, qp_attr->cur_qp_state); + ret = -EINVAL; + goto err_modify_qp; + } + + /* Bug #3933 - WA for HW bug 3928 + * ibv_query_qp might report wrong state when in state IBV_QPS_ERR + * QP hw state keeps in RESET for modify_qp_sw to INIT or ERR states + */ + if (qp_attr_mask & IB_QP_STATE) + if ((qp_attr->qp_state != IB_QPS_INIT && qp_attr->qp_state != IB_QPS_ERR) + || (PSIF_REVISION(sdev) > 3)) + set_psif_qp_core__state(&qps->state, ib2sif_qp_state(qp_attr->qp_state)); + + if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) { + sif_log(sdev, SIF_INFO, + "IB_QP_EN_SQD_ASYNC_NOTIFY needed!"); + ret = -EINVAL; + goto err_modify_qp; + } + + if (qp_attr_mask & IB_QP_ACCESS_FLAGS) { + + set_psif_qp_core__rdma_rd_enable(&qps->state, + ((qp_attr->qp_access_flags & IB_ACCESS_REMOTE_READ) + ? 1 : 0)); + set_psif_qp_core__rdma_wr_enable(&qps->state, + ((qp_attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) + ? 1 : 0)); + set_psif_qp_core__atomic_enable(&qps->state, + ((qp_attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) + ? 1 : 0)); + } + + /* This section must be before IB_QP_AV */ + if (qp_attr_mask & IB_QP_PKEY_INDEX) { + volatile struct psif_qp_path *path = &qp->d.path_a; + + /* TBD: Argument check on index value ? */ + qp->pkey_index = qp_attr->pkey_index; + set_psif_qp_path__pkey_indx(path, qp->pkey_index); + sif_log(sdev, SIF_QP, "pkey_indx in primary path set to %d", qp->pkey_index); + + } + + /* This section must be before IB_QP_AV */ + if (qp_attr_mask & IB_QP_PORT) { + if (qp_attr->port_num < 1 || qp_attr->port_num > 2) { + sif_log(sdev, SIF_INFO, "Modify port: Illegal port %d specified for qp %d", + qp_attr->port_num, qp->qp_idx); + ret = -EINVAL; + goto err_modify_qp; + } + sif_log(sdev, SIF_QP, "Modify port to %d for qp %d", + qp_attr->port_num, qp->qp_idx); + qp->port = qp_attr->port_num; + } + + if (qp_attr_mask & IB_QP_QKEY) { + + /* Set the 'ipoib' and 'ipoib_enable' fields for UD QPs with the IPoIB QKey */ + /* TBD: The IPoIB QKEY value is hardcoded. We need to figured out how ask the + * driver to ask the FW for this value + */ + if (qp_attr->qkey == 0x00000b1b) { + set_psif_qp_core__ipoib(&qps->state, 1); + set_psif_qp_core__ipoib_enable(&qps->state, 1); + } + + set_psif_qp_core__qkey(&qps->state, qp_attr->qkey); + + sif_log(sdev, SIF_QP, "Assign QKEY 0x%x for qp %d", + qp_attr->qkey, qp->qp_idx); + } + + if (qp_attr_mask & IB_QP_AV) + set_qp_path_sw(qp, qp_attr, qp_attr_mask, false); + + if (qp_attr_mask & IB_QP_PATH_MTU) { + if (!ib_legal_path_mtu(qp_attr->path_mtu)) { + sif_log(sdev, SIF_INFO, "Illegal MTU encoding %d", qp_attr->path_mtu); + ret = EINVAL; + goto err_modify_qp; + } + if ((qp->type == PSIF_QP_TRANSPORT_RC) && sif_feature(force_rc_2048_mtu)) { + if (qp_attr->path_mtu > IB_MTU_2048) + qp_attr->path_mtu = IB_MTU_2048; + } + sif_log(sdev, SIF_QP, "Modify path_mtu to %d for qp %d", + qp_attr->path_mtu, qp->qp_idx); + set_psif_qp_core__path_mtu(&qps->state, + ib2sif_path_mtu(qp_attr->path_mtu)); + qp->mtu = qp_attr->path_mtu; + } + + if (!(qp_attr_mask & (IB_QP_AV|IB_QP_ALT_PATH))) { + /* Set these values also if a path does not get set */ + if (qp_attr_mask & IB_QP_TIMEOUT) + set_psif_qp_path__local_ack_timeout(&qps->path_a, qp_attr->timeout); + } + + if (qp_attr_mask & IB_QP_RETRY_CNT) { + set_psif_qp_core__error_retry_init(&qps->state, qp_attr->retry_cnt); + set_psif_qp_core__error_retry_count(&qps->state, qp_attr->retry_cnt); + } + + if (qp_attr_mask & IB_QP_RNR_RETRY) { + int rnr_value = qp_attr->retry_cnt; + + set_psif_qp_core__rnr_retry_init(&qps->state, rnr_value); + set_psif_qp_core__rnr_retry_count(&qps->state, qp_attr->rnr_retry); + } + + if (qp_attr_mask & IB_QP_RQ_PSN) + set_psif_qp_core__expected_psn(&qps->state, qp_attr->rq_psn); + + if (qp_attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + /* This is the sending side */ + if (unlikely(qp_attr->max_rd_atomic > 16)) { + sif_log(sdev, SIF_QP, + "IB_QP_MAX_QP_RD_ATOMIC value %u out of range - psif supports no more than 16", + qp_attr->max_rd_atomic); + qp_attr->max_rd_atomic = 16; + } + set_psif_qp_core__max_outstanding(&qps->state, qp_attr->max_rd_atomic); + } + + if (qp_attr_mask & IB_QP_ALT_PATH) { + if (qp_attr->alt_port_num < 1 || qp_attr->alt_port_num > 2) { + sif_log(sdev, SIF_INFO, "Illegal alternate port %d specified for qp %d", + qp_attr->alt_port_num, qp->qp_idx); + ret = -EINVAL; + goto err_modify_qp; + } + set_qp_path_sw(qp, qp_attr, qp_attr_mask, true); + } + + if (qp_attr_mask & IB_QP_MIN_RNR_TIMER) + set_psif_qp_core__min_rnr_nak_time(&qps->state, + bug_3646_conv_table[qp_attr->min_rnr_timer & 0x1F]); + + if (qp_attr_mask & IB_QP_SQ_PSN) { + /* last_acked_psn must be 1 less (modulo 24 bit) than xmit_psn + * (see issue #1011) + */ + u32 prev = qp_attr->sq_psn == 0 ? 0xFFFFFF : qp_attr->sq_psn - 1; + + set_psif_qp_core__xmit_psn(&qps->state, qp_attr->sq_psn); + set_psif_qp_core__last_acked_psn(&qps->state, prev); + } + + if (qp_attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + /* Currently hard coded to 16 in psif */ + if (unlikely(qp_attr->max_dest_rd_atomic > 16)) { + sif_log(sdev, SIF_INFO, + "IB_QP_MAX_DEST_RD_ATOMIC value %u out of range - psif supports 16 as a hard coded value", + qp_attr->max_dest_rd_atomic); + ret = -EINVAL; + goto err_modify_qp; + } else if (qp_attr->max_dest_rd_atomic < 16) { + sif_log(sdev, SIF_QP, + "IB_QP_MAX_DEST_RD_ATOMIC value %u ignored - psif supports 16 as a hard coded value", + qp_attr->max_dest_rd_atomic); + } + } + + if (qp_attr_mask & IB_QP_PATH_MIG_STATE) + set_psif_qp_core__mstate(&qps->state, + ib2sif_mig_state(qp_attr->path_mig_state)); + + if (qp_attr_mask & IB_QP_CAP) { + sif_log(sdev, SIF_INFO, "resizing QP not implemented"); + sif_log(sdev, SIF_INFO, "IB_QP_CAP needed!"); + ret = -EOPNOTSUPP; + goto err_modify_qp; + } + + if (qp_attr_mask & IB_QP_DEST_QPN) { + set_psif_qp_core__remote_qp(&qps->state, qp_attr->dest_qp_num); + sif_log(sdev, SIF_QP, "Modified remote qp (sw), local qp_idx: %d, remote_qp %d\n", + qp->qp_idx, qp_attr->dest_qp_num); + } + + /* Set the valid bit whenever we transition to INIT */ + if (rq && !rq->is_srq && qp_attr_mask & IB_QP_STATE && qp_attr->qp_state == IB_QPS_INIT) + set_psif_rq_hw__valid(&rq->d, 1); + + sif_log(sdev, SIF_QP, "qp %d done QP state %d -> %d", + qp->qp_idx, qp->last_set_state, + (qp_attr_mask & IB_QP_STATE ? qp_attr->qp_state : qp->last_set_state)); + + if (qp_attr_mask & IB_QP_STATE) + qp->last_set_state = qp_attr->qp_state; + + return ret; +err_modify_qp: + return ret; +} + + +static int sif_query_qp_sw(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); +static int sif_query_qp_hw(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); + +int sif_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + bool use_hw = false; + struct sif_qp *qp = to_sqp(ibqp); + struct sif_dev *sdev = to_sdev(ibqp->device); + + sif_logi(ibqp->device, SIF_QP, "last_set_state %d", qp->last_set_state); + + switch (qp->last_set_state) { + case IB_QPS_RESET: + case IB_QPS_INIT: + break; + default: + /* Bug #3933 - WA for HW bug 3928 + * ibv_query_qp might report wrong state when in state IBV_QPS_ERR + * Query must be done based on current ownership (towards HW only if HW owned) + */ + if (PSIF_REVISION(sdev) <= 3 || qp->flush_sq_done_wa4074) + use_hw = (qp->flags & SIF_QPF_HW_OWNED); + else + use_hw = true; + break; + } + + return use_hw ? + sif_query_qp_hw(ibqp, qp_attr, qp_attr_mask, qp_init_attr) : + sif_query_qp_sw(ibqp, qp_attr, qp_attr_mask, qp_init_attr); +} + +enum ib_qp_state get_qp_state(struct sif_qp *qp) +{ + struct ib_qp *ibqp = &qp->ibqp; + struct ib_qp_init_attr init_attr; + struct ib_qp_attr attr; + + memset(&attr, 0, sizeof(attr)); + memset(&init_attr, 0, sizeof(init_attr)); + + if (sif_query_qp(ibqp, &attr, IB_QP_STATE, &init_attr)) { + sif_logi(ibqp->device, SIF_INFO, + "query_qp failed for qp %d", ibqp->qp_num); + return -1; + } + return attr.qp_state; +} + +static void get_qp_path_sw(struct sif_qp *qp, struct ib_qp_attr *qp_attr, bool alternate) +{ + volatile struct psif_qp_path *path; + struct ib_ah_attr *ah_attr; + enum psif_use_grh use_grh; + volatile struct psif_qp_path *alt_path; + struct ib_ah_attr *alt_ah_attr; + + alt_path = &qp->d.path_b; + alt_ah_attr = &qp_attr->alt_ah_attr; + path = &qp->d.path_a; + ah_attr = &qp_attr->ah_attr; + + ah_attr->sl = get_psif_qp_path__sl(path); + use_grh = get_psif_qp_path__use_grh(path); + + if (use_grh == USE_GRH) { + ah_attr->ah_flags |= IB_AH_GRH; + ah_attr->grh.dgid.global.subnet_prefix = get_psif_qp_path__remote_gid_0(path); + ah_attr->grh.dgid.global.interface_id = get_psif_qp_path__remote_gid_1(path); + ah_attr->grh.flow_label = get_psif_qp_path__flowlabel(path); + ah_attr->grh.hop_limit = get_psif_qp_path__hoplmt(path); + /* TBD: ah_attr->grh.sgid_index? */ + } + + qp_attr->pkey_index = get_psif_qp_path__pkey_indx(path); + qp_attr->timeout = get_psif_qp_path__local_ack_timeout(path); + + ah_attr->port_num = get_psif_qp_path__port(path); + ah_attr->dlid = get_psif_qp_path__remote_lid(path); + ah_attr->src_path_bits = get_psif_qp_path__local_lid_path(path); + + alt_ah_attr->port_num = get_psif_qp_path__port(alt_path); + alt_ah_attr->dlid = get_psif_qp_path__remote_lid(alt_path); + alt_ah_attr->src_path_bits = get_psif_qp_path__local_lid_path(alt_path); +} + + + +static int sif_query_qp_sw(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + struct sif_dev *sdev = to_sdev(ibqp->device); + struct sif_qp *qp = to_sqp(ibqp); + volatile struct psif_qp *qps = &qp->d; + struct sif_rq *rq = NULL; + struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx); + int ret = 0; + + if (qp->type != PSIF_QP_TRANSPORT_XRC) + rq = get_sif_rq(sdev, qp->rq_idx); + + /* Mellanox almost completely ignores the mask on both + * input and output and reports all attributes regardlessly.. + * as opposed to what man ibv_query_qp indicates. + * Since this behavour is utilized by a.o. qperf + * we probably have no other meaningful choice than + * to report back everything even with mask 0. + */ + sif_log(sdev, SIF_QP, "qp_attr_mask 0x%x", qp_attr_mask); + + memset(qp_init_attr, 0, sizeof(struct ib_qp_init_attr)); + memset(qp_attr, 0, sizeof(struct ib_qp_attr)); + + qp_attr->qp_state = qp_attr->cur_qp_state = qp->last_set_state; + qp_attr->qp_access_flags |= + get_psif_qp_core__rdma_rd_enable(&qps->state) ? IB_ACCESS_REMOTE_READ : 0; + qp_attr->qp_access_flags |= + get_psif_qp_core__rdma_wr_enable(&qps->state) ? IB_ACCESS_REMOTE_WRITE : 0; + qp_attr->qp_access_flags |= + get_psif_qp_core__atomic_enable(&qps->state) ? IB_ACCESS_REMOTE_ATOMIC : 0; + + qp_attr->pkey_index = get_psif_qp_path__pkey_indx(&qps->path_a); + qp_attr->port_num = qp->port; + qp_attr->qkey = get_psif_qp_core__qkey(&qps->state); + get_qp_path_sw(qp, qp_attr, qp_attr_mask & IB_QP_ALT_PATH); + + qp_attr->path_mtu = sif2ib_path_mtu(get_psif_qp_core__path_mtu(&qps->state)); + qp_attr->timeout = get_psif_qp_path__local_ack_timeout(&qps->path_a); + qp_attr->retry_cnt = get_psif_qp_core__error_retry_count(&qps->state); + qp_attr->rnr_retry = get_psif_qp_core__rnr_retry_count(&qps->state); + qp_attr->rq_psn = get_psif_qp_core__expected_psn(&qps->state); + qp_attr->min_rnr_timer = get_psif_qp_core__min_rnr_nak_time(&qps->state); + qp_attr->sq_psn = get_psif_qp_core__xmit_psn(&qps->state); + qp_attr->path_mig_state = sif2ib_mig_state(get_psif_qp_core__mstate(&qps->state)); + qp_attr->dest_qp_num = get_psif_qp_core__remote_qp(&qps->state); + + /* TBD: Revisit this: This value is currently hard coded to 16 in psif */ + qp_attr->max_dest_rd_atomic = 16; + + qp_init_attr->port_num = qp->port; + if (rq) { + if (rq->is_srq) + qp_init_attr->srq = &rq->ibsrq; + qp_init_attr->cap.max_recv_wr = rq->entries_user; + qp_init_attr->cap.max_recv_sge = rq->sg_entries; + } + qp_init_attr->cap.max_send_wr = sq->entries; + qp_init_attr->cap.max_send_sge = sq->sg_entries; + qp_init_attr->cap.max_inline_data = qp->max_inline_data; + + /* TBD: What to do with this: + * IB_QP_MAX_QP_RD_ATOMIC = (1<<13), + */ + return ret; +} + +static void get_qp_path_hw(struct psif_query_qp *qqp, struct ib_qp_attr *qp_attr, bool alternate) +{ + struct psif_qp_path *path; + struct ib_ah_attr *ah_attr; + enum psif_use_grh use_grh; + struct psif_qp_path *alt_path; + struct ib_ah_attr *alt_ah_attr; + + alt_path = &qqp->alternate_path; + alt_ah_attr = &qp_attr->alt_ah_attr; + path = &qqp->primary_path; + ah_attr = &qp_attr->ah_attr; + + ah_attr->sl = path->sl; + use_grh = path->use_grh; + + if (use_grh == USE_GRH) { + ah_attr->ah_flags |= IB_AH_GRH; + ah_attr->grh.dgid.global.subnet_prefix = path->remote_gid_0; + ah_attr->grh.dgid.global.interface_id = path->remote_gid_1; + ah_attr->grh.flow_label = path->flowlabel; + ah_attr->grh.hop_limit = path->hoplmt; + /* TBD: ah_attr->grh.sgid_index? */ + } + qp_attr->pkey_index = path->pkey_indx; + qp_attr->timeout = path->local_ack_timeout; + qp_attr->port_num = path->port + 1; + + qp_attr->alt_pkey_index = alt_path->pkey_indx; + qp_attr->alt_timeout = alt_path->local_ack_timeout; + qp_attr->alt_port_num = alt_path->port + 1; + + + + ah_attr->port_num = path->port + 1; + ah_attr->dlid = path->remote_lid; + ah_attr->src_path_bits = path->local_lid_path; + + alt_ah_attr->port_num = alt_path->port + 1; + alt_ah_attr->dlid = alt_path->remote_lid; + alt_ah_attr->src_path_bits = alt_path->local_lid_path; +} + +u64 sif_qqp_dma_addr(struct sif_dev *sdev, struct sif_qp *qps) +{ + struct sif_table *tp = &sdev->ba[qp]; + u64 offset = qps->qp_idx * tp->ext_sz + offsetof(struct sif_qp, qqp); + + if (tp->mmu_ctx.mt == SIFMT_BYPASS) + return sif_mem_dma(tp->mem, offset); + else if (!epsc_gva_permitted(sdev)) + return sif_mem_dma(tp->mem, offset); + else + return tp->mmu_ctx.base + offset; +} + +/* Internal query qp implementation - updates the local query qp state for this QP */ +int epsc_query_qp(struct sif_qp *sqp, struct psif_query_qp *lqqp) +{ + int ret; + struct psif_epsc_csr_rsp cqe; + struct psif_epsc_csr_req req; + struct psif_csr_modify_qp_ctrl *cmd = &req.u.query_qp.ctrl; + struct sif_dev *sdev = to_sdev(sqp->ibqp.device); + + /* This function can potentially use the same qqp data structure reentrant + * but we dont care as we know that EPSC operations gets sequenced + */ + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_QUERY_QP; + cmd->cmd = QP_CMD_QUERY; + if (sqp->qp_idx <= 3) { + cmd->qp_num = sqp->qp_idx & 1; + cmd->port_num = sqp->qp_idx >> 1; + } else + cmd->qp_num = sqp->qp_idx; + req.u.query_qp.address = sif_qqp_dma_addr(sdev, sqp); + + if (!epsc_gva_permitted(sdev)) + req.u.query_qp.mmu_cntx = sif_mmu_ctx_passthrough(true); + else + req.u.query_qp.mmu_cntx = sdev->ba[qp].mmu_ctx.mctx; + ret = sif_epsc_wr_poll(sdev, &req, &cqe); + + /* Copy data irrespective of how the EPSC operation went */ + if (eps_version_ge(&sdev->es[sdev->mbox_epsc], 0, 31)) + copy_conv_to_sw(lqqp, &sqp->qqp, sizeof(*lqqp)); + else + memcpy(lqqp, &sqp->qqp, sizeof(*lqqp)); + + return ret; +} + + +static int sif_query_qp_hw(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + int ret = 0; + struct sif_qp *qp = to_sqp(ibqp); + struct sif_dev *sdev = to_sdev(ibqp->device); + struct sif_rq *rq = NULL; + struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx); + struct psif_query_qp lqqp; + + /* Take QP lock to avoid any race condition on updates to last_set_state: */ + mutex_lock(&qp->lock); + + ret = epsc_query_qp(qp, &lqqp); + if (!ret) + qp->last_set_state = sif2ib_qp_state(lqqp.qp.state); + mutex_unlock(&qp->lock); + + if (ret) + return ret; + + if (qp->type != PSIF_QP_TRANSPORT_XRC) + rq = get_sif_rq(sdev, qp->rq_idx); + + /* Mellanox almost completely ignores the mask on both + * input and output and reports all attributes regardlessly.. + * as opposed to what man ibv_query_qp indicates. + * Since this behavour is utilized by a.o. qperf + * we probably have no other meaningful choice than + * to report back everything even with mask 0. + */ + sif_log(sdev, SIF_QP|SIF_DUMP, "qp %d, qp_attr_mask 0x%x", qp->qp_idx, qp_attr_mask); + sif_logs(SIF_DUMP, write_struct_psif_query_qp(NULL, 0, &lqqp)); + + + memset(qp_init_attr, 0, sizeof(struct ib_qp_init_attr)); + memset(qp_attr, 0, sizeof(struct ib_qp_attr)); + + qp_attr->qp_state = qp_attr->cur_qp_state = qp->last_set_state; + qp_attr->qp_access_flags |= lqqp.qp.rdma_rd_enable ? IB_ACCESS_REMOTE_READ : 0; + qp_attr->qp_access_flags |= lqqp.qp.rdma_wr_enable ? IB_ACCESS_REMOTE_WRITE : 0; + qp_attr->qp_access_flags |= lqqp.qp.atomic_enable ? IB_ACCESS_REMOTE_ATOMIC : 0; + + qp_attr->pkey_index = lqqp.primary_path.pkey_indx; + qp_attr->port_num = lqqp.primary_path.port + 1; + qp_attr->qkey = lqqp.qp.qkey; + get_qp_path_hw(&lqqp, qp_attr, qp_attr_mask & IB_QP_ALT_PATH); + + qp_attr->path_mtu = sif2ib_path_mtu(lqqp.qp.path_mtu); + qp_attr->timeout = lqqp.primary_path.local_ack_timeout; + qp_attr->retry_cnt = lqqp.qp.error_retry_count; + qp_attr->rnr_retry = lqqp.qp.rnr_retry_count; + qp_attr->rq_psn = lqqp.qp.expected_psn; + qp_attr->min_rnr_timer = lqqp.qp.min_rnr_nak_time; + qp_attr->sq_psn = lqqp.qp.xmit_psn; + qp_attr->path_mig_state = sif2ib_mig_state(lqqp.qp.mstate); + qp_attr->dest_qp_num = lqqp.qp.remote_qp; + + /* TBD: Revisit this: This value is currently hard coded to 16 in psif */ + qp_attr->max_dest_rd_atomic = 16; + + qp_init_attr->port_num = qp->port; /* TBD: Use primary path info here as well? */ + + if (rq) { + if (rq->is_srq) + qp_init_attr->srq = &rq->ibsrq; + qp_init_attr->cap.max_recv_wr = rq->entries_user; + qp_init_attr->cap.max_recv_sge = rq->sg_entries; + } + qp_init_attr->cap.max_send_wr = sq->entries; + qp_init_attr->cap.max_send_sge = sq->sg_entries; + qp_init_attr->cap.max_inline_data = qp->max_inline_data; + + /* TBD: What to do with these.. + * IB_QP_MAX_QP_RD_ATOMIC = (1<<13), + */ + return ret; +} + + +int sif_destroy_qp(struct ib_qp *ibqp) +{ + struct sif_qp *qp = to_sqp(ibqp); + struct sif_dev *sdev = to_sdev(ibqp->device); + struct sif_eps *es = &sdev->es[sdev->mbox_epsc]; + bool need_pma_pxy_qp = eps_version_ge(es, 0, 57) + && (qp->qp_idx == 1 || qp->qp_idx == 3); + + sif_log(sdev, SIF_QP, "qp_num %d", ibqp->qp_num); + + /* Destroy PMA_PXY QPs associated with QP1/3 */ + if (need_pma_pxy_qp) { + struct sif_qp *pma_qp = NULL; + int pma_qp_idx; + int ret; + + pma_qp_idx = sdev->pma_qp_idxs[!!(qp->qp_idx & 2)]; + pma_qp = get_sif_qp(sdev, pma_qp_idx); + + /* clearing epsc PMA_PXY QP redirection */ + ret = notify_epsc_pma_qp(sdev, -1, qp->port); + if (ret) + sif_log(sdev, SIF_QP, + "Failed to clear epsc PMA_PXY rerirect for qp_num %d", pma_qp_idx); + destroy_qp(sdev, pma_qp); + } + + return destroy_qp(sdev, qp); +} + + +int destroy_qp(struct sif_dev *sdev, struct sif_qp *qp) +{ + int ret; + int index = qp->qp_idx; + struct sif_pd *pd = qp->ibqp.pd ? to_spd(qp->ibqp.pd) : to_sxrcd(qp->ibqp.xrcd)->pd; + struct ib_qp_attr mod_attr = { + .qp_state = IB_QPS_RESET + }; + struct sif_rq *rq = NULL; + bool reuse_ok = true; + + /* See bug #3496 */ + if (sif_feature(no_multipacket_qp_reuse)) { + switch (qp->type) { + case PSIF_QP_TRANSPORT_UD: + case PSIF_QP_TRANSPORT_MANSP1: + reuse_ok = true; + break; + default: + reuse_ok = false; + break; + } + } + + sif_log(sdev, SIF_QP, "## Enter qp_idx %d", index); + + if (is_regular_qp(qp)) + rq = get_sif_rq(sdev, qp->rq_idx); + + /* make sure event handling is performed before reset the qp.*/ + if (atomic_dec_and_test(&qp->refcnt)) + complete(&qp->can_destroy); + wait_for_completion(&qp->can_destroy); + + /* Modify to reset causes an implicit reset_qp() if state is RESET */ + ret = modify_qp(sdev, qp, &mod_attr, IB_QP_STATE, false, NULL); + if (ret) + sif_log(sdev, SIF_INFO, "modify qp %d to RESET failed, sts %d", index, ret); + + if (!(qp->flags & SIF_QPF_USER_MODE)) { + int nfixup; + struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx); + u32 cq_idx = get_psif_qp_core__rcv_cq_indx(&qp->d.state); + struct sif_cq *send_cq = (sq && sq->cq_idx >= 0) ? get_sif_cq(sdev, sq->cq_idx) : NULL; + struct sif_cq *recv_cq = rq ? get_sif_cq(sdev, cq_idx) : NULL; + + if (send_cq) { + nfixup = sif_fixup_cqes(send_cq, sq, qp); + if (nfixup < 0) { + sif_log(sdev, SIF_INFO, + "sif_fixup_cqes: on qp %d send cq %d failed with error %d", + qp->qp_idx, sq->cq_idx, nfixup); + goto fixup_failed; + } + sif_log(sdev, SIF_QP, "sif_fixup_cqes: fixed %d CQEs in sq.cq %d", + nfixup, sq->cq_idx); + } + if (recv_cq && recv_cq != send_cq) { + nfixup = sif_fixup_cqes(recv_cq, sq, qp); + if (nfixup < 0) { + sif_log(sdev, SIF_INFO, + "sif_fixup_cqes: on qp %d recv cq %d failed with error %d", + qp->qp_idx, cq_idx, nfixup); + goto fixup_failed; + } + sif_log(sdev, SIF_QP, "sif_fixup_cqes: fixed %d CQEs in rq.cq %d", + nfixup, cq_idx); + + } + } + +fixup_failed: + if (qp->qp_idx < 4) { + /* Special QP cleanup */ + int ok = atomic_add_unless(&sdev->sqp_usecnt[qp->qp_idx], -1, 0); + + if (!ok) { + sif_log(sdev, SIF_INFO, + "Attempt to destroy an uncreated QP %d", qp->qp_idx); + return -EINVAL; + } + } + + sif_dfs_remove_qp(qp); + + sif_free_sq(sdev, qp); + + if (rq) { + ret = free_rq(sdev, qp->rq_idx); + if (ret && (ret != -EBUSY || !rq->is_srq)) + return ret; + } + + if (index > 3 && reuse_ok) + sif_free_qp_idx(pd, index); + + sif_log(sdev, SIF_QP, "## Exit success (qp_idx %d)", index); + return 0; +} + +/* Set this QP back to the initial state + * (called by modify_qp after a successful modify to reset + */ +static int reset_qp(struct sif_dev *sdev, struct sif_qp *qp) +{ + volatile struct psif_qp *qps = &qp->d; + struct sif_rq *rq = NULL; + struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx); + bool need_wa_3713 = 0; + + /* Bring down order needed by rev2 according to bug #3480 */ + int ret = poll_wait_for_qp_writeback(sdev, qp); + + if (ret) + goto failed; + + if (is_regular_qp(qp)) + rq = get_sif_rq(sdev, qp->rq_idx); + + /* WA 3713 special handling */ + need_wa_3713 = (PSIF_REVISION(sdev) <= 3) + && IS_PSIF(sdev) /* Next check if there is a retry outstanding */ + && !qp->flush_sq_done_wa4074 + && (get_psif_qp_core__retry_tag_committed(&qp->d.state) != + get_psif_qp_core__retry_tag_err(&qp->d.state)) + && (qp->qp_idx != sdev->flush_qp); + + if (need_wa_3713) { + ret = reset_qp_flush_retry(sdev); + if (ret < 0) + sif_log(sdev, SIF_INFO, "Flush_retry special handling failed with ret %d", ret); + + } + + + /* if the send queue scheduler is running, wait for + * it to terminate: + */ + ret = sif_flush_sqs(sdev, sq); + if (ret) + goto failed; + + sif_logs(SIF_DUMP, + write_struct_psif_qp(NULL, 1, (struct psif_qp *)&qp->d)); + +failed: + if (ret) { + /* TBD: Debug case - should never fail? */ + if (qp->type != PSIF_QP_TRANSPORT_MANSP1) + return ret; + } + + /* Reset the SQ pointers */ + if (!qp->ibqp.xrcd) { + struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx); + + memset(sq_sw, 0, sizeof(*sq_sw)); + set_psif_sq_sw__tail_indx(&sq_sw->d, 0); + set_psif_sq_hw__last_seq(&sq->d, 0); + set_psif_sq_hw__destroyed(&sq->d, 0); + } + + /* Invalidate the RQ and set it in a consistent state for reuse */ + if (rq && !rq->is_srq) { + struct sif_rq_sw *rq_sw = get_sif_rq_sw(sdev, rq->index); + + if (!(test_bit(RQ_IS_INVALIDATED, &rq_sw->flags))) { + ret = sif_invalidate_rq_hw(sdev, rq->index, PCM_POST); + if (ret) { + sif_log(sdev, SIF_INFO, + "Invalidate rq_hw failed, status %d", ret); + return ret; + } + set_bit(RQ_IS_INVALIDATED, &rq_sw->flags); + } + + /* Make sure the RQ is sofware owned: */ + ret = poll_wait_for_rq_writeback(sdev, rq); + if (ret) + return ret; + + /* Reset pointers */ + memset(rq_sw, 0, sizeof(*rq_sw)); + set_psif_rq_hw__head_indx(&rq->d, 0); + } + + mb(); + + if (multipacket_qp(qp->type) && IS_PSIF(sdev) && PSIF_REVISION(sdev) > 2) { + int i; + int loop_count = 1; + + /* bz #3794: WA for HW bug 3198, VAL issuing read to uninitialized DMA VT entry */ + if (qp->type == PSIF_QP_TRANSPORT_UC && PSIF_REVISION(sdev) <= 3) + loop_count = 64; + + /* Invalidate the SGL cache (mapped to the qp type) + * TBD: We can consider a posted inv.req and check lazy upon reuse + */ + + for (i = 0; i < loop_count; ++i) { + ret = sif_invalidate_qp(sdev, qp->qp_idx, PCM_WAIT); + if (ret) { + sif_log(sdev, SIF_INFO, + "Invalidate SGL cache failed"); + return ret; + } + cpu_relax(); + } + } + + /* Reset counters to same values used at QP create + * Last acked psn must be initialized to one less than xmit_psn + * and it is a 24 bit value. See issue #1011 + */ + set_psif_qp_core__xmit_psn(&qps->state, 0); + set_psif_qp_core__last_acked_psn(&qps->state, 0xffffff); + qp->flush_sq_done_wa4074 = false; + + return ret; +} + + + +void sif_dfs_print_qp(struct seq_file *s, struct sif_dev *sdev, + loff_t pos) +{ + struct sif_qp *qp; + volatile struct psif_qp *qps; + struct psif_qp lqps; + + if (unlikely(pos < 0)) { + seq_puts(s, "Index\tState\tRecvCQ\tSendCQ\tRQ\tRemQP\tType\n"); + return; + } + + qp = get_sif_qp(sdev, pos); + qps = &qp->d; + copy_conv_to_sw(&lqps, qps, sizeof(struct psif_qp)); + + if (pos <= 3 && atomic_read(&sdev->sqp_usecnt[pos]) != 1) + return; + + seq_printf(s, "%llu\t%d\t", pos, qp->last_set_state); + + if (qp->rq_idx == -1) + seq_puts(s, "[none]"); + else + seq_printf(s, "%u", lqps.state.rcv_cq_indx); + + seq_printf(s, "\t%u\t", lqps.state.send_cq_indx); + + if (qp->rq_idx == -1) + seq_puts(s, "[none]"); + else + seq_printf(s, "%u", lqps.state.rq_indx); + + seq_printf(s, "\t%u", lqps.state.remote_qp); + seq_printf(s, "\t%s", string_enum_psif_qp_trans(lqps.state.transport_type)+18); + if (lqps.state.proxy_qp_enable) + seq_puts(s, "\t[proxy]\n"); + else if (is_epsa_tunneling_qp(qp->ibqp.qp_type)) + seq_puts(s, "\t[EPSA tunneling]\n"); + else if (qp->ulp_type == RDS_ULP) + seq_puts(s, "\t[RDS]\n"); + else if (qp->ulp_type == IPOIB_CM_ULP) + seq_puts(s, "\t[IPOIB_CM]\n"); + else if (qp->flags & SIF_QPF_EOIB) + seq_puts(s, "\t[EoIB]\n"); + else if (qp->flags & SIF_QPF_IPOIB) + seq_puts(s, "\t[IPoIB]\n"); + else if (qp->flags & SIF_QPF_NO_EVICT) + seq_puts(s, "\t[no_evict]\n"); + else if (qp->flags & SIF_QPF_FLUSH_RETRY) + seq_puts(s, "\t[flush_retry]\n"); + else if (qp->flags & SIF_QPF_KI_STENCIL) + seq_puts(s, "\t[ki_stencil]\n"); + else if (qp->flags & SIF_QPF_PMA_PXY) + if (qp->port == 1) + seq_puts(s, "\t[PMA_PXY_QP_P1]\n"); + else + seq_puts(s, "\t[PMA_PXY_QP_P2]\n"); + else if (qp->flags & SIF_QPF_SMI) + if (qp->port == 1) + seq_puts(s, "\t[SMI_QP_P1]\n"); + else + seq_puts(s, "\t[SMI_QP_P2]\n"); + else if (qp->flags & SIF_QPF_GSI) + if (qp->port == 1) + seq_puts(s, "\t[GSI_QP_P1]\n"); + else + seq_puts(s, "\t[GSI_QP_P2]\n"); + else + seq_puts(s, "\n"); +} + +void sif_dfs_print_ipoffload(struct seq_file *s, struct sif_dev *sdev, loff_t pos) +{ + struct sif_qp *qp; + + if (unlikely(pos < 0)) { + seq_printf(s, "#%7s %10s %21s %21s %21s\n", + "", "TX csum", "---- RX l3_csum ----", "---- RX l4_csum ----", + "-------- LSO --------"); + seq_printf(s, "#%7s %10s %10s %10s %10s %10s %10s %10s\n", + "Index", "", "ok", "err", "ok", "err", "pkt", "bytes"); + return; + } + + qp = get_sif_qp(sdev, pos); + + if (qp->flags & SIF_QPF_IPOIB || qp->flags & SIF_QPF_EOIB) { + if (pos <= 3 && atomic_read(&sdev->sqp_usecnt[pos]) != 1) + return; + + seq_printf(s, "%8llu ", pos); + seq_printf(s, "%10llu ", + qp->ipoib_tx_csum_l3); + seq_printf(s, "%10llu %10llu ", + qp->ipoib_rx_csum_l3_ok, qp->ipoib_rx_csum_l3_err); + seq_printf(s, "%10llu %10llu ", + qp->ipoib_rx_csum_l4_ok, qp->ipoib_rx_csum_l4_err); + seq_printf(s, "%10llu %10llu\n", + qp->ipoib_tx_lso_pkt, qp->ipoib_tx_lso_bytes); + } +} + +bool has_srq(struct sif_dev *sdev, struct sif_qp *qp) +{ + struct sif_rq *rq = has_rq(qp) ? get_sif_rq(sdev, qp->rq_idx) : NULL; + + return rq && rq->is_srq; +} diff --git a/drivers/infiniband/hw/sif/sif_qp.h b/drivers/infiniband/hw/sif/sif_qp.h new file mode 100644 index 000000000000..0ab36abd3804 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_qp.h @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_qp.h: Interface to internal IB queue pair logic for sif + */ + +#ifndef __SIF_QP_H +#define __SIF_QP_H +#include "psif_hw_data.h" +#include "sif_rq.h" +#include "sif_sq.h" +#include "sif_ibqp.h" + +struct sif_dev; +struct seq_file; +struct sif_sq; +struct sif_rq; + +#define CB_LENGTH 256 +#define CB_KICK_ALIGN 64 +#define CB_KICK_MASK (CB_KICK_ALIGN - 1) + +enum sif_qp_flags { + SIF_QPF_EOIB = 0x1, + SIF_QPF_IPOIB = 0x2, + SIF_QPF_FORCE_SQ_MODE = 0x1000, /* Set by vendor specific flag to enforce use of SQ mode */ + SIF_QPF_NO_EVICT = 0x2000, /* Special fake qp with do_not_evict set (see #3552) */ + SIF_QPF_KI_STENCIL = 0x4000, /* Special stencil qp set up for efficient key invalidates */ + SIF_QPF_DYNAMIC_MTU = 0x8000, /* Set by vendor specific flag to enforce use of dynamic MTU */ + SIF_QPF_FLUSH_RETRY = 0x10000, /* Special fake rc qp to flush retry (see #3714) */ + SIF_QPF_USER_MODE = 0x20000, /* User (udata != NULL) and not kernel verbs */ + SIF_QPF_PMA_PXY = 0x100000, /* Performance management interface QP type */ + SIF_QPF_SMI = 0x200000, /* Subnet management interface QP type */ + SIF_QPF_GSI = 0x400000, /* General services interface QP type */ + SIF_QPF_HW_OWNED = 0x1000000,/* Indicates HW ownership */ +}; + +struct dentry; + +/* + * TBD - not suitable for kernel.org: + * As for now, the stack unwind is done at sif_create_qp() within sif driver. + * Picking UEK version 4.1.12 as a starting point to have this, + * as UEK kernel has ib_create_qp->ib_create_qp_ex. + * Thus, set it to 4 based on what is implemented in Oracle Kernel + * to retrieve the ULP. +*/ +#define STACK_UNWIND_LEVEL 4 +/* + * sif_create_qp = __builtin_return_address(0) + * ib_create_qp = __builtin_return_address(1) + * ib_create_qp_ex = __builtin_return_address(2) + * if (rdma_cm) + * rdma_create_qp = __builtin_return_address(3) + * ULP = __builtin_return_address(4) +*/ + +/* The enum to determine what is the ULP caller + */ +enum kernel_ulp_type { + OTHER_ULP = 0, + RDS_ULP = 1, + IPOIB_CM_ULP = 2, + IPOIB_ULP = 3, +}; + +struct sif_qp_init_attr { + struct sif_pd *pd; + enum psif_qp_trans qp_type; + enum sif_proxy_type proxy; + enum psif_tsu_qos qosl; + enum kernel_ulp_type ulp_type; /* the ulp caller hint */ + bool user_mode; + int sq_hdl_sz; +}; + +struct sif_qp { + volatile struct psif_qp d; /* Hardware QPSC entry */ + struct ib_qp ibqp ____cacheline_internodealigned_in_smp; + + /* Data area for query_qp results: */ + struct psif_query_qp qqp ____cacheline_internodealigned_in_smp; + + /* Pack the members used in critical path in as few cache lines as possible */ + union { + u16 submask[2]; + u32 mask; + } traffic_patterns; /* heuristic mask to determine the traffic pattern */ + enum kernel_ulp_type ulp_type; /* the ulp caller hint */ + atomic_t refcnt; /* qp refcnt to sync between destroy qp and event handling. */ + struct completion can_destroy; /* use to synchronize destroy qp with event handling */ + struct mutex lock ____cacheline_internodealigned_in_smp; + int qp_idx; /* qp and sq index */ + int rq_idx; + u32 max_inline_data; /* Requested max inline for this QP */ + + /* Next 6 members are copy from the qp state */ + u32 remote_qp; + u32 magic; + bool nocsum; + enum psif_tsu_qos qosl; + u8 tsl; + u16 remote_lid; + + u16 eps_tag; /* Value to use for the eps_tag field (proxy_qp) */ + short port; /* IB port number (= sif port# + 1) */ + u32 flags; + enum ib_qp_state last_set_state; + enum psif_qp_trans type; /* PSIF transport type set up for this QP */ + + /* The following members are not used in critical path */ + u16 pkey_index; /* Default PKEY index as set by IB_QP_PKEY */ + enum ib_mtu mtu; /* Currently set mtu */ + enum ib_qp_state tracked_state; /* TBD: This is stupid: Make SQD fail as MLX for SQD */ + struct dentry *dfs_qp; /* Raw qp dump debugfs handle - used by sif_debug.c */ + bool sq_cmpl_map_valid; + + int srq_idx; /* WA #3952: Track SRQ for modify_srq(used only for pQP) */ + atomic64_t arm_srq_holdoff_time;/* Wait-time,if the pQP is held for a prev modify_srq */ + + bool flush_sq_done_wa4074; /* WA #4074: Track if QP state changes are already applied */ + + u64 ipoib_tx_csum_l3; + u64 ipoib_tx_csum_l4; + u64 ipoib_rx_csum_l3_ok; + u64 ipoib_rx_csum_l3_err; + u64 ipoib_rx_csum_l4_ok; + u64 ipoib_rx_csum_l4_err; + u64 ipoib_tx_lso_pkt; + u64 ipoib_tx_lso_bytes; +}; + + +/* Definition of PSIF EPSA tunneling QP using IB_QPT_RESERVED1 */ +#define IB_QPT_EPSA_TUNNELING IB_QPT_RESERVED1 + +/* Command used to invalidate a collect buffer by writing to offset 0xff8 */ +#define PSIF_WR_CANCEL_CMD_BE 0xff00000000000000ULL + +/* HEURISTIC BITS used for TX/RX direction. */ +#define HEUR_RX_DIRECTION (~1ULL) +#define HEUR_TX_DIRECTION (1ULL) + +static inline bool supports_offload(struct sif_qp *qp) +{ + return qp->flags & (SIF_QPF_EOIB | SIF_QPF_IPOIB); +} + +static inline int psif_supported_trans(enum psif_qp_trans type) +{ + return type != PSIF_QP_TRANSPORT_RSVD1; +} + +static inline bool is_regular_qp(struct sif_qp *qp) +{ + return (qp->type != PSIF_QP_TRANSPORT_MANSP1 && + qp->type != PSIF_QP_TRANSPORT_XRC); +} + +static inline bool is_epsa_tunneling_qp(enum ib_qp_type type) +{ + return type == IB_QPT_EPSA_TUNNELING; +} + +static inline struct sif_qp *to_sqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct sif_qp, ibqp); +} + +struct sif_qp *create_qp(struct sif_dev *sdev, + struct ib_qp_init_attr *init_attr, + struct sif_qp_init_attr *sif_attr); + +int destroy_qp(struct sif_dev *sdev, struct sif_qp *qp); + + +int modify_qp(struct sif_dev *sdev, struct sif_qp *qp, + struct ib_qp_attr *qp_attr, int qp_attr_mask, + bool fail_on_same_state, struct ib_udata *udata); + +enum ib_qp_state get_qp_state(struct sif_qp *qp); + +/* Line printers for debugfs files */ +void sif_dfs_print_qp(struct seq_file *s, struct sif_dev *sdev, loff_t pos); +void sif_dfs_print_ipoffload(struct seq_file *s, struct sif_dev *sdev, loff_t pos); + +/* SIF specific type of handling of a modify QP operation: + * + */ +enum sif_mqp_type { + SIF_MQP_ERR, /* Illegal transition */ + SIF_MQP_SW, /* Software handled transition */ + SIF_MQP_HW, /* Hardware handled transition */ + SIF_MQP_IGN, /* Silently ignored transition req */ + SIF_MQP_MAX +}; + +u64 sif_qqp_dma_addr(struct sif_dev *sdev, struct sif_qp *qps); + +/* Internal query qp implementation - stores a host order query qp state in lqqp */ +int epsc_query_qp(struct sif_qp *qp, struct psif_query_qp *lqqp); + +/* EPSC configuration to forward PMA responses to the remapped qp_idx */ +int notify_epsc_pma_qp(struct sif_dev *sdev, int qp_idx, short port); + +enum sif_mqp_type sif_modify_qp_is_ok( + struct sif_qp *qp, + enum ib_qp_state cur_state, + enum ib_qp_state next_state, + enum ib_qp_attr_mask mask +); + +static inline enum psif_mbox_type proxy_to_mbox(enum sif_proxy_type proxy) +{ + switch (proxy) { + case SIFPX_EPSA_1: + return MBOX_EPSA0; + case SIFPX_EPSA_2: + return MBOX_EPSA1; + case SIFPX_EPSA_3: + return MBOX_EPSA2; + case SIFPX_EPSA_4: + return MBOX_EPSA3; + default: + break; + } + return (enum psif_mbox_type) -1; +} + +int modify_qp_hw_wa_qp_retry(struct sif_dev *sdev, struct sif_qp *qp, + struct ib_qp_attr *qp_attr, int qp_attr_mask); + +static inline bool has_rq(struct sif_qp *qp) +{ + return qp->rq_idx >= 0; +} + +bool has_srq(struct sif_dev *sdev, struct sif_qp *qp); + +static inline bool ib_legal_path_mtu(enum ib_mtu mtu) +{ + return (mtu >= IB_MTU_256) && (mtu <= IB_MTU_4096); +} + + +#endif diff --git a/drivers/infiniband/hw/sif/sif_query.c b/drivers/infiniband/hw/sif/sif_query.c new file mode 100644 index 000000000000..dcb03e39b0d5 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_query.c @@ -0,0 +1,384 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_query.c: SIF implementation of some of IB query APIs + */ +#include +#include +#include +#include "sif_dev.h" +#include "sif_query.h" +#include "sif_defs.h" +#include "sif_qp.h" + +int epsc_query_device(struct sif_dev *sdev, struct psif_epsc_device_attr *ldev) +{ + int ret; + struct psif_epsc_csr_rsp cqe; + struct psif_epsc_csr_req req; + struct sif_eps *es = &sdev->es[sdev->mbox_epsc]; + + memset(&req, 0, sizeof(req)); + /* MMU context nil - passthrough */ + req.opcode = EPSC_QUERY_DEVICE; + req.u.query_hw.address = + (u64)es->data_dma_hdl + offsetof(struct sif_epsc_data, dev); + req.u.query_hw.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx; + ret = sif_epsc_wr(sdev, &req, &cqe); + + /* Copy data irrespective of how the EPSC operation went */ + if (eps_version_ge(es, 0, 31)) + copy_conv_to_sw(ldev, &es->data->dev, sizeof(*ldev)); + else + memcpy(ldev, &es->data->dev, sizeof(*ldev)); + + return ret; +} + +int sif_query_device(struct ib_device *ibdev, struct ib_device_attr *props) +{ + int ret; + struct sif_dev *sdev = to_sdev(ibdev); + struct sif_eps *es = &sdev->es[sdev->mbox_epsc]; + struct psif_epsc_device_attr ldev; + + ret = epsc_query_device(sdev, &ldev); + if (ret) + return ret; + + memset(props, 0, sizeof(*props)); + /* TBD: x.y.z - 16 bit per sublevel - we use x.y.0 for now */ + props->fw_ver = (u64)es->ver.fw_major << 32 | (u64)es->ver.fw_minor << 16; + props->sys_image_guid = cpu_to_be64(ldev.sys_image_guid); + props->max_mr_size = ~0ull; + props->page_size_cap = 0xfffffe00; /* TBD: Sensible value? Use what Mellanox uses */ + props->vendor_id = ldev.vendor_id; + props->vendor_part_id = ldev.vendor_part_id; + props->hw_ver = ldev.hw_ver; + props->max_qp = sdev->ba[qp].entry_cnt; /* TBD: min(ldev.max_qp, sdev->ba[qp].entry_cnt) */ + props->max_qp_wr = min_t(u32, SIF_SW_MAX_SQE, ldev.max_srq_wr); /* Max on _any_ work queue */ + props->device_cap_flags = + IB_DEVICE_BAD_PKEY_CNTR | + IB_DEVICE_BAD_QKEY_CNTR | + IB_DEVICE_AUTO_PATH_MIG | + IB_DEVICE_CURR_QP_STATE_MOD | + IB_DEVICE_SHUTDOWN_PORT | + IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_RC_RNR_NAK_GEN | + IB_DEVICE_UD_IP_CSUM | + IB_DEVICE_UD_TSO | + IB_DEVICE_XRC | + IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + + /* returns max_sge SIF_HW_MAX_SEND_SGE -1 for IPoIB connected mode. + */ + props->max_sge = (sif_find_kernel_ulp_caller() == IPOIB_CM_ULP) ? + SIF_HW_MAX_SEND_SGE - 1 : SIF_HW_MAX_SEND_SGE; + props->max_sge_rd = ldev.max_sge_rd; + props->max_cq = sdev->ba[cq_sw].entry_cnt; + props->max_cqe = SIF_SW_MAX_CQE; + /* Make sure we never fill the CQ completely on rev 1-3 - Bug #3657 */ + if (PSIF_REVISION(sdev) <= 3) + props->max_cqe = SIF_SW_MAX_CQE - 1; + props->max_mr = sdev->ba[key].entry_cnt; + props->max_pd = SIF_MAX_PD_INDEX - 1; /* 0 not used, limited by hw field size */ + props->max_qp_rd_atom = ldev.max_qp_rd_atom; + props->max_ee_rd_atom = ldev.max_ee_rd_atom; + props->max_res_rd_atom = props->max_qp_rd_atom * sdev->ba[qp].entry_cnt; + props->max_qp_init_rd_atom = ldev.max_qp_init_rd_atom; + props->max_ee_init_rd_atom = ldev.max_ee_init_rd_atom; + props->atomic_cap = ldev.atomic_cap; + props->max_ee = ldev.max_ee; + props->max_rdd = ldev.max_rdd; + props->max_mw = ldev.max_mw; + props->max_raw_ipv6_qp = min_t(u32, ldev.max_raw_ipv6_qp, props->max_qp); + props->max_raw_ethy_qp = min_t(u32, ldev.max_raw_ethy_qp, props->max_qp); + props->max_mcast_grp = ldev.max_mcast_grp; + props->max_mcast_qp_attach = ldev.max_mcast_qp_attach; + props->max_total_mcast_qp_attach = ldev.max_total_mcast_qp_attach; + props->max_ah = sdev->ba[ah].entry_cnt; + props->max_fmr = props->max_mr; + props->max_map_per_fmr = 0x7ffff000; /* Should be props->max_mr_size but that breaks ibv_devinfo */ + props->max_srq = sdev->ba[rq_hw].entry_cnt; + props->max_srq_wr = ldev.max_srq_wr; + props->max_srq_sge = ldev.max_srq_sge; + props->max_pkeys = ldev.max_pkeys; + props->local_ca_ack_delay = ldev.local_ca_ack_delay; + return ret; +} + + + +static int epsc_query_port(struct sif_dev *sdev, u8 port, struct psif_epsc_port_attr *lpa) +{ + int ret; + struct sif_eps *es = &sdev->es[sdev->mbox_epsc]; + struct psif_epsc_csr_rsp cqe; + struct psif_epsc_csr_req req; + const u8 psif_port = port - 1; /* sif port index starts at 0 */ + struct psif_epsc_port_attr *ps; + + if (port > 2) { + sif_log(sdev, SIF_INFO, "error: request for port %d while PSIF has only 2 ports", + port); + return -EINVAL; + } + + ps = &es->data->port[psif_port]; + + memset(&req, 0, sizeof(req)); + req.opcode = psif_port == PORT_1 ? EPSC_QUERY_PORT_1 : EPSC_QUERY_PORT_2; + req.u.query_hw.address = + (u64)es->data_dma_hdl + offsetof(struct sif_epsc_data, port[psif_port]); + req.u.query_hw.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx; + + ret = sif_epsc_wr(sdev, &req, &cqe); + + /* Copy data irrespective of how the EPSC operation went */ + if (eps_version_ge(es, 0, 31)) + copy_conv_to_sw(lpa, ps, sizeof(*lpa)); + else + memcpy(lpa, ps, sizeof(*lpa)); + + if (!ret) + sif_log(sdev, SIF_VERBS, "port %d lid %d sm_lid %d seq 0x%llx", + port, lpa->lid, lpa->sm_lid, cqe.seq_num); + else + sif_log(sdev, SIF_INFO, "error: port %d seq 0x%llx failed with status %s (ret = %d)", + port, cqe.seq_num, string_enum_psif_epsc_csr_status(cqe.status), + ret); + return ret; +} + +int sif_calc_ipd(struct sif_dev *sdev, u8 port, enum ib_rate static_rate, u8 *ipd) +{ + int path = ib_rate_to_mult(static_rate); + int link, ret; + struct ib_port_attr lpa; + + if (static_rate == IB_RATE_PORT_CURRENT) { + *ipd = 0; + return 0; + } + + if (unlikely(path < 0)) { + sif_log(sdev, SIF_INFO, " Invalid static rate = %x\n", + path); + return -EINVAL; + } + + ret = sif_query_port(&sdev->ib_dev, port, &lpa); + if (unlikely(ret != 0)) { + sif_log(sdev, SIF_INFO, "Failed to query port %u\n", port); + return ret; + } + /* 2^active_width * active_speed */ + link = (1 << lpa.active_width)*lpa.active_speed; + + if (path >= link) + *ipd = 0; + else + *ipd = (link/path)-1; + return 0; +} + + +int sif_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) +{ + int ret; + struct sif_dev *sdev = to_sdev(ibdev); + struct psif_epsc_port_attr lpa; + + ret = epsc_query_port(sdev, port, &lpa); + memset(props, 0, sizeof(*props)); + props->state = lpa.state; + props->max_mtu = IB_MTU_4096; + props->active_mtu = lpa.active_mtu; + props->gid_tbl_len = lpa.gid_tbl_len; + props->port_cap_flags = lpa.port_cap_flags; + props->max_msg_sz = lpa.max_msg_sz; + props->bad_pkey_cntr = lpa.bad_pkey_cntr; + props->qkey_viol_cntr = lpa.qkey_viol_cntr; + props->pkey_tbl_len = lpa.pkey_tbl_len; + props->lid = lpa.lid; + props->sm_lid = lpa.sm_lid; + props->lmc = lpa.lmc; + props->max_vl_num = lpa.max_vl_num; + props->sm_sl = lpa.sm_sl; + props->subnet_timeout = lpa.subnet_timeout; + props->init_type_reply = lpa.init_type_reply; + props->active_width = lpa.active_width; + props->active_speed = lpa.active_speed; + props->phys_state = lpa.phys_state; + + /* Cache values */ + sdev->port[port - 1] = *props; + return ret; +} + +int sif_query_gid(struct ib_device *ibdev, u8 port_num, int index, union ib_gid *gid) +{ + int ret = 0; + ulong log_class = SIF_VERBS; + struct sif_dev *sdev = to_sdev(ibdev); + struct psif_epsc_csr_rsp cqe; + struct psif_epsc_csr_req req; + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_QUERY_GID; + req.u.query_table.port = port_num; + req.u.query_table.index = index; + ret = sif_epsc_wr(sdev, &req, &cqe); + if (ret) + return ret; + + /* Apparently clients expect to get GIDs in network byte order + * which requires an extra swap here: + */ + gid->global.subnet_prefix = be64_to_cpu(cqe.data); + gid->global.interface_id = be64_to_cpu(cqe.info); + + if (ret) + log_class = SIF_INFO; + sif_logi(ibdev, log_class, + " port_num %d, GID Table index %d - > %llx.%llx", + port_num, index, gid->global.subnet_prefix, gid->global.interface_id); + return ret; +} + + +int sif_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + int ret = 0; + struct sif_dev *sdev = to_sdev(ibdev); + struct psif_epsc_csr_rsp cqe; + struct psif_epsc_csr_req req; + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_QUERY_PKEY; + req.u.query_table.port = port; + req.u.query_table.index = index; + ret = sif_epsc_wr(sdev, &req, &cqe); + if (ret) { + sif_log(sdev, SIF_INFO, "port %u index %u: Failed with status %d", port, index, ret); + return ret; + } + *pkey = (u16)cqe.data; + sif_logi(ibdev, SIF_VERBS_V, "port %u index %u -> key 0x%x", + port, index, *pkey); + return ret; +} + + +/* Called from sif_modify_device when IB_DEVICE_MODIFY_EXTENDED is set + * PSIF specific extension bits defined in sif_verbs.h + */ +static int sif_modify_device_extended(struct sif_dev *sdev, struct ib_device_modify *device_modify, + struct psif_epsc_csr_req *req) +{ + struct sif_device_modify *dm = + container_of(device_modify, struct sif_device_modify, ib); + + /* TBD: Simplifying firmware support? */ + sif_log(sdev, SIF_INFO, "uf %d eoib_ctrl %x eoib_data %x (not implemented)", + dm->uf, dm->eoib_ctrl, dm->eoib_data); + return -EOPNOTSUPP; +} + + +int sif_modify_device(struct ib_device *ibdev, + int device_modify_mask, + struct ib_device_modify *device_modify) +{ + int ret = 0; + struct sif_dev *sdev = to_sdev(ibdev); + struct psif_epsc_csr_rsp cqe; + struct psif_epsc_csr_req req; + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_MODIFY_DEVICE; + if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) { + req.u.device.modify_mask |= PSIF_DEVICE_MODIFY_SYS_IMAGE_GUID; + sif_logi(ibdev, SIF_VERBS, "sys_image_guid = 0x%llx", + device_modify->sys_image_guid); + req.u.device.sys_image_guid = device_modify->sys_image_guid; + } + if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) { + req.u.device.modify_mask |= PSIF_DEVICE_MODIFY_NODE_DESC; + sif_logi(ibdev, SIF_VERBS, "node_desc = %s", + device_modify->node_desc); + strncpy(req.u.device.node_desc, device_modify->node_desc, + ARRAY_SIZE(req.u.device.node_desc)-1); + strncpy(ibdev->node_desc, device_modify->node_desc, + ARRAY_SIZE(ibdev->node_desc)-1); + } + + /** PSIF specific extensions (sif_verbs.h) **/ + if (device_modify_mask & IB_DEVICE_MODIFY_EXTENDED) + ret = sif_modify_device_extended(sdev, device_modify, &req); + + ret = sif_epsc_wr(sdev, &req, &cqe); + if (ret) + sif_log(sdev, SIF_INFO, "Failed with status %d", ret); + return ret; +} + +int sif_modify_port(struct ib_device *ibdev, + u8 port, int port_modify_mask, + struct ib_port_modify *props) +{ + int ret = 0; + struct sif_dev *sdev = to_sdev(ibdev); + struct psif_epsc_csr_rsp cqe; + struct psif_epsc_csr_req req; + + sif_logi(ibdev, SIF_VERBS, + "via eps - port %d mask %x init_type %d, set mask %x, clr mask %x", + port, port_modify_mask, + props->init_type, + props->set_port_cap_mask, + props->clr_port_cap_mask); + + memset(&req, 0, sizeof(req)); + /* TBD: Why both port and different op for port 1 and 2? */ + req.u.port.port = port; + if (port == 1) + req.opcode = EPSC_MODIFY_PORT_1; + else if (port == 2) + req.opcode = EPSC_MODIFY_PORT_2; + else { + /* No such port */ + ret = -EINVAL; + goto out; + } + + /* TBD: Check later on if we can let this mask straight through 1-1 */ + if (port_modify_mask & IB_PORT_SHUTDOWN) + req.u.port.modify_mask |= PSIF_PORT_SHUTDOWN; + if (port_modify_mask & IB_PORT_INIT_TYPE) { + req.u.port.modify_mask |= PSIF_PORT_INIT_TYPE; + req.u.port.init_type = props->init_type; + } + if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR) + req.u.port.modify_mask |= PSIF_PORT_RESET_QKEY_CNTR; + if (port_modify_mask & (1<<4)) + req.u.port.modify_mask |= PSIF_PORT_RESET_PKEY_CNTR; + req.u.port.set_port_cap_mask = props->set_port_cap_mask; + req.u.port.clr_port_cap_mask = props->clr_port_cap_mask; + ret = sif_epsc_wr(sdev, &req, &cqe); + if (ret) + sif_log(sdev, SIF_INFO, "Failed with status %d", ret); +out: + return ret; +} + + diff --git a/drivers/infiniband/hw/sif/sif_query.h b/drivers/infiniband/hw/sif/sif_query.h new file mode 100644 index 000000000000..fc1fe8766e79 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_query.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_query.h: SIF implementation of some of IB query APIs + */ + +#ifndef _SIF_QUERY_H +#define _SIF_QUERY_H +#include "psif_hw_data.h" +#include "sif_epsc.h" +#include "sif_fwa.h" + +/* Max size of firmware version info */ +#define MAX_FW_VERSION_INFO_SZ 4096 + +/* DMA mapped structure to receive query data in + * We only need one of these and we protect user access to + * it with sif_epsc->lock + */ + +struct sif_epsc_data { + struct psif_epsc_device_attr dev; + struct psif_epsc_port_attr port[2]; + struct psif_epsc_log_stat log; + + /* fixed buffer space for special FWA client needs */ + char fw_version[MAX_FW_VERSION_INFO_SZ]; /* Data area for firmware version info */ + char flash[MAX_FWA_NL_PAYLOAD]; /* Data area for flash support */ + char epsc_cli[MAX_FWA_NL_PAYLOAD]; /* Data area for EPSC CLI response*/ + char vimm_agent[MAX_FWA_NL_PAYLOAD]; /* Data area for VIMM agent */ + char log_data_area[0]; /* Data area will be allocated right after this struct */ +}; + +int sif_query_device(struct ib_device *ibdev, struct ib_device_attr *props); + +int sif_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props); +int sif_query_gid(struct ib_device *ibdev, u8 port_num, int index, union ib_gid *gid); +int sif_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey); + +int sif_calc_ipd(struct sif_dev *sdev, u8 port, enum ib_rate static_rate, + u8 *ipd); + +int sif_modify_device(struct ib_device *ibdev, + int device_modify_mask, + struct ib_device_modify *device_modify); + +int sif_modify_port(struct ib_device *ibdev, + u8 port, int port_modify_mask, + struct ib_port_modify *props); + +/* Populate ldev with host endian query_device info requested from the epsc */ +int epsc_query_device(struct sif_dev *sdev, struct psif_epsc_device_attr *ldev); + + +static inline bool epsc_gva_permitted(struct sif_dev *sdev) +{ + /* None of the planned SIBS versions supports GVA2GPA for EPSC mappings */ + return !IS_SIBS(sdev) && sdev->pdev->revision != 2 && !sif_feature(passthrough_query_qp); +} + +static inline bool eps_version_ge(struct sif_eps *es, u16 major, u16 minor) +{ + return EPSC_API_VERSION(es->ver.epsc_major, es->ver.epsc_minor) >= + EPSC_API_VERSION(major, minor); +} + +static inline bool eps_fw_version_ge(struct sif_eps *es, u16 major, u16 minor) +{ + return EPSC_API_VERSION(es->ver.fw_major, es->ver.fw_minor) >= + EPSC_API_VERSION(major, minor); +} + +static inline bool eps_fw_version_lt(struct sif_eps *es, u16 major, u16 minor) +{ + return EPSC_API_VERSION(es->ver.fw_major, es->ver.fw_minor) < + EPSC_API_VERSION(major, minor); +} + + +#endif diff --git a/drivers/infiniband/hw/sif/sif_r3.c b/drivers/infiniband/hw/sif/sif_r3.c new file mode 100644 index 000000000000..0dcd7118b1f6 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_r3.c @@ -0,0 +1,880 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_r3.c: Special handling specific for psif revision 3 and earlier + */ +#include "sif_dev.h" +#include "sif_r3.h" +#include "sif_base.h" +#include "sif_query.h" +#include "sif_qp.h" +#include "sif_ibqp.h" +#include "sif_sndrcv.h" +#include "sif_ibcq.h" +#include "sif_defs.h" +#include "psif_hw_setget.h" + +/* Declared below: */ +static void sif_hw_free_flush_qp(struct sif_dev *sdev); +static int sif_hw_allocate_flush_qp(struct sif_dev *sdev); +static int sif_hw_allocate_dne_qp(struct sif_dev *sdev); +static void sif_hw_free_dne_qp(struct sif_dev *sdev); + +static int outstanding_wqes(struct sif_dev *sdev, struct sif_qp *qp, u16 *head); +static u16 cq_walk_wa4074(struct sif_dev *sdev, struct sif_qp *qp, bool *last_seq_set); +static u16 walk_and_update_cqes(struct sif_dev *sdev, struct sif_qp *qp, u16 head, u16 end); + +int sif_r3_init(struct sif_dev *sdev) +{ + int ret; + bool dne_qp_alloc = false; + + if (eps_fw_version_lt(&sdev->es[sdev->mbox_epsc], 0, 58)) { + ret = sif_hw_allocate_dne_qp(sdev); + if (ret) + return ret; + dne_qp_alloc = true; + } + + /* Init the flush_retry qp lock */ + mutex_init(&sdev->flush_lock); + ret = sif_hw_allocate_flush_qp(sdev); + if (ret) + goto flush_retry_failed; + + return 0; +flush_retry_failed: + if (dne_qp_alloc) + sif_hw_free_dne_qp(sdev); + return ret; +} + + +void sif_r3_deinit(struct sif_dev *sdev) +{ + sif_hw_free_flush_qp(sdev); + if (eps_fw_version_lt(&sdev->es[sdev->mbox_epsc], 0, 58)) + sif_hw_free_dne_qp(sdev); +} + + +static int sif_hw_allocate_dne_qp(struct sif_dev *sdev) +{ + int ret; + u32 idx = sif_alloc_qp_idx(sdev->pd); + struct sif_qp *qp; + struct psif_qp lqp; + struct psif_query_qp lqqp; + + if (idx < 0) { + sif_log(sdev, SIF_INFO, "Unable to reserve QP index for the do-not-evict qp"); + return -ENOMEM; + } + sdev->dne_qp = idx; + qp = get_sif_qp(sdev, idx); + /* Make dfs and query_qp happy: */ + qp->qp_idx = idx; + qp->ibqp.device = &sdev->ib_dev; + qp->ibqp.pd = &sdev->pd->ibpd; + qp->rq_idx = -1; + qp->last_set_state = IB_QPS_RTS; + qp->flags = SIF_QPF_NO_EVICT; + mutex_init(&qp->lock); + + memset(&lqp, 0, sizeof(struct psif_qp)); + + lqp.state.do_not_evict = 1; + lqp.state.timeout_time = 0xffffffffffffULL; /* 48 bits */ + lqp.state.state = PSIF_QP_STATE_RTS; + lqp.state.timer_running = 1; + lqp.state.transport_type = PSIF_QP_TRANSPORT_RC; + + /* Write composed entry to shared area */ + copy_conv_to_hw(&qp->d, &lqp, sizeof(struct psif_qp)); + + /* Do a query_qp to make PSIF fill it's cache with it + *- we dont care about the results from the query other than + * that the operation succeeds: + */ + ret = epsc_query_qp(qp, &lqqp); + if (ret) { + sif_log(sdev, SIF_INFO, "query_qp failed with status %d", ret); + return ret; + } + ret = sif_dfs_add_qp(sdev, qp); + if (ret) { + sif_log(sdev, SIF_INFO, "Failed to allocate do-not-evict qp, index %d", idx); + return ret; + } + sif_log(sdev, SIF_INFO, "Allocated do-not-evict qp, index %d", idx); + return 0; +} + + + +static void sif_hw_free_dne_qp(struct sif_dev *sdev) +{ + if (sdev->dne_qp) { + /* Modify it to reset via error to flush it out. + * We cannot use destroy_qp since it is not a "fully configured" QP: + */ + struct sif_qp *qp = get_sif_qp(sdev, sdev->dne_qp); + struct ib_qp_attr mod_attr = { + .qp_state = IB_QPS_RESET, + }; + modify_qp_hw_wa_qp_retry(sdev, qp, &mod_attr, IB_QP_STATE); + sif_dfs_remove_qp(qp); + sif_free_qp_idx(sdev->pd, sdev->dne_qp); + sdev->dne_qp = 0; + } +} + + +static int sif_hw_allocate_flush_qp(struct sif_dev *sdev) +{ + int ret = 0; + struct sif_qp *qp = NULL; + struct sif_cq *cq = NULL; + + struct ib_qp_init_attr init_attr = { + .event_handler = NULL, + .srq = NULL, + .cap = { + .max_send_wr = 64, + .max_recv_wr = 64, + .max_send_sge = 1, + .max_recv_sge = 1, + }, + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_RC, + }; + + struct sif_qp_init_attr sif_attr = { + .pd = sdev->pd, + .qp_type = ib2sif_qp_type(init_attr.qp_type), + .user_mode = NULL, + .sq_hdl_sz = sizeof(struct sif_sq_hdl), + .qosl = QOSL_LOW_LATENCY, + }; + + enum ib_qp_attr_mask qp_attr_mask = + IB_QP_STATE | + IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS; + + struct ib_qp_attr qp_attr = { + .qp_state = IB_QPS_INIT, + .pkey_index = 0, + .port_num = 1, + .qp_access_flags = + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_ATOMIC, + }; + + struct ib_port_attr lpa; + + /* No QPs when running in limited mode */ + if (sdev->limited_mode) + return 0; + + ret = sif_query_port(&sdev->ib_dev, 1, &lpa); + if (unlikely(ret)) { + sif_log(sdev, SIF_INFO, "Failed to query port 1"); + goto err_query_port; + } + + /* CQ */ + cq = create_cq(sdev->pd, + init_attr.cap.max_send_wr + init_attr.cap.max_recv_wr, + 1, SIFPX_OFF, false); + if (IS_ERR(cq)) { + sif_log(sdev, SIF_INFO, "Failed to create CQ for flush_retry QP"); + return -EINVAL; + } + init_attr.send_cq = &cq->ibcq; + init_attr.recv_cq = &cq->ibcq; + cq->ibcq.device = &sdev->ib_dev; /* Make destroy cq happy */ + + /* QP */ + qp = create_qp(sdev, &init_attr, &sif_attr); + if (IS_ERR(qp)) { + sif_log(sdev, SIF_INFO, "Failed to create flush_retry QP"); + ret = -EINVAL; + goto err_create_qp; + } + + sif_log(sdev, SIF_QP, "Exit: success flush_retry qp 0x%p ib qp %d - real qp %d", + &qp->ibqp, qp->ibqp.qp_num, qp->qp_idx); + + + /* Make query & modify qp happy */ + qp->ibqp.qp_num = qp->qp_idx; + qp->ibqp.device = &sdev->ib_dev; + qp->ibqp.pd = &sdev->pd->ibpd; + qp->ibqp.qp_type = init_attr.qp_type; + qp->type = sif_attr.qp_type; + qp->port = 1; + qp->flags = SIF_QPF_FLUSH_RETRY; + + ret = sif_modify_qp(&qp->ibqp, &qp_attr, qp_attr_mask, NULL); + if (ret) { + sif_log(sdev, SIF_INFO, "modify_qp to init failed with status %d", ret); + goto err_modify_qp; + } + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IB_QPS_RTR; + qp_attr.path_mtu = IB_MTU_2048; + qp_attr.dest_qp_num = qp->qp_idx; + qp_attr.rq_psn = 0; + qp_attr.max_dest_rd_atomic = 1; + qp_attr.min_rnr_timer = 1; + qp_attr.ah_attr.dlid = lpa.lid; + qp_attr.ah_attr.port_num = 1; + qp_attr_mask = + IB_QP_STATE | + IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_MIN_RNR_TIMER; + + ret = sif_modify_qp(&qp->ibqp, &qp_attr, qp_attr_mask, NULL); + if (ret) { + sif_log(sdev, SIF_INFO, "modify_qp to RTR failed with status %d", ret); + goto err_modify_qp; + } + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IB_QPS_RTS; + qp_attr.sq_psn = 0; + qp_attr.timeout = 6; + qp_attr.retry_cnt = 7; + qp_attr.rnr_retry = 7; + qp_attr.max_rd_atomic = 1; + qp_attr_mask = + IB_QP_STATE | + IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_SQ_PSN | + IB_QP_MAX_QP_RD_ATOMIC; + + ret = sif_modify_qp(&qp->ibqp, &qp_attr, qp_attr_mask, NULL); + if (ret) { + sif_log(sdev, SIF_INFO, "modify_qp to RTS failed with status %d", ret); + goto err_modify_qp; + } + + sdev->flush_qp = qp->qp_idx; + sif_log(sdev, SIF_INFO, "Allocated flush-retry qp, index %d", sdev->flush_qp); + + return ret; + +err_modify_qp: + destroy_qp(sdev, qp); +err_create_qp: + destroy_cq(cq); +err_query_port: + sdev->flush_qp = 0; + sif_log(sdev, SIF_INFO, "Allocated flush-retry qp failed"); + + return ret; +} + +static void sif_hw_free_flush_qp(struct sif_dev *sdev) +{ + struct sif_qp *qp = NULL; + struct sif_sq *sq = NULL; + struct sif_cq *cq = NULL; + + if (sdev->flush_qp) { + qp = get_sif_qp(sdev, sdev->flush_qp); + sq = get_sif_sq(sdev, sdev->flush_qp); + cq = get_sif_cq(sdev, sq->cq_idx); + + destroy_qp(sdev, qp); + destroy_cq(cq); + sdev->flush_qp = 0; + + sif_log(sdev, SIF_QP, "destroy_qp %d success", qp->qp_idx); + } +} + +void sif_r3_recreate_flush_qp(struct sif_dev *sdev) +{ + /* For simplicity we just destroy the old + * and allocate a new flush_retry qp. + */ + mutex_lock(&sdev->flush_lock); + sif_hw_free_flush_qp(sdev); + sif_hw_allocate_flush_qp(sdev); + mutex_unlock(&sdev->flush_lock); +} + +int reset_qp_flush_retry(struct sif_dev *sdev) +{ + struct sif_qp *qp = NULL; + struct psif_query_qp lqqp; + + struct ib_send_wr *sbad_wr; + struct ib_send_wr snd_wr = { + .wr_id = 0x1, + .sg_list = NULL, + .opcode = IB_WR_SEND, + .num_sge = 0, /* ZERO byte */ + .next = NULL, + }; + struct ib_recv_wr *rbad_wr; + struct ib_recv_wr rcv_wr = { + .wr_id = 0x2, + .sg_list = NULL, + .next = NULL, + .num_sge = 0, + }; + + struct sif_rq *rq = NULL; + struct sif_cq *cq = NULL; + + int ret = 0; + int rte, rtc; + int count; + unsigned long timeout = sdev->min_resp_ticks; + unsigned long timeout_real; + + /* Get access to the flush_retry QP */ + mutex_lock(&sdev->flush_lock); + + if (!sdev->flush_qp) { + sif_log(sdev, SIF_INFO, "special handling WA_3713 failed: flush_qp does not exist"); + ret = -EINVAL; + goto err_flush_qp; + } + + qp = get_sif_qp(sdev, sdev->flush_qp); + + /* Query flush_retry QP */ + ret = epsc_query_qp(qp, &lqqp); + if (ret) { + sif_log(sdev, SIF_INFO, "epsc_query_qp failed with status %d", ret); + goto fail; + } + + /* Store retry_tag_err and retry_tag_committed */ + rte = lqqp.qp.retry_tag_err; + rtc = lqqp.qp.retry_tag_committed; + + /* Post one zero byte send */ + ret = sif_post_send(&qp->ibqp, &snd_wr, &sbad_wr); + if (ret) { + sif_log(sdev, SIF_INFO, "sif_post_send failed with status %d", ret); + goto fail; + } + + timeout_real = jiffies + timeout; + while (rte == lqqp.qp.retry_tag_err || rtc == lqqp.qp.retry_tag_committed) { + if (time_is_after_jiffies(timeout_real)) { + cond_resched(); + ret = epsc_query_qp(qp, &lqqp); + if (ret) { + sif_log(sdev, SIF_INFO, "epsc_query_qp failed with status %d", ret); + goto fail; + } + } else { + sif_log(sdev, SIF_INFO, "Timeout waiting for flush retry"); + ret = -ETIMEDOUT; + goto fail; + } + } + + /* Post an RQE to the RQ */ + ret = sif_post_recv(&qp->ibqp, &rcv_wr, &rbad_wr); + if (ret) { + sif_log(sdev, SIF_INFO, "sif_post_recv failed with status %d", ret); + goto fail; + } + + /* Poll out the completions of the CQ */ + rq = get_sif_rq(sdev, qp->rq_idx); + cq = get_sif_cq(sdev, rq->cq_idx); + + count = 0; + timeout_real = jiffies + timeout; + while (count < 2) { + struct ib_wc wcs[2]; + int sts = sif_poll_cq(&cq->ibcq, 2, wcs); + + if (sts < 0) { + sif_log(sdev, SIF_INFO, "sif_poll_cq failed with status %d", sts); + ret = sts; + goto fail; + } else + count += sts; + + if (time_is_after_jiffies(timeout_real)) + cond_resched(); + else { + sif_log(sdev, SIF_INFO, "Timeout waiting for completions"); + for (sts = 0; sts < count; sts++) + sif_log(sdev, SIF_INFO, "wr_id %lld status %d opcode %d", + wcs[sts].wr_id, wcs[sts].status, wcs[sts].opcode); + goto fail; + } + } + + mutex_unlock(&sdev->flush_lock); + return ret; +fail: + sif_hw_free_flush_qp(sdev); + sif_hw_allocate_flush_qp(sdev); + mutex_unlock(&sdev->flush_lock); + return ret; + +err_flush_qp: + mutex_unlock(&sdev->flush_lock); + return ret; +} + +static int outstanding_wqes(struct sif_dev *sdev, struct sif_qp *qp, u16 *head) +{ + struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx); + struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx); + struct psif_query_qp lqqp; + int ret = 0; + + ret = epsc_query_qp(qp, &lqqp); + if (ret) { + sif_log(sdev, SIF_INFO, "epsc_query_qp failed with status %d", ret); + return ret; + } + if (head) + *head = lqqp.qp.retry_sq_seq; + + return sq_length(sq, lqqp.qp.retry_sq_seq, sq_sw->last_seq); +} + +int pre_process_wa4074(struct sif_dev *sdev, struct sif_qp *qp) +{ + struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx); + struct psif_sq_entry *sqe; + u16 head; + int len; + + len = outstanding_wqes(sdev, qp, &head); + if (len <= 0) + return -1; + + while (len) { + head++; + sqe = get_sq_entry(sq, head); + set_psif_wr__checksum(&sqe->wr, 0); + len--; + } + return 0; +} + +/* QP is in RESET state, its now safe to do a cq_walk and + * flush any completions. + */ +int post_process_wa4074(struct sif_dev *sdev, struct sif_qp *qp) +{ + struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx); + struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx); + struct psif_query_qp lqqp; + bool last_seq_set = false; + u16 last_seq, fence_seq; + DECLARE_SIF_CQE_POLL(sdev, lcqe); + int ret = 0; + bool need_gen_fence_completion = true; + struct sif_cq *cq = (sq && sq->cq_idx >= 0) ? get_sif_cq(sdev, sq->cq_idx) : NULL; + struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index); + + + /* if flush SQ is in progress, set FLUSH_SQ_IN_FLIGHT. + */ + if (test_bit(FLUSH_SQ_IN_PROGRESS, &sq_sw->flags)) { + set_bit(FLUSH_SQ_IN_FLIGHT, &sq_sw->flags); + return ret; + } + + if (test_and_set_bit(FLUSH_SQ_IN_PROGRESS, &sq_sw->flags)) { + set_bit(FLUSH_SQ_IN_FLIGHT, &sq_sw->flags); + return ret; + } + + if ((sq_sw->last_seq - sq_sw->head_seq) == 0) + goto err_post_wa4074; + + /* if SQ has been flushed before, continue to generate + * the remaining completions. + */ + if (test_and_set_bit(FLUSH_SQ_FIRST_TIME, &sq_sw->flags)) { + sif_log(sdev, SIF_WCE_V, "flush sq not the first time"); + last_seq = sq_sw->trusted_seq; + goto flush_sq_again; + } + + ret = epsc_query_qp(qp, &lqqp); + if (ret) { + sif_log(sdev, SIF_INFO, "epsc_query_qp failed, ret %d", ret); + goto err_post_wa4074; + } + + last_seq = sq_sw->last_seq; + + set_bit(CQ_POLLING_NOT_ALLOWED, &cq_sw->flags); + + sif_log(sdev, SIF_WCE_V, "sq_retry_seq %x sq_seq %x last_seq %x head_seq %x", + lqqp.qp.retry_sq_seq, lqqp.qp.sq_seq, sq_sw->last_seq, sq_sw->head_seq); + + /* need_gen_fence_completion is used to flush any cqes in the pipeline. + * If this is a good case, no fence completion is needed. + * Proceed directly to walk and update the CQE. The good case + * is only true if retry_tag_committed == retry_tag_err && + * retry_sq_seq + 1 == sq_seq && !flush_started. + */ + + need_gen_fence_completion = ((lqqp.qp.retry_tag_committed != lqqp.qp.retry_tag_err) || + (lqqp.qp.retry_sq_seq + 1 != lqqp.qp.sq_seq) || + (lqqp.qp.flush_started)); + + if (need_gen_fence_completion) { + + /* This is just a sequence number that we use to flush any cqes in the pipeline. + * Before walking the CQ, we need to ensure that we receive a cqe with fence_seq. + */ + fence_seq = sq_sw->head_seq + 1; + + sif_log(sdev, SIF_WCE_V, "fence_seq %x", + fence_seq); + + /* Completion fence, this also flushes any cqes in pipeline */ + ret = sif_gen_sq_flush_cqe(sdev, sq, fence_seq, qp->qp_idx, false); + if (ret) + sif_log(sdev, SIF_INFO, "sq %d, sif_gen_sq_flush_cqe returned %d", + sq->index, ret); + + if (ret == -EAGAIN) { + ret = gen_pqp_cqe(&lcqe); + if (ret < 0) + goto err_post_wa4074; + + ret = poll_cq_waitfor(&lcqe); + if (ret < 0) + goto err_post_wa4074; + + lcqe.written = false; + } + + /* Generate a sync.completion for us on the PQP */ + ret = gen_pqp_cqe(&lcqe); + if (ret < 0) { + sif_log(sdev, SIF_INFO, "SQ %d, gen_pqp_cqe ret %d", sq->index, ret); + goto err_post_wa4074; + } + ret = poll_cq_waitfor(&lcqe); + if (ret < 0) { + sif_log(sdev, SIF_INFO, "SQ %d, poll_cq_waitfor failed, ret %d", + sq->index, ret); + goto err_post_wa4074; + } + + last_seq = cq_walk_wa4074(sdev, qp, &last_seq_set); + + if (!last_seq_set) { + sif_log(sdev, SIF_INFO, "failed to generate a completion to cq"); + goto err_post_wa4074; + } + + if (last_seq != fence_seq) { + sif_log(sdev, SIF_INFO, "last seq (%x) is different than fenced completion (%x)!", + last_seq, fence_seq); + /* As the Fenced completion cannot be guaranteed to be the last, software still needs to + * walk and update the CQ to avoid unexpected completion/duplicated completion + * even thought the last completion is the CQ is not generated fenced completion. + */ + } + + sif_log(sdev, SIF_WCE_V, "after: sq_retry_seq %x sq_seq %x last_seq %x head_seq %x", + lqqp.qp.retry_sq_seq, lqqp.qp.sq_seq, sq_sw->last_seq, sq_sw->head_seq); + + } + last_seq = walk_and_update_cqes(sdev, qp, sq_sw->head_seq + 1, sq_sw->last_seq); + sq_sw->trusted_seq = last_seq; + + clear_bit(CQ_POLLING_NOT_ALLOWED, &cq_sw->flags); + + if (GREATER_16(last_seq, sq_sw->last_seq)) { + sif_log(sdev, SIF_WCE_V, "last seq %x > sq_sw->last_seq %x\n", last_seq, sq_sw->last_seq); + if (!(qp->flags & SIF_QPF_USER_MODE) && (cq->ibcq.comp_handler)) { + if (atomic_add_unless(&cq->refcnt, 1, 0)) { + sif_log(sdev, SIF_WCE_V, "need to generate an event to cq %d\n", cq->index); + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); + if (atomic_dec_and_test(&cq->refcnt)) + complete(&cq->cleanup_ok); + } + } + goto check_in_flight_and_return; + } + + sif_log(sdev, SIF_WCE_V, "generate completion from %x to %x", + last_seq, sq_sw->last_seq); +flush_sq_again: + for (; (!GREATER_16(last_seq, sq_sw->last_seq)); ++last_seq) { + sif_log(sdev, SIF_WCE_V, "generate completion %x", + last_seq); + + ret = sif_gen_sq_flush_cqe(sdev, sq, last_seq, qp->qp_idx, true); + if (ret) + sif_log(sdev, SIF_INFO, + "sq %d, last_seq %x, sif_gen_sq_flush_cqe returned %d", + sq->index, last_seq, ret); + + if (ret == -EAGAIN) { + ret = gen_pqp_cqe(&lcqe); + if (ret < 0) + goto err_post_wa4074; + + ret = poll_cq_waitfor(&lcqe); + if (ret < 0) + goto err_post_wa4074; + + lcqe.written = false; + continue; + } + + if (ret < 0) + goto err_post_wa4074; + } + + /* Generate a sync.completion for us on the PQP itself + * to allow us to wait for the whole to complete: + */ + ret = gen_pqp_cqe(&lcqe); + if (ret < 0) { + sif_log(sdev, SIF_INFO, "SQ %d, gen_pqp_cqe ret %d", sq->index, ret); + goto err_post_wa4074; + } + ret = poll_cq_waitfor(&lcqe); + if (ret < 0) { + sif_log(sdev, SIF_INFO, "SQ %d, poll_cq_waitfor failed, ret %d", + sq->index, ret); + goto err_post_wa4074; + } + + sif_log(sdev, SIF_INFO_V, "SQ %d: recv'd completion on cq %d seq 0x%x - done, ret %d", + sq->index, sq->cq_idx, lcqe.cqe.seq_num, ret); + sq_sw->trusted_seq = last_seq; + +check_in_flight_and_return: + if (test_and_clear_bit(FLUSH_SQ_IN_FLIGHT, &sq_sw->flags)) { + sif_log(sdev, SIF_WCE_V, "in-flight:generate completion from %x to %x", + last_seq, sq_sw->last_seq); + goto flush_sq_again; + } + +err_post_wa4074: + clear_bit(CQ_POLLING_NOT_ALLOWED, &cq_sw->flags); + clear_bit(FLUSH_SQ_IN_FLIGHT, &sq_sw->flags); + clear_bit(FLUSH_SQ_IN_PROGRESS, &sq_sw->flags); + qp->flush_sq_done_wa4074 = true; + return ret = ret > 0 ? 0 : ret; +} + +/* This is called from teardown (user modify QP->ERR) as well as + * any subsequent WQEs posted to SQ. + */ +int sq_flush_wa4074(struct sif_dev *sdev, struct sif_qp *qp) +{ + struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx); + struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx); + struct sif_cq *cq = (sq && sq->cq_idx >= 0) ? get_sif_cq(sdev, sq->cq_idx) : NULL; + struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index); + u16 last_seq; + int flushed = 0; + DECLARE_SIF_CQE_POLL(sdev, lcqe); + int ret = 0; + + sif_log(sdev, SIF_INFO_V, "last_seq %x head_seq %x", + sq_sw->last_seq, sq_sw->head_seq); + + set_bit(CQ_POLLING_NOT_ALLOWED, &cq_sw->flags); + + last_seq = walk_and_update_cqes(sdev, qp, sq_sw->head_seq + 1, sq_sw->last_seq); + + clear_bit(CQ_POLLING_NOT_ALLOWED, &cq_sw->flags); + + if (last_seq > sq_sw->last_seq) + goto err_sq_flush; + + for (; last_seq <= sq_sw->last_seq; ++last_seq) { + + ret = sif_gen_sq_flush_cqe(sdev, sq, last_seq, qp->qp_idx, true); + if (ret) + sif_log(sdev, SIF_INFO, + "sq %d, last_seq %x, sif_gen_sq_flush_cqe returned %d", + sq->index, last_seq, ret); + + if (ret == -EAGAIN) { + ret = gen_pqp_cqe(&lcqe); + if (ret < 0) + goto err_sq_flush; + + ret = poll_cq_waitfor(&lcqe); + if (ret < 0) + goto err_sq_flush; + + lcqe.written = false; + continue; + } + + if (ret < 0) + goto err_sq_flush; + ++flushed; + } + + /* Generate a sync.completion for us on the PQP itself + * to allow us to wait for the whole to complete: + */ + ret = gen_pqp_cqe(&lcqe); + if (ret < 0) { + sif_log(sdev, SIF_INFO, "SQ %d, gen_pqp_cqe ret %d", sq->index, ret); + goto err_sq_flush; + } + ret = poll_cq_waitfor(&lcqe); + if (ret < 0) { + sif_log(sdev, SIF_INFO, "SQ %d, poll_cq_waitfor failed, ret %d", + sq->index, ret); + goto err_sq_flush; + } + + sif_log(sdev, SIF_INFO_V, "SQ %d: recv'd completion on cq %d seq 0x%x - done, ret %d", + sq->index, sq->cq_idx, lcqe.cqe.seq_num, ret); + +err_sq_flush: + return ret = ret > 0 ? 0 : ret; +} + +/* Walk the CQ, update the cqe from head to end and return the last_seq */ +static u16 walk_and_update_cqes(struct sif_dev *sdev, struct sif_qp *qp, u16 head, u16 end) +{ + struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx); + struct sif_cq *cq = (sq && sq->cq_idx >= 0) ? get_sif_cq(sdev, sq->cq_idx) : NULL; + struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index); + volatile struct psif_cq_entry *cqe; + u16 last_seq = 0, updated_seq; + u32 seqno, polled_value; + unsigned long flags = 0; + int n = 0; + + updated_seq = head; + last_seq = head; + + spin_lock_irqsave(&cq->lock, flags); + + for (seqno = cq_sw->next_seq;; ++seqno) { + struct psif_cq_entry lcqe; + + cqe = get_cq_entry(cq, seqno); + polled_value = get_psif_cq_entry__seq_num(cqe); + + if (seqno != polled_value) + break; + + if (get_psif_cq_entry__qp(cqe) != qp->qp_idx) + continue; + + copy_conv_to_sw(&lcqe, cqe, sizeof(lcqe)); + + if (!(lcqe.opcode & IB_WC_RECV)) { + last_seq = lcqe.wc_id.sq_id.sq_seq_num; + sif_log(sdev, SIF_WCE_V, "last_seq %x updated_seq %x lcqe.seq_num %x", + last_seq, updated_seq, lcqe.seq_num); + if (last_seq != updated_seq) { + lcqe.wc_id.sq_id.sq_seq_num = updated_seq; + if (GREATER_16(updated_seq, end)) { + /* A scenario might be that an additional CQE + * must be generated to flush all the HW + * generated completions. Thus, igore the polling the cqe. + */ + lcqe.seq_num = ~lcqe.seq_num; + sif_log(sdev, SIF_WCE_V, "corrupt: lcqe.seq_num %x", + lcqe.seq_num); + set_bit(CQ_POLLING_IGNORED_SEQ, &cq_sw->flags); + } + copy_conv_to_hw(cqe, &lcqe, sizeof(lcqe)); + } + if (!GREATER_16(updated_seq, end)) + updated_seq++; + ++n; + } + } + sif_log(sdev, SIF_WCE_V, "sq/cq %d/%d: %d entries not being pulled yet", + sq->index, cq->index, n); + + spin_unlock_irqrestore(&cq->lock, flags); + return updated_seq; +} + +/* Walk the CQ and return the last completed sq_seq */ +static u16 cq_walk_wa4074(struct sif_dev *sdev, struct sif_qp *qp, bool *last_seq_set) +{ + struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx); + struct sif_cq *cq = (sq && sq->cq_idx >= 0) ? get_sif_cq(sdev, sq->cq_idx) : NULL; + struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index); + volatile struct psif_cq_entry *cqe; + u32 seqno, polled_value; + unsigned long flags = 0; + u16 last_seq = 0, prev_seq = 0; + bool prev_seq_set = false; + int n = 0; + + spin_lock_irqsave(&cq->lock, flags); + + for (seqno = cq_sw->next_seq;; ++seqno) { + struct psif_cq_entry lcqe; + + cqe = get_cq_entry(cq, seqno); + polled_value = get_psif_cq_entry__seq_num(cqe); + + if (seqno != polled_value) + break; + + if (get_psif_cq_entry__qp(cqe) != qp->qp_idx) + continue; + + copy_conv_to_sw(&lcqe, cqe, sizeof(lcqe)); + + if (!(lcqe.opcode & IB_WC_RECV)) { + last_seq = lcqe.wc_id.sq_id.sq_seq_num; + + if (!(*last_seq_set)) + *last_seq_set = true; + + if (unlikely(prev_seq_set && prev_seq >= last_seq)) + sif_log(sdev, SIF_INFO_V, + "sq/cq %d/%d: prev sq_seq (0x%x) >= curr sq_seq (0x%x)", + sq->index, cq->index, prev_seq, last_seq); + + prev_seq = last_seq; + if (!(prev_seq_set)) + prev_seq_set = true; + n++; + } + } + sif_log(sdev, SIF_WCE_V, "sq/cq %d/%d: %d entries not being pulled yet", + sq->index, cq->index, n); + + spin_unlock_irqrestore(&cq->lock, flags); + return last_seq; +} diff --git a/drivers/infiniband/hw/sif/sif_r3.h b/drivers/infiniband/hw/sif/sif_r3.h new file mode 100644 index 000000000000..6fffc755952f --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_r3.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_r3.h: Special handling specific for psif revision 3 and earlier + */ + +#ifndef _SIF_R3_H +#define _SIF_R3_H + +int sif_r3_init(struct sif_dev *sdev); +void sif_r3_deinit(struct sif_dev *sdev); + +/* WA for #3713 */ +int reset_qp_flush_retry(struct sif_dev *sdev); +void sif_r3_recreate_flush_qp(struct sif_dev *sdev); + +/* WA for #4074 */ +int pre_process_wa4074(struct sif_dev *sdev, struct sif_qp *qp); +int post_process_wa4074(struct sif_dev *sdev, struct sif_qp *qp); +int sq_flush_wa4074(struct sif_dev *sdev, struct sif_qp *qp); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_rq.c b/drivers/infiniband/hw/sif/sif_rq.c new file mode 100644 index 000000000000..da406db58711 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_rq.c @@ -0,0 +1,545 @@ +/* + * Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_rq.c: Implementation of sif receive queues + */ + +#include +#include "sif_dev.h" +#include "psif_hw_data.h" +#include "psif_hw_setget.h" +#include "sif_dma.h" +#include "sif_rq.h" +#include "sif_xrc.h" +#include "sif_base.h" +#include "sif_defs.h" +#include + +int poll_wait_for_rq_writeback(struct sif_dev *sdev, struct sif_rq *rq) +{ + unsigned long timeout = sdev->min_resp_ticks; + unsigned long timeout_real = jiffies + timeout; + u8 valid; + + sif_log(sdev, SIF_RQ, "enter rq %d", rq->index); + do { + /* Make sure the update from hw is observed in correct order */ + smp_rmb(); + valid = get_psif_rq_hw__valid(&rq->d); + + if (!valid) + break; + + if (time_is_after_jiffies(timeout_real)) + cpu_relax(); + else { + sif_log(sdev, SIF_INFO, + "Timeout waiting for write back for RQ %d - still valid", + rq->index); + return -ETIMEDOUT; + } + } while (true); + + sif_log(sdev, SIF_RQ, "exit - write-back observed on rq %d", rq->index); + return 0; +} + +int alloc_rq(struct sif_dev *sdev, struct sif_pd *pd, + u32 entries, u32 sg_entries, + struct ib_srq_init_attr *srq_init_attr, + bool user_mode) +{ + int ret = 0; + bool mark_dirty = false; + /* Access to receive queue descriptor elements */ + struct sif_rq *rq; + struct sif_rq_sw *rq_sw; + volatile struct psif_rq_hw *rq_hw_p; + struct psif_rq_sw lrq_sw; + struct psif_xrq_hw lrq_hw; + int extent_log2; + struct psif_rq_entry rqe; /* Receive queue element for size calc only */ + u32 max_entries; + u32 entries_log2; + int rq_idx; + u64 alloc_sz; + + max_entries = roundup_pow_of_two(entries); + entries_log2 = order_base_2(max_entries); + + /* Meaningless with 0 sge */ + if (!sg_entries) + sg_entries = 1; + if (sg_entries > 16) { + sif_log(sdev, SIF_INFO, + "requested %d but sif only supports 16 receive sg entries", + sg_entries); + return -ENOMEM; + } + + /* Max supporter nmbr of RQ WRs are 2^14 - 1 */ + if (entries > 0x3fff) { + sif_log(sdev, SIF_INFO, + "requested %d entries, but sif only supports %d", + entries, 0x3fff); + return -ENFILE; /* 4 bit size_log2 field in rqs but highest value not supported (#2965) */ + } + + rq_idx = sif_alloc_rq_hw_idx(pd); + + if (rq_idx < 0) { + sif_log(sdev, SIF_INFO, + "unable to allocate a receive queue, consider increasing rq_size"); + ret = -ENOMEM; + return ret; + } + rq = get_sif_rq(sdev, rq_idx); + + /* Make sure the RQ is sofware owned: */ + ret = poll_wait_for_rq_writeback(sdev, rq); + if (ret) { + mark_dirty = true; + goto err_alloc; + } + rq->index = rq_idx; + rq->pd = pd; + + rq_hw_p = &rq->d; + rq_sw = get_sif_rq_sw(sdev, rq_idx); + + /* Initialize driver/user space state within sw extent */ + atomic_set(&rq_sw->length, 0); + rq_sw->next_seq = 0; + + rq->entries = max_entries; + /* Ref. #2965 */ + rq->entries_user = (entries_log2 == 0xe ? max_entries - 1 : max_entries); + rq->mask = max_entries - 1; + rq->extent = + roundup_pow_of_two(sizeof(rqe.rqe_id) + + sizeof(struct psif_rq_scatter) * sg_entries); + + /* Now recalculate sge space from the extent to offer any extra room "for free" */ + sg_entries = min((rq->extent - sizeof(rqe.rqe_id)) / sizeof(struct psif_rq_scatter), 16UL); + extent_log2 = order_base_2(rq->extent); + alloc_sz = max_entries * rq->extent; + + /* Only whole pages must be exposed to user space */ + if (user_mode && (alloc_sz & ~PAGE_MASK)) + alloc_sz = (alloc_sz + PAGE_SIZE) & PAGE_MASK; + rq->user_mode = user_mode; + + sif_log(sdev, SIF_QP, "RQ:sw 0x%p, hw 0x%p entries %d index %d extent %d max sge %d", + rq_sw, rq_hw_p, rq->entries, rq_idx, rq->extent, sg_entries); + + if (alloc_sz <= SIF_MAX_CONT) + rq->mem = sif_mem_create_dmacont(sdev, alloc_sz, GFP_KERNEL | __GFP_ZERO, DMA_BIDIRECTIONAL); + else + rq->mem = sif_mem_create(sdev, alloc_sz >> PMD_SHIFT, + alloc_sz, SIFMT_2M, GFP_KERNEL | __GFP_ZERO, DMA_BIDIRECTIONAL); + if (!rq->mem) { + sif_log(sdev, SIF_INFO, "Failed RQ buffer pool allocation!"); + ret = -ENOMEM; + goto err_alloc; + } + + rq->sg_entries = sg_entries; + atomic_set(&rq->refcnt, 1); + + /* Initialize hw part of descriptor */ + memset(&lrq_hw, 0, sizeof(lrq_hw)); + + /* For normal RQs we use the valid bit as follows: + * + * - If the QP is in RESET state, the RQ is invalid. + * - The RQ is set to valid as part of transitioning to INIT. + * - The RQ is still valid when the QP is in ERROR state + * - A modify to RESET resets the valid bit again. + */ + + lrq_hw.size_log2 = entries_log2; + lrq_hw.prefetch_threshold_log2 = 1; + + /* scatter = 0 means a single entry etc. */ + lrq_hw.scatter = rq->sg_entries - 1; + lrq_hw.pd = pd->idx; + + lrq_hw.head_indx = 0; + lrq_hw.base_addr = sif_mem_dma(rq->mem, 0); + lrq_hw.extent_log2 = extent_log2; + + /* Allocate mmu context without wr_access set */ + ret = sif_map_ctx(sdev, &rq->mmu_ctx, rq->mem, lrq_hw.base_addr, + alloc_sz, false); + if (ret) { + sif_log(sdev, SIF_INFO, "Failed to set mmu context for rq %d", + rq->index); + goto err_map_ctx; + } + + if (srq_init_attr) { + /* Request for an SRQ */ + lrq_hw.valid = 1; /* SRQs are valid for their entire lifetime */ + lrq_hw.srq = 1; + lrq_hw.srq_lim = srq_init_attr->attr.srq_limit; + rq->is_srq = true; + + if (srq_init_attr->srq_type == IB_SRQT_XRC) { + struct sif_cq *cq = to_scq(srq_init_attr->ext.xrc.cq); + struct sif_xrcd *xrcd = to_sxrcd(srq_init_attr->ext.xrc.xrcd); + ulong flags; + + rq->cq_idx = cq->index; + rq->xrc_domain = lrq_hw.xrc_domain = xrcd->index; + lrq_hw.cqd_id = rq->cq_idx; + spin_lock_irqsave(&cq->lock, flags); + /* We only allow a CQ to be used for one single XSRQ + * This is a violation of the IB standard but one + * that probably should not have practical conseqences: + * See #3521 for details: + */ + if (cq->xsrq) { + sif_log(sdev, SIF_INFO, + "xsrq %d: cq %d already used with xsrq %d - please use another cq for this xsrq", + rq->index, cq->index, cq->xsrq->index); + ret = -EBUSY; + } else + cq->xsrq = rq; + spin_unlock_irqrestore(&cq->lock, flags); + if (ret) + goto err_map_ctx; + } + } + + /* Get the hw mmu context populated by sif_map_ctx */ + lrq_hw.mmu_cntx = rq->mmu_ctx.mctx; + + /* Write network byte order hw copy */ + copy_conv_to_hw(rq_hw_p, &lrq_hw, sizeof(lrq_hw)); + + /* Initialize sw part of descriptor */ + memset(&lrq_sw, 0, sizeof(lrq_sw)); + lrq_sw.tail_indx = rq_sw->next_seq; + + copy_conv_to_hw(&rq_sw->d, &lrq_sw, sizeof(lrq_sw)); + + spin_lock_init(&rq->lock); + + return rq_idx; + +err_map_ctx: + sif_mem_free(rq->mem); +err_alloc: + if (!mark_dirty) + sif_free_rq_hw_idx(pd, rq_idx); + return ret; +} + + +/* Invalidate the RQ cache and flush a desired amount of + * the remaining entries in the given receive queue. + * @target_qp indicates the value of the local_qp field in the generated + * completion. The qp itself would already have been modified to RESET + * to avoid any more traffic; + * + * Workaround #622: PSIF doesn't generate "FLUSHED IN ERROR" completions. + * In order to maintain OFED verbs-programming and IB spec. compatibility, + * RQEs needs to be "flushed in error" when + * - Verbs layer modifies QP to error + * - Hardware sends an async event, after setting the QP in error + * - Poll CQ on IB client(kernel/user) receives an error completion + * (Responder class A & C) with QP set to error + * - More WQEs are posted by IB client(kernel/user) when QP in error + * - QP is destroyed + * + * Note: No locking of the RQ is neccessary as there are multiple trigger points + * for flushing RQEs within OFED verbs model. + */ +int sif_flush_rq(struct sif_dev *sdev, struct sif_rq *rq, struct sif_qp *target_qp, + int max_flushed_in_err) +{ + int len, real_len; + struct sif_rq_sw *rq_sw = get_sif_rq_sw(sdev, rq->index); + int ret = 0; + u32 head, tail; + enum sif_mqp_type mqp_type = SIF_MQP_SW; + DECLARE_SIF_CQE_POLL(sdev, lcqe); + + /* if flush RQ is in progress, set FLUSH_RQ_IN_FLIGHT. + */ + if (test_bit(FLUSH_RQ_IN_PROGRESS, &rq_sw->flags)) { + set_bit(FLUSH_RQ_IN_FLIGHT, &rq_sw->flags); + return ret; + } + + /* if race condition happened while trying to flush RQ, + * set the FLUSH_RQ_IN_FLIGHT, and let the other party does the job. + */ + if (test_and_set_bit(FLUSH_RQ_IN_PROGRESS, &rq_sw->flags)) { + set_bit(FLUSH_RQ_IN_FLIGHT, &rq_sw->flags); + return ret; + } + + if (!sif_feature(disable_rq_flush)) + len = min(max_flushed_in_err, atomic_read(&rq_sw->length)); + else + len = 0; + if (len == 0) + goto error; + + sif_log(sdev, SIF_INFO_V, "flushing %d entries out of %d/%d entries remaining", + len, atomic_read(&rq_sw->length), rq->entries); + + /* Workaround #622 v2 step 1: ModifyQP to RESET + * The QP must be in the RESET state to avoid race condition. + * sif_flush_rq will only be called when the QP is + * in ERROR state. As for now, keeping the same coding style to + * check whether the qp flags SIF_QPF_HW_OWNED is clear. + * If it is clear, it means that the QP is in the shadowed + * software error state (actual hw state is in RESET). + * + * TBD - Should we add new PSIF_QP_STATE_SHADOWED_ERROR state, + * at least to me it is more readable? + */ + mutex_lock(&target_qp->lock); + /* qp lock must be held to make sure not other thread is trying to do modify_qp_hw to RESET */ + mqp_type = sif_modify_qp_is_ok(target_qp, target_qp->last_set_state, IB_QPS_RESET, IB_QP_STATE); + + if (mqp_type == SIF_MQP_HW) { + struct ib_qp_attr attr = { + .qp_state = IB_QPS_ERR + }; + + ret = modify_qp_hw_wa_qp_retry(sdev, target_qp, &attr, IB_QP_STATE); + + if (ret) + sif_log(sdev, SIF_INFO, "qp %d RESET failed, ret %d", + target_qp->qp_idx, ret); + + } + mutex_unlock(&target_qp->lock); + + /* Workaround #622 v2 step 2: Invalidate RQ + * Invalidation of an RQ causes PSIF to flush it's caches for that RQ. + * If PSIF finds the RQ invalid, it will attempt to fetch it. + * It is then required to be valid (otherwise it will be interpreted as an error + * by PSIF (see #2134). So software cannot rely upon the completion of the invalidate + * to signal that the descriptor can be re-used, instead it will have to + * verify by checking the final write-back of the descriptor, which will have + * valid set to 0 by PSIF. In the general case we handle this lazy and check before we + * try to re-use. The request is posted with no completion requested as we + * do not need the completion: + */ + if (!(test_bit(RQ_IS_INVALIDATED, &rq_sw->flags))) { + ret = sif_invalidate_rq_hw(sdev, rq->index, PCM_POST); + if (ret) { + sif_log(sdev, SIF_INFO, + "Invalidate rq_hw failed, status %d", ret); + goto error; + } + set_bit(RQ_IS_INVALIDATED, &rq_sw->flags); + } + + /* Make sure the RQ is sofware owned: */ + ret = poll_wait_for_rq_writeback(sdev, rq); + if (ret) + goto error; + + /* The RQ is now software owned and the (after a successful invalidate) so we + * should be able to trust rq_hw::head_indx - better than scanning the CQ + * for unprocessed elements: + * Note that only the lowest 14 bits of the sequence number in head_indx is + * valid: + */ +flush_rq_again: + head = get_psif_rq_hw__head_indx(&rq->d); + tail = rq_sw->next_seq; + real_len = rq_length(rq, head, tail & ((1 << 14) - 1)) & ((1 << 14) - 1); + + /* Workaround #622 v2 step 3: Check the last completion on the CQ + * The rq_sw->length is used to track the length of a queue + * with #posted - #completed. If the calculated real_len is + * smaller than the len, it means that a completion is missing. + * Instead of loooping RQ to find rqe of the completed wc_id, the + * rq_sw->length represents the #posted - #completed, and nfixup + * represents the remaining completions after the QP moved to RESET. + * Thus, the number of flush-in error that must be generated is + * rq_sw->length - nfixup. + */ + if (!(test_bit(FLUSH_RQ_FIRST_TIME, &rq_sw->flags))) { + /* need to use a flag to differentiate between the first call of + * sif_flush_rq or the subsequent call. The race condition where + * HW acquired a RWQE but does not generate a completion can + * only happen at the first call of sif_flush_rq. This is because + * the QP state is moved to RESET. + * Besides, if the generated completion arrived later and + * FLUSH_RQ_IN_FLIGHT is set, the test of real_len < len + * might be true. + */ + len = atomic_read(&rq_sw->length); + if (real_len < len) { + int nfixup; + u32 cq_idx = get_psif_qp_core__rcv_cq_indx(&target_qp->d.state); + struct sif_cq *cq = rq ? get_sif_cq(sdev, cq_idx) : NULL; + + nfixup = sif_fixup_cqes(cq, NULL, target_qp); + sif_log(sdev, SIF_RQ, + "RQ %d: updating calculated entries from %d to %d - %d (%d)", + rq->index, real_len, len, nfixup, len - nfixup); + real_len = len - nfixup; + } + set_bit(FLUSH_RQ_FIRST_TIME, &rq_sw->flags); + } + + /* Now find the actual 32 bit seq.no */ + head = tail - real_len; + + sif_log(sdev, SIF_RQ, + "RQ %d not empty: sz %d, head %d, next_seq %d, %d/%d entries at exit", + rq->index, rq->entries, head, tail, len, real_len); + + if (!real_len) + goto error; + + /* Workaround #622 v2 step 4: generate flush in error completion + * Generate flushed in error completions: + * these give no pqp completions but may in theory fail + */ + while (real_len > 0) { + sif_log(sdev, SIF_PQP, "rq %d, len %d", rq->index, real_len); + ret = sif_gen_rq_flush_cqe(sdev, rq, head, target_qp); + if (ret) + sif_log(sdev, SIF_INFO, "rq %d, len %d, sif_gen_rq_flush_cqe returned %d", + rq->index, real_len, ret); + if (ret == -EAGAIN) { + ret = gen_pqp_cqe(&lcqe); + if (ret < 0) + goto error; + ret = poll_cq_waitfor(&lcqe); + if (ret < 0) + goto error; + lcqe.written = false; + continue; + } + if (ret < 0) + goto error; + real_len--; + head++; + } + + /* Finally generate a sync.completion for us on the PQP itself + * to allow us to wait for the whole to complete: + */ + ret = gen_pqp_cqe(&lcqe); + if (ret < 0) { + sif_log(sdev, SIF_INFO, "rq %d, cqe %p gen_pqp_cqe returned %d", + rq->index, &lcqe, ret); + goto error; + } + + ret = poll_cq_waitfor(&lcqe); + if (ret < 0) { + sif_log(sdev, SIF_INFO, "rq %d, cqe %p poll_cq_waitfor returned %d", + rq->index, &lcqe, ret); + goto error; + } + + sif_log(sdev, SIF_INFO_V, "RQ %d: received completion on cq %d seq 0x%x - done", + rq->index, rq->cq_idx, lcqe.cqe.seq_num); + + /* Make sure hardware pointer reflects the flushed situation */ + set_psif_rq_hw__head_indx(&rq->d, head); + wmb(); + + /* if FLUSH_RQ_IN_FLIGHT is set, it means another party is trying to + * flush the rq at the same time. This should be retried + * once as no more than one asynchronous event will be generated if + * QP is in ERROR state. This is to take care of a scenario where + * QP is modified to ERROR explicitly and at the same time received + * the asynchronous event. Nevertheless, the RQ entry changes in between + * of these two scenario that can trigger flush rq. + */ + if (test_and_clear_bit(FLUSH_RQ_IN_FLIGHT, &rq_sw->flags)) + goto flush_rq_again; + +error: + clear_bit(FLUSH_RQ_IN_PROGRESS, &rq_sw->flags); + return ret = ret > 0 ? 0 : ret; +} + + +int free_rq(struct sif_dev *sdev, int rq_idx) +{ + struct sif_rq *rq; + int stat; + + rq = get_sif_rq(sdev, rq_idx); + sif_log(sdev, SIF_RQ, "entry %d", rq_idx); + + stat = atomic_dec_and_test(&rq->refcnt); + if (!stat) { + sif_log(sdev, SIF_RQ, "rq %d still in use - ref.cnt %d", + rq_idx, atomic_read(&rq->refcnt)); + return -EBUSY; + } + + sif_release_rq(sdev, rq->index); + return 0; +} + + +void sif_release_rq(struct sif_dev *sdev, int index) +{ + struct sif_rq *rq = get_sif_rq(sdev, index); + struct sif_pd *pd = rq->pd; + + if (!pd) { + sif_log(sdev, SIF_INFO, "Internal error: no pd associated with rq %d", index); + return; + } + + sif_unmap_ctx(sdev, &rq->mmu_ctx); + + sif_mem_free(rq->mem); + sif_clear_rq_sw(sdev, index); + + if (!sif_feature(disable_invalidate_rq)) + sif_free_rq_hw_idx(pd, index); +} + +void sif_dfs_print_rq_hw(struct seq_file *s, struct sif_dev *sdev, loff_t pos) +{ + struct sif_rq *rq; + struct sif_rq_sw *rq_sw; + volatile struct psif_rq_hw *rq_hw; + u32 tail, head; + int qlen; + + if (unlikely(pos < 0)) { + seq_puts(s, "# Index head sw_tail entries queue_len nmbr_sge next_seq srq_lim\n"); + return; + } + rq = get_sif_rq(sdev, pos); + rq_hw = &rq->d; + rq_sw = get_sif_rq_sw(sdev, pos); + + head = get_psif_rq_hw__head_indx(rq_hw); + tail = get_psif_rq_sw__tail_indx(&rq_sw->d); + qlen = atomic_read(&rq_sw->length); + + seq_printf(s, "%7llu %5u %8u %8u %9u %8u %8u %7u", pos, + head, tail, rq->entries, qlen, rq->sg_entries, rq_sw->next_seq, rq->srq_limit); + if (rq->is_srq & rq->xrc_domain) + seq_puts(s, "\t[XRC-SRQ]\n"); + else if (rq->is_srq) + seq_puts(s, "\t[SRQ]\n"); + else + seq_puts(s, "\n"); +} diff --git a/drivers/infiniband/hw/sif/sif_rq.h b/drivers/infiniband/hw/sif/sif_rq.h new file mode 100644 index 000000000000..be8bb21fadc2 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_rq.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_rq.h: Interface to sif receive queues + */ + +#ifndef _SIF_RQ_H +#define _SIF_RQ_H + +struct sif_rq { + volatile struct psif_rq_hw d; /* Hardware descriptor */ + struct ib_srq ibsrq ____cacheline_internodealigned_in_smp; /* Only used if this is an SRQ */ + spinlock_t lock ____cacheline_internodealigned_in_smp; + struct sif_mmu_ctx mmu_ctx; + struct sif_pd *pd; /* Ref to owning protection domain */ + int index; + int cq_idx; /* Default compl.queue index to use, if any */ + bool user_mode; /* Set if this is an RQ to be mapped to user space */ + bool is_srq; /* Set if this is a shared receive queue */ + int xrc_domain; /* If != 0: This is an XRC SRQ member of this domain idx */ + atomic_t refcnt; /* Ref.count for usage as a shared receive queue */ + u16 entries; /* Allocated entries */ + u16 entries_user; /* Entries reported to user (entries -1 if max) */ + u32 sg_entries; /* Max receive scatter/gather configured for this rq */ + u32 mask; /* entries - 1 for modulo using & */ + u32 extent; + u16 srq_limit; + struct sif_mem *mem; /* Allocated queue memory */ +}; + +static inline struct sif_rq *to_srq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct sif_rq, ibsrq); +} + +int poll_wait_for_rq_writeback(struct sif_dev *sdev, struct sif_rq *rq); + +/* Allocate a receive queue - if @srq_init_attr is non-nil + * this is a shared receive queue (SRQ) + * A return value >= 0 is the index of the receive queue descriptor allocated + * otherwise it is -errno + */ +int alloc_rq(struct sif_dev *sdev, struct sif_pd *pd, + u32 entries, u32 sge_entries, + struct ib_srq_init_attr *srq_init_attr, + bool user_mode); + +/* Invalidate the RQ cache and flush a desired amount of + * the remaining entries in the given receive queue. + * @target_qp indicates the value of the local_qp field in the generated + * completion but is not interpreted by SIF in any way. + */ +int sif_flush_rq(struct sif_dev *sdev, struct sif_rq *rq, + struct sif_qp *target_qp, int max_flushed_in_err); + +int free_rq(struct sif_dev *sdev, int rq_idx); + +/* Low level callbacks to release memory for these queues + * Called from sif_hiw::handle_invalidate_wc + */ +void sif_release_rq(struct sif_dev *sdev, int index); + +void sif_dfs_print_rq_hw(struct seq_file *s, struct sif_dev *sdev, loff_t pos); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_sndrcv.c b/drivers/infiniband/hw/sif/sif_sndrcv.c new file mode 100644 index 000000000000..c2afdab16da0 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_sndrcv.c @@ -0,0 +1,1152 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_sndrcv.c: Implementation of post send/recv logic for SIF + */ +#include +#include +#include +#include +#include + +#include "sif_dev.h" +#include "sif_query.h" +#include "sif_defs.h" +#include "sif_base.h" +#include "sif_sndrcv.h" +#include "sif_qp.h" +#include "sif_mr.h" +#include "sif_tqp.h" +#include "sif_r3.h" +#include "psif_hw_setget.h" +#include "sif_checksum.h" +#include + + +/* Handle a NULL terminated array of send work requests */ +#define SQS_ACTIVE (get_psif_sq_hw__sq_next(&sq->d) != 0xFFFFFFFF) +int sif_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct sif_dev *sdev = to_sdev(ibqp->device); + struct sif_qp *qp = to_sqp(ibqp); + struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx); + struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx); + unsigned long flags; + bool doorbell_mode; + bool last; + u16 first_seq; + const int nmbr_wrs_to_bulk_process = 32; + int ret = 0; + int n; + + sif_log(sdev, SIF_SND, "on qp_idx %d wr 0x%p ibv type %d", + qp->qp_idx, wr, wr->opcode); + + if (unlikely(qp->type > PSIF_QP_TRANSPORT_MANSP2)) { + sif_log(sdev, SIF_INFO, "Invalid QP type"); + ret = -EINVAL; + goto err_post_send_unlocked; + } + + if (unlikely(is_epsa_tunneling_qp(ibqp->qp_type))) { + sif_log(sdev, SIF_QP, "epsa tunneling post_send"); + return sif_epsa_tunneling_post_send(ibqp, wr, bad_wr); + } + + /* PSIF does not support SQD. Per IBTA 11.4.1.1, error is only returned + * when the QP is in the RESET, INIT or RTR states. + */ + if (unlikely(qp->last_set_state < IB_QPS_RTS)) { + sif_log(sdev, SIF_INFO, "Invalid QP state - expected RTS(%d) found %d!", + (int)IB_QPS_RTS, qp->last_set_state); + ret = -EINVAL; + goto err_post_send_unlocked; + } + + while (wr) { + /* Workaround #3595: ring doorbell if SQS active */ + doorbell_mode = qp->flags & SIF_QPF_FORCE_SQ_MODE || SQS_ACTIVE; + + /* We need to serialize sends on the same send queue + * so we need to keep sq->lock around it all + */ + spin_lock_irqsave(&sq->lock, flags); + first_seq = sq_sw->last_seq + 1; + for (n = 0; wr && n < nmbr_wrs_to_bulk_process; ++n, wr = wr->next) { + last = !wr->next || n == (nmbr_wrs_to_bulk_process - 1); + ret = sif_post_send_single(ibqp, wr, &doorbell_mode, last, &first_seq); + if (ret < 0) + goto err_post_send; + } + spin_unlock_irqrestore(&sq->lock, flags); + } + + if ((qp->type != PSIF_QP_TRANSPORT_MANSP1) + && (qp->last_set_state == IB_QPS_ERR)) { + ret = 0; + goto flush_sq_wa4074; + } + + + sif_log(sdev, SIF_SND, "Exit: success"); + return 0; + +err_post_send: + spin_unlock_irqrestore(&sq->lock, flags); + +err_post_send_unlocked: + *bad_wr = wr; + +flush_sq_wa4074: + if ((qp->type != PSIF_QP_TRANSPORT_MANSP1) + && (qp->last_set_state == IB_QPS_ERR)) { + if (post_process_wa4074(sdev, qp)) + sif_log(sdev, SIF_INFO, "failed to flush SQ %d", qp->qp_idx); + } + + sif_log(sdev, SIF_SND, "Exit: error %d", ret); + return ret; + +} +#undef SQS_ACTIVE + + +/* The copy_from_user function on x86_64 calls might_fault() to verify that + * it is not called from interrupt context. However with our use case the memory is guaranteed + * to be pinned, so no faults will ever happen. + * + * TBD: Sparc does not define _copy_from_user - just use copy_from _user for now + */ +inline unsigned long sif_copy_from_user(void *to, const void __user *from, unsigned int n) +{ +#ifdef __x86_64__ + return _copy_from_user(to, from, n); +#else + return copy_from_user(to, from, n); +#endif +} + + +static int copy_sg(struct sif_qp *qp, void *dest, u64 vaddr, u32 len) +{ + struct sif_dev *sdev = to_sdev(qp->ibqp.device); + + if (qp->ibqp.uobject) { + unsigned long not_copied; + + sif_log(sdev, SIF_SND, "Copy sg len %d from user addr 0x%llx to %p", + len, vaddr, dest); + not_copied = sif_copy_from_user(dest, (void __user *)vaddr, len); + if (not_copied) { + sif_log(sdev, SIF_INFO, + "copy_from_user: Failed to copy %ld/%d bytes from uaddr %llx", + not_copied, len, vaddr); + return -EFAULT; + } + } else { + sif_log(sdev, SIF_SND, "Copy sge len %d from kernel addr 0x%llx to %p", + len, vaddr, dest); + memcpy(dest, (void *)vaddr, len); + } + return 0; +} + + +/* Copy the first @sg_cnt sg entries of @wr into the inline space + */ + +/* TBD: Consider cleaning up/unrolling this into one copy + * into temp buffer for csumming/cb copy_convert + * and one other plain copy into send queue: + */ +static int prep_inline_part(struct sif_qp *qp, struct ib_send_wr *wr, int sg_cnt, + struct psif_cb *wqe, struct psif_wr_local *la, u32 sqe_seq, + bool is_phys_addr) +{ + int ret; + int wr_len = 0; + struct sif_sq *sq; + struct psif_sq_entry *sqe; + struct psif_key *key; + + /* collect buffer only supports 256 byte inlined, this first part + * of the inline data must be handled in host byte order to + * make sure the checksum gets right: + */ + int cb_len = min_t(int, ((qp->max_inline_data + CB_KICK_MASK) & ~CB_KICK_MASK), CB_LENGTH); + int space = qp->max_inline_data; + int copy = 0; + int remaining = -1; + int i; + u32 len = 0; + u64 addr = 0; + struct sif_dev *sdev = to_sdev(qp->ibqp.device); + + u8 buf[CB_LENGTH]; + u8 *dbuf = buf; + + if (wr->send_flags & IB_SEND_IP_CSUM) { + /* Cannot use collect-buffer for inline data when offloading */ + cb_len = 0; + } + + sq = get_sif_sq(sdev, qp->qp_idx); + sqe = get_sq_entry(sq, sqe_seq); + + sif_log(sdev, SIF_SND, "inline from %d sges, buf at %p sqe at %p", sg_cnt, buf, sqe); + + for (i = 0; i < sg_cnt; ++i) { + if (unlikely(remaining >= 0)) { + /* Switch to copying directly into send queue + * @copy already holds the offset + */ + dbuf = ((u8 *)sqe->payload); + if (remaining > 0) { + addr += len; + len = remaining; + remaining = -1; + goto do_copy; + } else + remaining = -1; + } + len = wr->sg_list[i].length; + addr = wr->sg_list[i].addr; + + if (len > 0) { + u32 lkey = wr->sg_list[i].lkey; + + key = safe_get_key(sdev, lkey); + if (!key || PSIF_DMA_KEY_INVALID == get_psif_key__lkey_state(key)) { + sif_log(sdev, SIF_INFO, + "Attempt to do inline copying from an invalid MR with lkey %d at addr 0x%llx", + lkey, addr); + return -EPERM; + } + } + +do_copy: + wr_len += len; + if (unlikely(dbuf == buf && wr_len >= cb_len)) { + remaining = wr_len - cb_len; + len -= remaining; + wr_len -= remaining; + if (remaining) + i--; /* Run an extra iter to copy remainder */ + } else if (unlikely(copy + len > space)) { + sif_log(sdev, SIF_INFO, + "Inline space exhausted: available %d, copied %d, len %d", + space, copy, len); + return -ENOMEM; + } + if (is_phys_addr) { + u64 *kva = phys_to_virt(addr); + + sif_log(sdev, SIF_SND, + "Phys-addr %llx -> %llx copy %d len %d", + addr, (u64)kva, copy, len); + memcpy((void *)&dbuf[copy], (void *)kva, len); + ret = 0; + } else { + ret = copy_sg(qp, &dbuf[copy], addr, len); + } + if (ret < 0) + return ret; + copy += len; + } + + if (buf == dbuf && copy & CB_KICK_MASK) { + /* Pad out the misaligned end data */ + memset(&buf[copy], 0, CB_KICK_ALIGN - (copy & CB_KICK_MASK)); + } + + sif_log(sdev, SIF_QP, "wr_len is %d bytes, cb_len %d bytes", wr_len, cb_len); + if (cb_len > 0) { + /* Convert payload twice to get checksum right. + * The 32 bit version of the checksumming in PSIF does not + * have the property that checksumming of the same data + * on different endian hosts yields the same checksum.. + */ + copy_conv_to_sw(wqe->payload, buf, cb_len); + } + wqe->wr.collect_length = min(wr_len, cb_len); + return wr_len; +} + +static inline int prep_inline(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe, + struct psif_wr_local *la, u32 sqe_seq, + bool is_phys_addr) +{ + struct sif_dev *sdev = to_sdev(qp->ibqp.device); + struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx); + int wr_len = prep_inline_part(qp, wr, wr->num_sge, wqe, la, sqe_seq, is_phys_addr); + + if (wr_len < 0) + return wr_len; + if (wr_len) { + /* la must point to the start of the payload in the send queue + * to have the whole message available in case of retries: + */ + la->addr = get_sqe_dma(sq, sqe_seq) + offsetof(struct psif_sq_entry, payload); + la->lkey = sq->sg_mr->index; + } + la->length = wr_len; + return wr_len; +} + +/* Helper funcs declared below */ +static void prep_atomic(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe); +static int prep_send(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe, + bool inlined, struct psif_wr_local *la, u32 sqe_idx); +static int prep_send_lso(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe, + bool inlined, struct psif_wr_local *la, u32 sqe_idx); +static int prep_remote_addr(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe); + + +/* Return bypass mode offset or 0 if invalid for post_sends (see below) + * (PSIF will take care of rejecting the post) + */ + +inline u64 mr_uv2dma(struct sif_dev *sdev, int idx) +{ + struct sif_mr *mr = safe_get_sif_mr(sdev, idx); + + if (mr) + return mr->mmu_ctx.uv2dma; + return 0; +} + + +/* + * Handle send of a single wr - can be called from any context. + * + * Use either CB mode or DB mode. In CB mode, wqe is allocated, + * written to SQ, SW pointer updated, and finally the wqe is written + * to the CB. In DB mode, the wqe is allocated and written to the + * SQ. On the last wqe, SW pointer is updated and the doorbell is rung + * with the seq number of the first sqe. + */ +int sif_post_send_single(struct ib_qp *ibqp, struct ib_send_wr *wr, bool *use_db, bool last, u16 *first_seq) +{ + bool inlined = false; + u64 csum; + struct psif_cb wqe; + struct psif_sq_entry *sqe; + int cb_len = 0; + int cb_len_8 = 0; + struct sif_dev *sdev = to_sdev(ibqp->device); + struct sif_qp *qp = to_sqp(ibqp); + struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx); + int ret = 0; + u16 head, sq_seq, q_sz; + struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx); + bool is_ud = qp->type == PSIF_QP_TRANSPORT_UD; + struct sif_sq_hdl *wh; + + if (wr->num_sge > sq->sg_entries) { + sif_log(sdev, SIF_SND, "attempt to post wr with %d/%d sg entries", + wr->num_sge, sq->sg_entries); + return -EINVAL; + } + + sq_seq = ++sq_sw->last_seq; + head = sq_sw->head_seq; + q_sz = sq_length(sq, head, sq_seq); + + if (q_sz > sq->entries) { + sif_log(sdev, SIF_INFO, + "Send Queue %d full - head %d, tail %d, entries %d, sge_entries %u, sq->user_mode: %s, sq->alloc_sz: %llu", + sq->cq_idx, head, sq_seq, sq->entries, sq->sg_entries, + (sq->user_mode) ? "[yes]" : "[no]", sq->mem->size); + ret = -EAGAIN; + goto fail; + } + + + sqe = get_sq_entry(sq, sq_seq); + + memset(&wqe, 0, sizeof(wqe)); + + wqe.wr.tsu_qosl = qp->qosl; + wqe.wr.eps_tag = qp->eps_tag; + + ret = prep_remote_addr(qp, wr, &wqe); + if (ret) + goto fail; + + if (wr->send_flags & IB_SEND_FENCE) /* RC only */ + wqe.wr.fence = 1; + + if (qp->flags & SIF_QPF_DYNAMIC_MTU) + wqe.wr.dynamic_mtu_enable = 1; + + wqe.wr.completion = sq->complete_all; + if (wr->send_flags & IB_SEND_SIGNALED) + wqe.wr.completion = 1; + + inlined = wr->send_flags & IB_SEND_INLINE; + + if (qp->qp_idx < 4) { + /* Field valid for QP0/1 only */ + wqe.wr.port = qp->port - 1; + + /* and in the work request we must use "real" QP numbers as well */ + wqe.wr.local_qp = qp->qp_idx & 1; + } else + wqe.wr.local_qp = qp->qp_idx; + + if (wr->opcode == IB_WR_SEND_WITH_IMM || + wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) { + wqe.wr.imm = cpu_to_be32(wr->ex.imm_data); + } + + /* TBD: only set if wr opcode allows it */ + if (wr->send_flags & IB_SEND_SOLICITED) + wqe.wr.se = 1; + + if (wr->send_flags & IB_SEND_IP_CSUM) { + wqe.wr.l3_checksum_en = 1; + wqe.wr.l4_checksum_en = 1; + qp->ipoib_tx_csum_l3++; + qp->ipoib_tx_csum_l4++; + } + switch (wr->opcode) { + case IB_WR_LSO: + { + struct psif_wr_local *la = &wqe.wr.details.send.ud.local_addr; + + if (!supports_offload(qp)) { + sif_log(sdev, SIF_INFO, + "LSO WR on qp %d which does not support offloading", + qp->qp_idx); + ret = -EINVAL; + goto fail; + } + ret = prep_send_lso(qp, wr, &wqe, inlined, la, sq_seq); + if (ret < 0) + goto fail; + break; + } + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + { + struct psif_wr_local *la = (is_ud ? + &wqe.wr.details.send.ud.local_addr : + &wqe.wr.details.send.uc_rc_xrc.local_addr); + ret = prep_send(qp, wr, &wqe, inlined, la, sq_seq); + if (ret < 0) + goto fail; + break; + } + case IB_WR_RDMA_READ: + /* RDMA READ does not support dynamic MTU */ + wqe.wr.dynamic_mtu_enable = 0; + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + { + struct psif_wr_local *la = &wqe.wr.details.rdma.local_addr; + struct psif_wr_remote *ra = &wqe.wr.details.rdma.remote_addr; + + ra->addr = wr->wr.rdma.remote_addr; + ra->rkey = wr->wr.rdma.rkey; + + ret = prep_send(qp, wr, &wqe, inlined, la, sq_seq); + if (ret < 0) + goto fail; + + ra->length = ret; + break; + } + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + prep_atomic(qp, wr, &wqe); + break; + case IB_WR_SEND_WITH_INV: + case IB_WR_RDMA_READ_WITH_INV: + sif_log(sdev, SIF_SND, "Opcode not implemented"); + ret = -EOPNOTSUPP; + goto fail; + case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: + case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD: + { + /* Bug 3844, WA for HW bug 3683 */ + bool masked_atomics_defeatured = PSIF_REVISION(sdev) <= 3; + + if (masked_atomics_defeatured) + sif_log(sdev, SIF_SND, "Opcode not supported"); + else + sif_log(sdev, SIF_SND, "Opcode not yet implemented"); + ret = -EOPNOTSUPP; + goto fail; + } + default: + sif_log(sdev, SIF_SND, "Unsupported opcode"); + ret = -EINVAL; + goto fail; + } + + sif_log(sdev, SIF_SND, + "copied %d bytes inline, num_sgl %d, sqe at %p", + wqe.wr.collect_length, wqe.wr.num_sgl, sqe); + cb_len_8 = sizeof(struct psif_wr) + + ((wqe.wr.collect_length + 7) & ~7); + cb_len = sizeof(struct psif_wr) + + ((wqe.wr.collect_length + CB_KICK_MASK) & ~CB_KICK_MASK); + + wqe.wr.sq_seq = sq_seq; + wqe.wr.tsu_sl = qp->tsl; + + /* Map sqe (repr.by index in sq) to this wr_id */ + wh = get_sq_hdl(sq, sq_seq); + wh->wr_id = wr->wr_id; + wh->sq_seq = sq_seq; + wh->used = true; + + sif_log(sdev, SIF_SND, "wr_id %llx at tail 0x%x sq_seq_num %d%s", + wr->wr_id, sq_seq & sq->mask, wqe.wr.sq_seq, (wqe.wr.completion ? " [req.compl]" : "")); + + /* We can safely checksum any "hole" due to end misalignment + byte swap + * towards the end of the inline data + * as prep_inline has nil'ed these bytes out: + */ + if (qp->nocsum) { + wqe.wr.checksum = qp->magic; + } else { + csum = csum32_partial(&wqe, cb_len_8, qp->magic); + csum = csum32_fold(csum); + wqe.wr.checksum = csum; + } + sif_log(sdev, SIF_SND, "op %s checksum %x cb_len 0x%x", + string_enum_psif_wr_type(wqe.wr.op), + wqe.wr.checksum, cb_len); + sif_logs(SIF_DUMP, write_struct_psif_wr(NULL, 0, &wqe.wr)); + + /* First update send queue (any further inline data beyond cb_len + * has already been copied in prep_inline: + */ + copy_conv_to_hw(sqe, &wqe, cb_len); + + /* A heuristic mechanism to determine the traffic pattern. */ + /* Even though traffic_patterns.mask is being set by handle_wc, no + * lock is used.The reason is that the mask is used to get a "rough" + * idea about the underlying traffic pattern without adding latency + * in the driver. + */ + qp->traffic_patterns.mask = (qp->traffic_patterns.mask << 1) | + HEUR_TX_DIRECTION; + sif_log_perf(sdev, SIF_PERF_V, "qp:traffic_pattern %x", + qp->traffic_patterns.mask); + /* If the traffic pattern shows that it's not latency sensitive, + * use SQ mode by ringing the doorbell. + * In a latency sensitive traffic pattern, a SEND should + * be accompanied by a WC_OPCODE_RECEIVE_SEND. Thus, + * a latency sensitve traffic pattern should have + * half_of_bits(sizeof(traffic_patterns.submask[n)) set. + * The constant 7 and 9 are used below as we are adding one + * to half_of_bits(sizeof(traffic_patterns.submask[n])) + * as the tolerance. + */ + if (((hweight16(qp->traffic_patterns.submask[0]) < 7) || + (hweight16(qp->traffic_patterns.submask[0]) > 9)) || + ((hweight16(qp->traffic_patterns.submask[1]) < 7) || + (hweight16(qp->traffic_patterns.submask[1]) > 9))) + *use_db = true; + + /* Flush writes before updating the sw pointer, + * This is necessary to ensure that the sqs do not see + * an incomplete entry. + * NB! Note that as opposed to software consuming + * queues this value should point to the last used entry, not the first + * unused: + */ + if (!*use_db || last) { + wmb(); + set_psif_sq_sw__tail_indx(&sq_sw->d, sq_seq); + } + + /* Finally write to collect buffer or ring doorbell if last */ + if (*use_db && last) + /* Write doorbell for first WR when we process the last request */ + sif_doorbell_from_sqe(qp, *first_seq, true); + else if (!*use_db) + if (sif_cb_write(qp, &wqe.wr, cb_len)) { + /*vcb lock busy, convert to db mode */ + if (last) + sif_doorbell_from_sqe(qp, sq_seq, true); + else { + *use_db = true; + *first_seq = sq_seq; + } + } + + return ret; +fail: + sif_log(sdev, SIF_SND, "Exit: Fail to post_send a WR"); + sif_logs(SIF_DUMP, write_struct_psif_wr(NULL, 0, &wqe.wr)); + + /* Avoid "using" the allocated entry */ + sq_sw->last_seq--; + return ret; +} /* end sif_post_send_single */ + + +static int get_gsi_qp_idx(struct sif_qp *qp) +{ + struct sif_dev *sdev = to_sdev(qp->ibqp.device); + int pma_qp_idx = sdev->pma_qp_idxs[!!(qp->qp_idx & 2)]; + struct sif_qp *pma_qp = get_sif_qp(sdev, pma_qp_idx); + struct sif_rq_sw *rq_sw; + int gsi_qlen, pma_qlen; + + rq_sw = get_sif_rq_sw(sdev, qp->rq_idx); + gsi_qlen = atomic_read(&rq_sw->length); + rq_sw = get_sif_rq_sw(sdev, pma_qp->rq_idx); + pma_qlen = atomic_read(&rq_sw->length); + + return (gsi_qlen <= pma_qlen) ? qp->qp_idx : pma_qp->qp_idx; +} + + +int sif_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct sif_qp *qp = to_sqp(ibqp); + struct sif_rq *rq; + struct sif_dev *sdev = to_sdev(ibqp->device); + struct sif_eps *es = &sdev->es[sdev->mbox_epsc]; + bool need_pma_pxy_qp = eps_version_ge(es, 0, 57) + && (qp->qp_idx == 1 || qp->qp_idx == 3); + + + sif_log(sdev, SIF_RCV, "Enter: wr_id 0x%llx qp_idx %d", + wr->wr_id, qp->qp_idx); + + if (need_pma_pxy_qp) { + qp = get_sif_qp(sdev, get_gsi_qp_idx(qp)); + sif_log(sdev, SIF_RCV, "Redirect wr_id 0x%llx to qp_idx %d", + wr->wr_id, qp->qp_idx); + } + + if (qp->last_set_state == IB_QPS_RESET) { + sif_log(sdev, SIF_INFO, "Invalid QP state (IB_QPS_RESET)"); + return -EINVAL; + } + + rq = get_sif_rq(sdev, qp->rq_idx); + + if (wr->num_sge > rq->sg_entries) { + sif_log(sdev, SIF_INFO, "qp only supports %d receive sg entries - wr has %d", + rq->sg_entries, wr->num_sge); + return -ENOMEM; + } + + return post_recv(sdev, qp, rq, wr, bad_wr); +} + + +/* Post a list of receives - can be called from any context */ +int post_recv(struct sif_dev *sdev, struct sif_qp *qp, struct sif_rq *rq, + struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr) +{ + struct sif_rq_sw *rq_sw = get_sif_rq_sw(sdev, rq->index); + int ret = 0; + u32 rq_len; + + unsigned long flags; + + if (unlikely(rq->user_mode)) { + sif_log(sdev, SIF_INFO, + "rq %d: Attempt to use kernel API to post to user mode receive queue", + rq->index); + return -EINVAL; + } + + if (!wr) + return ret; + + /* TBD: Revisit locking scheme again later + * to allow more parallelism. For now serialize to avoid + * having to handle "holes": + */ + spin_lock_irqsave(&rq->lock, flags); + + for (; wr; wr = wr->next) { + struct psif_rq_entry *rqe; + struct psif_rq_entry lrqe; + struct psif_rq_scatter *sge; + int i = 0; + int rqe_sz = 8 + wr->num_sge*sizeof(struct psif_rq_scatter); + int max_rqe_sz = 8 + rq->sg_entries*sizeof(struct psif_rq_scatter); + + rq_len = atomic_inc_return(&rq_sw->length); + if (rq_len > rq->entries) { + sif_log(sdev, SIF_INFO, "queue full - rq %d entries %d len %d", + rq->index, rq->entries, rq_len); + atomic_dec(&rq_sw->length); + ret = -ENOMEM; + goto err_post_recv; + } + if (wr->num_sge > rq->sg_entries) { + sif_log(sdev, SIF_INFO, "too many sges - rq %d sges configured %d, sges in wr %d", + rq->index, rq->sg_entries, wr->num_sge); + atomic_dec(&rq_sw->length); + ret = -EINVAL; + goto err_post_recv; + } + + rqe = get_rq_entry(rq, rq_sw->next_seq++); + + /* On the receive side we use the full wr_id directly */ + lrqe.rqe_id = wr->wr_id; + + sge = lrqe.scatter; + for (i = 0; i < wr->num_sge; i++) { + u32 lkey = wr->sg_list[i].lkey; + + sge[i].lkey = lkey; + sge[i].base_addr = wr->sg_list[i].addr + mr_uv2dma(sdev, lkey); + sge[i].length = wr->sg_list[i].length; + sif_log(sdev, SIF_RCV, + "sg_adr 0x%llx sg_len %d lkey %d", + wr->sg_list[i].addr, wr->sg_list[i].length, lkey); + } + + copy_conv_to_hw(rqe, &lrqe, rqe_sz); + + /* As per PRM, unused sges shall be zero, which is endian neutral */ + if (max_rqe_sz > rqe_sz) + memset(rqe->scatter + wr->num_sge, 0, max_rqe_sz - rqe_sz); + + sif_log(sdev, SIF_RCV, + " entries %u extent %u RQ %d next_seq %x length %d", + rq->entries, rq->extent, rq->index, + rq_sw->next_seq, atomic_read(&rq_sw->length)); + } + /* Enforce reordering of new rq entries and tail */ + wmb(); + set_psif_rq_sw__tail_indx(&rq_sw->d, rq_sw->next_seq); + /* Enforce visibility of rq tail on hw */ + smp_wmb(); + + sif_log(sdev, SIF_RCV, "Exit: success"); +err_post_recv: + spin_unlock_irqrestore(&rq->lock, flags); + *bad_wr = wr; + + /* WA #622, Check if QP in ERROR, flush RQ */ + if (!rq->is_srq && is_regular_qp(qp) && qp->last_set_state == IB_QPS_ERR) { + if (sif_flush_rq(sdev, rq, qp, atomic_read(&rq_sw->length))) + sif_log(sdev, SIF_INFO, "failed to flush RQ %d", rq->index); + } + + return ret; +} + +int sif_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct sif_dev *sdev = to_sdev(ibqp->device); + struct sif_qp *qp = to_sqp(ibqp); + struct psif_epsc_csr_rsp rsp; + struct psif_epsc_csr_req req; + + sif_log(sdev, SIF_MC, "qp %d mc gid %llx.%llx lid 0x%x", + qp->qp_idx, gid->global.subnet_prefix, gid->global.interface_id, lid); + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_MC_ATTACH; + req.u.mc.qp = qp->qp_idx; + req.u.mc.port = qp->port; /* The EPS uses IB port space */ + /* union ib_gid contains BE gids and we do copy_convert later.. */ + req.u.mc.mgid_0 = be64_to_cpu(gid->global.subnet_prefix); + req.u.mc.mgid_1 = be64_to_cpu(gid->global.interface_id); + return sif_epsc_wr(sdev, &req, &rsp); +} + +int sif_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct sif_dev *sdev = to_sdev(ibqp->device); + struct sif_qp *qp = to_sqp(ibqp); + struct psif_epsc_csr_rsp rsp; + struct psif_epsc_csr_req req; + + sif_log(sdev, SIF_MC, "qp %d mc gid %llx.%llx lid 0x%x", + qp->qp_idx, gid->global.subnet_prefix, gid->global.interface_id, lid); + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_MC_DETACH; + req.u.mc.qp = qp->qp_idx; + req.u.mc.port = qp->port; /* The EPS uses IB port space */ + /* union ib_gid contains BE gids and we do copy_convert later.. */ + req.u.mc.mgid_0 = be64_to_cpu(gid->global.subnet_prefix); + req.u.mc.mgid_1 = be64_to_cpu(gid->global.interface_id); + return sif_epsc_wr(sdev, &req, &rsp); +} + + +/* Workaround to emulate extra send sg entries from software: + * We use the available inline space and copy the first fitting + * xsg = wr->num_sge - hw_max + 1 entries into this space: + */ +static int prep_sw_sg(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe, + struct psif_wr_local *la, u32 sqe_seq) +{ + struct sif_dev *sdev = to_sdev(qp->ibqp.device); + struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx); + struct psif_sq_entry *sqe = get_sq_entry(sq, sqe_seq); + void *sgl_start = sq_sgl_offset(sq, sqe); + struct psif_rq_scatter *sge = sq->tmp_sge; + int i; + int xsg = wr->num_sge - SIF_HW_MAX_SEND_SGE + 1; + int xi = -1; + int pi = 0; + u32 xcnt = 0; + u32 len = 0; + int ret; + u32 xlen = 0; + u64 addr = 0; + int space = qp->max_inline_data; + + la->addr = get_sqe_dma(sq, sqe_seq) + sq->sgl_offset; + la->lkey = sq->sg_mr->index; + + for (i = 0; i < wr->num_sge; i++) { + if (i == xsg) + space -= 256; /* We can no longer use the inline bytes */ + xlen += wr->sg_list[i].length; + sif_log(sdev, SIF_SND, "xsg %d, xlen 0x%x space 0x%x", xsg, xlen, space); + if (xcnt < xsg) { + xcnt++; + if (xcnt < xsg) + continue; + } + if (xlen <= space) { + xi = i - xsg + 1; + break; + } + xlen -= wr->sg_list[i - xsg].length; + } + if (xi < 0) { + /* If our worst case calculations are right, this should not happen.. */ + sif_log(sdev, SIF_INFO, "Failed to find sg entries to collapse into inline space!"); + return -ENOMEM; + } + if (xi == 0) { + ret = prep_inline_part(qp, wr, xsg, wqe, la, sqe_seq, false); + if (ret < 0) + return ret; + } else { + /* TBD: We can consider merging xsg + 1 entries into two + * sg entries, one containing the first entries, but for now + * keep it simple and just not use the first 256 bytes: + */ + u8 *dbuf = ((u8 *)sqe->payload); + int copy = 0; + + for (i = xi; i < xi + xsg; i++) { + u32 lkey = wr->sg_list[i].lkey; + + len = wr->sg_list[i].length; + addr = wr->sg_list[i].addr; + if (len > 0) { + struct psif_key *key = safe_get_key(sdev, lkey); + + if (!key || PSIF_DMA_KEY_INVALID == get_psif_key__lkey_state(key)) { + sif_log(sdev, SIF_INFO, + "Attempt to do inline copying from an invalid MR with lkey %d at addr 0x%llx", + wr->sg_list[i].lkey, addr); + return -EPERM; + } + } + + ret = copy_sg(qp, &dbuf[copy], addr, len); + if (ret < 0) + return ret; + copy += len; + } + } + + la->length = 0; + for (i = 0; i < wr->num_sge; i++) { + u32 lkey; + u32 offset = i ? 256 : 0; + + if (i == xi) { + sge[pi].lkey = sq->sg_mr->index; + sge[pi].base_addr = + get_sqe_dma(sq, sqe_seq) + + offsetof(struct psif_sq_entry, payload) + offset; + sge[pi].length = xlen; + la->length += xlen; + i += xsg - 1; + sif_log(sdev, SIF_SND, + "sg_list[%d]: sge entry: dma addr 0x%llx, len = %d, lkey %d", + pi, sge[pi].base_addr, sge[pi].length, sge[pi].lkey); + pi++; + continue; + } + lkey = wr->sg_list[i].lkey; + sge[pi].base_addr = wr->sg_list[i].addr + + mr_uv2dma(sdev, lkey); + sge[pi].lkey = wr->sg_list[i].lkey; + sge[pi].length = wr->sg_list[i].length; + la->length += sge[pi].length; + sif_log(sdev, SIF_SND, + "sg_list[%d]: sge entry: dma addr 0x%llx, len = %d, lkey %d", + pi, sge[pi].base_addr, sge[pi].length, sge[pi].lkey); + pi++; + } + sif_log(sdev, SIF_SND, + "ready with sgl_start %p, sg list addr 0x%llx, message len %d, lkey %d, sge %p", + sgl_start, la->addr, la->length, la->lkey, sge); + + copy_conv_to_hw(sgl_start, sge, + sizeof(struct psif_rq_scatter) * SIF_HW_MAX_SEND_SGE); + wqe->wr.num_sgl = SIF_HW_MAX_SEND_SGE - 1; + return la->length; +} + + +static int prep_send(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe, + bool inlined, struct psif_wr_local *la, u32 sqe_seq) +{ + struct sif_dev *sdev = to_sdev(qp->ibqp.device); + int ret = 0; + int num_sge; + int use_inline_first_sge = 0; + + if (inlined) + return prep_inline(qp, wr, wqe, la, sqe_seq, false); + + la->length = 0; + num_sge = wr->num_sge; + if (num_sge == 0) { + sif_log(sdev, SIF_SND, "no sge entries - local_addr left as 0"); + return 0; + } + if (!sif_feature(disable_inline_first_sge) && qp->ulp_type == RDS_ULP && num_sge == 2 + && wr->sg_list[0].length <= qp->max_inline_data) { + use_inline_first_sge = 1; + } + + if (use_inline_first_sge) { + int wr_len; + u32 lkey = wr->sg_list[0].lkey; + struct sif_mr *mr = safe_get_sif_mr(sdev, lkey); + int mem_type = mr ? mr->mem->mem_type : 0; + bool is_phys_addr = mem_type != SIFMT_UMEM; + + sif_log(sdev, SIF_SND, "qp_%d handle special case; " + "#sge == 2 && sg[0].len == 48 max_inline_data %d, mem_type %d", + qp->qp_idx, qp->max_inline_data, mem_type); + /* Copy first sge inline */ + if ((wr->sg_list[0].length + wr->sg_list[1].length) <= qp->max_inline_data) { + sif_log(sdev, SIF_SND, "qp_%d Inlining both %d + %d = %d", + qp->qp_idx, + wr->sg_list[0].length, + wr->sg_list[1].length, + (wr->sg_list[0].length + wr->sg_list[1].length)); + return prep_inline(qp, wr, wqe, la, sqe_seq, is_phys_addr); + } + wr_len = prep_inline_part(qp, wr, 1, wqe, la, sqe_seq, is_phys_addr); + if (wr_len < 0) + return wr_len; + lkey = wr->sg_list[1].lkey; + /* Subtract to get address "correct" for hw-usage */ + la->addr = wr->sg_list[1].addr + mr_uv2dma(sdev, lkey) - wr_len; + la->lkey = lkey; + la->length = wr_len + wr->sg_list[1].length; + num_sge = 1; + sif_log(sdev, SIF_SND, + "Changed to single sge user addr 0x%llx dma addr 0x%llx, message len %d, key %d collect_len %d wr_len %d", + wr->sg_list[1].addr, la->addr, la->length, lkey, wqe->wr.collect_length, wr_len); + } else if (num_sge == 1) { + /* Single entry S/G list result after inlining */ + u32 lkey = wr->sg_list[0].lkey; + + la->addr = wr->sg_list[0].addr + mr_uv2dma(sdev, lkey); + la->lkey = lkey; + la->length += wr->sg_list[0].length; + sif_log(sdev, SIF_SND, + "single sge user addr 0x%llx dma addr 0x%llx, message len %d, key %d", + wr->sg_list[0].addr, la->addr, la->length, lkey); + } else if (unlikely(wr->num_sge > SIF_HW_MAX_SEND_SGE)) { + return prep_sw_sg(qp, wr, wqe, la, sqe_seq); + } else { + struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx); + struct psif_sq_entry *sqe = get_sq_entry(sq, sqe_seq); + void *sgl_start = sq_sgl_offset(sq, sqe); + struct psif_rq_scatter *sge = sq->tmp_sge; + int i; + + la->addr = get_sqe_dma(sq, sqe_seq) + sq->sgl_offset; + la->lkey = sq->sg_mr->index; + + for (i = 0; i < num_sge; i++) { + u32 lkey = wr->sg_list[i].lkey; + + sge[i].base_addr = wr->sg_list[i].addr + + mr_uv2dma(sdev, lkey); + sge[i].lkey = wr->sg_list[i].lkey; + sge[i].length = wr->sg_list[i].length; + la->length += sge[i].length; + sif_log(sdev, SIF_SND, + "sg_list[%d]: sge entry: dma addr 0x%llx, len = %d, lkey %d", + i, sge[i].base_addr, sge[i].length, sge[i].lkey); + } + sif_log(sdev, SIF_SND, + "ready with sgl_start %p, sg list addr 0x%llx, message len %d, lkey %d, sge %p", + sgl_start, la->addr, la->length, la->lkey, sge); + + copy_conv_to_hw(sgl_start, sge, + sizeof(struct psif_rq_scatter) * wr->num_sge); + ret = la->length; + } + /* 0 here means a single entry, but input 0 must also be 0 */ + wqe->wr.num_sgl = num_sge ? num_sge - 1 : 0; + return ret; +} +static int prep_send_lso(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe, + bool inlined, struct psif_wr_local *la, u32 sqe_seq) +{ + struct sif_dev *sdev = to_sdev(qp->ibqp.device); + void *sgl_start; + int ret = 0; + int i; + u8 *p8; + struct sif_sq *sq; + struct psif_sq_entry *sqe; + struct psif_rq_scatter *sge; + const int stencil_sge = 1; + + sq = get_sif_sq(sdev, qp->qp_idx); + sqe = get_sq_entry(sq, sqe_seq); + sge = sq->tmp_sge; + sgl_start = sq_sgl_offset(sq, sqe); + + if (unlikely(wr->num_sge >= SIF_HW_MAX_SEND_SGE || wr->num_sge < 1)) { + sif_log(sdev, SIF_INFO, "attempt to post lso wr with %d/%d sg entries", + wr->num_sge, sq->sg_entries); + return -EINVAL; + } + + wqe->wr.details.send.ud.mss = wr->wr.ud.mss; + + la->addr = get_sqe_dma(sq, sqe_seq) + sq->sgl_offset; + la->lkey = sq->sg_mr->index; + la->length = 0; + + /* copy stencil to payload-area in send_queue */ + p8 = (u8 *)wr->wr.ud.header; + memcpy((u8 *)sqe->payload, p8, wr->wr.ud.hlen); + + sge[0].base_addr = get_sqe_dma(sq, sqe_seq) + + offsetof(struct psif_sq_entry, payload) + mr_uv2dma(sdev, la->lkey); + sge[0].lkey = sq->sg_mr->index; + sge[0].length = wr->wr.ud.hlen; + la->length += sge[0].length; + + sif_log(sdev, SIF_SND, + "sg_list[%d]: sge entry: dma addr 0x%llx, len = %d, lkey %d", + 0, sge[0].base_addr, sge[0].length, sge[0].lkey); + + for (i = 0; i < wr->num_sge; i++) { + u32 lkey = wr->sg_list[i].lkey; + + sge[i+1].base_addr = wr->sg_list[i].addr + mr_uv2dma(sdev, lkey); + sge[i+1].lkey = wr->sg_list[i].lkey; + sge[i+1].length = wr->sg_list[i].length; + la->length += sge[i+1].length; + sif_log(sdev, SIF_SND, + "sg_list[%d]: sge entry: dma addr 0x%llx, len = %d, lkey %d", + i+1, sge[i+1].base_addr, sge[i+1].length, sge[i+1].lkey); + } + copy_conv_to_hw(sgl_start, sge, + sizeof(struct psif_rq_scatter) * (wr->num_sge+1)); + + wmb(); + wqe->wr.num_sgl = wr->num_sge - 1 + stencil_sge; + sif_log(sdev, SIF_SND, + "num_sgl %d, sqe at %p la ->addr 0x%llx ->lkey %d ->length %d %d", wqe->wr.num_sgl, sqe, + la->addr, la->lkey, la->length, la->length-sge[0].length); + qp->ipoib_tx_lso_pkt++; + qp->ipoib_tx_lso_bytes += (la->length - sge[0].length); + return ret; +} + + +static int prep_remote_addr(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe) +{ + struct sif_ah *ah = NULL; + struct psif_ah *ah_p; + bool is_dr = false; + struct sif_dev *sdev = to_sdev(qp->ibqp.device); + + sif_log(sdev, SIF_SND, ""); + switch (qp->type) { + case PSIF_QP_TRANSPORT_UD: + if (!wr->wr.ud.ah) { + sif_log(sdev, SIF_INFO, "No ah supplied for ud packet"); + return -EINVAL; + } + ah = to_sah(wr->wr.ud.ah); + ah_p = get_ah(sdev, ah->index); + is_dr = get_psif_ah__remote_lid(ah_p) == 0xffff; + + /* Direct routed packets are destined for the SMA at uf 33. + * For all other packets this field is ignored by the hw: + */ + if (is_dr) + wqe->wr.destuf = 33; + wqe->wr.details.send.ud.remote_addr.ah_indx + = ah->index; + wqe->wr.details.send.ud.qp.qkey = wr->wr.ud.remote_qkey; + wqe->wr.details.send.ud.qp.remote_qp = wr->wr.ud.remote_qpn; + wqe->wr.ud_pkt = 1; + break; + case PSIF_QP_TRANSPORT_UC: + case PSIF_QP_TRANSPORT_RC: + break; + case PSIF_QP_TRANSPORT_XRC: + wqe->wr.xrc_hdr.xrqd_id = wr->xrc_remote_srq_num; + break; + default: + sif_log(sdev, SIF_INFO, + "unhandled transport type %s", string_enum_psif_qp_trans(qp->type)); + return -EINVAL; + } + wqe->wr.op = ib2sif_wr_op(wr->opcode, is_dr); + return 0; +} + + + +static void prep_atomic(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe) +{ + struct psif_wr_local *la = &wqe->wr.details.atomic.local_addr; + struct psif_wr_remote *ra = &wqe->wr.details.atomic.remote_addr; + + la->addr = wr->sg_list[0].addr; + la->lkey = wr->sg_list[0].lkey; + la->length = sizeof(long); + + ra->addr = wr->wr.atomic.remote_addr; + ra->rkey = wr->wr.atomic.rkey; + ra->length = sizeof(long); + + /* Payload order as in IB header */ + if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + wqe->payload[0] = cpu_to_be64(wr->wr.atomic.swap); + wqe->payload[1] = cpu_to_be64(wr->wr.atomic.compare_add); + wqe->wr.collect_length = 16; + } else { + wqe->payload[0] = cpu_to_be64(wr->wr.atomic.compare_add); + wqe->wr.collect_length = 8; + } +} diff --git a/drivers/infiniband/hw/sif/sif_sndrcv.h b/drivers/infiniband/hw/sif/sif_sndrcv.h new file mode 100644 index 000000000000..af1a535b7871 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_sndrcv.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_sndrcv.h: Interface to IB send/receive, MAD packet recv and + * multicast send/recv + */ + +#ifndef __SIF_SNDRCV_H +#define __SIF_SNDRCV_H + +struct sif_rq; +struct sif_dev; + +int sif_post_send(struct ib_qp *ibqp, + struct ib_send_wr *wr, struct ib_send_wr **bad_wr); +int sif_post_recv(struct ib_qp *ibqp, + struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr); + +int sif_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); +int sif_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); + +int post_recv(struct sif_dev *sdev, struct sif_qp *qp, struct sif_rq *rq, + struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr); + +/* Send a single wr */ +int sif_post_send_single(struct ib_qp *ibqp, struct ib_send_wr *wr, bool *use_db, bool last, u16 *first_seq); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_spt.c b/drivers/infiniband/hw/sif/sif_spt.c new file mode 100644 index 000000000000..a2faa4b0fca5 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_spt.c @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Vinay Shaw + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_spt.c: Experimental implementation of shared use of the OS's page tables. + * Default is to use private page tables - shared page tables can be enabled using + * a vendor flag. This implementation assumes that physical addresses and DMA addresses + * are 1-1, which might not in general be the case if going through an IOMMU. + */ + +#include "sif_mmu.h" +#include "sif_dev.h" +#include "sif_base.h" +#include "sif_dma.h" +#include "sif_hwi.h" +#include "sif_spt.h" + +#include +#include +#include +#include + + +#define PMD_ALIGN(addr) ALIGN(addr, PMD_SIZE) +#define PUD_ALIGN(addr) ALIGN(addr, PUD_SIZE) +#define PGDIR_ALIGN(addr) ALIGN(addr, PGDIR_SIZE) + + +static void set_ctx_w_page(struct sif_dev *sdev, + struct sif_mmu_ctx *ctx, + enum psif_table_level level, + enum psif_page_size pg_sz, u64 val) +{ + struct psif_mmu_cntx *hw_ctx = &ctx->mctx; + + hw_ctx->page_size = pg_sz; + hw_ctx->table_ptr = ((val) >> PAGE_SHIFT) & ~PSIF_TABLE_PTR_MASK; + hw_ctx->table_level = level; + sif_log(sdev, SIF_MMU, "pte 0x%08llx level %d", val, level); +} + + +static int sif_set_mmu_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *sctx, + struct sif_mem *mem, bool write); + +int sif_spt_map_gva_ctx(struct sif_dev *sdev, + struct sif_mmu_ctx *ctx, + struct sif_mem *mem, + bool write) +{ + int ret; + + if (!(mem->mem_type == SIFMT_UMEM) || !mem->m.u.umem) { + sif_log(sdev, SIF_MMU, "Only implemented for user space mappings!"); + return -EINVAL; + } + + ret = sif_set_mmu_ctx(sdev, ctx, mem, write); + if (ret) + goto mmctx_failed; + return 0; + +mmctx_failed: + return ret; +} + + +static int sif_set_mmu_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx, + struct sif_mem *mem, bool write) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + u64 start = ctx->base; + u64 len = ctx->size; + struct psif_mmu_cntx *pctx = &ctx->mctx; + int npgds, npuds, npmds, nptes; + int ret = 0; + + sif_log(sdev, SIF_MMU, "start 0x%llx len 0x%llx", start, len); + + if (len == 0) + goto err; + + pgd = pgd_offset(mem->m.u.umem->mm, start); + if (pgd_none(*pgd)) + goto err; + + ctx->pt = (void *)pgd; /* Misuse pt to save the pointer to avoid going via mm at dealloc time */ + ctx->mt = SIFMT_ZERO; + pud = pud_offset(pgd, start); + if (pud_none(*pud)) + goto err; + + pctx->wr_access = write; + pctx->translation_type = MMU_GVA2GPA_MODE; + + npgds = PGDIR_ALIGN(len + (start & ~PGDIR_MASK)) >> PGDIR_SHIFT; + npuds = PUD_ALIGN(len + (start & ~PUD_MASK)) >> PUD_SHIFT; + +#ifndef __aarch64__ + if (pud_large(*pud)) { + ptep = (pte_t *) pud; + pte = *ptep; + + if (!pte_present(pte)) { + sif_log(sdev, SIF_MMU, + "Page not present, bugging out.."); + BUG(); + goto err; + } + + if (npuds == 1) { + set_ctx_w_page(sdev, ctx, PAGE_LEVEL2, PAGE_SIZE_IA32E_1GB, + pte_val(pte)); + } else if (npgds == 1) + set_ctx_w_page(sdev, ctx, PAGE_LEVEL3, PAGE_SIZE_IA32E_1GB, + pgd_val(*pgd)); +#ifdef CONFIG_X86 + else + set_ctx_w_page(sdev, ctx, PAGE_LEVEL4, PAGE_SIZE_IA32E_1GB, + read_cr3()); +#endif + goto out; + } +#endif /* !__aarch64__ */ + + pmd = pmd_offset(pud, start); + if (pmd_none(*pmd)) + goto err; + + npmds = PMD_ALIGN(len + (start & ~PMD_MASK)) >> PMD_SHIFT; + +#ifndef __aarch64__ + if (pmd_large(*pmd)) { + ptep = (pte_t *) pmd; + pte = *ptep; + + if (!pte_present(pte)) { + sif_log(sdev, SIF_MMU, + "Page not present, bugging out.."); + BUG(); + goto err; + } + + if (npmds == 1) { + set_ctx_w_page(sdev, ctx, PAGE_LEVEL1, PAGE_SIZE_IA32E_2MB, + pte_val(pte)); + } else if (npuds == 1) + set_ctx_w_page(sdev, ctx, PAGE_LEVEL2, PAGE_SIZE_IA32E_2MB, + pud_val(*pud)); + else if (npgds == 1) + set_ctx_w_page(sdev, ctx, PAGE_LEVEL3, PAGE_SIZE_IA32E_2MB, + pgd_val(*pgd)); +#ifdef CONFIG_X86 + else + set_ctx_w_page(sdev, ctx, PAGE_LEVEL4, PAGE_SIZE_IA32E_2MB, + read_cr3()); +#endif + goto out; + } +#endif /* !__aarch64__ */ + + ptep = pte_offset_map(pmd, start); + pte = *ptep; + if (!pte_present(pte)) { + sif_log(sdev, SIF_MMU, "Page not present, bugging out.."); + BUG(); + goto err; + } + + nptes = PAGE_ALIGN(len + (start & ~PAGE_MASK)) >> PAGE_SHIFT; + if (nptes == 1) { + set_ctx_w_page(sdev, ctx, PAGE_LEVEL0, PAGE_SIZE_IA32E_4KB, pte_val(pte)); + } else if (npmds == 1) { + set_ctx_w_page(sdev, ctx, PAGE_LEVEL1, PAGE_SIZE_IA32E_4KB, pmd_val(*pmd)); + } else if (npuds == 1) { + set_ctx_w_page(sdev, ctx, PAGE_LEVEL2, PAGE_SIZE_IA32E_4KB, pud_val(*pud)); + } else if (npgds == 1) { + set_ctx_w_page(sdev, ctx, PAGE_LEVEL3, PAGE_SIZE_IA32E_4KB, pgd_val(*pgd)); +#ifdef CONFIG_X86 + } else { + set_ctx_w_page(sdev, ctx, PAGE_LEVEL4, PAGE_SIZE_IA32E_4KB, read_cr3()); +#endif + } + goto out; +err: + sif_log(sdev, SIF_MMU, "Error in setting mmu context"); + ret = -1; +out: + return ret; +} + +void sif_spt_unmap_gva_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *sctx) +{ + u64 start = sctx->base; + u64 len = sctx->size; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + + int npgds, npuds, npmds, nptes; + + sif_log(sdev, SIF_MMU, "start 0x%llx len 0x%llx", start, len); + + if (len == 0) + goto err; + + pgd = (pgd_t *)sctx->pt; + if (pgd_none(*pgd)) + goto err; + + if (pgd_none(*pgd)) { + sif_log(sdev, SIF_MMU, "Table entry(pgd) already freed"); + goto out; + } + + pud = pud_offset(pgd, start); + if (pud_none(*pud)) { + sif_log(sdev, SIF_MMU, "Table entry(pud) already freed"); + goto out; + } + + npgds = PGDIR_ALIGN(len + (start & ~PGDIR_MASK)) >> PGDIR_SHIFT; + npuds = PUD_ALIGN(len + (start & ~PUD_MASK)) >> PUD_SHIFT; + +#ifndef __aarch64__ + if (pud_large(*pud)) { + ptep = (pte_t *) pud; + pte = *ptep; + + if (!pte_present(pte)) { + sif_log(sdev, SIF_MMU, + "Page not present, bugging out.."); + BUG(); + goto err; + } + goto out; + } +#endif /* !__aarch64__ */ + + pmd = pmd_offset(pud, start); + if (pmd_none(*pmd)) { + sif_log(sdev, SIF_MMU, "Table entry(pmd) already freed"); + goto out; + } + + npmds = PMD_ALIGN(len + (start & ~PMD_MASK)) >> PMD_SHIFT; + +#ifndef __aarch64__ + if (pmd_large(*pmd)) { + ptep = (pte_t *) pmd; + pte = *ptep; + + if (!pte_present(pte)) { + sif_log(sdev, SIF_MMU, + "Page not present, bugging out.."); + BUG(); + goto err; + } + goto out; + } +#endif /* !__aarch64__ */ + + ptep = pte_offset_map(pmd, start); + pte = *ptep; + if (!pte_present(pte)) { + sif_log(sdev, SIF_MMU, "Page not present, bugging out.."); + BUG(); + goto err; + } + + nptes = PAGE_ALIGN(len + (start & ~PAGE_MASK)) >> PAGE_SHIFT; + + goto out; +err: + sif_log(sdev, SIF_MMU, "Error releasing mmu context"); +out: + return; +} + diff --git a/drivers/infiniband/hw/sif/sif_spt.h b/drivers/infiniband/hw/sif/sif_spt.h new file mode 100644 index 000000000000..b66ef57de32e --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_spt.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_spt.h: Experimental (still unsafe) + * implementation of direct use of the operating system's + * page tables (shared page tables) + */ + +#ifndef _SIF_SPT_H +#define _SIF_SPT_H + +struct sif_dev; +struct sif_mmu_ctx; + + +#define PSIF_TABLE_PTR_SHIFT 52 +#define PSIF_TABLE_PTR_SIZE (_AC(1, UL) << PSIF_TABLE_PTR_SHIFT) +#define PSIF_TABLE_PTR_MASK (~(PSIF_TABLE_PTR_SIZE-1)) + +int sif_spt_map_gva_ctx(struct sif_dev *sdev, + struct sif_mmu_ctx *ctx, + struct sif_mem *mem, + bool write); + +void sif_spt_unmap_gva_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_sq.c b/drivers/infiniband/hw/sif/sif_sq.c new file mode 100644 index 000000000000..2d5bcd26e532 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_sq.c @@ -0,0 +1,518 @@ +/* + * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_sq.c: Implementation of the send queue side of an IB queue pair + */ + +#include +#include "sif_dev.h" +#include "sif_base.h" +#include "sif_defs.h" +#include "sif_dma.h" +#include "sif_mmu.h" +#include "sif_pt.h" +#include "sif_mr.h" +#include "sif_sq.h" +#include "sif_hwi.h" +#include "psif_hw_setget.h" +#include +#include + +/* Figure out the minimal space needed in each send queue element + * given the input sizes. + * + * We also use this space to collapse sg entries if we need to emulate more + * sg entries in software than what hardware supports. + * + * TBD: Note that the SQS sometimes checksums more data + * (up to 256 bytes depending on max_inline??) which we then cannot use + * as sg list data area. + * Note also that no sgl is needed in PSIF for the single sg entry case: + */ + +static u32 compute_sq_extent(u32 sge_entries, u32 max_inline_data, + u32 *sgl_offset, u32 *min_extent_p, + u32 *sgl_size_p, u32 *max_inline_p) +{ + u32 hw_sge_entries = min_t(u32, SIF_HW_MAX_SEND_SGE, sge_entries); + u32 sgl_size = sge_entries > 1 ? hw_sge_entries * sizeof(struct psif_wr_local) : 0; + u32 xsge = sge_entries - hw_sge_entries; + + /* This amount must be reserved for 0-padded inline data due to + * restrictions in the SQS: + */ + u32 sqs_headroom = min(256U, ((max_inline_data + 63U) & ~63U)); + u32 sqs_inline_extra = + max_inline_data > sqs_headroom ? max_inline_data - sqs_headroom : 0; + + /* This applies to UD only, with max 4K message size: + * Set aside room for inlining of @xsge sg entries. + * Average size of an sge entry will be max 256 bytes, add an extra + * 256 to handle the case where we cannot use the initial inline space: + */ + u32 xsge_space = !xsge ? 0 : (xsge + 2) * 256; + + u32 min_extent = sizeof(struct psif_wr) + + sqs_headroom + + max(max(sqs_inline_extra, sgl_size), xsge_space); + + u32 real_extent = roundup_pow_of_two(min_extent); + + if (sgl_offset) + *sgl_offset = real_extent - sgl_size; + if (sgl_size_p) + *sgl_size_p = sgl_size; + if (min_extent_p) + *min_extent_p = min_extent; + if (max_inline_p) + *max_inline_p = max_t(int, xsge_space - sqs_headroom, sqs_inline_extra); + return real_extent; +} + + +int sif_alloc_sq(struct sif_dev *sdev, struct sif_pd *pd, + struct sif_qp *qp, struct ib_qp_cap *cap, + bool user_mode, int wr_hdl_sz) +{ + /* Send queues always uses same indexes as the corresponding qp */ + int ret = 0; + int extent_log2; + struct sif_sq *sq; + struct sif_sq_sw *sq_sw; + struct psif_sq_hw *sq_hw_p; + struct psif_sq_rspq *sq_rspq_p; + struct psif_sq_sw lsq_sw; + struct psif_sq_hw lsq_hw; + struct psif_sq_entry sqe; + + u32 min_entries = cap->max_send_wr; + u32 max_entries; + u32 entries_log2; + u32 min_extent; + u32 sgl_size; + u32 max_inline; + u64 alloc_sz; + dma_addr_t dma_start; + bool need_page_aligned; + bool need_wa_4049 = PSIF_REVISION(sdev) <= 3; + + + max_entries = roundup_pow_of_two(max(2U, min_entries)); + entries_log2 = order_base_2(max_entries); + + if (entries_log2 > SIF_SW_MAX_SQE_LOG2) { + sif_log(sdev, SIF_INFO, + "requested %d entries -> %d but sif only supports %d", + cap->max_send_wr, max_entries, SIF_SW_MAX_SQE); + return -ENFILE; /* Limited by 4 bit size_log2 field in sq desc */ + } + + sq = get_sif_sq(sdev, qp->qp_idx); + sq_sw = get_sif_sq_sw(sdev, qp->qp_idx); + sq->index = qp->qp_idx; + sq->wr_hdl_sz = wr_hdl_sz; + + /* Due to IB standard requirements for ssn = 1 on the first packet + * on a QP and that psif now uses send queue sequence number == ssn + * we must initialize so the first packet is sent on index 1. + * Also the send queue in psif uses last_seq == last used seq instead of + * next_seq == next seq to use.. + * NB! This applies only to the send queue - we start at index 0 on all the others! + */ + sq_sw->last_seq = sq_sw->head_seq = 0; + + sq_hw_p = get_sq_hw(sdev, qp->qp_idx); + + sq->entries = max_entries; + sq->mask = max_entries - 1; + sq->sg_entries = need_wa_4049 ? roundup_pow_of_two(cap->max_send_sge) : cap->max_send_sge; + + sq->extent = compute_sq_extent(sq->sg_entries, cap->max_inline_data, + &sq->sgl_offset, &min_extent, &sgl_size, &max_inline); + + qp->max_inline_data = cap->max_inline_data; + if (sq->extent > min_extent) { + int extra_extent = sq->extent - min_extent; + + if (sq->sg_entries > SIF_HW_MAX_SEND_SGE) { + qp->max_inline_data = max_inline + extra_extent; + } else if (cap->max_inline_data >= 256) { + sif_log(sdev, SIF_QP, "QP %d has room for %d bytes of extra inline space", + qp->qp_idx, extra_extent); + qp->max_inline_data += extra_extent; + } + } + + extent_log2 = order_base_2(sq->extent); + alloc_sz = max_entries * sq->extent; + + /* Only whole pages must be exposed to user space. + * For simplicity we impose the same for reliable QPs as their SQs + * have to be page aligned to ensure proper access from SQ_CMPL: + */ + need_page_aligned = user_mode || reliable_qp(qp->type); + + if (need_page_aligned && (alloc_sz & ~PAGE_MASK)) + alloc_sz = (alloc_sz + ~PAGE_MASK) & PAGE_MASK; + sq->user_mode = user_mode; + + if (alloc_sz <= SIF_MAX_CONT) + sq->mem = sif_mem_create_dmacont(sdev, alloc_sz, GFP_KERNEL, DMA_BIDIRECTIONAL); + else { + alloc_sz = (alloc_sz + ~PMD_MASK) & PMD_MASK; + sq->mem = sif_mem_create(sdev, alloc_sz >> PMD_SHIFT, + alloc_sz, SIFMT_2M, GFP_KERNEL | __GFP_ZERO, + DMA_BIDIRECTIONAL); + } + if (!sq->mem) { + sif_log(sdev, SIF_INFO, "Failed to allocate %llu bytes of SQ buffer pool", + alloc_sz); + ret = -ENOMEM; + goto err_alloc_dma; + } + + dma_start = sif_mem_dma(sq->mem, 0); + + sif_log(sdev, SIF_QP, "SQ dma %pad va 0x%p, sz %d, min_extent %d -> extent %d", + &dma_start, sif_mem_kaddr(sq->mem, 0), sq->entries, min_extent, sq->extent); + sif_log(sdev, SIF_SQ, "SQ wr sz %ld, sgl_offset/sz %d/%d, max_inline %d, max sge %d", + sizeof(sqe.wr), sq->sgl_offset, sgl_size, + qp->max_inline_data, sq->sg_entries); + + sq->wr_hdl = kzalloc(max_entries * sq->wr_hdl_sz, GFP_KERNEL); + if (!sq->wr_hdl) { + sif_log(sdev, SIF_INFO, "Failed to allocate wr_hdl table!"); + ret = -ENOMEM; + goto err_alloc_wrid; + } + + if (qp->type != PSIF_QP_TRANSPORT_MANSP1 && (qp->max_inline_data || sgl_size)) { + /* Allocate a DMA validation entry to be used for sif to access + * s/g lists, which we put in the spare space between entries + * in the send queue. This MR is also used by the SQS to access + * inline data. + */ + sq->sg_mr = alloc_mr(sdev, pd, sq->mem, dma_start, 0); + if (IS_ERR(sq->sg_mr)) { + ret = PTR_ERR(sq->sg_mr); + sif_log(sdev, SIF_INFO, "Failed to allocate lkey for s/g list (%d)", + ret); + goto err_alloc_sg_mr; + } + } + + /* Initialize hw part of descriptor */ + memset(&lsq_hw, 0, sizeof(lsq_hw)); + + lsq_hw.size_log2 = entries_log2; + lsq_hw.extent_log2 = extent_log2; + /* TBD: mmu_context */ + + /* See comment above */ + lsq_hw.last_seq = 0; + lsq_hw.base_addr = dma_start; + lsq_hw.sq_max_inline = min(256U, qp->max_inline_data); + lsq_hw.sq_max_sge = sq->sg_entries - 1; + + /* These are needed for sq mode to work */ + lsq_hw.sq_next.next_qp_num = 0xffffff; + lsq_hw.sq_next.next_null = 0xff; + + /* Allocate mmu context for the send queue - only read access needed + * for the queue itself: + */ + ret = sif_map_ctx(sdev, &sq->mmu_ctx, sq->mem, lsq_hw.base_addr, + alloc_sz, false); + if (ret) { + sif_log(sdev, SIF_INFO, "Failed to set mmu context for sq %d", + sq->index); + goto err_map_ctx; + } + + + lsq_hw.mmu_cntx = sq->mmu_ctx.mctx; + + /* Write network byte order copy */ + copy_conv_to_hw(sq_hw_p, &lsq_hw, sizeof(lsq_hw)); + + /* Initialize sw part of descriptor */ + memset(&lsq_sw, 0, sizeof(lsq_sw)); + + copy_conv_to_hw(&sq_sw->d, &lsq_sw, sizeof(lsq_sw)); + + spin_lock_init(&sq->lock); + + sq_rspq_p = get_sq_rspq(sdev, qp->qp_idx); + + /* We need to set the (network byte order) + * fields next_qp_num and rspq_next to all 1's (see bug 3479) + * TBD: This needs to be properly set up in psifapi + */ + sq_rspq_p->something_tbd[0] = (u64)-1; + return 0; + + sif_unmap_ctx(sdev, &sq->mmu_ctx); +err_map_ctx: + if (sq->sg_mr) + dealloc_mr(sdev, sq->sg_mr); +err_alloc_sg_mr: + kfree(sq->wr_hdl); +err_alloc_wrid: + sif_mem_free(sq->mem); +err_alloc_dma: + return ret; +} + + +int sif_flush_sqs(struct sif_dev *sdev, struct sif_sq *sq) +{ + ulong start_time = jiffies; + ulong timeout = start_time + sdev->min_resp_ticks * 2; + struct sif_qp *qp = get_sif_qp(sdev, sq->index); + bool sqs_idle = false; + u32 sq_next; + u32 prev_sq_next; + struct psif_wr wr; + struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, sq->index); + + if (qp->ibqp.xrcd) /* XRC target QPs dont have any valid sqs setup */ + return 0; + + memset(&wr, 0, sizeof(struct psif_wr)); + wr.local_qp = sq->index; + + /* Trigger a stop of SQS (rev2 feature) */ + sif_doorbell_write(qp, &wr, false); + + prev_sq_next = sq_next = get_psif_sq_hw__sq_next(&sq->d); + + sif_log(sdev, SIF_SQ, "Entering sq_hw poll for sq %d: last_seq %d head_seq %d sq_next %x", + sq->index, sq_sw->last_seq, sq_sw->head_seq, sq_next); + for (;;) { + if (!sqs_idle) { + sqs_idle = get_psif_sq_hw__destroyed(&sq->d); + if (sqs_idle) { + rmb(); /* Make sure we observe sq_next after the + * destroyed bit has been set + */ + sq_next = get_psif_sq_hw__sq_next(&sq->d); + } + } + if (sqs_idle && sq_next == 0xffffffff) + break; + if (sq_next != prev_sq_next) { + /* Reset timeout */ + timeout = jiffies + sdev->min_resp_ticks * 2; + sif_log(sdev, SIF_INFO_V, "sq %d: sq_next moved from %d -> %d", + sq->index, prev_sq_next, sq_next); + } else if (time_is_before_jiffies(timeout)) { + if (sif_feature(pcie_trigger)) + force_pcie_link_retrain(sdev); + sif_log(sdev, SIF_INFO, + "Error: sq %d timed out - waited %d ms for SQ flush. Idle:%d sq_next:%x", + sq->index, jiffies_to_msecs(jiffies - start_time), sqs_idle, sq_next); + return -ETIMEDOUT; + } + /* TBD: No sleep necessary as this should be really quick (?) */ + cpu_relax(); + prev_sq_next = sq_next; + sq_next = get_psif_sq_hw__sq_next(&sq->d); + } + + sif_log(sdev, SIF_SQ, " sq %d: done waiting for SQS to finish", sq->index); + return 0; +} + + +void sif_free_sq(struct sif_dev *sdev, struct sif_qp *qp) +{ + struct sif_sq *sq; + volatile struct psif_sq_hw *sq_hw_p; + volatile struct psif_sq_sw *sq_sw_p; + + int index = qp->qp_idx; + + sq = get_sif_sq(sdev, index); + sif_log(sdev, SIF_SQ, "idx %d", sq->index); + + sq_sw_p = get_sq_sw(sdev, index); + sq_hw_p = &sq->d; + + if (reliable_qp(qp->type) && qp->sq_cmpl_map_valid) + sif_sq_cmpl_unmap_sq(sdev, sq); + + sif_unmap_ctx(sdev, &sq->mmu_ctx); + + /* We clear the whole sq field including sq_hw below */ + sif_clear_sq_sw(sdev, index); + + if (sq->sg_mr) + dealloc_mr(sdev, sq->sg_mr); + + sif_mem_free(sq->mem); + kfree(sq->wr_hdl); + memset(sq, 0, sizeof(struct sif_sq)); +} + + +/* Setup of the root node(s) of a page table mapping all + * active send queues: + */ +int sif_sq_cmpl_setup(struct sif_table *tp) +{ + u32 max_sq_extent = compute_sq_extent(16, sif_max_inline, + NULL, NULL, NULL, NULL); + struct sif_dev *sdev = tp->sdev; + + tp->ext_sz = SIF_SW_MAX_SQE * max_sq_extent; /* Largest possible send queue */ + tp->table_sz = (size_t)tp->ext_sz * tp->entry_cnt; + tp->sif_base = SIF_SQ_CMPL_START; + tp->mem = sif_mem_create_ref(sdev, SIFMT_CS, tp->sif_base, tp->table_sz, + GFP_KERNEL); + + sif_log(sdev, SIF_SQ, "ext.sz %d entry cnt %d max sq extent 0x%x tbl.sz 0x%lx", + tp->ext_sz, tp->entry_cnt, max_sq_extent, tp->table_sz); + return 0; +} + + +/* Map/unmap the page table of a send queue in the sq_cmpl mapping + * The way to map it depends on the map type of the send queue itself: + */ +int sif_sq_cmpl_map_sq(struct sif_dev *sdev, struct sif_sq *sq) +{ + struct sif_table *sctp = &sdev->ba[sq_cmpl]; + + /* Start offset of this send queue in the large virtual sq_cmpl mapping: */ + u64 virt_base = sctp->mmu_ctx.base + (u64)sq->index * sctp->ext_sz; + u64 size = sq->mem->size; + + return sif_map_ctx_part(sdev, &sctp->mmu_ctx, sq->mem, virt_base, size); +} + + +int sif_sq_cmpl_unmap_sq(struct sif_dev *sdev, struct sif_sq *sq) +{ + struct sif_table *sctp = &sdev->ba[sq_cmpl]; + + /* Start offset of this send queue in the large virtual sq_cmpl mapping: */ + u64 virt_base = sctp->mmu_ctx.base + (u64)sq->index * sctp->ext_sz; + u64 size = sq->mem->size; + + sif_log(sdev, SIF_SQ, "sq %d, virt_base 0x%llx size 0x%llx", sq->index, virt_base, size); + return sif_unmap_gva_ctx_part(sdev, &sctp->mmu_ctx, virt_base, size); +} + + +void sif_dfs_print_sq_hw(struct seq_file *s, struct sif_dev *sdev, loff_t pos) +{ + struct sif_sq *sq; + int qlen; + u32 head, tail; + struct psif_sq_hw lhw; + struct sif_sq_sw *sq_sw; + struct sif_qp *qp; + int tsv; + + if (unlikely(pos < 0)) { + seq_puts(s, "# N = next_null, T = sq_timestamp_valid, D = sq_done, X = destroyed\n"); + seq_puts(s, "# [----------------------- sw view ----------------------] [----------- hw view ------------]\n"); + seq_puts(s, "# Index cq_idx head tail q_sz q_len q_high max_sge inline head tail n.qp N T D X\n"); + return; + } + sq = get_sif_sq(sdev, pos); + sq_sw = get_sif_sq_sw(sdev, pos); + qp = get_sif_qp(sdev, pos); + + /* Check for QP0/1 which is reserved but not initialized */ + if (sq->entries == 0) + return; + + head = sq_sw->head_seq; + tail = sq_sw->last_seq; + qlen = sq_length(sq, head, tail); + + copy_conv_to_sw(&lhw, &sq->d, sizeof(lhw)); + tsv = lhw.sq_timestamp_valid; + + seq_printf(s, "%7lld %7d %8d %8d %8d %9d %9d %7d %6d %8d%8d %06x %2x %d %d %d\n", + pos, + sq->cq_idx, head, tail, sq->entries, qlen, sq->max_outstanding, + sq->sg_entries, qp->max_inline_data, + get_psif_sq_sw__tail_indx(&sq_sw->d), lhw.last_seq, + lhw.sq_next.next_qp_num, lhw.sq_next.next_null, + tsv, lhw.sq_done, lhw.destroyed); +} + + +void sif_dfs_print_sq_cmpl(struct seq_file *s, struct sif_dev *sdev, loff_t pos) +{ + struct sif_sq *sq; + struct sif_qp *qp; + struct sif_table *sctp = &sdev->ba[sq_cmpl]; + u64 virt_base; + dma_addr_t val; + u64 pte_cnt, i; + dma_addr_t dma_start; + struct sif_mmu_ctx *ctx = &sctp->mmu_ctx; + + if (unlikely(pos < 0)) { + u64 table_ptr = sif_pt_dma_root(ctx->pt); + + seq_printf(s, "# - mmu_cntx: root %016llx, level %d\n", + table_ptr, sctp->mmu_ctx.mctx.table_level); + seq_puts(s, "# Index psif vaddr #pages @pte[0] pte[0..]\n"); + return; + } + sq = get_sif_sq(sdev, pos); + qp = get_sif_qp(sdev, pos); + virt_base = sctp->mmu_ctx.base + (u64)sq->index * sctp->ext_sz; + + /* Check for QP0/1 which is reserved but not initialized */ + if (sq->entries == 0) + return; + + /* Only QPs with multipacket support is mapped here; */ + if (!reliable_qp(qp->type)) + return; + + if (sif_pt_entry(ctx->pt, virt_base, &dma_start, &val)) + return; + + pte_cnt = 1; /* TBD: read the correct value to report all pages the pt refers to */ + seq_printf(s, " %6lld %016llx %6lld @%pad: [", pos, virt_base, pte_cnt, &dma_start); + for (i = 0; i < pte_cnt; i++) { + if (i > 0) + seq_puts(s, ","); + seq_printf(s, "%pad", &val); + } + seq_puts(s, "]\n"); +} + + +bool multipacket_qp(enum psif_qp_trans type) +{ + switch (type) { + case PSIF_QP_TRANSPORT_RC: + case PSIF_QP_TRANSPORT_UC: + case PSIF_QP_TRANSPORT_XRC: + return true; + default: + return false; + } +} + + +bool reliable_qp(enum psif_qp_trans type) +{ + return + type == PSIF_QP_TRANSPORT_RC || + type == PSIF_QP_TRANSPORT_XRC; +} diff --git a/drivers/infiniband/hw/sif/sif_sq.h b/drivers/infiniband/hw/sif/sif_sq.h new file mode 100644 index 000000000000..6d0cc306f4c3 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_sq.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_sq.h: Implementation of the send queue side of an IB queue pair + */ + +#ifndef __SIF_SQ_H +#define __SIF_SQ_H + +struct sif_sq_hdl { + u64 wr_id; /* Stored work id */ + u32 sq_seq; /* Extra sanity checks */ + bool used; +}; + + +struct sif_sq { + volatile struct psif_sq_hw d; /* Hardware descriptor */ + /* Serializes access to sq_sw->last_seq (alloc of new sqes): */ + spinlock_t lock ____cacheline_internodealigned_in_smp; + struct sif_mmu_ctx mmu_ctx; + int index; /* Send queue index (same as the qp index) */ + int cq_idx; /* Default send compl.queue index to use */ + u32 sg_entries; /* Max send scatter/gather configured for this sq */ + u16 entries; + u16 mask; /* entries - 1 for modulo using & */ + u16 max_outstanding; /* Longest observed send queue len */ + u8 complete_all; /* Gets or'ed into completion bit in WRs */ + u32 extent; + u32 sgl_offset; /* Offset from start of the sqe where the sgl starts */ + bool user_mode; /* Set if this is an SQ to be mapped to user space */ + struct sif_mem *mem; /* Allocated queue memory */ + void *wr_hdl; /* map from sq entry index to wr_id + optional bookkeeping */ + int wr_hdl_sz; /* Sz of each elem. in wr_hdl - PQP and std send path uses different sizes */ + struct sif_mr *sg_mr; /* DMA val.entry for the sge list when in the send queue */ + struct psif_rq_scatter tmp_sge[16]; /* Temp.storage for buildup of LE sge list */ +}; + + +/* Lookup function for the handle for a particular request: */ +static inline struct sif_sq_hdl *get_sq_hdl(struct sif_sq *sq, u32 seq) +{ + return (struct sif_sq_hdl *)(sq->wr_hdl + sq->wr_hdl_sz * (seq & sq->mask)); +} + +int sif_sq_cmpl_setup(struct sif_table *tp); + +int sif_alloc_sq(struct sif_dev *sdev, struct sif_pd *pd, + struct sif_qp *qp, struct ib_qp_cap *cap, + bool user_mode, int sq_hdl_sz); + +void sif_free_sq(struct sif_dev *sdev, struct sif_qp *qp); + +int sif_flush_sqs(struct sif_dev *sdev, struct sif_sq *sq); + +int sif_sq_cmpl_map_sq(struct sif_dev *sdev, struct sif_sq *sq); +int sif_sq_cmpl_unmap_sq(struct sif_dev *sdev, struct sif_sq *sq); + +/* Line printers for debugfs files */ +void sif_dfs_print_sq_hw(struct seq_file *s, struct sif_dev *sdev, loff_t pos); +void sif_dfs_print_sq_cmpl(struct seq_file *s, struct sif_dev *sdev, loff_t pos); + +bool multipacket_qp(enum psif_qp_trans type); +bool reliable_qp(enum psif_qp_trans type); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_srq.c b/drivers/infiniband/hw/sif/sif_srq.c new file mode 100644 index 000000000000..db40c54829d7 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_srq.c @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_srq.c: Interface to shared receive queues for SIF + */ + +#include +#include "sif_dev.h" +#include "sif_qp.h" +#include "sif_srq.h" +#include "sif_base.h" +#include "sif_defs.h" +#include "sif_sndrcv.h" + +struct ib_srq *sif_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata) +{ + int rq_idx; + struct sif_dev *sdev = to_sdev(ibpd->device); + struct sif_rq *rq; + ulong user_flags = 0; + int ret = 0; + bool user_mode = udata != NULL; + + if (sif_feature(disable_srq)) + return ERR_PTR(-EOPNOTSUPP); + + if (udata) { + struct sif_create_srq_ext cmd; + + ret = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); + if (ret) + goto err_create_srq; + user_flags = cmd.flags; + + if (sif_vendor_enable(SVF_kernel_mode, user_flags)) + user_mode = false; + } + + sif_log(sdev, SIF_SRQ, "%s", (user_mode ? "(user)" : "(kernel)")); + + rq_idx = alloc_rq(sdev, to_spd(ibpd), srq_init_attr->attr.max_wr, + srq_init_attr->attr.max_sge, srq_init_attr, user_mode); + if (rq_idx < 0) { + ret = rq_idx; + goto err_create_srq; + } + + rq = get_sif_rq(sdev, rq_idx); + + if (udata) { + struct sif_create_srq_resp_ext resp; + + memset(&resp, 0, sizeof(resp)); + resp.index = rq_idx; + resp.extent = rq->extent; + ret = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (ret) + goto err_udata; + } + + srq_init_attr->attr.max_wr = rq->entries_user; + + return &rq->ibsrq; +err_udata: + free_rq(sdev, rq->index); +err_create_srq: + return ERR_PTR(ret); +} + +#define ARM_SRQ_HOLDOFF (10 + jiffies) + +static int sif_arm_srq(struct sif_dev *sdev, struct sif_rq *srq, u32 srq_limit) +{ + int ret; + struct psif_wr wr; + struct psif_cq_entry *cqe; + DECLARE_SIF_CQE_POLL_WITH_RR_PQP(sdev, lcqe); + struct sif_pqp *pqp = lcqe.pqp; + + if (unlikely(!pqp)) + return -EAGAIN; + + memset(&wr, 0, sizeof(struct psif_wr)); + + wr.completion = 1; + wr.op = PSIF_WR_SET_SRQ_LIM; + wr.details.su.srq_lim = srq_limit; + wr.details.su.u2.rq_id = srq->index; + +try_again: + if (time_is_after_jiffies((unsigned long)atomic64_read(&pqp->qp->arm_srq_holdoff_time))) { + cpu_relax(); + goto try_again; + } + + atomic64_set(&pqp->qp->arm_srq_holdoff_time, ARM_SRQ_HOLDOFF); + pqp->qp->srq_idx = srq->index; + + ret = sif_pqp_poll_wr(sdev, &wr, &lcqe); + if (ret < 0) { + sif_log(sdev, SIF_INFO, "pqp request failed with errno %d", ret); + return ret; + } + + cqe = &lcqe.cqe; + if (cqe->status != PSIF_WC_STATUS_SUCCESS) { + sif_log(sdev, SIF_INFO, "failed with status %s(%d) for cq_seq %d", + string_enum_psif_wc_status(cqe->status), cqe->status, cqe->seq_num); + return -EIO; + } + + srq->srq_limit = srq_limit; + + return 0; +} + +int sif_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct sif_dev *sdev = to_sdev(ibsrq->device); + struct sif_rq *srq = to_srq(ibsrq); + u16 srq_limit; + int ret; + + if (attr_mask & IB_SRQ_MAX_WR) { + sif_log(sdev, SIF_SRQ, "SRQ_MAX_WR not supported"); + return -EINVAL; + } + + if (attr_mask & IB_SRQ_LIMIT) { + srq_limit = attr->srq_limit & 0x3fff; + if (srq_limit >= srq->entries) + return -EINVAL; + + ret = sif_arm_srq(sdev, srq, srq_limit); + if (ret) + return ret; + } + return 0; +} + +int sif_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) +{ + struct sif_rq *srq = to_srq(ibsrq); + + attr->max_wr = srq->entries; + attr->max_sge = srq->sg_entries; + attr->srq_limit = srq->srq_limit; + + return 0; +} + +int sif_destroy_srq(struct ib_srq *ibsrq) +{ + int sts; + struct sif_dev *sdev = to_sdev(ibsrq->device); + struct sif_rq *rq = to_srq(ibsrq); + + sif_log(sdev, SIF_SRQ, "rq %d", rq->index); + + if (atomic_read(&rq->refcnt) > 1) + return -EBUSY; + + /* An SRQ cannot be flushed with flushed-in-error completions + * as we don't know which completion queue to generate + * the flushed-in-error completions for, and this should be fine + * from a standards perspective: + * IB spec refs: 10.2.9.4, 11.2.3.4. + */ + sts = sif_invalidate_rq_hw(sdev, rq->index, PCM_WAIT); + if (sts) { + sif_log(sdev, SIF_INFO, + "Invalidate rq_hw failed"); + } + + return free_rq(sdev, rq->index); +} + +int sif_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr) +{ + struct sif_dev *sdev = to_sdev(ibsrq->device); + struct sif_rq *rq = to_srq(ibsrq); + + sif_logi(ibsrq->device, SIF_SRQ, "rq %d (SRQ)", rq->index); + + return post_recv(sdev, NULL, rq, recv_wr, bad_recv_wr); +} diff --git a/drivers/infiniband/hw/sif/sif_srq.h b/drivers/infiniband/hw/sif/sif_srq.h new file mode 100644 index 000000000000..8ea4b32b70bd --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_srq.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_srq.h: Interface to internal Shared receive queue logic for SIF + */ + +#ifndef __SIF_SRQ_H +#define __SIF_SRQ_H + +struct ib_srq *sif_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata); +int sif_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr, + enum ib_srq_attr_mask srq_attr_mask, struct ib_udata *udata); +int sif_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr); +int sif_destroy_srq(struct ib_srq *ibsrq); + +int sif_post_srq_recv(struct ib_srq *ibsrq, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_tqp.c b/drivers/infiniband/hw/sif/sif_tqp.c new file mode 100644 index 000000000000..2c883481eaed --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_tqp.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Wei Lin Guay + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_tqp.c: Implementation of EPSA tunneling QP for SIF + */ +#include +#include +#include "sif_tqp.h" +#include "psif_hw_setget.h" +#include "sif_defs.h" + +/* + * This is a host-EPSA mailbox function that is called via ib_post_send() + * The conditions and assumptions are:- + * 1. qp_type == IB_QPT_EPSA_TUNNELING. + * 2. opcode == IB_WR_SEND_WITH_IMM + * 3. Only receive completion - no send completion will be generated. + * 4. Only the first wr.sge will be handled. + * 5. wr.ex.imm_data is the EPSA_N + */ +int sif_epsa_tunneling_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct psif_epsc_csr_req req; + struct psif_epsc_csr_rsp rsp; + struct sif_dev *sdev = to_sdev(ibqp->device); + + /* The status of the epsa mailbox communication is logged in the received cq: */ + struct sif_cq *cq = to_scq(ibqp->recv_cq); + struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index); + volatile struct psif_cq_entry *cqe; + struct psif_cq_entry lcqe; + u32 seqno; + int ret; + + memset(&req, 0, sizeof(req)); + memset(&rsp, 0, sizeof(rsp)); + + req.uf = 0; + req.opcode = EPSC_A_COMMAND; + req.u.epsa_cmd.cmd = EPSA_GENERIC_CMD; + req.u.epsa_cmd.length = wr->sg_list[0].length; + req.u.epsa_cmd.host_addr = wr->sg_list[0].addr; + req.u.epsa_cmd.key = wr->sg_list[0].lkey; + + if (wr->ex.imm_data > 3) { + sif_log(sdev, SIF_INFO, "Exit: Fail to post_send a WR"); + return -EINVAL; + } + + sif_log(sdev, SIF_SND, "len %d host addr addr 0x%llx key 0x%x", + req.u.epsa_cmd.length, req.u.epsa_cmd.host_addr, key); + + ret = sif_eps_wr(sdev, u32_to_mbox(wr->ex.imm_data), &req, &rsp); + + seqno = cq_sw->next_seq; + cqe = get_cq_entry(cq, seqno); + + memset(&lcqe, 0, sizeof(lcqe)); + /* construct the required info for WC during poll_cq. + * As for now include the wr_id, mailbox status, qp_num, and status: + */ + lcqe.seq_num = seqno; + lcqe.wc_id.rq_id = wr->wr_id; + lcqe.vendor_err = rsp.status; + lcqe.qp = ibqp->qp_num; + lcqe.status = ret == 0 ? PSIF_WC_STATUS_SUCCESS : PSIF_WC_STATUS_GENERAL_ERR; + + copy_conv_to_hw(cqe, &lcqe, sizeof(*cqe)); + + return ret; +}