--- /dev/null
+/*
+ * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_pt.c: SIF (private) page table management
+ */
+
+#include <linux/mm.h>
+#include <linux/kref.h>
+#include <linux/seq_file.h>
+#include "sif_dev.h"
+#include "sif_mr.h"
+#include "sif_mem.h"
+#include "sif_pt.h"
+#include "sif_base.h"
+
+/* A kmem_cache to allocate the nodes in the rb_trees */
+static struct kmem_cache *pt_page_cache;
+
+static inline void *sif_pt_cache_alloc(struct sif_dev *sdev, gfp_t flags)
+{
+#ifdef CONFIG_NUMA
+ void *n;
+
+ n = kmem_cache_alloc_node(pt_page_cache, flags, sdev->pdev->dev.numa_node);
+ if (n)
+ return n;
+
+ sif_log(sdev, SIF_INFO, "Warning: unable to allocate mem on numa node %d",
+ sdev->pdev->dev.numa_node);
+#endif
+ return kmem_cache_alloc(pt_page_cache, flags);
+}
+
+
+/* Declared below */
+static int init_top(struct sif_pt *pt, u64 vstart, int npages);
+
+
+int sif_pt_init(void)
+{
+ pt_page_cache = KMEM_CACHE(sif_pt_page, 0);
+ if (!pt_page_cache)
+ return -ENOMEM;
+ sif_log0(SIF_INFO, "order PAGE_SIZE = %d", order_base_2(PAGE_SIZE));
+ return 0;
+}
+
+void sif_pt_exit(void)
+{
+ kmem_cache_destroy(pt_page_cache);
+}
+
+/* some utilities: */
+
+/* Find the optimal page size (represented by the leaf level)
+ * to use based on device capabilities, configuration and a max_shift
+ * value (typically based on continuousness of memory.
+ * The result is adjusted with the address pair of a corresponding virtual
+ * address and dma address to ensure that it is possible to create a mapping at that
+ * level. pte_ext_shift is set to the number bits to shift increment between
+ * each valid pte (For the odd sized leaf pages)
+ * Assumes vaddr and dma_add.
+ */
+int find_optimal_leaf_level(struct sif_dev *sdev, u32 max_shift,
+ u64 vaddr, u64 dma_addr, u64 size,
+ u8 *leaf_level, u8 *pte_ext_shift)
+{
+ u32 shift, adj_page_shift, page_shift;
+ unsigned long smallest_misalign;
+ u32 bits = sizeof(dma_addr_t) << 3;
+
+ /* Page size not supported by device configuration
+ * TBD: Remove (Should not happen unless a programming error)
+ */
+ if (sdev->mi.page_shift > max_shift) {
+ sif_log(sdev, SIF_INFO,
+ "Failed to find a valid leaf level (page_shift %d, max_shift %d)",
+ sdev->mi.page_shift, max_shift);
+ return -EINVAL;
+ }
+
+ *leaf_level = 0;
+ *pte_ext_shift = 0;
+ shift = sdev->mi.page_shift;
+
+ switch (shift) {
+ case 12:
+ /* Device configured for Intel page sizes:
+ * In x86 mode for PSIF 2.1 only 4K base page size is supported
+ */
+ if (max_shift < 21)
+ break;
+ *leaf_level = 1;
+ if (max_shift < 30)
+ break;
+ *leaf_level = 2;
+ break;
+ case 13: /* Device configured for Sparc page sizes */
+ if (max_shift < 16)
+ break;
+ *pte_ext_shift = 3; /* 64K base page - only populate every 8th leaf entry */
+ if (max_shift < 19)
+ break;
+ *pte_ext_shift = 6; /* 512K base page - only populate every 64th leaf entry */
+ if (max_shift < 22)
+ break;
+ *leaf_level = 1;
+ *pte_ext_shift = 0;
+ if (max_shift < 25)
+ break;
+ *pte_ext_shift = 3; /* Fits 32M pages at level 1 - every 8th 4M entry */
+ if (max_shift < 28)
+ break;
+ *pte_ext_shift = 6; /* Fits 256M pages at level 1 - every 64th 4M entry */
+ if (max_shift < 31)
+ break;
+ *leaf_level = 2;
+ *pte_ext_shift = 0; /* Fits 2GB pages at level 2 */
+ if (max_shift < 34)
+ break;
+ *pte_ext_shift = 3; /* Fits 16GB pages at level 2 - every 8th 2GB entry */
+ if (max_shift < 37)
+ break;
+ break;
+ default:
+ BUG();
+ }
+ if (*leaf_level) {
+ page_shift = shift + (*leaf_level * sdev->mi.level_shift);
+ smallest_misalign = (dma_addr ^ vaddr) & ((1 << page_shift) - 1);
+ if (smallest_misalign & ~PAGE_MASK) {
+ sif_log(sdev, SIF_INFO,
+ "Failed to create page table: misaligned VA/DMA (0x%lx) dma 0x%llx vaddr 0x%llx",
+ smallest_misalign, dma_addr, vaddr);
+ return -EINVAL;
+ }
+
+ if (smallest_misalign) {
+ adj_page_shift = find_first_bit(&smallest_misalign, bits);
+ *leaf_level = (adj_page_shift - shift) / sdev->mi.level_shift;
+ sif_log(sdev, SIF_PT,
+ "misaligned VA/DMA adj: leaf_level %d, page_shift %d, smallest_misalign 0x%lx, adj_page_shift %d",
+ *leaf_level,
+ page_shift, smallest_misalign, adj_page_shift);
+ page_shift = adj_page_shift;
+ }
+ /* TBD: Remove - just for debugging */
+ if (*leaf_level > 3) {
+ sif_log(sdev, SIF_INFO,
+ "haywire leaf level %d - should not be possible - setting safe value 0",
+ *leaf_level);
+ *leaf_level = 0;
+ return -EINVAL;
+ }
+ if (*leaf_level) {
+ /* Check if we can do equally well with a lower level pointer */
+ int size_order = order_base_2(size);
+ int size_shift = page_shift - size_order;
+
+ if (size_shift < 0)
+ goto out;
+ sif_log(sdev, SIF_PT, "order %d page_shift %d size_shift %d",
+ size_order, page_shift, size_shift);
+ if (size_shift > 0) {
+ u32 new_leaf_level =
+ ((page_shift - size_shift + sdev->mi.level_shift - 1 - shift)
+ / sdev->mi.level_shift);
+ sif_log(sdev, SIF_PT, "new_leaf_level %d", new_leaf_level);
+ if (new_leaf_level < *leaf_level) {
+ *leaf_level = new_leaf_level;
+ sif_log(sdev, SIF_PT,
+ "size_shift %d, size adjusted leaf_level %d",
+ size_shift, *leaf_level);
+ }
+ }
+ }
+ }
+out:
+ sif_log(sdev, SIF_PT, "shift %d leaf_level %d", shift, *leaf_level);
+ return 0;
+}
+
+/* Find the aligned size of a region within a certain page alignment size
+ * (eg. the number of pages of size @alignment needed to address (start,len))
+ */
+u64 aligned_size(u64 start, u64 len, u64 alignment)
+{
+ u64 mask = alignment - 1;
+ u64 aligned_start = start & ~mask;
+ u64 aligned_end = (start + len + mask) & ~mask;
+
+ return aligned_end - aligned_start;
+}
+
+/* Find the union of the two ranges including non-overlapped parts */
+static u64 merge_ranges(u64 start1, u64 size1, u64 start2, u64 size2, u64 *new_size)
+{
+ u64 new_start = min(start1, start2);
+ u64 new_end = max(start1 + size1, start2 + size2);
+ *new_size = new_end - new_start;
+ return new_start;
+}
+
+static u32 level_to_pageshift(struct sif_pt *pt, int level)
+{
+ struct sif_mem_info *mi = &pt->sdev->mi;
+
+ level++;
+ if (level < 0 || level > 4)
+ sif_log(pt->sdev, SIF_INFO, "level %d", level);
+ BUG_ON(level < 0 || level > 4);
+ return mi->page_shift + mi->level_shift * level;
+}
+
+static u64 level_to_pagesize(struct sif_pt *pt, int level)
+{
+ return (1ull << level_to_pageshift(pt, level));
+}
+
+static u64 level_to_pagemask(struct sif_pt *pt, int level)
+{
+ return (level_to_pagesize(pt, level) - 1);
+}
+
+
+u32 sif_pt_page_shift(struct sif_pt *pt)
+{
+ return level_to_pageshift(pt, pt->leaf_level - 1);
+}
+
+/* Find the required page table memory need in number of
+ * pt->page_table_page sized pages
+ * If pt->fixed_top, calculate space for a final page for each of the levels
+ * even if only one entry is necessary.
+ *
+ * NB! Sets pt->top_level as a side effect
+ */
+static u32 table_mem_need(struct sif_pt *pt, u64 vstart, u64 mapsize)
+{
+ u64 aligned_size_pte;
+ u64 aligned_size_pmd;
+ u64 aligned_size_pud;
+ u64 aligned_size_pgd;
+ u64 aligned_size_pml4;
+ u64 psz;
+ int nptes, npmds, npuds, npgds, pte_pages;
+ int pshift;
+ /* If we need to guarantee that the top node remains the same, we must build
+ * a max level page table
+ */
+ int single = pt->fixed_top ? 1 : 0;
+ struct sif_dev *sdev = pt->sdev;
+
+ /* Determine what setup to use for the kmem object based on the initial mapsize:
+ * We use 4K pages for now, and set sg_size to the number of pages needed to
+ * support mapsize + the full chain of pages if we need a 4-level table:
+ */
+ psz = sdev->mi.page_size;
+ aligned_size_pte = aligned_size(vstart, mapsize, psz);
+ psz <<= sdev->mi.level_shift;
+ aligned_size_pmd = aligned_size(vstart, mapsize, psz);
+ psz <<= sdev->mi.level_shift;
+ aligned_size_pud = aligned_size(vstart, mapsize, psz);
+ psz <<= sdev->mi.level_shift;
+ aligned_size_pgd = aligned_size(vstart, mapsize, psz);
+ psz <<= sdev->mi.level_shift;
+ aligned_size_pml4 = aligned_size(vstart, mapsize, psz);
+
+ sif_log(pt->sdev, SIF_MMU, "aligned lengths: pte %llx pmd %llx pud %llx pgd %llx pml4 %llx",
+ aligned_size_pte, aligned_size_pmd, aligned_size_pud,
+ aligned_size_pgd, aligned_size_pml4);
+
+ pshift = sdev->mi.page_shift + sdev->mi.level_shift;
+ nptes = aligned_size_pmd >> pshift;
+ pshift += sdev->mi.level_shift;
+ npmds = nptes > 1 ? aligned_size_pud >> pshift : single;
+ pshift += sdev->mi.level_shift;
+ npuds = npmds > 1 ? aligned_size_pgd >> pshift : single;
+ pshift += sdev->mi.level_shift;
+ npgds = npuds > 1 ? aligned_size_pml4 >> pshift : single;
+
+ pte_pages = pt->leaf_level ? 0 : nptes;
+
+ sif_log(pt->sdev, SIF_MMU, "npgds %d, npuds %d, npmds: %d, pte_pages %d",
+ npgds, npuds, npmds, pte_pages);
+
+ pt->top_level = single ? 3 : (npgds ? 3 : (npuds ? 2 : (npmds ? 1 : 0)));
+ return pte_pages + npmds + npuds + npgds;
+}
+
+/* Find page table entry index for the pte referring
+ * the page starting at vaddr at level @level
+ */
+static inline int sif_pte_index(struct sif_dev *sdev, u64 vaddr, u64 page_shift)
+{
+ return (vaddr >> page_shift) & (sdev->mi.ptes_per_page - 1);
+}
+
+
+
+
+static void pt_free_page(struct sif_pt *pt, struct sif_pt_page *n)
+{
+ list_add_tail(&n->list, &pt->freelist);
+ n->parent = NULL;
+ n->vaddr = 0;
+}
+
+
+/* Destructor callback for kref */
+static void sif_pt_release(struct kref *kref)
+{
+ struct sif_pt *pt = container_of(kref, struct sif_pt, refcnt);
+ struct list_head *np;
+ struct list_head *npp;
+ struct sif_pt_page *n;
+
+ sif_log(pt->sdev, SIF_MMU_V, "at %p", pt);
+
+ if (pt->top)
+ pt_free_page(pt, pt->top);
+
+ /* Actual cleanup */
+ list_for_each_safe(np, npp, &pt->freelist) {
+ n = list_entry(np, struct sif_pt_page, list);
+ kfree(n);
+ }
+ if (pt->m.sg_size)
+ sif_kmem_free(pt->sdev, &pt->m);
+ kfree(pt);
+}
+
+
+/* Create a sif_page_table object and if mapsize > 0,
+ * map the range starting at @sg to a map with start at virtual
+ * address @vstart and size @mapsize and the number of bits to use in each page
+ * in page_shift. The object can later be resized using sif_pt_extend/sif_pt_shrink:
+ * Set @modifiable to allow the table to be extended and shrinked
+ * Set @fixed_top to have pt guarantee that the top node remains constant
+ * in which case it will always be a level 4 tree.
+ */
+struct sif_pt *sif_pt_create(struct sif_dev *sdev, struct scatterlist *sg,
+ u64 vstart, size_t size, u32 page_shift,
+ bool modifiable, bool fixed_top)
+{
+ int ret = 0;
+ int i;
+ dma_addr_t dma_start = sg ? sg_dma_address(sg) : 0;
+ struct sif_pt *pt = sif_kmalloc(sdev, sizeof(*pt), GFP_KERNEL | __GFP_ZERO);
+
+ if (!pt)
+ return NULL;
+
+ /* sub-page misalignment in vstart must correspond with
+ * misalignment in dma address but sg entries are page aligned:
+ */
+ dma_start += vstart & ~PAGE_MASK;
+
+ sif_log(sdev, SIF_MMU, "vstart %llx, size %lx, page_shift %d%s", vstart, size,
+ page_shift, (modifiable ? " (modifiable)" : ""));
+ pt->sdev = sdev;
+ pt->fixed_top = fixed_top;
+ pt->modifiable = modifiable;
+
+ ret = find_optimal_leaf_level(sdev, page_shift,
+ vstart, dma_start, size,
+ &pt->leaf_level, &pt->pte_ext_shift);
+ if (ret)
+ goto extend_failed;
+
+ pt->page_shift = sdev->mi.page_shift + pt->leaf_level * sdev->mi.level_shift;
+ pt->ptes_per_page = 1 << sdev->mi.level_shift;
+
+ for (i = 0; i < PT_LEVELS; i++)
+ pt->pmd[i] = RB_ROOT;
+ kref_init(&pt->refcnt);
+ mutex_init(&pt->lock);
+ INIT_LIST_HEAD(&pt->freelist);
+
+ ret = sif_pt_extend(pt, sg, vstart, size);
+ if (ret < 0)
+ goto extend_failed;
+ return pt;
+
+extend_failed:
+ kfree(pt);
+ return NULL;
+}
+
+
+struct sif_pt *sif_pt_create_for_mem(struct sif_mem *mem,
+ u64 vstart, u32 page_shift, bool modifiable, bool fixed_top)
+{
+ int ret = 0;
+ int i;
+ struct sif_dev *sdev = mem->sdev;
+ struct sif_pt *pt = sif_kmalloc(sdev, sizeof(*pt), GFP_KERNEL | __GFP_ZERO);
+ size_t size = mem->size;
+
+ if (!pt)
+ return NULL;
+
+ sif_log(sdev, SIF_MMU, "vstart %llx, size %lx, page_shift %d%s", vstart, size,
+ page_shift, (modifiable ? " (modifiable)" : ""));
+ pt->sdev = sdev;
+ pt->fixed_top = fixed_top;
+ pt->modifiable = modifiable;
+ ret = find_optimal_leaf_level(sdev, page_shift,
+ vstart, sif_mem_dma(mem, 0), size,
+ &pt->leaf_level, &pt->pte_ext_shift);
+ if (ret)
+ goto extend_failed;
+
+ pt->page_shift = sdev->mi.page_shift + pt->leaf_level * sdev->mi.level_shift;
+ pt->ptes_per_page = 1 << sdev->mi.level_shift;
+
+ for (i = 0; i < PT_LEVELS; i++)
+ pt->pmd[i] = RB_ROOT;
+ kref_init(&pt->refcnt);
+ mutex_init(&pt->lock);
+ INIT_LIST_HEAD(&pt->freelist);
+
+ ret = sif_pt_extend_with_mem(pt, mem, vstart);
+ if (ret < 0)
+ goto extend_failed;
+ return pt;
+
+extend_failed:
+ kfree(pt);
+ return NULL;
+}
+
+
+/* Create an empty, extendable sif page table object */
+struct sif_pt *sif_pt_create_empty(struct sif_dev *sdev, u64 vstart, enum sif_mem_type map_mt)
+{
+ u32 page_shift = sdev->mi.page_shift;
+ struct sif_pt *pt;
+ int ret;
+
+ if (map_mt == SIFMT_2M)
+ page_shift += sdev->mi.level_shift;
+
+ pt = sif_pt_create(sdev, NULL, vstart, 0, page_shift, true, map_mt == SIFMT_CS);
+ if (!pt)
+ return NULL;
+
+ if (map_mt == SIFMT_CS) {
+ /* Allocate an empty top page table page to get an address to send to PSIF: */
+ pt->top_level = 3;
+ ret = init_top(pt, 0, 1);
+ if (ret) {
+ sif_kmem_free(pt->sdev, &pt->m);
+ return NULL;
+ }
+ }
+ return pt;
+}
+
+
+/* DMA address of root pointer of page table */
+dma_addr_t sif_pt_dma_root(struct sif_pt *pt)
+{
+ return pt->top ? sg_dma_address(pt->top->page) : 0;
+}
+
+/* SIF level of root pointer */
+u8 sif_pt_root_table_level(struct sif_pt *pt)
+{
+ return pt->top_level + 1;
+}
+
+
+/* Create sif_pt_page objects for @npages new pages for the page list in @sgl
+ * and insert them into the freelist:
+ */
+static int add_pages_to_freelist(struct sif_pt *pt, struct scatterlist *sgl, int npages)
+{
+ struct scatterlist *sg;
+ struct sif_pt_page *n;
+ int i;
+
+ for_each_sg(sgl, sg, npages, i) {
+ n = sif_pt_cache_alloc(pt->sdev, GFP_KERNEL | __GFP_ZERO);
+ if (!n)
+ return -ENOMEM;
+ sif_log(pt->sdev, SIF_MMU_V, "i = %d: sg %p", i, sg);
+ n->page = sg;
+ list_add_tail(&n->list, &pt->freelist);
+ }
+ return 0;
+}
+
+
+/* TBD: Consider allocating more than a single page at a time from @m object
+ * as sif_kmem_find_sg_list is O(n) where n is the number of sg arrays in @m.
+ */
+static struct sif_pt_page *pt_alloc_page(struct sif_pt *pt, u64 vaddr)
+{
+ int ret;
+ struct scatterlist *sg;
+ struct sif_pt_page *n;
+
+ if (list_empty(&pt->freelist)) {
+ ret = sif_kmem_extend(pt->sdev, &pt->m, PAGE_SIZE, GFP_KERNEL);
+ if (ret < 0)
+ goto failed;
+ sg = sif_kmem_find_sg_idx(&pt->m, ret);
+ ret = add_pages_to_freelist(pt, sg, 1);
+ if (ret < 0)
+ goto failed;
+ }
+
+ n = list_first_entry(&pt->freelist, struct sif_pt_page, list);
+ list_del(&n->list);
+ n->vaddr = vaddr;
+ return n;
+failed:
+ return ERR_PTR(ret);
+}
+
+
+
+static struct sif_pt_page *replace_top(struct sif_pt *pt, u64 vaddr)
+{
+ /* insert a new top node, put the old one into the
+ * empty rbtree for this level, and link the old top node from
+ * the new top:
+ */
+ u64 aligned_vaddr, top_pagesize;
+ u64 pt_shift, ptv;
+ u64 *pmd;
+ int i;
+ struct sif_pt_page *ep;
+ struct sif_dev *sdev = pt->sdev;
+
+ if (pt->top->usecnt == 1) {
+ /* Top node not used, just reuse with different va */
+ pt->top->vaddr = vaddr;
+ return pt->top;
+ }
+
+ pt->top->usecnt--;
+ /* Loop until we have a top node that spans vaddr */
+ do {
+ int level = pt->top_level;
+ struct rb_root *root = &pt->pmd[level];
+ struct rb_node **np = &root->rb_node;
+
+ top_pagesize = level_to_pagesize(pt, ++pt->top_level);
+ aligned_vaddr = pt->top->vaddr & ~(top_pagesize - 1);
+
+ rb_link_node(&pt->top->node, NULL, np);
+ rb_insert_color(&pt->top->node, root);
+ ep = pt->top;
+ pt->top = pt_alloc_page(pt, aligned_vaddr);
+ if (IS_ERR(pt->top)) {
+ ep = pt->top;
+ pt->top = NULL;
+ return ep;
+ }
+
+ ep->parent = pt->top;
+ pmd = sg_virt(pt->top->page);
+ pt_shift = level_to_pageshift(pt, level);
+ i = sif_pte_index(sdev, ep->vaddr, pt_shift);
+ ptv = sg_dma_address(ep->page) | PT_PAGE_PRESENT;
+ sif_log(sdev, SIF_MMU_V, "level %d: pmd[%d](%p) = %llx", level, i, &pmd[i], ptv);
+ BUG_ON(pmd[i] != 0);
+ pmd[i] = ptv;
+ pt->top->usecnt++;
+
+ sif_log(sdev, SIF_MMU,
+ "New top node at dma addr %pad level %d - aligned at %llx, page sz. %llx",
+ &sg_dma_address(pt->top->page), pt->top_level, aligned_vaddr, top_pagesize);
+ } while (vaddr < aligned_vaddr || vaddr >= aligned_vaddr + top_pagesize);
+
+ return NULL;
+}
+
+
+
+/* Find the page table page at level whose first entry references the sif virtual address @vaddr
+ * @vaddr assumed to be aligned to the appropriate alignment for the level.
+ * If the page does not exist, allocate a new one and add it:
+ */
+static struct sif_pt_page *find_insert_page(struct sif_pt *pt, u8 level, u64 vaddr)
+{
+ struct rb_root *root = &pt->pmd[level];
+ struct rb_node **np = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct sif_pt_page *ep;
+ struct sif_dev *sdev = pt->sdev;
+
+ sif_log(sdev, SIF_MMU, "level %d vaddr %llx", level, vaddr);
+ if (level == pt->top_level) {
+ if (likely(vaddr == pt->top->vaddr))
+ return pt->top;
+
+ /* (possibly recursively) build up a new top node that spans both
+ * the old tree and the new subtree:
+ */
+ ep = replace_top(pt, vaddr);
+ if (ep)
+ return ep;
+ }
+
+ while (*np) {
+ ep = container_of(*np, struct sif_pt_page, node);
+ parent = *np;
+ if (vaddr < ep->vaddr)
+ np = &((*np)->rb_left);
+ else if (vaddr > ep->vaddr)
+ np = &((*np)->rb_right);
+ else {
+ sif_log(sdev, SIF_PT,
+ "Level %d: Found page at vaddr %llx with dma addr %pad",
+ level, ep->vaddr, &sg_dma_address(ep->page));
+ return ep;
+ }
+ }
+
+ /* Allocate and insert a new node into the tree */
+ ep = pt_alloc_page(pt, vaddr);
+ if (IS_ERR(ep))
+ return ep;
+
+ sif_log(sdev, SIF_PT, "Allocated new pt page for vaddr %llx with dma addr %pad",
+ vaddr, &sg_dma_address(ep->page));
+
+ rb_link_node(&ep->node, parent, np);
+ rb_insert_color(&ep->node, root);
+ return ep;
+}
+
+
+/* Find an element in the tree for the given level, return NULL if it does not
+ * exist:
+ */
+static struct sif_pt_page *find_page(struct sif_pt *pt, u8 level, u64 vaddr)
+{
+ struct rb_root *root;
+ struct rb_node *n;
+ struct rb_node *parent = NULL;
+ struct sif_pt_page *ep;
+
+ if (level == pt->top_level)
+ return pt->top;
+
+ root = &pt->pmd[level];
+ n = root->rb_node;
+
+ sif_log(pt->sdev, SIF_MMU_V, "level %d vaddr %llx", level, vaddr);
+ while (n) {
+ ep = container_of(n, struct sif_pt_page, node);
+ parent = n;
+ if (vaddr < ep->vaddr)
+ n = n->rb_left;
+ else if (vaddr > ep->vaddr)
+ n = n->rb_right;
+ else
+ return ep;
+ }
+ return NULL;
+}
+
+
+static inline struct sif_pt_page *next_page(struct sif_pt_page *p)
+{
+ struct rb_node *node = rb_next(&p->node);
+
+ if (node)
+ return container_of(node, struct sif_pt_page, node);
+ else
+ return NULL;
+}
+
+static inline struct sif_pt_page *prev_page(struct sif_pt_page *p)
+{
+ struct rb_node *node = rb_prev(&p->node);
+
+ if (node)
+ return container_of(node, struct sif_pt_page, node);
+ else
+ return NULL;
+}
+
+static inline struct sif_pt_page *first_page(struct sif_pt *pt, int level)
+{
+ struct rb_node *node = rb_first(&pt->pmd[level]);
+
+ if (node)
+ return container_of(node, struct sif_pt_page, node);
+ else
+ return NULL;
+}
+
+static inline struct sif_pt_page *last_page(struct sif_pt *pt, int level)
+{
+ struct rb_node *node = rb_last(&pt->pmd[level]);
+
+ if (node)
+ return container_of(node, struct sif_pt_page, node);
+ else
+ return NULL;
+}
+
+
+/* Create the page table tree from the given vaddr upwards, until
+ * we reach an existsting node or find the top node. Update use counts on the
+ * involved nodes:
+ */
+static struct sif_pt_page *find_next(struct sif_pt *pt, u8 level, u64 vaddr)
+{
+ u64 vaddr_up = 0;
+ struct sif_pt_page *pt_page_start = find_insert_page(pt, level, vaddr);
+ struct sif_pt_page *pt_page;
+ struct sif_pt_page *pt_parent;
+ struct sif_dev *sdev = pt->sdev;
+ int i;
+
+ if (pt_page_start == pt->top || IS_ERR(pt_page_start))
+ return pt_page_start;
+
+ sif_log(sdev, SIF_MMU_V, "level %d vaddr %llx", level, vaddr);
+
+ pt_page = pt_page_start;
+ for (;;) {
+ u64 pt_shift, ptv;
+ u64 *pmd;
+
+ pt_shift = level_to_pageshift(pt, level);
+ pt_parent = pt_page->parent;
+ level++;
+ if (pt_parent) {
+ /* We found an existing node - rest of the tree upwards is ok */
+ break;
+ }
+ vaddr_up = vaddr & ~level_to_pagemask(pt, level);
+ if (level == pt->top_level && vaddr_up == pt->top->vaddr) {
+ sif_log(sdev, SIF_PT, "found top at level %d", level);
+ pt_parent = pt->top;
+ } else {
+ sif_log(sdev, SIF_PT, "searching at level %d/%d from vaddr %llx",
+ level, pt->top_level, vaddr_up);
+ pt_parent = find_insert_page(pt, level, vaddr_up);
+ }
+
+ if (IS_ERR(pt_parent))
+ return pt_parent;
+
+ pt_page->parent = pt_parent;
+
+ /* Set page pointer in parent */
+ pmd = sg_virt(pt_parent->page);
+ i = sif_pte_index(sdev, vaddr, pt_shift);
+ ptv = sg_dma_address(pt_page->page) | PT_PAGE_PRESENT;
+ sif_log(sdev, SIF_MMU_V, "level %d: pmd[%d](%p) = %llx", level, i, &pmd[i], ptv);
+ WARN_ON(pmd[i] != 0);
+ pmd[i] = ptv;
+
+ pt_parent->usecnt++;
+ if (pt_parent == pt->top || pt_parent->usecnt > 1)
+ break;
+ pt_page = pt_parent;
+ vaddr = vaddr_up;
+ }
+ return pt_page_start;
+}
+
+
+static int populate_pt(struct sif_pt *pt, struct scatterlist *sg,
+ u64 vstart, size_t size)
+{
+ int level = pt->leaf_level;
+ u64 va, vend, incr;
+ u64 pt_shift = level_to_pageshift(pt, level-1); /* page shift for the level below us */
+ u64 page_flags = PT_PAGE_PRESENT;
+ struct sif_dev *sdev = pt->sdev;
+ u64 small_page_misalign;
+ u64 large_page_misalign = 0;
+ off_t sg_offset; /* Running page aligned offset within the current sg */
+
+ /* If level > 0 we must set the PS bit to indicate that this is a leaf node
+ * We also have two levels of alignment to consider:
+ */
+ if (level > 0) {
+ small_page_misalign = vstart & level_to_pagemask(pt, level - 2);
+ large_page_misalign = (vstart & level_to_pagemask(pt, level - 1)) - small_page_misalign;
+ page_flags |= PT_PAGE_PS;
+ } else
+ small_page_misalign = (vstart & level_to_pagemask(pt, level - 1));
+
+
+ /* Populate the table at level @level - assuming no overlap */
+ vend = vstart + size;
+ va = vstart & ~level_to_pagemask(pt, level - 1);
+
+ /* Depending on alignment we might need to point to a DMA address
+ * way ahead of the first sg, but aligned to the first small page size:
+ */
+ sg_offset = -large_page_misalign;
+ incr = level_to_pagesize(pt, level - 1) << pt->pte_ext_shift;
+
+ sif_log(sdev, SIF_PT,
+ "level %d mis (0x%llx,0x%llx) vstart %llx -> %llx size %lx pte_ext_shift %d, incr 0x%llx sg_offset %#lx",
+ level, small_page_misalign, large_page_misalign, vstart, va, size,
+ pt->pte_ext_shift, incr, sg_offset);
+
+ while (va < vend) {
+ struct sif_pt_page *pt_page;
+ u64 *pmd;
+ int i;
+ u64 va_up = va & ~level_to_pagemask(pt, level);
+
+ pt_page = find_next(pt, level, va_up);
+ if (IS_ERR(pt_page))
+ return PTR_ERR(pt_page);
+
+ pmd = sg_virt(pt_page->page);
+ i = sif_pte_index(sdev, va, pt_shift);
+ for (; i < sdev->mi.ptes_per_page && va < vend; i++) {
+ u64 ptv;
+
+ if (!sg) {
+ sif_log(sdev, SIF_INFO,
+ "##### pt at %p: level %d: failed to find next sg at va %llx (vstart,size) = (%llx,%lx))",
+ pt, level, va, vstart, size);
+ return -EIO;
+ }
+ ptv = (sg_dma_address(sg) + sg_offset) | page_flags;
+ WARN_ON(pmd[i] != 0);
+ sif_log(sdev, SIF_PT_V, "va %llx: level %d: pmd[%d](%p) = %llx",
+ va, level, i, &pmd[i], ptv);
+ pmd[i] = ptv;
+ pt_page->usecnt++;
+ va += incr;
+ sg_offset += incr;
+ /* At this point size might be the end aligned size at this level so
+ * make sure to terminate at the end of the sg list:
+ */
+ while (sg && sg_offset >= sg_dma_len(sg)) {
+ if (incr > sdev->mi.page_size)
+ sif_log(sdev, SIF_PT_VV,
+ "sg_offset %#lx sg->length %x sg_dma_len(sg) %x",
+ sg_offset, sg->length, sg_dma_len(sg));
+ sg_offset -= sg_dma_len(sg);
+ sg = sg_next(sg);
+ }
+ /* Note that we must handle both small incr in large pages and opposite! */
+ if (unlikely(sg_offset && sg_offset < incr))
+ return 0; /* We're done - vend in the middle of a higher level page */
+ }
+ }
+
+ return 0;
+}
+
+
+/* sif_mem iterator based page table population - needed for special types */
+static int populate_pt_from_mem(struct sif_pt *pt, struct sif_mem *mem, u64 vstart, bool fast_path)
+{
+ u8 level = pt->leaf_level;
+ u64 va, vend, incr;
+ u64 pt_shift = level_to_pageshift(pt, level-1); /* page shift for the level below us */
+ u64 page_flags = PT_PAGE_PRESENT;
+ struct sif_mem_iter mi;
+ struct sif_dev *sdev = pt->sdev;
+ u64 small_page_misalign;
+ u64 large_page_misalign = 0;
+ off_t sg_offset; /* Running page aligned offset within the current sg */
+
+ /* If level > 0 we must set the PS bit to indicate that this is a leaf node
+ * We also have two levels of alignment to consider:
+ */
+ if (level > 0) {
+ small_page_misalign = vstart & level_to_pagemask(pt, level - 2);
+ large_page_misalign = (vstart & level_to_pagemask(pt, level - 1)) - small_page_misalign;
+ page_flags |= PT_PAGE_PS;
+ } else
+ small_page_misalign = (vstart & level_to_pagemask(pt, level - 1));
+
+ /* Populate the table at level @level - assuming no overlap */
+ vend = vstart + mem->size;
+ va = vstart & ~level_to_pagemask(pt, level - 1);
+
+ /* Depending on alignment we might need to point to a DMA address
+ * way ahead of the first sg, but aligned to the first small page size:
+ */
+ sg_offset = -large_page_misalign;
+ incr = level_to_pagesize(pt, level - 1) << pt->pte_ext_shift;
+ sif_mem_iter_init(mem, &mi);
+
+ sif_log(sdev, SIF_PT,
+ "level %d mis (0x%llx,0x%llx) vstart %llx -> %llx size %llx pte_ext_shift %d, incr 0x%llx sg_offset %#lx",
+ level, small_page_misalign, large_page_misalign, vstart, va, mem->size,
+ pt->pte_ext_shift, incr, sg_offset);
+
+ while (va < vend) {
+ struct sif_pt_page *pt_page;
+ u64 *pmd;
+ int i;
+ u64 va_up = va & ~level_to_pagemask(pt, level);
+
+ pt_page = find_next(pt, level, va_up);
+ if (IS_ERR(pt_page))
+ return PTR_ERR(pt_page);
+
+ pmd = sg_virt(pt_page->page);
+ i = sif_pte_index(sdev, va, pt_shift);
+ for (; i < sdev->mi.ptes_per_page && va < vend; i++) {
+ u64 ptv;
+
+ ptv = (sif_mem_iter_dma(&mi) + sg_offset) | page_flags;
+ BUG_ON(!(ptv & ~0x81));
+ sif_log(sdev, SIF_PT_V, "level %d: pmd[%d](%p) = %llx", level, i, &pmd[i], ptv);
+ pmd[i] = ptv;
+ if (!fast_path)
+ pt_page->usecnt++;
+ va += incr;
+ sg_offset += incr;
+ if (va < vend) {
+ int ret = sif_mem_iter_advance(&mi, sg_offset);
+
+ if (ret) {
+ sif_log(sdev, SIF_MMU_V, "No page for vaddr %llx", va);
+ return ret;
+ }
+ sg_offset = 0;
+ }
+ }
+ }
+
+ return 0;
+}
+
+
+/* (safe) observe leaf node of page table at @vaddr */
+int sif_pt_entry(struct sif_pt *pt, u64 vaddr, dma_addr_t *entry, dma_addr_t *val)
+{
+ int ret = 0;
+ struct sif_pt_page *p;
+ struct sif_dev *sdev = pt->sdev;
+ u64 *pmd;
+ u64 pt_shift;
+ u64 va_up;
+ u8 level;
+ int i, ip;
+
+ mutex_lock(&pt->lock);
+ level = pt->leaf_level;
+ va_up = vaddr & ~level_to_pagemask(pt, level);
+ pt_shift = level_to_pageshift(pt, level-1);
+ p = find_page(pt, level, va_up);
+ if (p) {
+ pmd = sg_virt(p->page);
+ i = sif_pte_index(sdev, vaddr, pt_shift);
+ *val = pmd[i];
+ pmd = sg_virt(p->parent->page);
+ ip = sif_pte_index(sdev, va_up, level_to_pageshift(pt, level));
+ *entry = pmd[ip];
+ sif_log(sdev, SIF_MMU_V,
+ "Page at vaddr %llx, lookup vaddr %llx at index %d: entry(idx = %d): %pad, value: %pad",
+ va_up, vaddr, i, ip, entry, val);
+ } else {
+ sif_log(sdev, SIF_MMU_V, "Page at vaddr %llx not found", va_up);
+ ret = -EINVAL;
+ }
+ mutex_unlock(&pt->lock);
+ return ret;
+}
+
+
+/* Remove a reference to the given remove_addr from page @p,
+ * if refcnt == 0, return page to freelist
+ * and (if at leaf level) return the next page in the rb_tree, otherwise return
+ * the same page.
+ *
+ */
+static struct sif_pt_page *remove_page_ref(struct sif_pt *pt, struct sif_pt_page *p,
+ u64 remove_addr, u8 level)
+{
+ struct sif_pt_page *np = p;
+ u64 *pmd = sg_virt(p->page);
+ int index = sif_pte_index(pt->sdev, remove_addr, level_to_pageshift(pt, level-1));
+ u64 dma_addr = sg_dma_address(p->page);
+
+ BUG_ON(p->usecnt < 1);
+ pmd[index] = 0;
+
+ p->usecnt--;
+ sif_log(pt->sdev, SIF_PT_VV,
+ "level %d: index = %d ps = %d, page - dma at 0x%llx - use count %d",
+ level, index, level_to_pageshift(pt, level-1), dma_addr, p->usecnt);
+ if (!p->usecnt) {
+ if (p->parent)
+ remove_page_ref(pt, p->parent, p->vaddr, level + 1);
+ else
+ BUG_ON(p != pt->top);
+ if (level == pt->leaf_level)
+ np = next_page(p);
+ if (pt->top != p) /* We dont use the rbtree for the top node */
+ rb_erase(&p->node, &pt->pmd[level]);
+ else
+ pt->top = NULL; /* So we can check if removal is needed in sif_pt_release() */
+ pt_free_page(pt, p);
+ }
+ return np;
+}
+
+/* size of each sg list used to maintain page table pages
+ * when fixed_top is set (currently only used by the sq_cmpl table)
+ * We want it reasonably large as we index in constant time into the list
+ * but use a linear scan to navigate the chain of lists
+ */
+#define FIXED_TOP_SG_SIZE 0x1000
+
+static int init_top(struct sif_pt *pt, u64 vstart, int npages)
+{
+ u64 aligned_vaddr = vstart & ~(level_to_pagesize(pt, pt->top_level) - 1);
+ int ret;
+ size_t sg_size = pt->fixed_top ? FIXED_TOP_SG_SIZE : max(npages, 1);
+
+ /* Single pte table necessary for WA for Bug #4096 */
+ if (pt->top_level < pt->leaf_level) {
+ sif_log(pt->sdev, SIF_PT_V, "Adjusting top level %d -> %d",
+ pt->top_level, pt->leaf_level);
+ pt->top_level = pt->leaf_level;
+ }
+
+ ret = sif_kmem_init(pt->sdev, &pt->m, sg_size, (u64)npages << PAGE_SHIFT,
+ PAGE_SHIFT, GFP_KERNEL, DMA_TO_DEVICE);
+ if (ret < 0)
+ return ret;
+
+ if (add_pages_to_freelist(pt, pt->m.sg, pt->m.sg_max))
+ return ret;
+
+ /* Create the top node of the page table: */
+ pt->top = pt_alloc_page(pt, aligned_vaddr);
+ if (unlikely(IS_ERR(pt->top))) {
+ int ret = PTR_ERR(pt->top);
+
+ pt->top = NULL;
+ return ret;
+ }
+ sif_log(pt->sdev, SIF_PT_V,
+ "Created top node at kva %p, dma addr %pad level %d for vstart %llx - aligned at %llx",
+ sg_virt(pt->top->page), &sg_dma_address(pt->top->page),
+ pt->top_level, vstart, aligned_vaddr);
+
+ if (pt->modifiable) {
+ /* avoid that this node gets freed if all mappings are removed */
+ pt->top->usecnt++;
+ }
+ return 0;
+}
+
+
+inline void reinit_top(struct sif_pt *pt, u64 vstart)
+{
+ u64 aligned_vaddr = vstart & ~(level_to_pagesize(pt, pt->top_level) - 1);
+
+ sif_log(pt->sdev, SIF_PT_V,
+ "Reused top node at dma addr %pad level %d for vstart %llx - aligned at %llx",
+ &sg_dma_address(pt->top->page), pt->top_level, vstart, aligned_vaddr);
+ pt->top->vaddr = aligned_vaddr;
+}
+
+
+static u64 recalc_vstart(struct sif_pt *pt)
+{
+ struct sif_dev *sdev = pt->sdev;
+ struct sif_pt_page *p = first_page(pt, pt->leaf_level);
+ u64 page_shift = level_to_pageshift(pt, pt->leaf_level - 1);
+ int i;
+
+ if (p) {
+ u64 *pmd = sg_virt(p->page);
+
+ for (i = 0; i < sdev->mi.ptes_per_page; i++)
+ if (pmd[i]) {
+ u64 nvaddr = p->vaddr + (i << page_shift);
+ u64 delta_sz = nvaddr - pt->vstart;
+
+ sif_log(sdev, SIF_PT_V, "vstart %llx -> %llx (vsize %llx -> %llx)",
+ pt->vstart, nvaddr, pt->vsize, pt->vsize - delta_sz);
+ pt->vsize -= delta_sz;
+ return nvaddr;
+ }
+ }
+ pt->vsize = 0;
+ pt->vstart = 0;
+ return 0;
+}
+
+static u64 recalc_size(struct sif_pt *pt)
+{
+ struct sif_dev *sdev = pt->sdev;
+ struct sif_pt_page *p = last_page(pt, pt->leaf_level);
+ u64 page_shift = level_to_pageshift(pt, pt->leaf_level - 1);
+ int i;
+
+ if (p) {
+ u64 *pmd = sg_virt(p->page);
+
+ for (i = sdev->mi.ptes_per_page - 1; i >= 0; i--)
+ if (pmd[i]) {
+ u64 nend = p->vaddr + ((i+1) << page_shift);
+ u64 nvsize = nend - pt->vstart;
+
+ sif_log(sdev, SIF_MMU_V, "vstart at %llx, size %llx -> %llx",
+ pt->vstart, pt->vsize, nvsize);
+ return nvsize;
+ }
+ }
+ pt->vsize = 0;
+ pt->vstart = 0;
+ return 0;
+}
+
+
+
+/* Extend a page table at DMA address @vstart with the list starting at @sg with size @size */
+int sif_pt_extend(struct sif_pt *pt, struct scatterlist *sg, u64 vstart, size_t size)
+{
+ int ret = 0;
+ u32 npages;
+ u64 page_mask = level_to_pagesize(pt, pt->leaf_level - 1) - 1;
+ u64 new_start;
+ u64 new_size;
+
+ if (!size)
+ return 0;
+
+ sif_log(pt->sdev, SIF_MMU, "** vstart %llx size %lx page size %llx leaf_level %d **",
+ vstart, size, page_mask + 1, pt->leaf_level);
+ mutex_lock(&pt->lock);
+
+ /* Calculate a good size of each sg table in the kmem object: */
+ if (!pt->top) {
+ /* This is a blank pt - allocate and set up the initial structures */
+ npages = table_mem_need(pt, vstart, size);
+
+ ret = init_top(pt, vstart, npages);
+ if (ret)
+ goto kmem_ext_failed;
+
+ new_start = vstart;
+ new_size = size;
+ } else if (pt->vsize == 0) {
+ new_start = vstart;
+ new_size = size;
+ reinit_top(pt, vstart);
+ } else {
+ if (!pt->modifiable) {
+ sif_log(pt->sdev, SIF_INFO, "error: Attempt to modify an unmodifiable page table");
+ return -EINVAL;
+ }
+ new_start = merge_ranges(pt->vstart, pt->vsize, vstart, size, &new_size);
+ sif_log(pt->sdev, SIF_MMU_V, "new_start %llx new_size %llx **",
+ new_start, new_size);
+ }
+
+ kref_get(&pt->refcnt);
+
+ ret = populate_pt(pt, sg, vstart, size);
+ if (ret)
+ goto populate_failed;
+
+ /* sync the whole table memory to make sure the changes are reflected:
+ * TBD: Optimize to only sync the parts that have actually been modified.
+ * With this code we will potentially sync a long page freelist as well:
+ */
+ dma_sync_sg_for_device(pt->sdev->ib_dev.dma_device, pt->m.sg, pt->m.sg_max, DMA_TO_DEVICE);
+
+ pt->vstart = new_start;
+ pt->vsize = new_size;
+ mutex_unlock(&pt->lock);
+ return ret;
+populate_failed:
+ kref_put(&pt->refcnt, sif_pt_release);
+kmem_ext_failed:
+ sif_kmem_free(pt->sdev, &pt->m);
+ mutex_unlock(&pt->lock);
+ return ret;
+}
+
+
+
+/* Extend a page table at DMA address @vstart with the contents of @mem */
+int sif_pt_extend_with_mem(struct sif_pt *pt, struct sif_mem *mem, u64 vstart)
+{
+ int ret = 0;
+ u32 npages;
+ u64 page_mask = level_to_pagesize(pt, pt->leaf_level - 1) - 1;
+ u64 new_start;
+ u64 new_size;
+ size_t size = mem->size;
+
+ if (!size)
+ return 0;
+
+ sif_log(pt->sdev, SIF_MMU, "** vstart %llx size %lx page size %llx leaf level %d **",
+ vstart, size, page_mask + 1, pt->leaf_level);
+ mutex_lock(&pt->lock);
+
+ /* Calculate a good size of each sg table in the kmem object: */
+ if (!pt->top) {
+ /* This is a blank pt - allocate and set up the initial structures */
+ npages = table_mem_need(pt, vstart, size);
+
+ ret = init_top(pt, vstart, npages);
+ if (ret)
+ goto kmem_ext_failed;
+
+ new_start = vstart;
+ new_size = size;
+ } else if (!pt->modifiable) {
+ sif_log(pt->sdev, SIF_INFO, "error: Attempt to modify an unmodifiable page table");
+ return -EINVAL;
+ } else if (pt->vsize == 0) {
+ new_start = vstart;
+ new_size = size;
+ reinit_top(pt, vstart);
+ } else {
+ new_start = merge_ranges(pt->vstart, pt->vsize, vstart, size, &new_size);
+ sif_log(pt->sdev, SIF_MMU_V, "new_start %llx new_size %llx **",
+ new_start, new_size);
+ }
+
+ kref_get(&pt->refcnt);
+
+ ret = populate_pt_from_mem(pt, mem, vstart, false);
+
+ /* sync the whole table memory to make sure the changes are reflected:
+ * TBD: Optimize to only sync the parts that have actually been modified.
+ * With this code we will potentially sync a long page freelist as well:
+ */
+ dma_sync_sg_for_device(pt->sdev->ib_dev.dma_device, pt->m.sg, pt->m.sg_max, DMA_TO_DEVICE);
+
+ pt->vstart = new_start;
+ pt->vsize = new_size;
+ mutex_unlock(&pt->lock);
+ return ret;
+
+kmem_ext_failed:
+ sif_kmem_free(pt->sdev, &pt->m);
+ mutex_unlock(&pt->lock);
+ return ret;
+}
+
+
+/* Shrink a page table to no longer contain DMA address start @sg and size @size */
+int sif_pt_free_part(struct sif_pt *pt, u64 vstart, size_t size)
+{
+ struct sif_pt_page *p;
+ int level = pt->leaf_level;
+ u64 va = vstart & ~level_to_pagemask(pt, level - 1);
+ u64 va_up = va & ~level_to_pagemask(pt, level);
+ u64 vend = vstart + size;
+ u64 page_size;
+ int ret = 0;
+
+ sif_log(pt->sdev, SIF_PT_V, "** vstart %llx -> %llx, size %lx **", vstart, va, size);
+
+ page_size = level_to_pagesize(pt, level - 1);
+ mutex_lock(&pt->lock);
+ p = find_page(pt, level, va_up);
+ if (!p) {
+ sif_log(pt->sdev, SIF_INFO, "vaddr %llx not found at level %d",
+ va_up, level);
+ ret = -EINVAL; /* va not mapped */
+ goto failed;
+ }
+
+ while (va < vend && p) {
+ p = remove_page_ref(pt, p, va, level);
+ if (!p)
+ break;
+ if (va < p->vaddr)
+ va = p->vaddr;
+ else
+ va += page_size;
+ }
+ if (vstart == pt->vstart) {
+ pt->vsize -= size;
+ pt->vstart += size;
+ if (size == pt->vsize)
+ pt->vstart = pt->vsize = 0;
+ else
+ pt->vstart = recalc_vstart(pt);
+ }
+ if (vend == pt->vstart + pt->vsize) {
+ pt->vsize -= size;
+ if (size == pt->vsize)
+ pt->vstart = pt->vsize = 0;
+ else
+ pt->vsize = recalc_size(pt);
+ }
+
+ /* sync the whole table memory to make sure the changes are reflected:
+ * TBD: Optimize to only sync the parts that have actually been modified.
+ * With this code we will potentially sync a long page freelist as well:
+ */
+ dma_sync_sg_for_device(pt->sdev->ib_dev.dma_device, pt->m.sg, pt->m.sg_max, DMA_TO_DEVICE);
+
+ mutex_unlock(&pt->lock);
+ return kref_put(&pt->refcnt, sif_pt_release);
+
+failed:
+ mutex_unlock(&pt->lock);
+ return ret;
+}
+
+/* Free remaining mappings */
+int sif_pt_free(struct sif_pt *pt)
+{
+ int ret = 0;
+
+ if (pt->vsize) {
+ int ref = atomic_read(&pt->refcnt.refcount);
+
+ if (ref == 2)
+ ret = sif_pt_free_part(pt, pt->vstart, pt->vsize);
+ else {
+ sif_log(pt->sdev, SIF_MMU_V, "failed - vstart %llx, sz %llx, refcnt %d",
+ pt->vstart, pt->vsize, ref);
+ return -EBUSY;
+ }
+ }
+ if (!ret) {
+ sif_log(pt->sdev, SIF_MMU_V, "refcnt %d", atomic_read(&pt->refcnt.refcount) - 1);
+ ret = kref_put(&pt->refcnt, sif_pt_release);
+ if (!ret)
+ return -EBUSY;
+ ret = 0;
+ }
+ return ret;
+}
+
+
+
+/* Remap the (remappable) page table to be used starting at vstart for the range of mem */
+int sif_pt_remap_for_mem(struct sif_pt *pt, struct sif_mem *mem, u32 page_shift,
+ u64 vstart)
+{
+ /* We optimize the case where @vstart is aligned in a way that allows
+ * the page table to be reused directly. For now we just handle the case where
+ * the old and new vaddr and the size is the same, which is the case for RDS,
+ * our main use case for FMR at this stage.
+ * For all other cases, we just do a full cycle of free/extend_with_mem:
+ */
+ int ret = 0;
+
+ if (pt->vstart != vstart || pt->vsize != mem->size || pt->page_shift != page_shift) {
+ ret = sif_pt_free_part(pt, pt->vstart, pt->vsize);
+ if (ret)
+ return ret;
+ ret = sif_pt_extend_with_mem(pt, mem, vstart);
+ return ret;
+ }
+
+ sif_log(pt->sdev, SIF_MMU_V, "** vstart %llx size %llx **", vstart, mem->size);
+ mutex_lock(&pt->lock);
+
+ /* Fast path: Repopulate ptes directly - all ref.cnts are kept as is: */
+
+ ret = populate_pt_from_mem(pt, mem, vstart, true);
+
+ /* sync the whole table memory to make sure the changes are reflected:
+ * TBD: Optimize to only sync the parts that have actually been modified.
+ * With this code we will potentially sync a long page freelist as well:
+ */
+ if (!ret)
+ dma_sync_sg_for_device(pt->sdev->ib_dev.dma_device, pt->m.sg, pt->m.sg_max, DMA_TO_DEVICE);
+ mutex_unlock(&pt->lock);
+ return ret;
+}
+
+
+/* Called from debugfs key file - caller assumes this function will
+ * finish the line in the file:
+ */
+void sif_pt_dfs_print(struct seq_file *s, struct sif_dev *sdev, loff_t pos)
+{
+ /* First figure out if a pt object exists for this key,
+ * we only care about MR keys here yet:
+ */
+ struct sif_pt *pt;
+ struct sif_mr *mr = safe_get_sif_mr(sdev, pos);
+
+ pt = mr ? mr->mmu_ctx.pt : NULL;
+ if (!pt) {
+ seq_puts(s, "\n");
+ return;
+ }
+
+ seq_printf(s, " %3d %3d %4lld\n",
+ pt->top_level, pt->leaf_level, pt->m.size >> pt->m.page_shift);
+}
--- /dev/null
+/*
+ * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_pt.h: SIF (private) page table management.
+ * API for managing a sif specific page table which can be referenced from
+ * multiple MMU contexts.
+ */
+
+#ifndef _SIF_PT_H
+#define _SIF_PT_H
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include "sif_mem.h"
+
+struct seq_file;
+
+/* rb_tree entries to track virtual addresses
+ * in this page table.
+ */
+struct sif_pt_page {
+ struct rb_node node; /* Linkage for pt->pmd */
+ struct list_head list; /* Linkage for freelist */
+ struct scatterlist *page; /* Pointer to info on the used page within pt->m */
+ struct sif_pt_page *parent; /* Pointer to the parent page in the page table */
+ u64 vaddr; /* Virtual address mapped by the page table page */
+ u32 usecnt; /* Number of entries in use in the referred pt page */
+};
+
+
+/* Number of page table page levels we support:
+ * This level uses 0 = pte pages, 1 = pmd pages, 2 = pud pages, 3 = pgdir pages
+ * This equals psif_table_level - 1 as we do not represent the pages themselves.
+ *
+ * Example: Corresponding page_shift will then eg be 12 (4K pages) for level -1 and 21 (2M)
+ * for level 1 for the default x86 case. For Sparc, several level 0 page sizes are
+ * supported, which gives multiple alternatives for the lowest level.
+ */
+#define PT_LEVELS 4
+
+/* Lower bits with special meaning
+ * from the Intel page table spec
+ */
+#define PT_PAGE_PRESENT 0x1 /* Page is present */
+#define PT_PAGE_PS 0x80 /* If set (at level >= 0) page is a leaf pointer even at level > 0 */
+#define PT_PAGE_SHIFT 12 /* Number of insignificant bits in a sif page table pointer */
+
+/* SIF driver representation of a generic
+ * driver maintained page table.
+ *
+ * Note that the base leaf page size is
+ * based on the "theoretical" smallest page, eg with 2M pages it will be 4K = shift 12.
+ * Whether that size is actually used is then determined by leaf_level.
+ */
+struct sif_pt {
+ struct sif_dev *sdev; /* Device this mapping is valid for */
+ bool fixed_top; /* If set, pt guarantees that the top node remains constant */
+ bool modifiable; /* Set if this page table should support modification */
+ u8 top_level; /* Page table level of top node, 0 means no table */
+ u8 leaf_level; /* Page table level of leaf node */
+ u8 pte_ext_shift; /* Only populate every (1 << pte_ext_shift) pte */
+ u16 ptes_per_page; /* #ptes per page table page - also defines size of the pt page */
+ u32 page_shift; /* Base leaf page shift in use for this table */
+ u64 vstart; /* Start of the mapping in VA as seen from SIF */
+ u64 vsize; /* Extent of the mapping (including any holes) */
+ struct sif_pt_page *top;/* Top level page table page exposed to sif */
+ struct mutex lock; /* Protects modifications to the page table data structure */
+ struct kref refcnt; /* Keep track of users of this page table */
+ struct sif_kmem m; /* DMA mapped store for page table memory */
+ struct rb_root pmd[PT_LEVELS];/* Pr.level lookup table from offset to page table page */
+ struct list_head freelist; /* list of DMA mapped pt pages not currently in use */
+};
+
+
+/* Called from sif_init/exit to set up/clean up global data structures */
+int sif_pt_init(void);
+void sif_pt_exit(void);
+
+/* Called from debugfs key file */
+void sif_pt_dfs_print(struct seq_file *s, struct sif_dev *sdev, loff_t pos);
+
+/* Create a referenced sif page table object with an empty top level page */
+struct sif_pt *sif_pt_create_empty(struct sif_dev *sdev, u64 vstart, enum sif_mem_type map_mt);
+
+/* Create a sif page table object of size @mapsize using memory referenced by @sg
+ * with SIF virtual address starting at @vstart, which must be aligned at a page
+ * size boundary compatible with page sizes used by the memory type used by the backing store
+ * @map_mt. Assuming sg is a valid (possibly chained) scatterlist long enough to provide
+ * backing for @mapsize.
+ * Set @modifiable to allow the table to be extended and shrinked
+ * Set @fixed_top to have pt guarantee that the top node remains constant
+ * in which case it will always be a level 4 tree.
+ */
+struct sif_pt *sif_pt_create(struct sif_dev *sdev, struct scatterlist *sg,
+ u64 vstart, size_t mapsize,
+ u32 page_shift, bool modifiable, bool fixed_top);
+
+/* Create a sif page table from a mem object:
+ * Set @fixed_top to prepare for a table where the top node is fixed:
+ * (will always be a level 4 tree)
+ */
+struct sif_pt *sif_pt_create_for_mem(struct sif_mem *mem, u64 vstart,
+ u32 page_shift, bool modifiable, bool fixed_top);
+
+/* Remap the (remappable) page table to be used starting at vstart for the range of mem
+ * eg. replace the current mapping with a new one, preserving the top node
+ * (but possibly reuse at a different level!)
+ */
+int sif_pt_remap_for_mem(struct sif_pt *pt, struct sif_mem *mem,
+ u32 page_shift, u64 vstart);
+
+/* Extend a page table at DMA address @vstart with the list starting at @sg with size @size */
+int sif_pt_extend(struct sif_pt *pt, struct scatterlist *sg, u64 vstart, size_t size);
+
+/* Extend a page table at DMA address @vstart with the contents of @mem */
+int sif_pt_extend_with_mem(struct sif_pt *pt, struct sif_mem *mem, u64 vstart);
+
+/* DMA address of root pointer of page table */
+dma_addr_t sif_pt_dma_root(struct sif_pt *pt);
+
+/* SIF level of root pointer */
+u8 sif_pt_root_table_level(struct sif_pt *pt);
+
+/* Leaf page shift (number of bits within page) of this page table */
+u32 sif_pt_page_shift(struct sif_pt *pt);
+
+/* Observe leaf node of page table at @vaddr */
+int sif_pt_entry(struct sif_pt *pt, u64 vaddr, dma_addr_t *entry, dma_addr_t *val);
+
+/* free a part of the page table and dereference */
+int sif_pt_free_part(struct sif_pt *pt, u64 vstart, size_t size);
+
+/* Free this page table. If more than one reference has been created (using sif_pt_extend)
+ * return -EBUSY, e.g. this call can be used parenthetic with sif_pt_create, but not if
+ * mapping has been referenced more than once, in which case sif_pt_free_part must be called
+ * with identical start, size as with extend to clean up properly before a final sif_pt_free:
+ */
+int sif_pt_free(struct sif_pt *pt);
+
+/* Div. utilities: */
+
+/* Find the aligned size of a region within a certain page alignment size
+ * (eg. the number of pages of size @alignment needed to address (start,len))
+ */
+u64 aligned_size(u64 start, u64 len, u64 alignment);
+
+/* Find the optimal page size (represented by leaf level)
+ * to use based on device capabilities, configuration and a max_shift
+ * value (typically based on continuousness of memory:
+ * The result is adjusted with the address pair of a corresponding virtual
+ * address and dma address to ensure that it is possible to create a mapping at that
+ * level. pte_extent is set to the number bits to shift increment between
+ * each valid pte (For the odd sized leaf pages)
+ */
+int find_optimal_leaf_level(struct sif_dev *sdev, u32 max_shift,
+ u64 vaddr, u64 dma_addr, u64 size,
+ u8 *leaf_level, u8 *pte_ext_shift);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_qp.c: Implementation of IB queue pair logic for sif
+ */
+
+#include <linux/random.h>
+#include <rdma/ib_verbs.h>
+#include "sif_dev.h"
+#include "sif_defs.h"
+#include "sif_qp.h"
+#include "sif_ah.h"
+#include "sif_sq.h"
+#include "sif_pqp.h"
+#include "sif_dma.h"
+#include "sif_user.h"
+#include "sif_base.h"
+#include "sif_mr.h"
+#include "sif_xrc.h"
+#include "sif_query.h"
+#include "sif_hwi.h"
+#include "sif_user.h"
+#include "psif_hw_data.h"
+#include "psif_hw_setget.h"
+#include "psif_hw_csr.h"
+#include "sif_ibcq.h"
+#include "sif_sndrcv.h"
+#include <linux/delay.h>
+#include <linux/seq_file.h>
+
+/* Work-around for bz 3646 */
+static unsigned char bug_3646_conv_table[32] = {
+ 0,
+ 18,
+ 20,
+ 21,
+ 22,
+ 23,
+ 24,
+ 25,
+ 26,
+ 27,
+ 28,
+ 29,
+ 30,
+ 31,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+};
+
+static int reset_qp(struct sif_dev *sdev, struct sif_qp *qp);
+
+static int sif_create_pma_qp(struct ib_pd *ibpd,
+ struct ib_qp_init_attr *init_attr,
+ struct sif_qp_init_attr sif_attr);
+
+static int poll_wait_for_qp_writeback(struct sif_dev *sdev, struct sif_qp *qp)
+{
+ unsigned long timeout = sdev->min_resp_ticks;
+ unsigned long timeout_real = jiffies + timeout;
+ enum psif_qp_state state = PSIF_QP_STATE_INIT;
+
+ sif_log(sdev, SIF_QP, "enter qp %d", qp->qp_idx);
+ do {
+ /* Make sure the update from hw is observed in correct order */
+ smp_rmb();
+ state = get_psif_qp_core__state(&qp->d.state);
+
+ if (state == PSIF_QP_STATE_RESET)
+ break;
+
+ if (time_is_before_jiffies(timeout_real))
+ cond_resched();
+ else {
+ sif_log(sdev, SIF_INFO,
+ "Timeout waiting for write back for QP %d - last state %s",
+ qp->qp_idx, string_enum_psif_qp_state(state));
+
+ if (unlikely(sif_debug_mask & SIF_QP_V)) {
+ struct psif_query_qp lqqp;
+ int ret;
+
+ ret = epsc_query_qp(qp, &lqqp);
+ if (ret)
+ sif_log(sdev, SIF_QP_V,
+ "Unable to retrieve qp state for qp %d from epsc, status %d",
+ qp->qp_idx, ret);
+ else
+ sif_logs(SIF_QP_V, write_struct_psif_query_qp(NULL, 0, &lqqp));
+ }
+
+ return -ETIMEDOUT;
+ }
+ } while (true);
+
+ sif_log(sdev, SIF_QP, "exit - write-back observed on qp %d", qp->qp_idx);
+ return 0;
+}
+
+static int send_epsa_proxy_qp_sq_key(struct sif_dev *sdev, u32 lkey,
+ int qpnum,
+ enum psif_mbox_type eps_num)
+{
+ struct psif_epsc_csr_req req;
+ struct psif_epsc_csr_rsp rsp;
+ int ret;
+
+ memset(&req, 0, sizeof(req));
+ req.opcode = EPSC_A_COMMAND;
+ req.u.epsa_cmd.cmd = EPSA_GET_PROXY_QP_SQ_KEY;
+ req.u.epsa_cmd.key = lkey;
+ req.u.epsa_cmd.qpnum = qpnum;
+ ret = sif_eps_wr(sdev, eps_num, &req, &rsp);
+
+ return ret;
+}
+
+struct sif_qp *create_qp(struct sif_dev *sdev,
+ struct ib_qp_init_attr *init_attr,
+ struct sif_qp_init_attr *sif_attr)
+{
+ struct sif_qp *qp, *rqp = NULL;
+ struct sif_sq *sq;
+ struct psif_qp qpi;
+ struct sif_rq *rq = NULL;
+ struct sif_pd *pd = sif_attr->pd;
+
+ int ret = 0;
+ int rq_idx = -1;
+ int request_qpn = -1;
+ int index;
+ bool mark_dirty = false;
+ struct sif_cq *send_cq = NULL;
+ struct sif_cq *recv_cq = NULL;
+ u32 flags = init_attr->create_flags;
+ u32 max_sge;
+ int min_tso_inline;
+
+ if (init_attr->send_cq)
+ send_cq = to_scq(init_attr->send_cq);
+ if (init_attr->recv_cq)
+ recv_cq = to_scq(init_attr->recv_cq);
+
+ /* Software need to support more than max hw send sge for UD - see #1883 */
+ max_sge =
+ sif_attr->qp_type == PSIF_QP_TRANSPORT_UD ? SIF_SW_MAX_UD_SEND_SGE : SIF_HW_MAX_SEND_SGE;
+
+ if (init_attr->cap.max_send_sge > max_sge) {
+ sif_log(sdev, SIF_INFO, "illegal max send sge %d, SIF only supports %d",
+ init_attr->cap.max_send_sge, max_sge);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (init_attr->cap.max_inline_data > sif_max_inline) {
+ sif_log(sdev, SIF_INFO,
+ "%d bytes of inline data requested - supported max %u - this limit is defined by module parameter max_inline",
+ init_attr->cap.max_inline_data, sif_max_inline);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (init_attr->qp_type <= IB_QPT_GSI) {
+ /* IB verbs port numbers start at 1 while psif starts w/port 0 */
+ int qpn = init_attr->qp_type + ((init_attr->port_num - 1) << 1);
+ int ok = atomic_add_unless(&sdev->sqp_usecnt[qpn], 1, 1);
+
+ if (!ok) {
+ sif_log(sdev, SIF_INFO,
+ "Attempt to create QP %d for port %d more than once",
+ init_attr->qp_type, init_attr->port_num);
+ return ERR_PTR(-EBUSY);
+ }
+ request_qpn = qpn;
+ sif_log(sdev, SIF_QP, "Requested qp %d, port %d",
+ init_attr->qp_type, init_attr->port_num);
+ }
+
+ /* Allow allocation of qp 0/1 */
+ index = request_qpn >= 0 ? request_qpn : sif_alloc_qp_idx(pd);
+ if (index < 0) {
+ rqp = ERR_PTR(-ENOMEM);
+ sif_log(sdev, SIF_QP, "sif_alloc_qp_idx failed");
+ goto err_alloc_index;
+ }
+ qp = get_sif_qp(sdev, index);
+
+ /* Set this temporarily - needed by reporting of qp write-back check */
+ qp->qp_idx = index;
+ /*
+ * We add a sge (with the stencil) when sending with TSO. The stencil is stored at
+ * the beginning of the inline-area. TSO implies checksumming which again has
+ * a requirement that no inline can be used. It is therefore necessary to check that we have at least
+ * 64 bytes of inline-buffering.
+ */
+ min_tso_inline = 64;
+ if ((flags & IB_QP_CREATE_IPOIB_UD_LSO) &&
+ init_attr->cap.max_inline_data < min_tso_inline) {
+ sif_log(sdev, SIF_INFO,
+ "Create LSO QP; qp_%d max_sge %d inline_size %d qp_type %d; modifing max_inline_size to %d",
+ index, init_attr->cap.max_send_sge, init_attr->cap.max_inline_data,
+ init_attr->qp_type, min_tso_inline);
+ init_attr->cap.max_inline_data = min_tso_inline;
+ }
+
+ if (init_attr->qp_type == IB_QPT_RC || init_attr->qp_type == IB_QPT_XRC_INI) {
+ /* Required in anticipation of Atomics use */
+ init_attr->cap.max_inline_data = max(init_attr->cap.max_inline_data, 16U);
+ }
+
+ /* Now, before we can write the QP state - we must ensure that any previous usage
+ * has been completed (the writeback after modify_qp to RESET happens asynchronously
+ * after the modify_qp request completes.
+ */
+ ret = poll_wait_for_qp_writeback(sdev, qp);
+ if (ret) {
+ /* Dont release this desc as it is probably not safe to use anymore */
+ mark_dirty = true;
+ rqp = ERR_PTR(ret);
+ goto err_lazy_wb;
+ }
+
+ memset(qp, 0, sizeof(struct sif_qp));
+ qp->qp_idx = index;
+ qp->ulp_type = sif_attr->ulp_type;
+
+ if (qp->ulp_type == RDS_ULP) {
+ int new_max_inline = CB_LENGTH; /* collectbuffer_length is max 256 */
+
+ sif_log(sdev, SIF_QP,
+ "Create QP; qp_%d max_sge %d inline_size %d qp_type %d; modifing max_inline_size to %d",
+ index, init_attr->cap.max_send_sge, init_attr->cap.max_inline_data,
+ init_attr->qp_type, new_max_inline);
+ init_attr->cap.max_inline_data = new_max_inline;
+ }
+
+ if (init_attr->qp_type <= IB_QPT_GSI) {
+ qp->port = init_attr->port_num;
+ if (init_attr->qp_type == IB_QPT_SMI)
+ qp->flags |= SIF_QPF_SMI;
+ else if (init_attr->qp_type == IB_QPT_GSI)
+ qp->flags |= SIF_QPF_GSI;
+ } else {
+ /* Let port 1 be default: init_attr->port_num is only valid for qp 0/1 */
+ qp->port = 1;
+ }
+
+ qp->last_set_state = IB_QPS_RESET;
+ qp->tracked_state = IB_QPS_RESET;
+ qp->mtu = IB_MTU_4096;
+ qp->type = sif_attr->qp_type;
+
+ /* TBD: Optimize this log to a single stmt */
+ if (send_cq)
+ sif_log(sdev, SIF_QP, "qpn %d, qp 0x%p send cq %d (type %s) port %d, pd %d",
+ index, qp, send_cq->index, string_enum_psif_qp_trans(qp->type),
+ qp->port, pd->idx);
+ else
+ sif_log(sdev, SIF_QP, "qpn %d, qp 0x%p [no send cq] (type %s) port %d, pd %d",
+ index, qp, string_enum_psif_qp_trans(qp->type), qp->port, pd->idx);
+
+ /* The PQP does not have any receive queue, neither does the XRC qp
+ * where RQs are selected per work request via wr.xrc_hdr.xrqd_id
+ */
+ if (is_regular_qp(qp)) {
+ if (init_attr->srq) {
+ rq = to_srq(init_attr->srq);
+ if (atomic_add_unless(&rq->refcnt, 1, 0)) {
+ rq_idx = rq->index;
+ sif_log(sdev, SIF_QP, "Connected qp %d to SRQ %d",
+ index, rq_idx);
+ } else {
+ sif_log(sdev, SIF_INFO,
+ "failed to connect qp %d to SRQ %d, rq invalid",
+ index, rq_idx);
+ rqp = ERR_PTR(-ENODEV);
+ goto err_rq_fail;
+ }
+ } else {
+ rq_idx = alloc_rq(sdev, pd, init_attr->cap.max_recv_wr,
+ init_attr->cap.max_recv_sge, NULL,
+ sif_attr->user_mode);
+ if (rq_idx >= 0)
+ rq = get_sif_rq(sdev, rq_idx);
+ }
+ if (rq_idx < 0) {
+ rqp = ERR_PTR(rq_idx);
+ goto err_rq_fail;
+ }
+
+ /* Adjust requested values based on what we got: */
+ init_attr->cap.max_recv_wr = rq->entries_user;
+ }
+ qp->rq_idx = rq_idx;
+
+ if (rq && !init_attr->srq) {
+ /* Check/update max sge cap: */
+ if (rq->sg_entries > init_attr->cap.max_recv_sge) {
+ sif_log(sdev, SIF_QP, "recv sge adjusted (%d -> %d)",
+ init_attr->cap.max_recv_sge, rq->sg_entries);
+ init_attr->cap.max_recv_sge = rq->sg_entries;
+ }
+
+ /* Store cq reference for cleanup purposes */
+ if (recv_cq)
+ rq->cq_idx = recv_cq->index;
+ }
+
+
+ /* sq always gets same index as QP.. */
+ ret = sif_alloc_sq(sdev, pd, qp, &init_attr->cap,
+ sif_attr->user_mode, sif_attr->sq_hdl_sz);
+ if (ret < 0) {
+ rqp = ERR_PTR(ret);
+ goto err_sq_fail;
+ }
+
+ /* Store send completion queue index default since
+ * for psif send cq number is a parameter in the work request
+ */
+ sq = get_sif_sq(sdev, qp->qp_idx);
+ sq->cq_idx = send_cq ? send_cq->index : (u32)-1; /* XRC recv only */
+ sq->complete_all = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR ? 1 : 0;
+
+ /* Adjust requested values based on what we got: */
+ init_attr->cap.max_send_wr = sq->entries;
+
+ /* Initialization of qp state via local copy */
+ memset(&qpi, 0, sizeof(struct psif_qp));
+
+ if (multipacket_qp(qp->type)) {
+ qpi.state.sq_clog2_extent = order_base_2(sq->extent);
+ qpi.state.sq_clog2_size = order_base_2(sq->entries);
+ }
+ qpi.state.retry_sq_seq = 0;
+ qpi.state.state = ib2sif_qp_state(IB_QPS_RESET);
+ qpi.state.pd = pd->idx;
+ if (!sif_feature(zero_magic)) {
+ qp->magic = prandom_u32();
+ qpi.state.magic = qp->magic;
+ }
+ qpi.state.transport_type = qp->type;
+ if (qp->type == PSIF_QP_TRANSPORT_XRC && init_attr->xrcd)
+ qpi.state.xrc_domain = to_sxrcd(init_attr->xrcd)->index;
+ qpi.state.rq_indx = rq_idx;
+ qpi.state.rq_is_srq = !!init_attr->srq || (init_attr->qp_type == IB_QPT_XRC_TGT);
+ qpi.state.send_cq_indx = send_cq ? send_cq->index : (u32)-1;
+ qpi.state.rcv_cq_indx = recv_cq ? recv_cq->index : (u32)-1;
+
+ qpi.state.mstate = APM_MIGRATED;
+ qpi.state.path_mtu = ib2sif_path_mtu(qp->mtu);
+ /* Last acked psn must be initialized to one less than xmit_psn
+ * and it is a 24 bit value. See issue #1011
+ */
+ qpi.state.xmit_psn = 0;
+ qpi.state.last_acked_psn = 0xffffff;
+ qpi.state.qosl = qp->qosl = sif_attr->qosl;
+
+ /* See #2402/#2770 */
+ if (sif_feature(infinite_rnr)) {
+ qpi.state.rnr_retry_init = 7;
+ qpi.state.rnr_retry_count = 7;
+ qpi.state.min_rnr_nak_time = 26; /* Bug 3646, this is about 160 us */
+ }
+
+ if (flags & IB_QP_NO_CSUM)
+ qpi.state.no_checksum = 1;
+
+ if (sif_attr->proxy != SIFPX_OFF) {
+ /* This is a proxy QP */
+ qpi.state.proxy_qp_enable = 1;
+ qp->eps_tag |= EPS_TAG_FROM_HOST;
+ ret = send_epsa_proxy_qp_sq_key(sdev, sq->sg_mr->index,
+ qp->qp_idx,
+ proxy_to_mbox(sif_attr->proxy));
+ if (ret)
+ sif_log(sdev, SIF_QP, "send_epsa_proxy_qp_sq_key failed");
+ }
+
+ if (sif_attr->user_mode)
+ qp->flags |= SIF_QPF_USER_MODE;
+
+ if (flags & IB_QP_CREATE_IPOIB_UD_LSO) {
+ qp->flags |= SIF_QPF_IPOIB;
+ qpi.state.ipoib_enable = 1;
+ qpi.state.ipoib = 1;
+ }
+
+ /* PSIF extensions */
+ if (flags & IB_QP_CREATE_EOIB) {
+ qp->flags |= SIF_QPF_EOIB;
+ qpi.state.eoib_enable = 1;
+ qpi.state.eoib = 1;
+ qpi.state.eoib_type = EOIB_QKEY_ONLY;
+ }
+ if (flags & IB_QP_CREATE_RSS)
+ qpi.state.rss_enable = 1;
+ if (flags & IB_QP_CREATE_HDR_SPLIT)
+ qpi.state.hdr_split_enable = 1;
+ if (flags & IB_QP_CREATE_RCV_DYNAMIC_MTU)
+ qpi.state.rcv_dynamic_mtu_enable = 1;
+ if (flags & IB_QP_CREATE_SND_DYNAMIC_MTU)
+ qpi.state.send_dynamic_mtu_enable = 1;
+
+ /* according to ib_verbs.h init_attr->port_num is only valid for QP0/1 */
+ if (init_attr->qp_type <= IB_QPT_GSI)
+ qpi.path_a.port = init_attr->port_num - 1;
+
+ sif_log(sdev, SIF_QP, "qp %d path_a.port = %d", qp->qp_idx, qpi.path_a.port);
+
+ /* Write composed entry to shared area */
+ copy_conv_to_hw(&qp->d, &qpi, sizeof(struct psif_qp));
+
+ mutex_init(&qp->lock); /* TBD: Sync scheme! */
+
+ /* Users should see qp 0/1 even though qp 0/1 is mapped to qp 2/3 for
+ * port 2
+ */
+ qp->ibqp.qp_num = qp->qp_idx > 3 ? qp->qp_idx : (qp->qp_idx & 0x1);
+
+ /* For the priv. QP types we need to set some other elements in the
+ * ib verbs struct as well
+ */
+ if (qp->type == PSIF_QP_TRANSPORT_MANSP1) {
+ qp->ibqp.device = &sdev->ib_dev;
+ qp->ibqp.qp_num = qp->qp_idx;
+ qp->ibqp.qp_type = IB_QPT_UD;
+ }
+
+ qp->flush_sq_done_wa4074 = false;
+
+ ret = sif_dfs_add_qp(sdev, qp);
+ if (ret)
+ goto err_dfs_qp;
+ /* initialize the sychronization between destroy qp and event handling.*/
+ init_completion(&qp->can_destroy);
+
+ /* a qp can only be destroyed if refcnt == 0.*/
+ atomic_set(&qp->refcnt, 1);
+
+ return qp;
+
+err_dfs_qp:
+ sif_free_sq(sdev, qp);
+err_sq_fail:
+ if (rq && !rq->is_srq)
+ free_rq(sdev, rq_idx);
+err_rq_fail:
+err_lazy_wb:
+ if (!mark_dirty)
+ sif_free_qp_idx(pd, qp->qp_idx);
+err_alloc_index:
+ return rqp;
+}
+
+/* PMA proxy QP */
+static int sif_create_pma_qp(struct ib_pd *ibpd,
+ struct ib_qp_init_attr *init_attr,
+ struct sif_qp_init_attr sif_attr)
+{
+ struct ib_qp *ret = NULL;
+ struct sif_dev *sdev;
+ struct sif_pd *pd;
+ struct sif_qp *qp;
+
+ sdev = to_sdev(ibpd->device);
+ pd = to_spd(ibpd);
+ /* Let's override IB_QPT_GSI by IB_QPT_UD*/
+ init_attr->qp_type = IB_QPT_UD;
+
+ qp = create_qp(sdev, init_attr, &sif_attr);
+
+ if (IS_ERR(qp)) {
+ /* Convert interior error to right type: */
+ ret = (struct ib_qp *)qp;
+ goto err_create_qp;
+ }
+ qp->flags |= SIF_QPF_PMA_PXY;
+ qp->port = init_attr->port_num;
+ sdev->pma_qp_idxs[qp->port - 1] = qp->qp_idx;
+
+ /* Make dfs and query_qp happy: */
+ qp->ibqp.device = &sdev->ib_dev;
+ qp->ibqp.pd = &sdev->pd->ibpd;
+
+ /* Set back IB_QPT_GSI */
+ init_attr->qp_type = IB_QPT_GSI;
+
+ sif_log(sdev, SIF_QP, "Exit: success 0x%p proxy qp %d - real qp %d",
+ &qp->ibqp, qp->ibqp.qp_num, qp->qp_idx);
+ return qp->qp_idx;
+
+err_create_qp:
+ sif_log(sdev, SIF_QP, "Exit: failed");
+ return 0;
+}
+
+struct ib_qp *sif_create_qp(struct ib_pd *ibpd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct sif_dev *sdev;
+ struct sif_qp *qp;
+ struct sif_pd *pd;
+ struct sif_xrcd *xrcd = NULL;
+ struct ib_qp *ret = NULL;
+ enum ib_qp_create_flags flags = init_attr->create_flags;
+ ulong user_flags = 0;
+
+ struct sif_qp_init_attr sif_attr = {
+ .qp_type = ib2sif_qp_type(init_attr->qp_type),
+ .user_mode = udata != NULL,
+ .sq_hdl_sz = sizeof(struct sif_sq_hdl),
+ };
+
+
+ /* First we need to locate the device pointer -
+ * if this is an XRC QP ibpd will be NULL:
+ */
+ if (init_attr->qp_type == IB_QPT_XRC_TGT) {
+ if (!init_attr->xrcd) {
+ sif_log0(SIF_INFO, "Error: missing XRC domain for XRC qp");
+ return ERR_PTR(-EINVAL);
+ }
+
+ xrcd = to_sxrcd(init_attr->xrcd);
+ sdev = to_sdev(init_attr->xrcd->device);
+
+ pd = xrcd->pd;
+ } else {
+ sdev = to_sdev(ibpd->device);
+ pd = to_spd(ibpd);
+ }
+
+ sif_attr.pd = pd;
+
+ sif_log(sdev, SIF_QP, "Enter qp_type %d%s", init_attr->qp_type,
+ (udata ? " (user call)" : ""));
+
+ /* TBD: How to handle this? */
+ if (flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+ sif_log(sdev, SIF_QP, "flag IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK set (ignored)");
+
+ if (flags & IB_QP_CREATE_PROXY) {
+ /* We don't know the actual EPSA to use here but QPs dont care */
+ sif_attr.proxy = SIFPX_EPSA_1;
+ }
+
+ /* TBD: Verify that user params such as the send cq are authorized?? */
+ if (!xrcd && !init_attr->send_cq) {
+ sif_log(sdev, SIF_INFO, "No send completion queue specified");
+ ret = ERR_PTR(-EINVAL);
+ goto err_create_qp;
+ }
+
+ if (!xrcd && !init_attr->recv_cq) {
+ sif_log(sdev, SIF_INFO, "No receive completion queue specified");
+ ret = ERR_PTR(-EINVAL);
+ goto err_create_qp;
+ }
+
+ if (udata && init_attr->qp_type <= IB_QPT_GSI) {
+ sif_log(sdev, SIF_INFO, "Attempt to create SMI/GSI QP %d from user space",
+ init_attr->qp_type);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (udata) {
+ struct sif_create_qp_ext cmd;
+ int rv = ib_copy_from_udata(&cmd, udata, sizeof(cmd));
+
+ if (rv) {
+ ret = ERR_PTR(rv);
+ goto err_create_qp;
+ }
+ user_flags = cmd.flags;
+ if (sif_vendor_enable(proxy_mode, user_flags))
+ sif_attr.proxy = cmd.proxy;
+
+ if (sif_vendor_enable(SVF_kernel_mode, user_flags))
+ sif_attr.user_mode = false;
+
+ if (sif_vendor_enable(tsu_qosl, user_flags))
+ sif_attr.qosl = QOSL_LOW_LATENCY;
+
+ if (sif_vendor_enable(no_checksum, user_flags)) {
+ /* update the init_attr->create_flags directly.
+ * This will allow the same code path if umem can pass this as a
+ * create_qp flag via struct ibv_qp_init_attr_ex in the future:
+ */
+ init_attr->create_flags |= IB_QP_NO_CSUM;
+ }
+ }
+
+ /* TBD: check init_attr params against device cap-limits */
+ /* TBD update ib_qp_cap? */
+ if (sif_vendor_enable(dynamic_mtu, user_flags)) {
+ /* TBD - check the device capabilities to determine whether to
+ * create qp with the support of send/receive dynamic MTU.
+ */
+ init_attr->create_flags |= IB_QP_CREATE_RCV_DYNAMIC_MTU;
+ init_attr->create_flags |= IB_QP_CREATE_SND_DYNAMIC_MTU;
+ }
+
+ /* best effort to determine the ULP caller. */
+ if (!sif_attr.user_mode)
+ sif_attr.ulp_type = sif_find_kernel_ulp_caller();
+
+ qp = create_qp(sdev, init_attr, &sif_attr);
+
+ if (IS_ERR(qp)) {
+ /* Convert interior error to right type: */
+ ret = (struct ib_qp *)qp;
+ goto err_create_qp;
+ } else {
+ sif_log(sdev, SIF_QP, "Exit: success 0x%p ib qp %d - real qp %d%s",
+ &qp->ibqp, qp->ibqp.qp_num, qp->qp_idx,
+ (sif_attr.user_mode ? " (user mode)" : ""));
+ }
+
+ qp->qosl = sif_attr.qosl;
+ qp->nocsum = init_attr->create_flags & IB_QP_NO_CSUM;
+
+
+
+ if (sif_vendor_enable(dynamic_mtu, user_flags)) {
+ /* TBD - dynamic mtu flag should only be set during modify_qp in CM
+ * or OOB establishment. It is only set if remote dynamic_mtu_supported &&
+ * local dynamic_send_mtu_supported. As create_qp should not be in
+ * the critical path, split this code from the setting of
+ * IB_QP_CREATE_RCV_DYNAMIC_MTU and IB_QP_CREATE_SND_DYNAMIC_MTU flags
+ * to remind ourself that this need to be implemented separately.
+ */
+ sif_log(sdev, SIF_QP, "Enabling forced dynamic MTU for qp %d", qp->qp_idx);
+ qp->flags |= SIF_QPF_DYNAMIC_MTU;
+ }
+
+ if (sif_vendor_enable(SQ_mode, user_flags)) {
+ sif_log(sdev, SIF_QP, "Enabling forced SQ mode for qp %d", qp->qp_idx);
+ qp->flags |= SIF_QPF_FORCE_SQ_MODE;
+ }
+
+ if (udata) {
+ struct sif_create_qp_resp_ext resp;
+ struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+ int rv;
+
+ memset(&resp, 0, sizeof(resp));
+ resp.qp_idx = qp->qp_idx;
+ resp.sq_extent = sq->extent;
+ resp.sq_sgl_offset = sq->sgl_offset;
+ resp.sq_mr_idx = sq->sg_mr ? sq->sg_mr->index : 0;
+ resp.sq_dma_handle = sif_mem_dma(sq->mem, 0);
+ if (init_attr->qp_type != IB_QPT_XRC_INI && init_attr->qp_type != IB_QPT_XRC_TGT) {
+ /* XRC qps do not have any rq */
+ struct sif_rq *rq = get_sif_rq(sdev, qp->rq_idx);
+
+ resp.rq_idx = qp->rq_idx;
+ resp.rq_extent = rq->extent;
+ }
+
+ resp.magic = get_psif_qp_core__magic(&qp->d.state);
+ rv = ib_copy_to_udata(udata, &resp, sizeof(resp));
+ if (rv) {
+ ret = ERR_PTR(rv);
+ goto err_udata;
+ }
+ }
+ /* Support for PMA_PXY QP bug #3357 */
+ if (init_attr->qp_type == IB_QPT_GSI
+ && eps_version_ge(&sdev->es[sdev->mbox_epsc], 0, 57)) {
+ int pma_qp_idx = sif_create_pma_qp(ibpd, init_attr, sif_attr);
+
+ if (!pma_qp_idx)
+ sif_log(sdev, SIF_INFO, "Create PMA_PXY qp %d port %d failed",
+ qp->qp_idx, init_attr->port_num);
+ }
+
+ return &qp->ibqp;
+err_udata:
+ destroy_qp(sdev, qp);
+err_create_qp:
+ sif_log(sdev, SIF_QP, "Exit: failed");
+ return ret;
+}
+
+
+/* Modify qp implementation related: */
+
+
+enum sif_mqp_type sif_modify_qp_is_ok(struct sif_qp *qp, enum ib_qp_state cur_state,
+ enum ib_qp_state next_state, enum ib_qp_attr_mask mask)
+{
+ struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+ enum ib_qp_type type = qp->ibqp.qp_type;
+ int ret;
+ enum rdma_link_layer ll = IB_LINK_LAYER_INFINIBAND;
+
+ /* PSIF treats XRC just as any other RC QP */
+ if (type == IB_QPT_XRC_INI || type == IB_QPT_XRC_TGT)
+ type = IB_QPT_RC;
+ ret = ((qp->type == PSIF_QP_TRANSPORT_MANSP1 || is_epsa_tunneling_qp(type)) ? 1 :
+ ib_modify_qp_is_ok(cur_state, next_state, type, mask, ll));
+ if (!ret)
+ return SIF_MQP_ERR;
+ switch (cur_state) {
+ case IB_QPS_RESET:
+ if (qp->tracked_state == IB_QPS_SQD)
+ qp->tracked_state = IB_QPS_RESET;
+ return SIF_MQP_SW;
+ case IB_QPS_INIT:
+ if (next_state == IB_QPS_INIT || next_state == IB_QPS_RESET ||
+ next_state == IB_QPS_ERR)
+ return SIF_MQP_SW;
+ /* else fall-through */
+ case IB_QPS_RTS:
+ /* TBD: Elim.hack to behave like mlx on this: */
+ if (unlikely(qp->tracked_state == IB_QPS_SQD &&
+ next_state != IB_QPS_RESET && next_state != IB_QPS_ERR))
+ return SIF_MQP_ERR;
+ if (unlikely(next_state == IB_QPS_SQD)) {
+ qp->tracked_state = next_state; /* To fail on future transitions */
+ return SIF_MQP_IGN; /* Allow, but ignore as MLX does */
+ }
+ /* else fall-through */
+ case IB_QPS_RTR:
+ if (unlikely(next_state == IB_QPS_SQD))
+ return SIF_MQP_ERR;
+ return SIF_MQP_HW;
+ case IB_QPS_SQE:
+ return SIF_MQP_HW;
+ case IB_QPS_ERR:
+ /* Bug #3933 WA for HW bug 3928
+ * For this specific transition, modify qp must be done based
+ * on current qp ownership (towards HW only if HW owned)
+ */
+ return (PSIF_REVISION(sdev) <= 3)
+ && !(qp->flags & SIF_QPF_HW_OWNED) ?
+ SIF_MQP_SW : SIF_MQP_HW;
+ default:
+ return SIF_MQP_IGN;
+ }
+}
+
+
+
+static int modify_qp_sw(struct sif_dev *sdev, struct sif_qp *qp,
+ struct ib_qp_attr *qp_attr, int qp_attr_mask);
+static int modify_qp_hw(struct sif_dev *sdev, struct sif_qp *qp,
+ struct ib_qp_attr *qp_attr, int qp_attr_mask);
+
+
+
+int modify_qp_hw_wa_qp_retry(struct sif_dev *sdev, struct sif_qp *qp,
+ struct ib_qp_attr *qp_attr, int qp_attr_mask)
+{
+ struct ib_qp_attr mod_attr = {
+ .qp_state = IB_QPS_ERR
+ };
+
+ bool need_wa_3713 = PSIF_REVISION(sdev) <= 3
+ && IS_PSIF(sdev)
+ && qp_attr_mask & IB_QP_STATE && qp_attr->qp_state == IB_QPS_RESET;
+
+ /* WA for duplicate CQEs */
+ bool need_wa_4074 = PSIF_REVISION(sdev) <= 3
+ && (qp->type != PSIF_QP_TRANSPORT_MANSP1)
+ && qp_attr_mask & IB_QP_STATE && qp_attr->qp_state == IB_QPS_ERR
+ && IS_PSIF(sdev);
+
+ int ret = 0;
+
+ if (need_wa_3713 || need_wa_4074) {
+ if (qp->type != PSIF_QP_TRANSPORT_MANSP1)
+ ret = pre_process_wa4074(sdev, qp);
+
+ if (ret) {
+ if (ret != -1)
+ sif_log(sdev, SIF_INFO, "Failed to pre-process WA4074, ret - %d", ret);
+ }
+ }
+
+ if (need_wa_3713) {
+ /* Workaround for bug #3713 part 2 - see #3714 */
+ ret = modify_qp_hw(sdev, qp, &mod_attr, IB_QP_STATE);
+ if (ret)
+ sif_log(sdev, SIF_INFO, "implicit modify qp %d to ERR failed - ignoring",
+ qp->qp_idx);
+ }
+
+ ret = modify_qp_hw(sdev, qp, qp_attr, qp_attr_mask);
+
+ if (need_wa_3713 || need_wa_4074) {
+ struct ib_qp_attr attr = {
+ .qp_state = IB_QPS_RESET
+ };
+
+ if (need_wa_4074) {
+ ret = modify_qp_hw(sdev, qp, &attr, IB_QP_STATE);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "qp %d RESET failed, ret %d", qp->qp_idx, ret);
+ goto err_modify_qp_wa;
+ }
+ /* Restore QP SW state to ERROR */
+ qp->last_set_state = qp->tracked_state = IB_QPS_ERR;
+ }
+
+ qp->flags &= ~SIF_QPF_HW_OWNED;
+
+ if (qp->type != PSIF_QP_TRANSPORT_MANSP1)
+ ret = post_process_wa4074(sdev, qp);
+
+ if (ret)
+ sif_log(sdev, SIF_INFO, "Failed to post-process WA #4074 %d", ret);
+ }
+err_modify_qp_wa:
+
+ return ret;
+}
+
+int notify_epsc_pma_qp(struct sif_dev *sdev, int qp_idx, short port)
+{
+ struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+ struct psif_epsc_csr_req req;
+ struct psif_epsc_csr_rsp rsp;
+ int ret = -1;
+
+ if (eps_version_ge(es, 0, 57)) {
+ memset(&req, 0, sizeof(req));
+ memset(&rsp, 0, sizeof(rsp));
+ req.opcode = EPSC_SET;
+ req.u.set.data.op = EPSC_QUERY_PMA_REDIRECT_QP;
+ req.u.set.data.index = port;
+ req.u.set.data.value = qp_idx;
+
+ ret = sif_epsc_wr_poll(sdev, &req, &rsp);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "Failed to configure epsc PMA_PXY QP\n");
+ return ret;
+ }
+ return ret;
+ } else
+ return -EINVAL;
+}
+
+int sif_modify_qp(struct ib_qp *ibqp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask, struct ib_udata *udata)
+{
+ struct sif_qp *qp = to_sqp(ibqp);
+ struct sif_dev *sdev = to_sdev(ibqp->device);
+ struct sif_qp *pma_qp = NULL;
+ struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+ int ret = 0;
+ bool need_pma_pxy_qp = eps_version_ge(es, 0, 57)
+ && (qp_attr->qp_state != IB_QPS_RTS)
+ && (qp->qp_idx == 1 || qp->qp_idx == 3);
+
+ if (need_pma_pxy_qp) {
+ pma_qp = get_sif_qp(sdev, sdev->pma_qp_idxs[!!(qp->qp_idx & 2)]);
+ ret = modify_qp(sdev, pma_qp, qp_attr, qp_attr_mask, true, udata);
+ if (ret)
+ sif_log(sdev, SIF_INFO, "Modify PMA_PXY QP %d failed",
+ pma_qp->qp_idx);
+ else if (qp_attr->qp_state == IB_QPS_RTR) {
+ ret = notify_epsc_pma_qp(sdev, pma_qp->qp_idx, pma_qp->port);
+ if (ret)
+ sif_log(sdev, SIF_INFO, "Notify epsc PMA_PXY QP %d failed",
+ pma_qp->qp_idx);
+ }
+ }
+
+ return modify_qp(sdev, qp, qp_attr, qp_attr_mask,
+ true, udata);
+}
+
+
+int modify_qp(struct sif_dev *sdev, struct sif_qp *qp,
+ struct ib_qp_attr *qp_attr, int qp_attr_mask,
+ bool fail_on_same_state, struct ib_udata *udata)
+{
+ int ret = 0;
+ struct ib_qp *ibqp = &qp->ibqp;
+ enum ib_qp_state cur_state, new_state;
+ enum sif_mqp_type mqp_type = SIF_MQP_IGN;
+
+ sif_log(sdev, SIF_QP, "Enter: qpn %d qp_idx %d mask 0x%x",
+ ibqp->qp_num, qp->qp_idx, qp_attr_mask);
+
+ /* WA #622, RQ flush from error completion in userspace */
+ if (udata && is_regular_qp(qp)) {
+ struct sif_modify_qp_ext cmd;
+ struct sif_rq *rq = get_sif_rq(sdev, qp->rq_idx);
+
+ ret = ib_copy_from_udata(&cmd, udata, sizeof(cmd));
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "ib_copy_from_udata failed, sts %d, qp %d, size %ld",
+ ret, qp->qp_idx, sizeof(cmd));
+ return ret;
+ }
+
+ switch (cmd.flush) {
+ case FLUSH_RQ:
+ ret = sif_flush_rq(sdev, rq, qp, rq->entries);
+ if (ret)
+ sif_log(sdev, SIF_INFO, "failed to flush RQ %d",
+ rq->index);
+ return ret;
+ case FLUSH_SQ:
+ ret = post_process_wa4074(sdev, qp);
+ if (ret)
+ sif_log(sdev, SIF_INFO, "failed to flush SQ %d", qp->qp_idx);
+ return ret;
+ default:
+ break;
+ }
+ }
+
+ mutex_lock(&qp->lock);
+
+ cur_state = qp_attr_mask & IB_QP_CUR_STATE ?
+ qp_attr->cur_qp_state : qp->last_set_state;
+
+ new_state = qp_attr_mask & IB_QP_STATE ? qp_attr->qp_state : cur_state;
+
+ if (!fail_on_same_state && cur_state == qp_attr->qp_state) {
+ /* Silently ignore.. (used at destroy time) */
+ goto sif_mqp_ret;
+ }
+
+ mqp_type = sif_modify_qp_is_ok(qp, cur_state, new_state, qp_attr_mask);
+ switch (mqp_type) {
+ case SIF_MQP_SW:
+ ret = modify_qp_sw(sdev, qp, qp_attr, qp_attr_mask);
+ break;
+ case SIF_MQP_HW:
+ ret = modify_qp_hw_wa_qp_retry(sdev, qp, qp_attr, qp_attr_mask);
+ break;
+ case SIF_MQP_IGN:
+ break;
+ case SIF_MQP_ERR:
+ default:
+ sif_log(sdev, SIF_INFO, "illegal state change from %d to %d for qp %d",
+ cur_state, new_state, qp->qp_idx);
+ ret = -EINVAL;
+ }
+
+sif_mqp_ret:
+ if (!ret && !(mqp_type == SIF_MQP_IGN)) {
+ /* TBD: Is this needed? */
+ qp_attr->cur_qp_state = new_state;
+ }
+
+ /* QP ownership flag must be updated before release
+ * the lock in order to avoid race conditions
+ */
+ switch (new_state) {
+ case IB_QPS_RESET:
+ qp->flags &= ~SIF_QPF_HW_OWNED;
+ break;
+ case IB_QPS_RTR:
+ qp->flags |= SIF_QPF_HW_OWNED;
+ break;
+ default:
+ /* No extra actions needed */
+ break;
+ }
+
+ mutex_unlock(&qp->lock);
+
+ if (ret)
+ return ret;
+
+ /* Bug #3933 - WA for HW bug 3928
+ * enable/disable the HW ownership QP flag
+ */
+ switch (new_state) {
+ case IB_QPS_ERR:
+ if (is_regular_qp(qp)) {
+ struct sif_rq *rq = get_sif_rq(sdev, qp->rq_idx);
+
+ /* WA #3850:if SRQ, generate LAST_WQE event */
+ if (rq->is_srq && qp->ibqp.event_handler) {
+ struct ib_event ibe = {
+ .device = &sdev->ib_dev,
+ .event = IB_EVENT_QP_LAST_WQE_REACHED,
+ .element.qp = &qp->ibqp
+ };
+
+ qp->ibqp.event_handler(&ibe, qp->ibqp.qp_context);
+ } else if (rq && !rq->is_srq) {
+ /* WA #622: if reqular RQ, flush */
+ ret = sif_flush_rq(sdev, rq, qp, rq->entries);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "failed to flush RQ %d",
+ rq->index);
+ return ret;
+ }
+ }
+ }
+ break;
+ case IB_QPS_RESET:
+ /* clean all state associated with this QP */
+ ret = reset_qp(sdev, qp);
+ break;
+ default:
+ /* No extra actions needed */
+ break;
+ }
+ return ret;
+}
+
+
+static void set_qp_path_hw(struct sif_qp *qp, struct psif_epsc_csr_modify_qp *mct,
+ struct ib_qp_attr *qp_attr, int qp_attr_mask, bool alternate)
+{
+ struct psif_qp_path *path;
+ struct ib_ah_attr *ah_attr;
+ struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+ struct psif_csr_modify_qp_ctrl *ctrl_attr = &mct->ctrl;
+ u8 ipd = 0;
+
+ /* IBV_QP_ALT_PATH Set the alternative path via:
+ * alt_ah_attr, alt_pkey_index, alt_port_num and
+ * alt_timeout.
+ */
+ if (alternate) {
+ ctrl_attr->alt_path = 1;
+ path = &mct->data.alternate_path;
+ ah_attr = &qp_attr->alt_ah_attr;
+ path->pkey_indx = qp_attr->alt_pkey_index;
+ path->local_ack_timeout = qp_attr->alt_timeout;
+ path->port = qp_attr->alt_port_num - 1;
+ sif_log(sdev, SIF_QP, "Alternate pkey_indx %d local_ack_timeout %d, port %d",
+ qp_attr->alt_pkey_index, qp_attr->alt_timeout, qp_attr->alt_port_num + 1);
+ } else {
+ ctrl_attr->prim_path = 1;
+ /* TBD: Does this belong here? */
+ ctrl_attr->pkey_index = 1;
+ path = &mct->data.primary_path;
+ ah_attr = &qp_attr->ah_attr;
+ path->pkey_indx = qp->pkey_index;
+ /* Use the value set by IB_QP_PORT: */
+ path->port = qp->port - 1;
+ sif_log(sdev, SIF_QP, "Primary pkey_indx %d local_ack_timeout %d, port %d",
+ qp_attr->pkey_index, qp_attr->timeout, qp_attr->port_num + 1);
+ }
+ path->sl = ah_attr->sl;
+ path->remote_lid = ah_attr->dlid;
+ path->local_lid_path = ah_attr->src_path_bits;
+
+ path->loopback =
+ (sdev->port[path->port].lid | path->local_lid_path) == ah_attr->dlid ?
+ LOOPBACK : NO_LOOPBACK;
+
+ /* sif_calc_ipd do not set ipd if sif_calc_ipd failed. In that case, ipd = 0.*/
+ sif_calc_ipd(sdev, qp->port, (enum ib_rate) ah_attr->static_rate, &ipd);
+ path->ipd = ipd;
+
+ if (ah_attr->ah_flags & IB_AH_GRH) {
+ path->use_grh = USE_GRH;
+ path->remote_gid_0 = cpu_to_be64(ah_attr->grh.dgid.global.subnet_prefix);
+ path->remote_gid_1 = cpu_to_be64(ah_attr->grh.dgid.global.interface_id);
+ path->flowlabel = ah_attr->grh.flow_label;
+ path->hoplmt = ah_attr->grh.hop_limit;
+ /* TBD: ah_attr->grh.sgid_index? */
+
+ sif_log(sdev, SIF_QP, " - with grh dgid %llx.%llx",
+ ah_attr->grh.dgid.global.subnet_prefix,
+ ah_attr->grh.dgid.global.interface_id);
+ }
+
+ if (qp_attr_mask & IB_QP_TIMEOUT) {
+ path->local_ack_timeout = qp_attr->timeout;
+ sif_log(sdev, SIF_QP, " - with timeout %d", qp_attr->timeout);
+ }
+
+ sif_log(sdev, SIF_QP, "local_lid_path %d, remote_lid %d %s, QP(ipd):%d %s",
+ path->local_lid_path, path->remote_lid, (path->loopback ? "(loopback)" : ""),
+ path->ipd, (alternate ? "(alternate)" : ""));
+}
+
+static int modify_qp_hw(struct sif_dev *sdev, struct sif_qp *qp,
+ struct ib_qp_attr *qp_attr, int qp_attr_mask)
+{
+ struct psif_epsc_csr_rsp resp;
+ struct psif_epsc_csr_req req;
+ struct psif_epsc_csr_modify_qp *mct = &req.u.modify_qp;
+ struct psif_csr_modify_qp_ctrl *ctrl_attr = &mct->ctrl;
+ struct psif_csr_modify_qp_ctrl *cmd = &mct->ctrl;
+ int ret = 0;
+
+ memset(&req, 0, sizeof(req));
+
+ req.opcode = EPSC_MODIFY_QP;
+
+ cmd->cmd = QP_CMD_MODIFY;
+
+ if (qp->qp_idx <= 3) {
+ /* sif requires "real" QP numbers in modify_qp */
+ cmd->qp_num = qp->qp_idx & 1;
+ cmd->port_num = qp->qp_idx >> 1;
+ } else
+ cmd->qp_num = qp->qp_idx;
+
+ if (qp_attr_mask & IB_QP_STATE) {
+ ctrl_attr->qp_state = 1;
+ mct->data.state = ib2sif_qp_state(qp_attr->qp_state);
+ }
+
+ if (qp->last_set_state == IB_QPS_INIT && qp_attr->qp_state == IB_QPS_RTR) {
+ /* Bug #3933 - WA for HW bug 3928
+ * QP hw state must be set to INIT before modify_qp_hw to RTR
+ */
+ volatile struct psif_qp *qps;
+
+ qps = &qp->d;
+ set_psif_qp_core__state(&qps->state, PSIF_QP_STATE_INIT);
+
+ /* For INIT -> RTR the rest of the attrs are set directly in the descriptor: */
+ ret = modify_qp_sw(sdev, qp, qp_attr, qp_attr_mask & ~IB_QP_STATE);
+
+ /* Flag to the FW that this is the PQP */
+ if (qp->type == PSIF_QP_TRANSPORT_MANSP1)
+ req.flags |= EPSC_FL_PQP;
+ if (ret)
+ goto err_modify_qp;
+ else
+ goto ok_modify_qp_sw;
+ }
+
+ if (qp_attr_mask & IB_QP_CUR_STATE) {
+ ctrl_attr->use_current_state = 1;
+ cmd->current_state = ib2sif_qp_state(qp_attr->cur_qp_state);
+
+ /* TBD: Remove this sanity check later: */
+ if (qp_attr->cur_qp_state != qp->last_set_state)
+ sif_log(sdev, SIF_QP,
+ "** WARNING: possible state inconsistency (user %d, driver %d)",
+ qp->last_set_state, qp_attr->cur_qp_state);
+ }
+
+ if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) {
+ /* TBD: Needed? */
+ sif_log(sdev, SIF_QP,
+ "IB_QP_EN_SQD_ASYNC_NOTIFY needed!");
+ goto err_modify_qp;
+ }
+
+ if (qp_attr_mask & IB_QP_ACCESS_FLAGS) {
+ /* TBD: qp_rcv_cap must be set and the whole struct psif_qp_rcv_cap
+ * must be set if any of it's values are modified..
+ * - must keep driver copies of this
+ */
+
+ /* TBD: (qp_attr->qp_access_flags & IB_ACCESS_LOCAL_WRITE) ? 1 : 0; ? */
+ mct->data.rdma_rd_enable =
+ (qp_attr->qp_access_flags & IB_ACCESS_REMOTE_READ) ? 1 : 0;
+ mct->data.rdma_wr_enable =
+ (qp_attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+ mct->data.atomic_enable =
+ (qp_attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0;
+ /* IB_ACCESS_MW_BIND not supported (?) */
+ }
+
+ /* This section must be before IB_QP_AV */
+ if (qp_attr_mask & IB_QP_PKEY_INDEX) {
+ /* TBD: Argument check on index value ? */
+ qp->pkey_index = qp_attr->pkey_index;
+ }
+
+ /* This section must be before IB_QP_AV */
+ if (qp_attr_mask & IB_QP_PORT) {
+ if (qp_attr->port_num < 1 || qp_attr->port_num > 2) {
+ sif_log(sdev, SIF_INFO, "Modify port: Illegal port %d specified for qp %d",
+ qp_attr->port_num, qp->qp_idx);
+ ret = -EINVAL;
+ goto err_modify_qp;
+ }
+ sif_log(sdev, SIF_QP, "Modify port to %d for qp %d",
+ qp_attr->port_num, qp->qp_idx);
+ qp->port = qp_attr->port_num;
+ }
+
+ if (qp_attr_mask & IB_QP_QKEY) {
+ ctrl_attr->qkey = 1;
+ mct->data.rx_qkey = qp_attr->qkey;
+
+ sif_log(sdev, SIF_QP, "Assign QKEY 0x%x for qp %d",
+ qp_attr->qkey, qp->qp_idx);
+
+ }
+
+ if (qp_attr_mask & IB_QP_AV)
+ set_qp_path_hw(qp, mct, qp_attr, qp_attr_mask, false);
+
+ if (qp_attr_mask & IB_QP_PATH_MTU) {
+ if (!ib_legal_path_mtu(qp_attr->path_mtu)) {
+ sif_log(sdev, SIF_INFO, "Illegal MTU encoding %d", qp_attr->path_mtu);
+ ret = EINVAL;
+ goto err_modify_qp;
+ }
+ ctrl_attr->path_mtu = 1;
+ if ((qp->type == PSIF_QP_TRANSPORT_RC) && sif_feature(force_rc_2048_mtu)) {
+ if (qp_attr->path_mtu > IB_MTU_2048)
+ qp_attr->path_mtu = IB_MTU_2048;
+ }
+ mct->data.path_mtu = ib2sif_path_mtu(qp_attr->path_mtu);
+ qp->mtu = qp_attr->path_mtu;
+ }
+
+ if (qp_attr_mask & IB_QP_TIMEOUT) {
+ ctrl_attr->local_ack_timeout = 1;
+ if (!(qp_attr_mask & (IB_QP_AV|IB_QP_ALT_PATH)))
+ mct->data.primary_path.local_ack_timeout = qp_attr->timeout;
+ }
+
+ if (qp_attr_mask & IB_QP_RETRY_CNT) {
+ ctrl_attr->error_retry_count = 1;
+ mct->data.error_retry_count = qp_attr->retry_cnt;
+ }
+
+ if (qp_attr_mask & IB_QP_RNR_RETRY) {
+ ctrl_attr->rnr_retry_count = 1;
+ mct->data.rnr_retry_count = qp_attr->rnr_retry;
+ }
+
+ if (qp_attr_mask & IB_QP_RQ_PSN) {
+ /* expected receive PSN */
+ ctrl_attr->expected_psn = 1;
+ mct->data.expected_psn = qp_attr->rq_psn;
+ }
+
+ if (qp_attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+ /* This is the sending side */
+ ctrl_attr->max_outstanding = 1;
+ if (qp_attr->max_rd_atomic == 0) {
+ sif_log(sdev, SIF_QP,
+ "IB_QP_MAX_QP_RD_ATOMIC value 0 incrementing to 1");
+ qp_attr->max_rd_atomic = 1;
+ }
+ if (qp_attr->max_rd_atomic > 16 || qp_attr->max_rd_atomic < 0) {
+ /* As per IBTA 9.4.4 & 11.2.4.2 */
+ sif_log(sdev, SIF_INFO,
+ "IB_QP_MAX_QP_RD_ATOMIC value %u out of range",
+ qp_attr->max_rd_atomic);
+ ret = -EINVAL;
+ goto err_modify_qp;
+ }
+ mct->data.max_outstanding = qp_attr->max_rd_atomic;
+ }
+
+ if (qp_attr_mask & IB_QP_ALT_PATH) {
+ if (qp_attr->alt_port_num < 1 || qp_attr->alt_port_num > 2) {
+ sif_log(sdev, SIF_INFO, "Illegal alternate port %d specified for qp %d",
+ qp_attr->alt_port_num, qp->qp_idx);
+ ret = -EINVAL;
+ goto err_modify_qp;
+ }
+ set_qp_path_hw(qp, mct, qp_attr, qp_attr_mask, true);
+ }
+
+ if (qp_attr_mask & IB_QP_MIN_RNR_TIMER) {
+ ctrl_attr->min_rnr_nak_time = 1;
+ mct->data.min_rnr_nak_time = sif_feature(force_wa_3646) ?
+ bug_3646_conv_table[qp_attr->min_rnr_timer & 0x1F] :
+ qp_attr->min_rnr_timer & 0x1F;
+ }
+
+ if (qp_attr_mask & IB_QP_SQ_PSN) {
+ /* Send packet sequence number */
+ ctrl_attr->xmit_psn = 1;
+ mct->data.xmit_psn = qp_attr->sq_psn;
+ }
+
+ if (qp_attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+ /* Currently hard coded to 16 in psif */
+ if (unlikely(qp_attr->max_dest_rd_atomic > 16)) {
+ sif_log(sdev, SIF_QP,
+ "IB_QP_MAX_DEST_RD_ATOMIC value %u out of range - psif supports 16 as a hard coded value",
+ qp_attr->max_dest_rd_atomic);
+ goto err_modify_qp;
+ } else if (qp_attr->max_dest_rd_atomic < 16) {
+ sif_log(sdev, SIF_QP,
+ "IB_QP_MAX_DEST_RD_ATOMIC value %u ignored - psif supports 16 as a hard coded value",
+ qp_attr->max_dest_rd_atomic);
+ }
+ }
+
+ if (qp_attr_mask & IB_QP_PATH_MIG_STATE) {
+ ctrl_attr->mig_state = 1;
+ mct->data.mstate = ib2sif_mig_state(qp_attr->path_mig_state);
+ }
+
+ if (qp_attr_mask & IB_QP_CAP) {
+ sif_log(sdev, SIF_QP, "IB_QP_CAP not supported by PSIF");
+ goto err_modify_qp;
+ }
+
+ if (qp_attr_mask & IB_QP_DEST_QPN) {
+ /* Since this is only valid from the init state which is
+ * owned by software anyway, we set it directly from software
+ * (see issues #929, #1027)
+ */
+ qp->remote_qp = qp_attr->dest_qp_num;
+ set_psif_qp_core__remote_qp(&qp->d.state, qp_attr->dest_qp_num);
+ sif_log(sdev, SIF_QP, "Modified remote qp (hw), qp_idx: %d, value %d\n",
+ qp->qp_idx, qp_attr->dest_qp_num);
+ }
+
+ok_modify_qp_sw:
+
+ /*
+ * On modify to RTR, we set the TSU SL (tsl), because we have
+ * port # and sl present in the QP state at this point.
+ */
+ if ((qp_attr_mask & IB_QP_STATE) && (qp_attr->qp_state == IB_QPS_RTR)) {
+ int sl = get_psif_qp_path__sl(&qp->d.path_a);
+ int port = qp->port - 1;
+ enum psif_tsu_qos qosl = qp->qosl;
+
+ if (cmd->qp_num == 0)
+ qp->tsl = sdev->qp0_tsl[qp->port - 1];
+ else if (qp->type == PSIF_QP_TRANSPORT_MANSP1)
+ qp->tsl = sdev->pqp_rcn_tsl[qp->port - 1];
+ else
+ qp->tsl = sdev->sl2tsl[sl][port][(int)qosl];
+
+ set_psif_qp_core__tsl(&qp->d.state, qp->tsl);
+
+ /* Tell user-lib about tsl to use */
+ if (qp->flags & SIF_QPF_USER_MODE) {
+ struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx);
+
+ sq_sw->tsl = qp->tsl;
+ }
+
+ sif_log(sdev, SIF_TSL,
+ "%s qp_idx: %d with sl: %d, port: %d, qosl: %s tsl: %d",
+ qp->type == PSIF_QP_TRANSPORT_MANSP1 ? "privileged" : "regular",
+ qp->qp_idx, sl, qp->port, string_enum_psif_tsu_qos(qosl) + 5, qp->tsl);
+ }
+
+ {
+ struct sif_eps_cqe lcqe;
+ u16 seq_num;
+
+ lcqe.rsp = &resp;
+ init_completion(&lcqe.cmpl);
+
+ ret = sif_post_epsc_wr(sdev, &req, &seq_num, &lcqe, true);
+ if (ret)
+ goto err_modify_qp;
+
+ if (reliable_qp(qp->type)
+ && (qp_attr_mask & IB_QP_STATE)) {
+ if ((qp->last_set_state == IB_QPS_INIT)
+ && (qp_attr->qp_state == IB_QPS_RTR)) {
+ /* Map the new send queue into the global sq_cmpl PSIF
+ * only address map, see #944
+ */
+ ret = sif_sq_cmpl_map_sq(sdev, get_sif_sq(sdev, qp->qp_idx));
+ if (ret)
+ goto err_modify_qp;
+
+ qp->sq_cmpl_map_valid = true;
+
+ } else if ((qp->sq_cmpl_map_valid)
+ && (qp_attr->qp_state == IB_QPS_RESET)) {
+ /* Unmap the send queue from the global sq_cmpl PSIF */
+ ret = sif_sq_cmpl_unmap_sq(sdev, get_sif_sq(sdev, qp->qp_idx));
+ if (ret)
+ goto err_modify_qp;
+
+ qp->sq_cmpl_map_valid = false;
+ }
+ }
+
+ ret = sif_epsc_waitfor(sdev, seq_num, &lcqe);
+ if (ret)
+ goto err_modify_qp;
+ }
+
+ if (resp.status != EPSC_SUCCESS) {
+ sif_log(sdev, SIF_INFO, "qp %d failed with status %s",
+ qp->qp_idx, string_enum_psif_epsc_csr_status(resp.status));
+ goto err_modify_qp;
+ }
+
+ /* sif_logs(SIF_DUMP, write_struct_psif_qp(0, 1, (const struct psif_qp *)&qp->d)); */
+ sif_log(sdev, SIF_QP, "qp %d done QP state %d -> %d",
+ qp->qp_idx, qp->last_set_state,
+ (qp_attr_mask & IB_QP_STATE ? qp_attr->qp_state : qp->last_set_state));
+
+ if (qp_attr_mask & IB_QP_STATE)
+ qp->last_set_state = qp_attr->qp_state;
+
+ return ret;
+
+err_modify_qp:
+ if (resp.status == EPSC_MODIFY_INVALID_QP_STATE)
+ ret = -ESPIPE;
+
+ if (!ret)
+ ret = -EINVAL;
+ if (qp_attr_mask & IB_QP_STATE)
+ sif_log(sdev, SIF_QPE,
+ "qp %d failed - mask 0x%x cur.state %d, requested state %d, ret %d",
+ qp->qp_idx, qp_attr_mask, qp->last_set_state,
+ qp_attr->qp_state,
+ ret);
+ else
+ sif_log(sdev, SIF_QPE, "qp %d failed - mask 0x%x no state trans requested, ret %d",
+ qp->qp_idx, qp_attr_mask, ret);
+
+ sif_logs(SIF_DUMP, write_struct_psif_qp(NULL, 1, (const struct psif_qp *)&qp->d));
+ return ret;
+}
+
+
+static void set_qp_path_sw(struct sif_qp *qp, struct ib_qp_attr *qp_attr,
+ int qp_attr_mask, bool alternate)
+{
+ volatile struct psif_qp_path *path;
+ struct ib_ah_attr *ah_attr;
+ struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+ unsigned int local_lid_path;
+ u8 psif_port;
+ u8 ipd = 0;
+
+ if (alternate) {
+ path = &qp->d.path_b;
+ ah_attr = &qp_attr->alt_ah_attr;
+ set_psif_qp_path__pkey_indx(path, qp_attr->alt_pkey_index);
+ set_psif_qp_path__local_ack_timeout(path, qp_attr->alt_timeout);
+ set_psif_qp_path__port(path, qp_attr->alt_port_num - 1);
+ } else {
+ path = &qp->d.path_a;
+ ah_attr = &qp_attr->ah_attr;
+ set_psif_qp_path__pkey_indx(path, qp->pkey_index);
+ /* Use the value set by IB_QP_PORT: */
+ set_psif_qp_path__port(path, qp->port - 1);
+ }
+ set_psif_qp_path__sl(path, ah_attr->sl);
+
+ if (ah_attr->ah_flags & IB_AH_GRH) {
+ set_psif_qp_path__use_grh(path, USE_GRH);
+ set_psif_qp_path__remote_gid_0(path, cpu_to_be64(ah_attr->grh.dgid.global.subnet_prefix));
+ set_psif_qp_path__remote_gid_1(path, cpu_to_be64(ah_attr->grh.dgid.global.interface_id));
+ set_psif_qp_path__flowlabel(path, ah_attr->grh.flow_label);
+ set_psif_qp_path__hoplmt(path, ah_attr->grh.hop_limit);
+ /* TBD: ah_attr->grh.sgid_index? */
+
+ sif_log(sdev, SIF_QP, " - with grh dgid %llx.%llx",
+ be64_to_cpu(path->remote_gid_0),
+ be64_to_cpu(path->remote_gid_1));
+ }
+
+ if (qp_attr_mask & IB_QP_TIMEOUT) {
+ set_psif_qp_path__local_ack_timeout(path, qp_attr->timeout);
+ sif_log(sdev, SIF_QP, " - with timeout %d", qp_attr->timeout);
+ }
+
+ qp->remote_lid = ah_attr->dlid;
+ set_psif_qp_path__remote_lid(path, ah_attr->dlid);
+ local_lid_path = ah_attr->src_path_bits;
+ psif_port = get_psif_qp_path__port(path);
+ set_psif_qp_path__local_lid_path(path, local_lid_path);
+ set_psif_qp_path__loopback(path,
+ (sdev->port[psif_port].lid | local_lid_path) == ah_attr->dlid ?
+ LOOPBACK : NO_LOOPBACK);
+
+ /* sif_calc_ipd do not set ipd if sif_calc_ipd failed. In that case, ipd = 0.*/
+ sif_calc_ipd(sdev, qp->port, (enum ib_rate) ah_attr->static_rate, &ipd);
+ set_psif_qp_path__ipd(path, ipd);
+
+ sif_log(sdev, SIF_QP, "port %d lid %d(%#x) local_lid_path %d(%#x) remote_lid %d(%#x)",
+ ah_attr->port_num,
+ sdev->port[psif_port].lid,
+ sdev->port[psif_port].lid,
+ ah_attr->src_path_bits,
+ ah_attr->src_path_bits,
+ ah_attr->dlid,
+ ah_attr->dlid);
+
+ sif_log(sdev, SIF_QP, "(path_%c) psif_port %d, remote_lid %d(%#x) %s",
+ (alternate ? 'b' : 'a'),
+ psif_port,
+ get_psif_qp_path__remote_lid(path), get_psif_qp_path__remote_lid(path),
+ (get_psif_qp_path__loopback(path) == LOOPBACK ? "(loopback)" : "(not loopback)"));
+}
+
+static int modify_qp_sw(struct sif_dev *sdev, struct sif_qp *qp,
+ struct ib_qp_attr *qp_attr, int qp_attr_mask)
+{
+ int ret = 0;
+ volatile struct psif_qp *qps;
+ struct sif_rq *rq = NULL;
+
+ if (qp->rq_idx >= 0)
+ rq = get_sif_rq(sdev, qp->rq_idx);
+
+ qps = &qp->d;
+
+ if ((qp_attr_mask & IB_QP_STATE)
+ && (qp->last_set_state == IB_QPS_RESET)
+ && (qp_attr->qp_state == IB_QPS_INIT)) {
+ set_psif_qp_core__bytes_received(&qps->state, 0);
+ set_psif_qp_core__committed_received_psn(&qps->state, 0);
+ set_psif_qp_core__expected_psn(&qps->state, 0);
+ set_psif_qp_core__last_committed_msn(&qps->state, 0);
+ set_psif_qp_core__last_received_outstanding_msn(&qps->state, 0);
+ set_psif_qp_core__msn(&qps->state, 0); /* According to Brian 11.9.2012 */
+ set_psif_qp_core__scatter_indx(&qps->state, 0);
+ set_psif_qp_core__spin_hit(&qps->state, 0);
+ set_psif_qp_core__sq_seq(&qps->state, 1);
+ set_psif_qp_core__srq_pd(&qps->state, 0);
+ }
+
+ if (qp_attr_mask & IB_QP_CUR_STATE && qp_attr->cur_qp_state != qp->last_set_state) {
+ sif_log(sdev, SIF_INFO,
+ "Error: current state %d - user expected %d",
+ qp->last_set_state, qp_attr->cur_qp_state);
+ ret = -EINVAL;
+ goto err_modify_qp;
+ }
+
+ /* Bug #3933 - WA for HW bug 3928
+ * ibv_query_qp might report wrong state when in state IBV_QPS_ERR
+ * QP hw state keeps in RESET for modify_qp_sw to INIT or ERR states
+ */
+ if (qp_attr_mask & IB_QP_STATE)
+ if ((qp_attr->qp_state != IB_QPS_INIT && qp_attr->qp_state != IB_QPS_ERR)
+ || (PSIF_REVISION(sdev) > 3))
+ set_psif_qp_core__state(&qps->state, ib2sif_qp_state(qp_attr->qp_state));
+
+ if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) {
+ sif_log(sdev, SIF_INFO,
+ "IB_QP_EN_SQD_ASYNC_NOTIFY needed!");
+ ret = -EINVAL;
+ goto err_modify_qp;
+ }
+
+ if (qp_attr_mask & IB_QP_ACCESS_FLAGS) {
+
+ set_psif_qp_core__rdma_rd_enable(&qps->state,
+ ((qp_attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
+ ? 1 : 0));
+ set_psif_qp_core__rdma_wr_enable(&qps->state,
+ ((qp_attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
+ ? 1 : 0));
+ set_psif_qp_core__atomic_enable(&qps->state,
+ ((qp_attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)
+ ? 1 : 0));
+ }
+
+ /* This section must be before IB_QP_AV */
+ if (qp_attr_mask & IB_QP_PKEY_INDEX) {
+ volatile struct psif_qp_path *path = &qp->d.path_a;
+
+ /* TBD: Argument check on index value ? */
+ qp->pkey_index = qp_attr->pkey_index;
+ set_psif_qp_path__pkey_indx(path, qp->pkey_index);
+ sif_log(sdev, SIF_QP, "pkey_indx in primary path set to %d", qp->pkey_index);
+
+ }
+
+ /* This section must be before IB_QP_AV */
+ if (qp_attr_mask & IB_QP_PORT) {
+ if (qp_attr->port_num < 1 || qp_attr->port_num > 2) {
+ sif_log(sdev, SIF_INFO, "Modify port: Illegal port %d specified for qp %d",
+ qp_attr->port_num, qp->qp_idx);
+ ret = -EINVAL;
+ goto err_modify_qp;
+ }
+ sif_log(sdev, SIF_QP, "Modify port to %d for qp %d",
+ qp_attr->port_num, qp->qp_idx);
+ qp->port = qp_attr->port_num;
+ }
+
+ if (qp_attr_mask & IB_QP_QKEY) {
+
+ /* Set the 'ipoib' and 'ipoib_enable' fields for UD QPs with the IPoIB QKey */
+ /* TBD: The IPoIB QKEY value is hardcoded. We need to figured out how ask the
+ * driver to ask the FW for this value
+ */
+ if (qp_attr->qkey == 0x00000b1b) {
+ set_psif_qp_core__ipoib(&qps->state, 1);
+ set_psif_qp_core__ipoib_enable(&qps->state, 1);
+ }
+
+ set_psif_qp_core__qkey(&qps->state, qp_attr->qkey);
+
+ sif_log(sdev, SIF_QP, "Assign QKEY 0x%x for qp %d",
+ qp_attr->qkey, qp->qp_idx);
+ }
+
+ if (qp_attr_mask & IB_QP_AV)
+ set_qp_path_sw(qp, qp_attr, qp_attr_mask, false);
+
+ if (qp_attr_mask & IB_QP_PATH_MTU) {
+ if (!ib_legal_path_mtu(qp_attr->path_mtu)) {
+ sif_log(sdev, SIF_INFO, "Illegal MTU encoding %d", qp_attr->path_mtu);
+ ret = EINVAL;
+ goto err_modify_qp;
+ }
+ if ((qp->type == PSIF_QP_TRANSPORT_RC) && sif_feature(force_rc_2048_mtu)) {
+ if (qp_attr->path_mtu > IB_MTU_2048)
+ qp_attr->path_mtu = IB_MTU_2048;
+ }
+ sif_log(sdev, SIF_QP, "Modify path_mtu to %d for qp %d",
+ qp_attr->path_mtu, qp->qp_idx);
+ set_psif_qp_core__path_mtu(&qps->state,
+ ib2sif_path_mtu(qp_attr->path_mtu));
+ qp->mtu = qp_attr->path_mtu;
+ }
+
+ if (!(qp_attr_mask & (IB_QP_AV|IB_QP_ALT_PATH))) {
+ /* Set these values also if a path does not get set */
+ if (qp_attr_mask & IB_QP_TIMEOUT)
+ set_psif_qp_path__local_ack_timeout(&qps->path_a, qp_attr->timeout);
+ }
+
+ if (qp_attr_mask & IB_QP_RETRY_CNT) {
+ set_psif_qp_core__error_retry_init(&qps->state, qp_attr->retry_cnt);
+ set_psif_qp_core__error_retry_count(&qps->state, qp_attr->retry_cnt);
+ }
+
+ if (qp_attr_mask & IB_QP_RNR_RETRY) {
+ int rnr_value = qp_attr->retry_cnt;
+
+ set_psif_qp_core__rnr_retry_init(&qps->state, rnr_value);
+ set_psif_qp_core__rnr_retry_count(&qps->state, qp_attr->rnr_retry);
+ }
+
+ if (qp_attr_mask & IB_QP_RQ_PSN)
+ set_psif_qp_core__expected_psn(&qps->state, qp_attr->rq_psn);
+
+ if (qp_attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+ /* This is the sending side */
+ if (unlikely(qp_attr->max_rd_atomic > 16)) {
+ sif_log(sdev, SIF_QP,
+ "IB_QP_MAX_QP_RD_ATOMIC value %u out of range - psif supports no more than 16",
+ qp_attr->max_rd_atomic);
+ qp_attr->max_rd_atomic = 16;
+ }
+ set_psif_qp_core__max_outstanding(&qps->state, qp_attr->max_rd_atomic);
+ }
+
+ if (qp_attr_mask & IB_QP_ALT_PATH) {
+ if (qp_attr->alt_port_num < 1 || qp_attr->alt_port_num > 2) {
+ sif_log(sdev, SIF_INFO, "Illegal alternate port %d specified for qp %d",
+ qp_attr->alt_port_num, qp->qp_idx);
+ ret = -EINVAL;
+ goto err_modify_qp;
+ }
+ set_qp_path_sw(qp, qp_attr, qp_attr_mask, true);
+ }
+
+ if (qp_attr_mask & IB_QP_MIN_RNR_TIMER)
+ set_psif_qp_core__min_rnr_nak_time(&qps->state,
+ bug_3646_conv_table[qp_attr->min_rnr_timer & 0x1F]);
+
+ if (qp_attr_mask & IB_QP_SQ_PSN) {
+ /* last_acked_psn must be 1 less (modulo 24 bit) than xmit_psn
+ * (see issue #1011)
+ */
+ u32 prev = qp_attr->sq_psn == 0 ? 0xFFFFFF : qp_attr->sq_psn - 1;
+
+ set_psif_qp_core__xmit_psn(&qps->state, qp_attr->sq_psn);
+ set_psif_qp_core__last_acked_psn(&qps->state, prev);
+ }
+
+ if (qp_attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+ /* Currently hard coded to 16 in psif */
+ if (unlikely(qp_attr->max_dest_rd_atomic > 16)) {
+ sif_log(sdev, SIF_INFO,
+ "IB_QP_MAX_DEST_RD_ATOMIC value %u out of range - psif supports 16 as a hard coded value",
+ qp_attr->max_dest_rd_atomic);
+ ret = -EINVAL;
+ goto err_modify_qp;
+ } else if (qp_attr->max_dest_rd_atomic < 16) {
+ sif_log(sdev, SIF_QP,
+ "IB_QP_MAX_DEST_RD_ATOMIC value %u ignored - psif supports 16 as a hard coded value",
+ qp_attr->max_dest_rd_atomic);
+ }
+ }
+
+ if (qp_attr_mask & IB_QP_PATH_MIG_STATE)
+ set_psif_qp_core__mstate(&qps->state,
+ ib2sif_mig_state(qp_attr->path_mig_state));
+
+ if (qp_attr_mask & IB_QP_CAP) {
+ sif_log(sdev, SIF_INFO, "resizing QP not implemented");
+ sif_log(sdev, SIF_INFO, "IB_QP_CAP needed!");
+ ret = -EOPNOTSUPP;
+ goto err_modify_qp;
+ }
+
+ if (qp_attr_mask & IB_QP_DEST_QPN) {
+ set_psif_qp_core__remote_qp(&qps->state, qp_attr->dest_qp_num);
+ sif_log(sdev, SIF_QP, "Modified remote qp (sw), local qp_idx: %d, remote_qp %d\n",
+ qp->qp_idx, qp_attr->dest_qp_num);
+ }
+
+ /* Set the valid bit whenever we transition to INIT */
+ if (rq && !rq->is_srq && qp_attr_mask & IB_QP_STATE && qp_attr->qp_state == IB_QPS_INIT)
+ set_psif_rq_hw__valid(&rq->d, 1);
+
+ sif_log(sdev, SIF_QP, "qp %d done QP state %d -> %d",
+ qp->qp_idx, qp->last_set_state,
+ (qp_attr_mask & IB_QP_STATE ? qp_attr->qp_state : qp->last_set_state));
+
+ if (qp_attr_mask & IB_QP_STATE)
+ qp->last_set_state = qp_attr->qp_state;
+
+ return ret;
+err_modify_qp:
+ return ret;
+}
+
+
+static int sif_query_qp_sw(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+ int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+static int sif_query_qp_hw(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+ int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+
+int sif_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+ int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+ bool use_hw = false;
+ struct sif_qp *qp = to_sqp(ibqp);
+ struct sif_dev *sdev = to_sdev(ibqp->device);
+
+ sif_logi(ibqp->device, SIF_QP, "last_set_state %d", qp->last_set_state);
+
+ switch (qp->last_set_state) {
+ case IB_QPS_RESET:
+ case IB_QPS_INIT:
+ break;
+ default:
+ /* Bug #3933 - WA for HW bug 3928
+ * ibv_query_qp might report wrong state when in state IBV_QPS_ERR
+ * Query must be done based on current ownership (towards HW only if HW owned)
+ */
+ if (PSIF_REVISION(sdev) <= 3 || qp->flush_sq_done_wa4074)
+ use_hw = (qp->flags & SIF_QPF_HW_OWNED);
+ else
+ use_hw = true;
+ break;
+ }
+
+ return use_hw ?
+ sif_query_qp_hw(ibqp, qp_attr, qp_attr_mask, qp_init_attr) :
+ sif_query_qp_sw(ibqp, qp_attr, qp_attr_mask, qp_init_attr);
+}
+
+enum ib_qp_state get_qp_state(struct sif_qp *qp)
+{
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct ib_qp_init_attr init_attr;
+ struct ib_qp_attr attr;
+
+ memset(&attr, 0, sizeof(attr));
+ memset(&init_attr, 0, sizeof(init_attr));
+
+ if (sif_query_qp(ibqp, &attr, IB_QP_STATE, &init_attr)) {
+ sif_logi(ibqp->device, SIF_INFO,
+ "query_qp failed for qp %d", ibqp->qp_num);
+ return -1;
+ }
+ return attr.qp_state;
+}
+
+static void get_qp_path_sw(struct sif_qp *qp, struct ib_qp_attr *qp_attr, bool alternate)
+{
+ volatile struct psif_qp_path *path;
+ struct ib_ah_attr *ah_attr;
+ enum psif_use_grh use_grh;
+ volatile struct psif_qp_path *alt_path;
+ struct ib_ah_attr *alt_ah_attr;
+
+ alt_path = &qp->d.path_b;
+ alt_ah_attr = &qp_attr->alt_ah_attr;
+ path = &qp->d.path_a;
+ ah_attr = &qp_attr->ah_attr;
+
+ ah_attr->sl = get_psif_qp_path__sl(path);
+ use_grh = get_psif_qp_path__use_grh(path);
+
+ if (use_grh == USE_GRH) {
+ ah_attr->ah_flags |= IB_AH_GRH;
+ ah_attr->grh.dgid.global.subnet_prefix = get_psif_qp_path__remote_gid_0(path);
+ ah_attr->grh.dgid.global.interface_id = get_psif_qp_path__remote_gid_1(path);
+ ah_attr->grh.flow_label = get_psif_qp_path__flowlabel(path);
+ ah_attr->grh.hop_limit = get_psif_qp_path__hoplmt(path);
+ /* TBD: ah_attr->grh.sgid_index? */
+ }
+
+ qp_attr->pkey_index = get_psif_qp_path__pkey_indx(path);
+ qp_attr->timeout = get_psif_qp_path__local_ack_timeout(path);
+
+ ah_attr->port_num = get_psif_qp_path__port(path);
+ ah_attr->dlid = get_psif_qp_path__remote_lid(path);
+ ah_attr->src_path_bits = get_psif_qp_path__local_lid_path(path);
+
+ alt_ah_attr->port_num = get_psif_qp_path__port(alt_path);
+ alt_ah_attr->dlid = get_psif_qp_path__remote_lid(alt_path);
+ alt_ah_attr->src_path_bits = get_psif_qp_path__local_lid_path(alt_path);
+}
+
+
+
+static int sif_query_qp_sw(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+ int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+ struct sif_dev *sdev = to_sdev(ibqp->device);
+ struct sif_qp *qp = to_sqp(ibqp);
+ volatile struct psif_qp *qps = &qp->d;
+ struct sif_rq *rq = NULL;
+ struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+ int ret = 0;
+
+ if (qp->type != PSIF_QP_TRANSPORT_XRC)
+ rq = get_sif_rq(sdev, qp->rq_idx);
+
+ /* Mellanox almost completely ignores the mask on both
+ * input and output and reports all attributes regardlessly..
+ * as opposed to what man ibv_query_qp indicates.
+ * Since this behavour is utilized by a.o. qperf
+ * we probably have no other meaningful choice than
+ * to report back everything even with mask 0.
+ */
+ sif_log(sdev, SIF_QP, "qp_attr_mask 0x%x", qp_attr_mask);
+
+ memset(qp_init_attr, 0, sizeof(struct ib_qp_init_attr));
+ memset(qp_attr, 0, sizeof(struct ib_qp_attr));
+
+ qp_attr->qp_state = qp_attr->cur_qp_state = qp->last_set_state;
+ qp_attr->qp_access_flags |=
+ get_psif_qp_core__rdma_rd_enable(&qps->state) ? IB_ACCESS_REMOTE_READ : 0;
+ qp_attr->qp_access_flags |=
+ get_psif_qp_core__rdma_wr_enable(&qps->state) ? IB_ACCESS_REMOTE_WRITE : 0;
+ qp_attr->qp_access_flags |=
+ get_psif_qp_core__atomic_enable(&qps->state) ? IB_ACCESS_REMOTE_ATOMIC : 0;
+
+ qp_attr->pkey_index = get_psif_qp_path__pkey_indx(&qps->path_a);
+ qp_attr->port_num = qp->port;
+ qp_attr->qkey = get_psif_qp_core__qkey(&qps->state);
+ get_qp_path_sw(qp, qp_attr, qp_attr_mask & IB_QP_ALT_PATH);
+
+ qp_attr->path_mtu = sif2ib_path_mtu(get_psif_qp_core__path_mtu(&qps->state));
+ qp_attr->timeout = get_psif_qp_path__local_ack_timeout(&qps->path_a);
+ qp_attr->retry_cnt = get_psif_qp_core__error_retry_count(&qps->state);
+ qp_attr->rnr_retry = get_psif_qp_core__rnr_retry_count(&qps->state);
+ qp_attr->rq_psn = get_psif_qp_core__expected_psn(&qps->state);
+ qp_attr->min_rnr_timer = get_psif_qp_core__min_rnr_nak_time(&qps->state);
+ qp_attr->sq_psn = get_psif_qp_core__xmit_psn(&qps->state);
+ qp_attr->path_mig_state = sif2ib_mig_state(get_psif_qp_core__mstate(&qps->state));
+ qp_attr->dest_qp_num = get_psif_qp_core__remote_qp(&qps->state);
+
+ /* TBD: Revisit this: This value is currently hard coded to 16 in psif */
+ qp_attr->max_dest_rd_atomic = 16;
+
+ qp_init_attr->port_num = qp->port;
+ if (rq) {
+ if (rq->is_srq)
+ qp_init_attr->srq = &rq->ibsrq;
+ qp_init_attr->cap.max_recv_wr = rq->entries_user;
+ qp_init_attr->cap.max_recv_sge = rq->sg_entries;
+ }
+ qp_init_attr->cap.max_send_wr = sq->entries;
+ qp_init_attr->cap.max_send_sge = sq->sg_entries;
+ qp_init_attr->cap.max_inline_data = qp->max_inline_data;
+
+ /* TBD: What to do with this:
+ * IB_QP_MAX_QP_RD_ATOMIC = (1<<13),
+ */
+ return ret;
+}
+
+static void get_qp_path_hw(struct psif_query_qp *qqp, struct ib_qp_attr *qp_attr, bool alternate)
+{
+ struct psif_qp_path *path;
+ struct ib_ah_attr *ah_attr;
+ enum psif_use_grh use_grh;
+ struct psif_qp_path *alt_path;
+ struct ib_ah_attr *alt_ah_attr;
+
+ alt_path = &qqp->alternate_path;
+ alt_ah_attr = &qp_attr->alt_ah_attr;
+ path = &qqp->primary_path;
+ ah_attr = &qp_attr->ah_attr;
+
+ ah_attr->sl = path->sl;
+ use_grh = path->use_grh;
+
+ if (use_grh == USE_GRH) {
+ ah_attr->ah_flags |= IB_AH_GRH;
+ ah_attr->grh.dgid.global.subnet_prefix = path->remote_gid_0;
+ ah_attr->grh.dgid.global.interface_id = path->remote_gid_1;
+ ah_attr->grh.flow_label = path->flowlabel;
+ ah_attr->grh.hop_limit = path->hoplmt;
+ /* TBD: ah_attr->grh.sgid_index? */
+ }
+ qp_attr->pkey_index = path->pkey_indx;
+ qp_attr->timeout = path->local_ack_timeout;
+ qp_attr->port_num = path->port + 1;
+
+ qp_attr->alt_pkey_index = alt_path->pkey_indx;
+ qp_attr->alt_timeout = alt_path->local_ack_timeout;
+ qp_attr->alt_port_num = alt_path->port + 1;
+
+
+
+ ah_attr->port_num = path->port + 1;
+ ah_attr->dlid = path->remote_lid;
+ ah_attr->src_path_bits = path->local_lid_path;
+
+ alt_ah_attr->port_num = alt_path->port + 1;
+ alt_ah_attr->dlid = alt_path->remote_lid;
+ alt_ah_attr->src_path_bits = alt_path->local_lid_path;
+}
+
+u64 sif_qqp_dma_addr(struct sif_dev *sdev, struct sif_qp *qps)
+{
+ struct sif_table *tp = &sdev->ba[qp];
+ u64 offset = qps->qp_idx * tp->ext_sz + offsetof(struct sif_qp, qqp);
+
+ if (tp->mmu_ctx.mt == SIFMT_BYPASS)
+ return sif_mem_dma(tp->mem, offset);
+ else if (!epsc_gva_permitted(sdev))
+ return sif_mem_dma(tp->mem, offset);
+ else
+ return tp->mmu_ctx.base + offset;
+}
+
+/* Internal query qp implementation - updates the local query qp state for this QP */
+int epsc_query_qp(struct sif_qp *sqp, struct psif_query_qp *lqqp)
+{
+ int ret;
+ struct psif_epsc_csr_rsp cqe;
+ struct psif_epsc_csr_req req;
+ struct psif_csr_modify_qp_ctrl *cmd = &req.u.query_qp.ctrl;
+ struct sif_dev *sdev = to_sdev(sqp->ibqp.device);
+
+ /* This function can potentially use the same qqp data structure reentrant
+ * but we dont care as we know that EPSC operations gets sequenced
+ */
+
+ memset(&req, 0, sizeof(req));
+ req.opcode = EPSC_QUERY_QP;
+ cmd->cmd = QP_CMD_QUERY;
+ if (sqp->qp_idx <= 3) {
+ cmd->qp_num = sqp->qp_idx & 1;
+ cmd->port_num = sqp->qp_idx >> 1;
+ } else
+ cmd->qp_num = sqp->qp_idx;
+ req.u.query_qp.address = sif_qqp_dma_addr(sdev, sqp);
+
+ if (!epsc_gva_permitted(sdev))
+ req.u.query_qp.mmu_cntx = sif_mmu_ctx_passthrough(true);
+ else
+ req.u.query_qp.mmu_cntx = sdev->ba[qp].mmu_ctx.mctx;
+ ret = sif_epsc_wr_poll(sdev, &req, &cqe);
+
+ /* Copy data irrespective of how the EPSC operation went */
+ if (eps_version_ge(&sdev->es[sdev->mbox_epsc], 0, 31))
+ copy_conv_to_sw(lqqp, &sqp->qqp, sizeof(*lqqp));
+ else
+ memcpy(lqqp, &sqp->qqp, sizeof(*lqqp));
+
+ return ret;
+}
+
+
+static int sif_query_qp_hw(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+ int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+ int ret = 0;
+ struct sif_qp *qp = to_sqp(ibqp);
+ struct sif_dev *sdev = to_sdev(ibqp->device);
+ struct sif_rq *rq = NULL;
+ struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+ struct psif_query_qp lqqp;
+
+ /* Take QP lock to avoid any race condition on updates to last_set_state: */
+ mutex_lock(&qp->lock);
+
+ ret = epsc_query_qp(qp, &lqqp);
+ if (!ret)
+ qp->last_set_state = sif2ib_qp_state(lqqp.qp.state);
+ mutex_unlock(&qp->lock);
+
+ if (ret)
+ return ret;
+
+ if (qp->type != PSIF_QP_TRANSPORT_XRC)
+ rq = get_sif_rq(sdev, qp->rq_idx);
+
+ /* Mellanox almost completely ignores the mask on both
+ * input and output and reports all attributes regardlessly..
+ * as opposed to what man ibv_query_qp indicates.
+ * Since this behavour is utilized by a.o. qperf
+ * we probably have no other meaningful choice than
+ * to report back everything even with mask 0.
+ */
+ sif_log(sdev, SIF_QP|SIF_DUMP, "qp %d, qp_attr_mask 0x%x", qp->qp_idx, qp_attr_mask);
+ sif_logs(SIF_DUMP, write_struct_psif_query_qp(NULL, 0, &lqqp));
+
+
+ memset(qp_init_attr, 0, sizeof(struct ib_qp_init_attr));
+ memset(qp_attr, 0, sizeof(struct ib_qp_attr));
+
+ qp_attr->qp_state = qp_attr->cur_qp_state = qp->last_set_state;
+ qp_attr->qp_access_flags |= lqqp.qp.rdma_rd_enable ? IB_ACCESS_REMOTE_READ : 0;
+ qp_attr->qp_access_flags |= lqqp.qp.rdma_wr_enable ? IB_ACCESS_REMOTE_WRITE : 0;
+ qp_attr->qp_access_flags |= lqqp.qp.atomic_enable ? IB_ACCESS_REMOTE_ATOMIC : 0;
+
+ qp_attr->pkey_index = lqqp.primary_path.pkey_indx;
+ qp_attr->port_num = lqqp.primary_path.port + 1;
+ qp_attr->qkey = lqqp.qp.qkey;
+ get_qp_path_hw(&lqqp, qp_attr, qp_attr_mask & IB_QP_ALT_PATH);
+
+ qp_attr->path_mtu = sif2ib_path_mtu(lqqp.qp.path_mtu);
+ qp_attr->timeout = lqqp.primary_path.local_ack_timeout;
+ qp_attr->retry_cnt = lqqp.qp.error_retry_count;
+ qp_attr->rnr_retry = lqqp.qp.rnr_retry_count;
+ qp_attr->rq_psn = lqqp.qp.expected_psn;
+ qp_attr->min_rnr_timer = lqqp.qp.min_rnr_nak_time;
+ qp_attr->sq_psn = lqqp.qp.xmit_psn;
+ qp_attr->path_mig_state = sif2ib_mig_state(lqqp.qp.mstate);
+ qp_attr->dest_qp_num = lqqp.qp.remote_qp;
+
+ /* TBD: Revisit this: This value is currently hard coded to 16 in psif */
+ qp_attr->max_dest_rd_atomic = 16;
+
+ qp_init_attr->port_num = qp->port; /* TBD: Use primary path info here as well? */
+
+ if (rq) {
+ if (rq->is_srq)
+ qp_init_attr->srq = &rq->ibsrq;
+ qp_init_attr->cap.max_recv_wr = rq->entries_user;
+ qp_init_attr->cap.max_recv_sge = rq->sg_entries;
+ }
+ qp_init_attr->cap.max_send_wr = sq->entries;
+ qp_init_attr->cap.max_send_sge = sq->sg_entries;
+ qp_init_attr->cap.max_inline_data = qp->max_inline_data;
+
+ /* TBD: What to do with these..
+ * IB_QP_MAX_QP_RD_ATOMIC = (1<<13),
+ */
+ return ret;
+}
+
+
+int sif_destroy_qp(struct ib_qp *ibqp)
+{
+ struct sif_qp *qp = to_sqp(ibqp);
+ struct sif_dev *sdev = to_sdev(ibqp->device);
+ struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+ bool need_pma_pxy_qp = eps_version_ge(es, 0, 57)
+ && (qp->qp_idx == 1 || qp->qp_idx == 3);
+
+ sif_log(sdev, SIF_QP, "qp_num %d", ibqp->qp_num);
+
+ /* Destroy PMA_PXY QPs associated with QP1/3 */
+ if (need_pma_pxy_qp) {
+ struct sif_qp *pma_qp = NULL;
+ int pma_qp_idx;
+ int ret;
+
+ pma_qp_idx = sdev->pma_qp_idxs[!!(qp->qp_idx & 2)];
+ pma_qp = get_sif_qp(sdev, pma_qp_idx);
+
+ /* clearing epsc PMA_PXY QP redirection */
+ ret = notify_epsc_pma_qp(sdev, -1, qp->port);
+ if (ret)
+ sif_log(sdev, SIF_QP,
+ "Failed to clear epsc PMA_PXY rerirect for qp_num %d", pma_qp_idx);
+ destroy_qp(sdev, pma_qp);
+ }
+
+ return destroy_qp(sdev, qp);
+}
+
+
+int destroy_qp(struct sif_dev *sdev, struct sif_qp *qp)
+{
+ int ret;
+ int index = qp->qp_idx;
+ struct sif_pd *pd = qp->ibqp.pd ? to_spd(qp->ibqp.pd) : to_sxrcd(qp->ibqp.xrcd)->pd;
+ struct ib_qp_attr mod_attr = {
+ .qp_state = IB_QPS_RESET
+ };
+ struct sif_rq *rq = NULL;
+ bool reuse_ok = true;
+
+ /* See bug #3496 */
+ if (sif_feature(no_multipacket_qp_reuse)) {
+ switch (qp->type) {
+ case PSIF_QP_TRANSPORT_UD:
+ case PSIF_QP_TRANSPORT_MANSP1:
+ reuse_ok = true;
+ break;
+ default:
+ reuse_ok = false;
+ break;
+ }
+ }
+
+ sif_log(sdev, SIF_QP, "## Enter qp_idx %d", index);
+
+ if (is_regular_qp(qp))
+ rq = get_sif_rq(sdev, qp->rq_idx);
+
+ /* make sure event handling is performed before reset the qp.*/
+ if (atomic_dec_and_test(&qp->refcnt))
+ complete(&qp->can_destroy);
+ wait_for_completion(&qp->can_destroy);
+
+ /* Modify to reset causes an implicit reset_qp() if state is RESET */
+ ret = modify_qp(sdev, qp, &mod_attr, IB_QP_STATE, false, NULL);
+ if (ret)
+ sif_log(sdev, SIF_INFO, "modify qp %d to RESET failed, sts %d", index, ret);
+
+ if (!(qp->flags & SIF_QPF_USER_MODE)) {
+ int nfixup;
+ struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+ u32 cq_idx = get_psif_qp_core__rcv_cq_indx(&qp->d.state);
+ struct sif_cq *send_cq = (sq && sq->cq_idx >= 0) ? get_sif_cq(sdev, sq->cq_idx) : NULL;
+ struct sif_cq *recv_cq = rq ? get_sif_cq(sdev, cq_idx) : NULL;
+
+ if (send_cq) {
+ nfixup = sif_fixup_cqes(send_cq, sq, qp);
+ if (nfixup < 0) {
+ sif_log(sdev, SIF_INFO,
+ "sif_fixup_cqes: on qp %d send cq %d failed with error %d",
+ qp->qp_idx, sq->cq_idx, nfixup);
+ goto fixup_failed;
+ }
+ sif_log(sdev, SIF_QP, "sif_fixup_cqes: fixed %d CQEs in sq.cq %d",
+ nfixup, sq->cq_idx);
+ }
+ if (recv_cq && recv_cq != send_cq) {
+ nfixup = sif_fixup_cqes(recv_cq, sq, qp);
+ if (nfixup < 0) {
+ sif_log(sdev, SIF_INFO,
+ "sif_fixup_cqes: on qp %d recv cq %d failed with error %d",
+ qp->qp_idx, cq_idx, nfixup);
+ goto fixup_failed;
+ }
+ sif_log(sdev, SIF_QP, "sif_fixup_cqes: fixed %d CQEs in rq.cq %d",
+ nfixup, cq_idx);
+
+ }
+ }
+
+fixup_failed:
+ if (qp->qp_idx < 4) {
+ /* Special QP cleanup */
+ int ok = atomic_add_unless(&sdev->sqp_usecnt[qp->qp_idx], -1, 0);
+
+ if (!ok) {
+ sif_log(sdev, SIF_INFO,
+ "Attempt to destroy an uncreated QP %d", qp->qp_idx);
+ return -EINVAL;
+ }
+ }
+
+ sif_dfs_remove_qp(qp);
+
+ sif_free_sq(sdev, qp);
+
+ if (rq) {
+ ret = free_rq(sdev, qp->rq_idx);
+ if (ret && (ret != -EBUSY || !rq->is_srq))
+ return ret;
+ }
+
+ if (index > 3 && reuse_ok)
+ sif_free_qp_idx(pd, index);
+
+ sif_log(sdev, SIF_QP, "## Exit success (qp_idx %d)", index);
+ return 0;
+}
+
+/* Set this QP back to the initial state
+ * (called by modify_qp after a successful modify to reset
+ */
+static int reset_qp(struct sif_dev *sdev, struct sif_qp *qp)
+{
+ volatile struct psif_qp *qps = &qp->d;
+ struct sif_rq *rq = NULL;
+ struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+ bool need_wa_3713 = 0;
+
+ /* Bring down order needed by rev2 according to bug #3480 */
+ int ret = poll_wait_for_qp_writeback(sdev, qp);
+
+ if (ret)
+ goto failed;
+
+ if (is_regular_qp(qp))
+ rq = get_sif_rq(sdev, qp->rq_idx);
+
+ /* WA 3713 special handling */
+ need_wa_3713 = (PSIF_REVISION(sdev) <= 3)
+ && IS_PSIF(sdev) /* Next check if there is a retry outstanding */
+ && !qp->flush_sq_done_wa4074
+ && (get_psif_qp_core__retry_tag_committed(&qp->d.state) !=
+ get_psif_qp_core__retry_tag_err(&qp->d.state))
+ && (qp->qp_idx != sdev->flush_qp);
+
+ if (need_wa_3713) {
+ ret = reset_qp_flush_retry(sdev);
+ if (ret < 0)
+ sif_log(sdev, SIF_INFO, "Flush_retry special handling failed with ret %d", ret);
+
+ }
+
+
+ /* if the send queue scheduler is running, wait for
+ * it to terminate:
+ */
+ ret = sif_flush_sqs(sdev, sq);
+ if (ret)
+ goto failed;
+
+ sif_logs(SIF_DUMP,
+ write_struct_psif_qp(NULL, 1, (struct psif_qp *)&qp->d));
+
+failed:
+ if (ret) {
+ /* TBD: Debug case - should never fail? */
+ if (qp->type != PSIF_QP_TRANSPORT_MANSP1)
+ return ret;
+ }
+
+ /* Reset the SQ pointers */
+ if (!qp->ibqp.xrcd) {
+ struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx);
+
+ memset(sq_sw, 0, sizeof(*sq_sw));
+ set_psif_sq_sw__tail_indx(&sq_sw->d, 0);
+ set_psif_sq_hw__last_seq(&sq->d, 0);
+ set_psif_sq_hw__destroyed(&sq->d, 0);
+ }
+
+ /* Invalidate the RQ and set it in a consistent state for reuse */
+ if (rq && !rq->is_srq) {
+ struct sif_rq_sw *rq_sw = get_sif_rq_sw(sdev, rq->index);
+
+ if (!(test_bit(RQ_IS_INVALIDATED, &rq_sw->flags))) {
+ ret = sif_invalidate_rq_hw(sdev, rq->index, PCM_POST);
+ if (ret) {
+ sif_log(sdev, SIF_INFO,
+ "Invalidate rq_hw failed, status %d", ret);
+ return ret;
+ }
+ set_bit(RQ_IS_INVALIDATED, &rq_sw->flags);
+ }
+
+ /* Make sure the RQ is sofware owned: */
+ ret = poll_wait_for_rq_writeback(sdev, rq);
+ if (ret)
+ return ret;
+
+ /* Reset pointers */
+ memset(rq_sw, 0, sizeof(*rq_sw));
+ set_psif_rq_hw__head_indx(&rq->d, 0);
+ }
+
+ mb();
+
+ if (multipacket_qp(qp->type) && IS_PSIF(sdev) && PSIF_REVISION(sdev) > 2) {
+ int i;
+ int loop_count = 1;
+
+ /* bz #3794: WA for HW bug 3198, VAL issuing read to uninitialized DMA VT entry */
+ if (qp->type == PSIF_QP_TRANSPORT_UC && PSIF_REVISION(sdev) <= 3)
+ loop_count = 64;
+
+ /* Invalidate the SGL cache (mapped to the qp type)
+ * TBD: We can consider a posted inv.req and check lazy upon reuse
+ */
+
+ for (i = 0; i < loop_count; ++i) {
+ ret = sif_invalidate_qp(sdev, qp->qp_idx, PCM_WAIT);
+ if (ret) {
+ sif_log(sdev, SIF_INFO,
+ "Invalidate SGL cache failed");
+ return ret;
+ }
+ cpu_relax();
+ }
+ }
+
+ /* Reset counters to same values used at QP create
+ * Last acked psn must be initialized to one less than xmit_psn
+ * and it is a 24 bit value. See issue #1011
+ */
+ set_psif_qp_core__xmit_psn(&qps->state, 0);
+ set_psif_qp_core__last_acked_psn(&qps->state, 0xffffff);
+ qp->flush_sq_done_wa4074 = false;
+
+ return ret;
+}
+
+
+
+void sif_dfs_print_qp(struct seq_file *s, struct sif_dev *sdev,
+ loff_t pos)
+{
+ struct sif_qp *qp;
+ volatile struct psif_qp *qps;
+ struct psif_qp lqps;
+
+ if (unlikely(pos < 0)) {
+ seq_puts(s, "Index\tState\tRecvCQ\tSendCQ\tRQ\tRemQP\tType\n");
+ return;
+ }
+
+ qp = get_sif_qp(sdev, pos);
+ qps = &qp->d;
+ copy_conv_to_sw(&lqps, qps, sizeof(struct psif_qp));
+
+ if (pos <= 3 && atomic_read(&sdev->sqp_usecnt[pos]) != 1)
+ return;
+
+ seq_printf(s, "%llu\t%d\t", pos, qp->last_set_state);
+
+ if (qp->rq_idx == -1)
+ seq_puts(s, "[none]");
+ else
+ seq_printf(s, "%u", lqps.state.rcv_cq_indx);
+
+ seq_printf(s, "\t%u\t", lqps.state.send_cq_indx);
+
+ if (qp->rq_idx == -1)
+ seq_puts(s, "[none]");
+ else
+ seq_printf(s, "%u", lqps.state.rq_indx);
+
+ seq_printf(s, "\t%u", lqps.state.remote_qp);
+ seq_printf(s, "\t%s", string_enum_psif_qp_trans(lqps.state.transport_type)+18);
+ if (lqps.state.proxy_qp_enable)
+ seq_puts(s, "\t[proxy]\n");
+ else if (is_epsa_tunneling_qp(qp->ibqp.qp_type))
+ seq_puts(s, "\t[EPSA tunneling]\n");
+ else if (qp->ulp_type == RDS_ULP)
+ seq_puts(s, "\t[RDS]\n");
+ else if (qp->ulp_type == IPOIB_CM_ULP)
+ seq_puts(s, "\t[IPOIB_CM]\n");
+ else if (qp->flags & SIF_QPF_EOIB)
+ seq_puts(s, "\t[EoIB]\n");
+ else if (qp->flags & SIF_QPF_IPOIB)
+ seq_puts(s, "\t[IPoIB]\n");
+ else if (qp->flags & SIF_QPF_NO_EVICT)
+ seq_puts(s, "\t[no_evict]\n");
+ else if (qp->flags & SIF_QPF_FLUSH_RETRY)
+ seq_puts(s, "\t[flush_retry]\n");
+ else if (qp->flags & SIF_QPF_KI_STENCIL)
+ seq_puts(s, "\t[ki_stencil]\n");
+ else if (qp->flags & SIF_QPF_PMA_PXY)
+ if (qp->port == 1)
+ seq_puts(s, "\t[PMA_PXY_QP_P1]\n");
+ else
+ seq_puts(s, "\t[PMA_PXY_QP_P2]\n");
+ else if (qp->flags & SIF_QPF_SMI)
+ if (qp->port == 1)
+ seq_puts(s, "\t[SMI_QP_P1]\n");
+ else
+ seq_puts(s, "\t[SMI_QP_P2]\n");
+ else if (qp->flags & SIF_QPF_GSI)
+ if (qp->port == 1)
+ seq_puts(s, "\t[GSI_QP_P1]\n");
+ else
+ seq_puts(s, "\t[GSI_QP_P2]\n");
+ else
+ seq_puts(s, "\n");
+}
+
+void sif_dfs_print_ipoffload(struct seq_file *s, struct sif_dev *sdev, loff_t pos)
+{
+ struct sif_qp *qp;
+
+ if (unlikely(pos < 0)) {
+ seq_printf(s, "#%7s %10s %21s %21s %21s\n",
+ "", "TX csum", "---- RX l3_csum ----", "---- RX l4_csum ----",
+ "-------- LSO --------");
+ seq_printf(s, "#%7s %10s %10s %10s %10s %10s %10s %10s\n",
+ "Index", "", "ok", "err", "ok", "err", "pkt", "bytes");
+ return;
+ }
+
+ qp = get_sif_qp(sdev, pos);
+
+ if (qp->flags & SIF_QPF_IPOIB || qp->flags & SIF_QPF_EOIB) {
+ if (pos <= 3 && atomic_read(&sdev->sqp_usecnt[pos]) != 1)
+ return;
+
+ seq_printf(s, "%8llu ", pos);
+ seq_printf(s, "%10llu ",
+ qp->ipoib_tx_csum_l3);
+ seq_printf(s, "%10llu %10llu ",
+ qp->ipoib_rx_csum_l3_ok, qp->ipoib_rx_csum_l3_err);
+ seq_printf(s, "%10llu %10llu ",
+ qp->ipoib_rx_csum_l4_ok, qp->ipoib_rx_csum_l4_err);
+ seq_printf(s, "%10llu %10llu\n",
+ qp->ipoib_tx_lso_pkt, qp->ipoib_tx_lso_bytes);
+ }
+}
+
+bool has_srq(struct sif_dev *sdev, struct sif_qp *qp)
+{
+ struct sif_rq *rq = has_rq(qp) ? get_sif_rq(sdev, qp->rq_idx) : NULL;
+
+ return rq && rq->is_srq;
+}
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_qp.h: Interface to internal IB queue pair logic for sif
+ */
+
+#ifndef __SIF_QP_H
+#define __SIF_QP_H
+#include "psif_hw_data.h"
+#include "sif_rq.h"
+#include "sif_sq.h"
+#include "sif_ibqp.h"
+
+struct sif_dev;
+struct seq_file;
+struct sif_sq;
+struct sif_rq;
+
+#define CB_LENGTH 256
+#define CB_KICK_ALIGN 64
+#define CB_KICK_MASK (CB_KICK_ALIGN - 1)
+
+enum sif_qp_flags {
+ SIF_QPF_EOIB = 0x1,
+ SIF_QPF_IPOIB = 0x2,
+ SIF_QPF_FORCE_SQ_MODE = 0x1000, /* Set by vendor specific flag to enforce use of SQ mode */
+ SIF_QPF_NO_EVICT = 0x2000, /* Special fake qp with do_not_evict set (see #3552) */
+ SIF_QPF_KI_STENCIL = 0x4000, /* Special stencil qp set up for efficient key invalidates */
+ SIF_QPF_DYNAMIC_MTU = 0x8000, /* Set by vendor specific flag to enforce use of dynamic MTU */
+ SIF_QPF_FLUSH_RETRY = 0x10000, /* Special fake rc qp to flush retry (see #3714) */
+ SIF_QPF_USER_MODE = 0x20000, /* User (udata != NULL) and not kernel verbs */
+ SIF_QPF_PMA_PXY = 0x100000, /* Performance management interface QP type */
+ SIF_QPF_SMI = 0x200000, /* Subnet management interface QP type */
+ SIF_QPF_GSI = 0x400000, /* General services interface QP type */
+ SIF_QPF_HW_OWNED = 0x1000000,/* Indicates HW ownership */
+};
+
+struct dentry;
+
+/*
+ * TBD - not suitable for kernel.org:
+ * As for now, the stack unwind is done at sif_create_qp() within sif driver.
+ * Picking UEK version 4.1.12 as a starting point to have this,
+ * as UEK kernel has ib_create_qp->ib_create_qp_ex.
+ * Thus, set it to 4 based on what is implemented in Oracle Kernel
+ * to retrieve the ULP.
+*/
+#define STACK_UNWIND_LEVEL 4
+/*
+ * sif_create_qp = __builtin_return_address(0)
+ * ib_create_qp = __builtin_return_address(1)
+ * ib_create_qp_ex = __builtin_return_address(2)
+ * if (rdma_cm)
+ * rdma_create_qp = __builtin_return_address(3)
+ * ULP = __builtin_return_address(4)
+*/
+
+/* The enum to determine what is the ULP caller
+ */
+enum kernel_ulp_type {
+ OTHER_ULP = 0,
+ RDS_ULP = 1,
+ IPOIB_CM_ULP = 2,
+ IPOIB_ULP = 3,
+};
+
+struct sif_qp_init_attr {
+ struct sif_pd *pd;
+ enum psif_qp_trans qp_type;
+ enum sif_proxy_type proxy;
+ enum psif_tsu_qos qosl;
+ enum kernel_ulp_type ulp_type; /* the ulp caller hint */
+ bool user_mode;
+ int sq_hdl_sz;
+};
+
+struct sif_qp {
+ volatile struct psif_qp d; /* Hardware QPSC entry */
+ struct ib_qp ibqp ____cacheline_internodealigned_in_smp;
+
+ /* Data area for query_qp results: */
+ struct psif_query_qp qqp ____cacheline_internodealigned_in_smp;
+
+ /* Pack the members used in critical path in as few cache lines as possible */
+ union {
+ u16 submask[2];
+ u32 mask;
+ } traffic_patterns; /* heuristic mask to determine the traffic pattern */
+ enum kernel_ulp_type ulp_type; /* the ulp caller hint */
+ atomic_t refcnt; /* qp refcnt to sync between destroy qp and event handling. */
+ struct completion can_destroy; /* use to synchronize destroy qp with event handling */
+ struct mutex lock ____cacheline_internodealigned_in_smp;
+ int qp_idx; /* qp and sq index */
+ int rq_idx;
+ u32 max_inline_data; /* Requested max inline for this QP */
+
+ /* Next 6 members are copy from the qp state */
+ u32 remote_qp;
+ u32 magic;
+ bool nocsum;
+ enum psif_tsu_qos qosl;
+ u8 tsl;
+ u16 remote_lid;
+
+ u16 eps_tag; /* Value to use for the eps_tag field (proxy_qp) */
+ short port; /* IB port number (= sif port# + 1) */
+ u32 flags;
+ enum ib_qp_state last_set_state;
+ enum psif_qp_trans type; /* PSIF transport type set up for this QP */
+
+ /* The following members are not used in critical path */
+ u16 pkey_index; /* Default PKEY index as set by IB_QP_PKEY */
+ enum ib_mtu mtu; /* Currently set mtu */
+ enum ib_qp_state tracked_state; /* TBD: This is stupid: Make SQD fail as MLX for SQD */
+ struct dentry *dfs_qp; /* Raw qp dump debugfs handle - used by sif_debug.c */
+ bool sq_cmpl_map_valid;
+
+ int srq_idx; /* WA #3952: Track SRQ for modify_srq(used only for pQP) */
+ atomic64_t arm_srq_holdoff_time;/* Wait-time,if the pQP is held for a prev modify_srq */
+
+ bool flush_sq_done_wa4074; /* WA #4074: Track if QP state changes are already applied */
+
+ u64 ipoib_tx_csum_l3;
+ u64 ipoib_tx_csum_l4;
+ u64 ipoib_rx_csum_l3_ok;
+ u64 ipoib_rx_csum_l3_err;
+ u64 ipoib_rx_csum_l4_ok;
+ u64 ipoib_rx_csum_l4_err;
+ u64 ipoib_tx_lso_pkt;
+ u64 ipoib_tx_lso_bytes;
+};
+
+
+/* Definition of PSIF EPSA tunneling QP using IB_QPT_RESERVED1 */
+#define IB_QPT_EPSA_TUNNELING IB_QPT_RESERVED1
+
+/* Command used to invalidate a collect buffer by writing to offset 0xff8 */
+#define PSIF_WR_CANCEL_CMD_BE 0xff00000000000000ULL
+
+/* HEURISTIC BITS used for TX/RX direction. */
+#define HEUR_RX_DIRECTION (~1ULL)
+#define HEUR_TX_DIRECTION (1ULL)
+
+static inline bool supports_offload(struct sif_qp *qp)
+{
+ return qp->flags & (SIF_QPF_EOIB | SIF_QPF_IPOIB);
+}
+
+static inline int psif_supported_trans(enum psif_qp_trans type)
+{
+ return type != PSIF_QP_TRANSPORT_RSVD1;
+}
+
+static inline bool is_regular_qp(struct sif_qp *qp)
+{
+ return (qp->type != PSIF_QP_TRANSPORT_MANSP1 &&
+ qp->type != PSIF_QP_TRANSPORT_XRC);
+}
+
+static inline bool is_epsa_tunneling_qp(enum ib_qp_type type)
+{
+ return type == IB_QPT_EPSA_TUNNELING;
+}
+
+static inline struct sif_qp *to_sqp(struct ib_qp *ibqp)
+{
+ return container_of(ibqp, struct sif_qp, ibqp);
+}
+
+struct sif_qp *create_qp(struct sif_dev *sdev,
+ struct ib_qp_init_attr *init_attr,
+ struct sif_qp_init_attr *sif_attr);
+
+int destroy_qp(struct sif_dev *sdev, struct sif_qp *qp);
+
+
+int modify_qp(struct sif_dev *sdev, struct sif_qp *qp,
+ struct ib_qp_attr *qp_attr, int qp_attr_mask,
+ bool fail_on_same_state, struct ib_udata *udata);
+
+enum ib_qp_state get_qp_state(struct sif_qp *qp);
+
+/* Line printers for debugfs files */
+void sif_dfs_print_qp(struct seq_file *s, struct sif_dev *sdev, loff_t pos);
+void sif_dfs_print_ipoffload(struct seq_file *s, struct sif_dev *sdev, loff_t pos);
+
+/* SIF specific type of handling of a modify QP operation:
+ *
+ */
+enum sif_mqp_type {
+ SIF_MQP_ERR, /* Illegal transition */
+ SIF_MQP_SW, /* Software handled transition */
+ SIF_MQP_HW, /* Hardware handled transition */
+ SIF_MQP_IGN, /* Silently ignored transition req */
+ SIF_MQP_MAX
+};
+
+u64 sif_qqp_dma_addr(struct sif_dev *sdev, struct sif_qp *qps);
+
+/* Internal query qp implementation - stores a host order query qp state in lqqp */
+int epsc_query_qp(struct sif_qp *qp, struct psif_query_qp *lqqp);
+
+/* EPSC configuration to forward PMA responses to the remapped qp_idx */
+int notify_epsc_pma_qp(struct sif_dev *sdev, int qp_idx, short port);
+
+enum sif_mqp_type sif_modify_qp_is_ok(
+ struct sif_qp *qp,
+ enum ib_qp_state cur_state,
+ enum ib_qp_state next_state,
+ enum ib_qp_attr_mask mask
+);
+
+static inline enum psif_mbox_type proxy_to_mbox(enum sif_proxy_type proxy)
+{
+ switch (proxy) {
+ case SIFPX_EPSA_1:
+ return MBOX_EPSA0;
+ case SIFPX_EPSA_2:
+ return MBOX_EPSA1;
+ case SIFPX_EPSA_3:
+ return MBOX_EPSA2;
+ case SIFPX_EPSA_4:
+ return MBOX_EPSA3;
+ default:
+ break;
+ }
+ return (enum psif_mbox_type) -1;
+}
+
+int modify_qp_hw_wa_qp_retry(struct sif_dev *sdev, struct sif_qp *qp,
+ struct ib_qp_attr *qp_attr, int qp_attr_mask);
+
+static inline bool has_rq(struct sif_qp *qp)
+{
+ return qp->rq_idx >= 0;
+}
+
+bool has_srq(struct sif_dev *sdev, struct sif_qp *qp);
+
+static inline bool ib_legal_path_mtu(enum ib_mtu mtu)
+{
+ return (mtu >= IB_MTU_256) && (mtu <= IB_MTU_4096);
+}
+
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_query.c: SIF implementation of some of IB query APIs
+ */
+#include <linux/version.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_mad.h>
+#include "sif_dev.h"
+#include "sif_query.h"
+#include "sif_defs.h"
+#include "sif_qp.h"
+
+int epsc_query_device(struct sif_dev *sdev, struct psif_epsc_device_attr *ldev)
+{
+ int ret;
+ struct psif_epsc_csr_rsp cqe;
+ struct psif_epsc_csr_req req;
+ struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+
+ memset(&req, 0, sizeof(req));
+ /* MMU context nil - passthrough */
+ req.opcode = EPSC_QUERY_DEVICE;
+ req.u.query_hw.address =
+ (u64)es->data_dma_hdl + offsetof(struct sif_epsc_data, dev);
+ req.u.query_hw.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx;
+ ret = sif_epsc_wr(sdev, &req, &cqe);
+
+ /* Copy data irrespective of how the EPSC operation went */
+ if (eps_version_ge(es, 0, 31))
+ copy_conv_to_sw(ldev, &es->data->dev, sizeof(*ldev));
+ else
+ memcpy(ldev, &es->data->dev, sizeof(*ldev));
+
+ return ret;
+}
+
+int sif_query_device(struct ib_device *ibdev, struct ib_device_attr *props)
+{
+ int ret;
+ struct sif_dev *sdev = to_sdev(ibdev);
+ struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+ struct psif_epsc_device_attr ldev;
+
+ ret = epsc_query_device(sdev, &ldev);
+ if (ret)
+ return ret;
+
+ memset(props, 0, sizeof(*props));
+ /* TBD: x.y.z - 16 bit per sublevel - we use x.y.0 for now */
+ props->fw_ver = (u64)es->ver.fw_major << 32 | (u64)es->ver.fw_minor << 16;
+ props->sys_image_guid = cpu_to_be64(ldev.sys_image_guid);
+ props->max_mr_size = ~0ull;
+ props->page_size_cap = 0xfffffe00; /* TBD: Sensible value? Use what Mellanox uses */
+ props->vendor_id = ldev.vendor_id;
+ props->vendor_part_id = ldev.vendor_part_id;
+ props->hw_ver = ldev.hw_ver;
+ props->max_qp = sdev->ba[qp].entry_cnt; /* TBD: min(ldev.max_qp, sdev->ba[qp].entry_cnt) */
+ props->max_qp_wr = min_t(u32, SIF_SW_MAX_SQE, ldev.max_srq_wr); /* Max on _any_ work queue */
+ props->device_cap_flags =
+ IB_DEVICE_BAD_PKEY_CNTR |
+ IB_DEVICE_BAD_QKEY_CNTR |
+ IB_DEVICE_AUTO_PATH_MIG |
+ IB_DEVICE_CURR_QP_STATE_MOD |
+ IB_DEVICE_SHUTDOWN_PORT |
+ IB_DEVICE_PORT_ACTIVE_EVENT |
+ IB_DEVICE_SYS_IMAGE_GUID |
+ IB_DEVICE_RC_RNR_NAK_GEN |
+ IB_DEVICE_UD_IP_CSUM |
+ IB_DEVICE_UD_TSO |
+ IB_DEVICE_XRC |
+ IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
+
+ /* returns max_sge SIF_HW_MAX_SEND_SGE -1 for IPoIB connected mode.
+ */
+ props->max_sge = (sif_find_kernel_ulp_caller() == IPOIB_CM_ULP) ?
+ SIF_HW_MAX_SEND_SGE - 1 : SIF_HW_MAX_SEND_SGE;
+ props->max_sge_rd = ldev.max_sge_rd;
+ props->max_cq = sdev->ba[cq_sw].entry_cnt;
+ props->max_cqe = SIF_SW_MAX_CQE;
+ /* Make sure we never fill the CQ completely on rev 1-3 - Bug #3657 */
+ if (PSIF_REVISION(sdev) <= 3)
+ props->max_cqe = SIF_SW_MAX_CQE - 1;
+ props->max_mr = sdev->ba[key].entry_cnt;
+ props->max_pd = SIF_MAX_PD_INDEX - 1; /* 0 not used, limited by hw field size */
+ props->max_qp_rd_atom = ldev.max_qp_rd_atom;
+ props->max_ee_rd_atom = ldev.max_ee_rd_atom;
+ props->max_res_rd_atom = props->max_qp_rd_atom * sdev->ba[qp].entry_cnt;
+ props->max_qp_init_rd_atom = ldev.max_qp_init_rd_atom;
+ props->max_ee_init_rd_atom = ldev.max_ee_init_rd_atom;
+ props->atomic_cap = ldev.atomic_cap;
+ props->max_ee = ldev.max_ee;
+ props->max_rdd = ldev.max_rdd;
+ props->max_mw = ldev.max_mw;
+ props->max_raw_ipv6_qp = min_t(u32, ldev.max_raw_ipv6_qp, props->max_qp);
+ props->max_raw_ethy_qp = min_t(u32, ldev.max_raw_ethy_qp, props->max_qp);
+ props->max_mcast_grp = ldev.max_mcast_grp;
+ props->max_mcast_qp_attach = ldev.max_mcast_qp_attach;
+ props->max_total_mcast_qp_attach = ldev.max_total_mcast_qp_attach;
+ props->max_ah = sdev->ba[ah].entry_cnt;
+ props->max_fmr = props->max_mr;
+ props->max_map_per_fmr = 0x7ffff000; /* Should be props->max_mr_size but that breaks ibv_devinfo */
+ props->max_srq = sdev->ba[rq_hw].entry_cnt;
+ props->max_srq_wr = ldev.max_srq_wr;
+ props->max_srq_sge = ldev.max_srq_sge;
+ props->max_pkeys = ldev.max_pkeys;
+ props->local_ca_ack_delay = ldev.local_ca_ack_delay;
+ return ret;
+}
+
+
+
+static int epsc_query_port(struct sif_dev *sdev, u8 port, struct psif_epsc_port_attr *lpa)
+{
+ int ret;
+ struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+ struct psif_epsc_csr_rsp cqe;
+ struct psif_epsc_csr_req req;
+ const u8 psif_port = port - 1; /* sif port index starts at 0 */
+ struct psif_epsc_port_attr *ps;
+
+ if (port > 2) {
+ sif_log(sdev, SIF_INFO, "error: request for port %d while PSIF has only 2 ports",
+ port);
+ return -EINVAL;
+ }
+
+ ps = &es->data->port[psif_port];
+
+ memset(&req, 0, sizeof(req));
+ req.opcode = psif_port == PORT_1 ? EPSC_QUERY_PORT_1 : EPSC_QUERY_PORT_2;
+ req.u.query_hw.address =
+ (u64)es->data_dma_hdl + offsetof(struct sif_epsc_data, port[psif_port]);
+ req.u.query_hw.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx;
+
+ ret = sif_epsc_wr(sdev, &req, &cqe);
+
+ /* Copy data irrespective of how the EPSC operation went */
+ if (eps_version_ge(es, 0, 31))
+ copy_conv_to_sw(lpa, ps, sizeof(*lpa));
+ else
+ memcpy(lpa, ps, sizeof(*lpa));
+
+ if (!ret)
+ sif_log(sdev, SIF_VERBS, "port %d lid %d sm_lid %d seq 0x%llx",
+ port, lpa->lid, lpa->sm_lid, cqe.seq_num);
+ else
+ sif_log(sdev, SIF_INFO, "error: port %d seq 0x%llx failed with status %s (ret = %d)",
+ port, cqe.seq_num, string_enum_psif_epsc_csr_status(cqe.status),
+ ret);
+ return ret;
+}
+
+int sif_calc_ipd(struct sif_dev *sdev, u8 port, enum ib_rate static_rate, u8 *ipd)
+{
+ int path = ib_rate_to_mult(static_rate);
+ int link, ret;
+ struct ib_port_attr lpa;
+
+ if (static_rate == IB_RATE_PORT_CURRENT) {
+ *ipd = 0;
+ return 0;
+ }
+
+ if (unlikely(path < 0)) {
+ sif_log(sdev, SIF_INFO, " Invalid static rate = %x\n",
+ path);
+ return -EINVAL;
+ }
+
+ ret = sif_query_port(&sdev->ib_dev, port, &lpa);
+ if (unlikely(ret != 0)) {
+ sif_log(sdev, SIF_INFO, "Failed to query port %u\n", port);
+ return ret;
+ }
+ /* 2^active_width * active_speed */
+ link = (1 << lpa.active_width)*lpa.active_speed;
+
+ if (path >= link)
+ *ipd = 0;
+ else
+ *ipd = (link/path)-1;
+ return 0;
+}
+
+
+int sif_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props)
+{
+ int ret;
+ struct sif_dev *sdev = to_sdev(ibdev);
+ struct psif_epsc_port_attr lpa;
+
+ ret = epsc_query_port(sdev, port, &lpa);
+ memset(props, 0, sizeof(*props));
+ props->state = lpa.state;
+ props->max_mtu = IB_MTU_4096;
+ props->active_mtu = lpa.active_mtu;
+ props->gid_tbl_len = lpa.gid_tbl_len;
+ props->port_cap_flags = lpa.port_cap_flags;
+ props->max_msg_sz = lpa.max_msg_sz;
+ props->bad_pkey_cntr = lpa.bad_pkey_cntr;
+ props->qkey_viol_cntr = lpa.qkey_viol_cntr;
+ props->pkey_tbl_len = lpa.pkey_tbl_len;
+ props->lid = lpa.lid;
+ props->sm_lid = lpa.sm_lid;
+ props->lmc = lpa.lmc;
+ props->max_vl_num = lpa.max_vl_num;
+ props->sm_sl = lpa.sm_sl;
+ props->subnet_timeout = lpa.subnet_timeout;
+ props->init_type_reply = lpa.init_type_reply;
+ props->active_width = lpa.active_width;
+ props->active_speed = lpa.active_speed;
+ props->phys_state = lpa.phys_state;
+
+ /* Cache values */
+ sdev->port[port - 1] = *props;
+ return ret;
+}
+
+int sif_query_gid(struct ib_device *ibdev, u8 port_num, int index, union ib_gid *gid)
+{
+ int ret = 0;
+ ulong log_class = SIF_VERBS;
+ struct sif_dev *sdev = to_sdev(ibdev);
+ struct psif_epsc_csr_rsp cqe;
+ struct psif_epsc_csr_req req;
+
+ memset(&req, 0, sizeof(req));
+ req.opcode = EPSC_QUERY_GID;
+ req.u.query_table.port = port_num;
+ req.u.query_table.index = index;
+ ret = sif_epsc_wr(sdev, &req, &cqe);
+ if (ret)
+ return ret;
+
+ /* Apparently clients expect to get GIDs in network byte order
+ * which requires an extra swap here:
+ */
+ gid->global.subnet_prefix = be64_to_cpu(cqe.data);
+ gid->global.interface_id = be64_to_cpu(cqe.info);
+
+ if (ret)
+ log_class = SIF_INFO;
+ sif_logi(ibdev, log_class,
+ " port_num %d, GID Table index %d - > %llx.%llx",
+ port_num, index, gid->global.subnet_prefix, gid->global.interface_id);
+ return ret;
+}
+
+
+int sif_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+ u16 *pkey)
+{
+ int ret = 0;
+ struct sif_dev *sdev = to_sdev(ibdev);
+ struct psif_epsc_csr_rsp cqe;
+ struct psif_epsc_csr_req req;
+
+ memset(&req, 0, sizeof(req));
+ req.opcode = EPSC_QUERY_PKEY;
+ req.u.query_table.port = port;
+ req.u.query_table.index = index;
+ ret = sif_epsc_wr(sdev, &req, &cqe);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "port %u index %u: Failed with status %d", port, index, ret);
+ return ret;
+ }
+ *pkey = (u16)cqe.data;
+ sif_logi(ibdev, SIF_VERBS_V, "port %u index %u -> key 0x%x",
+ port, index, *pkey);
+ return ret;
+}
+
+
+/* Called from sif_modify_device when IB_DEVICE_MODIFY_EXTENDED is set
+ * PSIF specific extension bits defined in sif_verbs.h
+ */
+static int sif_modify_device_extended(struct sif_dev *sdev, struct ib_device_modify *device_modify,
+ struct psif_epsc_csr_req *req)
+{
+ struct sif_device_modify *dm =
+ container_of(device_modify, struct sif_device_modify, ib);
+
+ /* TBD: Simplifying firmware support? */
+ sif_log(sdev, SIF_INFO, "uf %d eoib_ctrl %x eoib_data %x (not implemented)",
+ dm->uf, dm->eoib_ctrl, dm->eoib_data);
+ return -EOPNOTSUPP;
+}
+
+
+int sif_modify_device(struct ib_device *ibdev,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify)
+{
+ int ret = 0;
+ struct sif_dev *sdev = to_sdev(ibdev);
+ struct psif_epsc_csr_rsp cqe;
+ struct psif_epsc_csr_req req;
+
+ memset(&req, 0, sizeof(req));
+ req.opcode = EPSC_MODIFY_DEVICE;
+ if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
+ req.u.device.modify_mask |= PSIF_DEVICE_MODIFY_SYS_IMAGE_GUID;
+ sif_logi(ibdev, SIF_VERBS, "sys_image_guid = 0x%llx",
+ device_modify->sys_image_guid);
+ req.u.device.sys_image_guid = device_modify->sys_image_guid;
+ }
+ if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
+ req.u.device.modify_mask |= PSIF_DEVICE_MODIFY_NODE_DESC;
+ sif_logi(ibdev, SIF_VERBS, "node_desc = %s",
+ device_modify->node_desc);
+ strncpy(req.u.device.node_desc, device_modify->node_desc,
+ ARRAY_SIZE(req.u.device.node_desc)-1);
+ strncpy(ibdev->node_desc, device_modify->node_desc,
+ ARRAY_SIZE(ibdev->node_desc)-1);
+ }
+
+ /** PSIF specific extensions (sif_verbs.h) **/
+ if (device_modify_mask & IB_DEVICE_MODIFY_EXTENDED)
+ ret = sif_modify_device_extended(sdev, device_modify, &req);
+
+ ret = sif_epsc_wr(sdev, &req, &cqe);
+ if (ret)
+ sif_log(sdev, SIF_INFO, "Failed with status %d", ret);
+ return ret;
+}
+
+int sif_modify_port(struct ib_device *ibdev,
+ u8 port, int port_modify_mask,
+ struct ib_port_modify *props)
+{
+ int ret = 0;
+ struct sif_dev *sdev = to_sdev(ibdev);
+ struct psif_epsc_csr_rsp cqe;
+ struct psif_epsc_csr_req req;
+
+ sif_logi(ibdev, SIF_VERBS,
+ "via eps - port %d mask %x init_type %d, set mask %x, clr mask %x",
+ port, port_modify_mask,
+ props->init_type,
+ props->set_port_cap_mask,
+ props->clr_port_cap_mask);
+
+ memset(&req, 0, sizeof(req));
+ /* TBD: Why both port and different op for port 1 and 2? */
+ req.u.port.port = port;
+ if (port == 1)
+ req.opcode = EPSC_MODIFY_PORT_1;
+ else if (port == 2)
+ req.opcode = EPSC_MODIFY_PORT_2;
+ else {
+ /* No such port */
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* TBD: Check later on if we can let this mask straight through 1-1 */
+ if (port_modify_mask & IB_PORT_SHUTDOWN)
+ req.u.port.modify_mask |= PSIF_PORT_SHUTDOWN;
+ if (port_modify_mask & IB_PORT_INIT_TYPE) {
+ req.u.port.modify_mask |= PSIF_PORT_INIT_TYPE;
+ req.u.port.init_type = props->init_type;
+ }
+ if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
+ req.u.port.modify_mask |= PSIF_PORT_RESET_QKEY_CNTR;
+ if (port_modify_mask & (1<<4))
+ req.u.port.modify_mask |= PSIF_PORT_RESET_PKEY_CNTR;
+ req.u.port.set_port_cap_mask = props->set_port_cap_mask;
+ req.u.port.clr_port_cap_mask = props->clr_port_cap_mask;
+ ret = sif_epsc_wr(sdev, &req, &cqe);
+ if (ret)
+ sif_log(sdev, SIF_INFO, "Failed with status %d", ret);
+out:
+ return ret;
+}
+
+
--- /dev/null
+/*
+ * Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_query.h: SIF implementation of some of IB query APIs
+ */
+
+#ifndef _SIF_QUERY_H
+#define _SIF_QUERY_H
+#include "psif_hw_data.h"
+#include "sif_epsc.h"
+#include "sif_fwa.h"
+
+/* Max size of firmware version info */
+#define MAX_FW_VERSION_INFO_SZ 4096
+
+/* DMA mapped structure to receive query data in
+ * We only need one of these and we protect user access to
+ * it with sif_epsc->lock
+ */
+
+struct sif_epsc_data {
+ struct psif_epsc_device_attr dev;
+ struct psif_epsc_port_attr port[2];
+ struct psif_epsc_log_stat log;
+
+ /* fixed buffer space for special FWA client needs */
+ char fw_version[MAX_FW_VERSION_INFO_SZ]; /* Data area for firmware version info */
+ char flash[MAX_FWA_NL_PAYLOAD]; /* Data area for flash support */
+ char epsc_cli[MAX_FWA_NL_PAYLOAD]; /* Data area for EPSC CLI response*/
+ char vimm_agent[MAX_FWA_NL_PAYLOAD]; /* Data area for VIMM agent */
+ char log_data_area[0]; /* Data area will be allocated right after this struct */
+};
+
+int sif_query_device(struct ib_device *ibdev, struct ib_device_attr *props);
+
+int sif_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props);
+int sif_query_gid(struct ib_device *ibdev, u8 port_num, int index, union ib_gid *gid);
+int sif_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+ u16 *pkey);
+
+int sif_calc_ipd(struct sif_dev *sdev, u8 port, enum ib_rate static_rate,
+ u8 *ipd);
+
+int sif_modify_device(struct ib_device *ibdev,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify);
+
+int sif_modify_port(struct ib_device *ibdev,
+ u8 port, int port_modify_mask,
+ struct ib_port_modify *props);
+
+/* Populate ldev with host endian query_device info requested from the epsc */
+int epsc_query_device(struct sif_dev *sdev, struct psif_epsc_device_attr *ldev);
+
+
+static inline bool epsc_gva_permitted(struct sif_dev *sdev)
+{
+ /* None of the planned SIBS versions supports GVA2GPA for EPSC mappings */
+ return !IS_SIBS(sdev) && sdev->pdev->revision != 2 && !sif_feature(passthrough_query_qp);
+}
+
+static inline bool eps_version_ge(struct sif_eps *es, u16 major, u16 minor)
+{
+ return EPSC_API_VERSION(es->ver.epsc_major, es->ver.epsc_minor) >=
+ EPSC_API_VERSION(major, minor);
+}
+
+static inline bool eps_fw_version_ge(struct sif_eps *es, u16 major, u16 minor)
+{
+ return EPSC_API_VERSION(es->ver.fw_major, es->ver.fw_minor) >=
+ EPSC_API_VERSION(major, minor);
+}
+
+static inline bool eps_fw_version_lt(struct sif_eps *es, u16 major, u16 minor)
+{
+ return EPSC_API_VERSION(es->ver.fw_major, es->ver.fw_minor) <
+ EPSC_API_VERSION(major, minor);
+}
+
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_r3.c: Special handling specific for psif revision 3 and earlier
+ */
+#include "sif_dev.h"
+#include "sif_r3.h"
+#include "sif_base.h"
+#include "sif_query.h"
+#include "sif_qp.h"
+#include "sif_ibqp.h"
+#include "sif_sndrcv.h"
+#include "sif_ibcq.h"
+#include "sif_defs.h"
+#include "psif_hw_setget.h"
+
+/* Declared below: */
+static void sif_hw_free_flush_qp(struct sif_dev *sdev);
+static int sif_hw_allocate_flush_qp(struct sif_dev *sdev);
+static int sif_hw_allocate_dne_qp(struct sif_dev *sdev);
+static void sif_hw_free_dne_qp(struct sif_dev *sdev);
+
+static int outstanding_wqes(struct sif_dev *sdev, struct sif_qp *qp, u16 *head);
+static u16 cq_walk_wa4074(struct sif_dev *sdev, struct sif_qp *qp, bool *last_seq_set);
+static u16 walk_and_update_cqes(struct sif_dev *sdev, struct sif_qp *qp, u16 head, u16 end);
+
+int sif_r3_init(struct sif_dev *sdev)
+{
+ int ret;
+ bool dne_qp_alloc = false;
+
+ if (eps_fw_version_lt(&sdev->es[sdev->mbox_epsc], 0, 58)) {
+ ret = sif_hw_allocate_dne_qp(sdev);
+ if (ret)
+ return ret;
+ dne_qp_alloc = true;
+ }
+
+ /* Init the flush_retry qp lock */
+ mutex_init(&sdev->flush_lock);
+ ret = sif_hw_allocate_flush_qp(sdev);
+ if (ret)
+ goto flush_retry_failed;
+
+ return 0;
+flush_retry_failed:
+ if (dne_qp_alloc)
+ sif_hw_free_dne_qp(sdev);
+ return ret;
+}
+
+
+void sif_r3_deinit(struct sif_dev *sdev)
+{
+ sif_hw_free_flush_qp(sdev);
+ if (eps_fw_version_lt(&sdev->es[sdev->mbox_epsc], 0, 58))
+ sif_hw_free_dne_qp(sdev);
+}
+
+
+static int sif_hw_allocate_dne_qp(struct sif_dev *sdev)
+{
+ int ret;
+ u32 idx = sif_alloc_qp_idx(sdev->pd);
+ struct sif_qp *qp;
+ struct psif_qp lqp;
+ struct psif_query_qp lqqp;
+
+ if (idx < 0) {
+ sif_log(sdev, SIF_INFO, "Unable to reserve QP index for the do-not-evict qp");
+ return -ENOMEM;
+ }
+ sdev->dne_qp = idx;
+ qp = get_sif_qp(sdev, idx);
+ /* Make dfs and query_qp happy: */
+ qp->qp_idx = idx;
+ qp->ibqp.device = &sdev->ib_dev;
+ qp->ibqp.pd = &sdev->pd->ibpd;
+ qp->rq_idx = -1;
+ qp->last_set_state = IB_QPS_RTS;
+ qp->flags = SIF_QPF_NO_EVICT;
+ mutex_init(&qp->lock);
+
+ memset(&lqp, 0, sizeof(struct psif_qp));
+
+ lqp.state.do_not_evict = 1;
+ lqp.state.timeout_time = 0xffffffffffffULL; /* 48 bits */
+ lqp.state.state = PSIF_QP_STATE_RTS;
+ lqp.state.timer_running = 1;
+ lqp.state.transport_type = PSIF_QP_TRANSPORT_RC;
+
+ /* Write composed entry to shared area */
+ copy_conv_to_hw(&qp->d, &lqp, sizeof(struct psif_qp));
+
+ /* Do a query_qp to make PSIF fill it's cache with it
+ *- we dont care about the results from the query other than
+ * that the operation succeeds:
+ */
+ ret = epsc_query_qp(qp, &lqqp);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "query_qp failed with status %d", ret);
+ return ret;
+ }
+ ret = sif_dfs_add_qp(sdev, qp);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "Failed to allocate do-not-evict qp, index %d", idx);
+ return ret;
+ }
+ sif_log(sdev, SIF_INFO, "Allocated do-not-evict qp, index %d", idx);
+ return 0;
+}
+
+
+
+static void sif_hw_free_dne_qp(struct sif_dev *sdev)
+{
+ if (sdev->dne_qp) {
+ /* Modify it to reset via error to flush it out.
+ * We cannot use destroy_qp since it is not a "fully configured" QP:
+ */
+ struct sif_qp *qp = get_sif_qp(sdev, sdev->dne_qp);
+ struct ib_qp_attr mod_attr = {
+ .qp_state = IB_QPS_RESET,
+ };
+ modify_qp_hw_wa_qp_retry(sdev, qp, &mod_attr, IB_QP_STATE);
+ sif_dfs_remove_qp(qp);
+ sif_free_qp_idx(sdev->pd, sdev->dne_qp);
+ sdev->dne_qp = 0;
+ }
+}
+
+
+static int sif_hw_allocate_flush_qp(struct sif_dev *sdev)
+{
+ int ret = 0;
+ struct sif_qp *qp = NULL;
+ struct sif_cq *cq = NULL;
+
+ struct ib_qp_init_attr init_attr = {
+ .event_handler = NULL,
+ .srq = NULL,
+ .cap = {
+ .max_send_wr = 64,
+ .max_recv_wr = 64,
+ .max_send_sge = 1,
+ .max_recv_sge = 1,
+ },
+ .sq_sig_type = IB_SIGNAL_ALL_WR,
+ .qp_type = IB_QPT_RC,
+ };
+
+ struct sif_qp_init_attr sif_attr = {
+ .pd = sdev->pd,
+ .qp_type = ib2sif_qp_type(init_attr.qp_type),
+ .user_mode = NULL,
+ .sq_hdl_sz = sizeof(struct sif_sq_hdl),
+ .qosl = QOSL_LOW_LATENCY,
+ };
+
+ enum ib_qp_attr_mask qp_attr_mask =
+ IB_QP_STATE |
+ IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS;
+
+ struct ib_qp_attr qp_attr = {
+ .qp_state = IB_QPS_INIT,
+ .pkey_index = 0,
+ .port_num = 1,
+ .qp_access_flags =
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_ATOMIC,
+ };
+
+ struct ib_port_attr lpa;
+
+ /* No QPs when running in limited mode */
+ if (sdev->limited_mode)
+ return 0;
+
+ ret = sif_query_port(&sdev->ib_dev, 1, &lpa);
+ if (unlikely(ret)) {
+ sif_log(sdev, SIF_INFO, "Failed to query port 1");
+ goto err_query_port;
+ }
+
+ /* CQ */
+ cq = create_cq(sdev->pd,
+ init_attr.cap.max_send_wr + init_attr.cap.max_recv_wr,
+ 1, SIFPX_OFF, false);
+ if (IS_ERR(cq)) {
+ sif_log(sdev, SIF_INFO, "Failed to create CQ for flush_retry QP");
+ return -EINVAL;
+ }
+ init_attr.send_cq = &cq->ibcq;
+ init_attr.recv_cq = &cq->ibcq;
+ cq->ibcq.device = &sdev->ib_dev; /* Make destroy cq happy */
+
+ /* QP */
+ qp = create_qp(sdev, &init_attr, &sif_attr);
+ if (IS_ERR(qp)) {
+ sif_log(sdev, SIF_INFO, "Failed to create flush_retry QP");
+ ret = -EINVAL;
+ goto err_create_qp;
+ }
+
+ sif_log(sdev, SIF_QP, "Exit: success flush_retry qp 0x%p ib qp %d - real qp %d",
+ &qp->ibqp, qp->ibqp.qp_num, qp->qp_idx);
+
+
+ /* Make query & modify qp happy */
+ qp->ibqp.qp_num = qp->qp_idx;
+ qp->ibqp.device = &sdev->ib_dev;
+ qp->ibqp.pd = &sdev->pd->ibpd;
+ qp->ibqp.qp_type = init_attr.qp_type;
+ qp->type = sif_attr.qp_type;
+ qp->port = 1;
+ qp->flags = SIF_QPF_FLUSH_RETRY;
+
+ ret = sif_modify_qp(&qp->ibqp, &qp_attr, qp_attr_mask, NULL);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "modify_qp to init failed with status %d", ret);
+ goto err_modify_qp;
+ }
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_RTR;
+ qp_attr.path_mtu = IB_MTU_2048;
+ qp_attr.dest_qp_num = qp->qp_idx;
+ qp_attr.rq_psn = 0;
+ qp_attr.max_dest_rd_atomic = 1;
+ qp_attr.min_rnr_timer = 1;
+ qp_attr.ah_attr.dlid = lpa.lid;
+ qp_attr.ah_attr.port_num = 1;
+ qp_attr_mask =
+ IB_QP_STATE |
+ IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER;
+
+ ret = sif_modify_qp(&qp->ibqp, &qp_attr, qp_attr_mask, NULL);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "modify_qp to RTR failed with status %d", ret);
+ goto err_modify_qp;
+ }
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_RTS;
+ qp_attr.sq_psn = 0;
+ qp_attr.timeout = 6;
+ qp_attr.retry_cnt = 7;
+ qp_attr.rnr_retry = 7;
+ qp_attr.max_rd_atomic = 1;
+ qp_attr_mask =
+ IB_QP_STATE |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_SQ_PSN |
+ IB_QP_MAX_QP_RD_ATOMIC;
+
+ ret = sif_modify_qp(&qp->ibqp, &qp_attr, qp_attr_mask, NULL);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "modify_qp to RTS failed with status %d", ret);
+ goto err_modify_qp;
+ }
+
+ sdev->flush_qp = qp->qp_idx;
+ sif_log(sdev, SIF_INFO, "Allocated flush-retry qp, index %d", sdev->flush_qp);
+
+ return ret;
+
+err_modify_qp:
+ destroy_qp(sdev, qp);
+err_create_qp:
+ destroy_cq(cq);
+err_query_port:
+ sdev->flush_qp = 0;
+ sif_log(sdev, SIF_INFO, "Allocated flush-retry qp failed");
+
+ return ret;
+}
+
+static void sif_hw_free_flush_qp(struct sif_dev *sdev)
+{
+ struct sif_qp *qp = NULL;
+ struct sif_sq *sq = NULL;
+ struct sif_cq *cq = NULL;
+
+ if (sdev->flush_qp) {
+ qp = get_sif_qp(sdev, sdev->flush_qp);
+ sq = get_sif_sq(sdev, sdev->flush_qp);
+ cq = get_sif_cq(sdev, sq->cq_idx);
+
+ destroy_qp(sdev, qp);
+ destroy_cq(cq);
+ sdev->flush_qp = 0;
+
+ sif_log(sdev, SIF_QP, "destroy_qp %d success", qp->qp_idx);
+ }
+}
+
+void sif_r3_recreate_flush_qp(struct sif_dev *sdev)
+{
+ /* For simplicity we just destroy the old
+ * and allocate a new flush_retry qp.
+ */
+ mutex_lock(&sdev->flush_lock);
+ sif_hw_free_flush_qp(sdev);
+ sif_hw_allocate_flush_qp(sdev);
+ mutex_unlock(&sdev->flush_lock);
+}
+
+int reset_qp_flush_retry(struct sif_dev *sdev)
+{
+ struct sif_qp *qp = NULL;
+ struct psif_query_qp lqqp;
+
+ struct ib_send_wr *sbad_wr;
+ struct ib_send_wr snd_wr = {
+ .wr_id = 0x1,
+ .sg_list = NULL,
+ .opcode = IB_WR_SEND,
+ .num_sge = 0, /* ZERO byte */
+ .next = NULL,
+ };
+ struct ib_recv_wr *rbad_wr;
+ struct ib_recv_wr rcv_wr = {
+ .wr_id = 0x2,
+ .sg_list = NULL,
+ .next = NULL,
+ .num_sge = 0,
+ };
+
+ struct sif_rq *rq = NULL;
+ struct sif_cq *cq = NULL;
+
+ int ret = 0;
+ int rte, rtc;
+ int count;
+ unsigned long timeout = sdev->min_resp_ticks;
+ unsigned long timeout_real;
+
+ /* Get access to the flush_retry QP */
+ mutex_lock(&sdev->flush_lock);
+
+ if (!sdev->flush_qp) {
+ sif_log(sdev, SIF_INFO, "special handling WA_3713 failed: flush_qp does not exist");
+ ret = -EINVAL;
+ goto err_flush_qp;
+ }
+
+ qp = get_sif_qp(sdev, sdev->flush_qp);
+
+ /* Query flush_retry QP */
+ ret = epsc_query_qp(qp, &lqqp);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "epsc_query_qp failed with status %d", ret);
+ goto fail;
+ }
+
+ /* Store retry_tag_err and retry_tag_committed */
+ rte = lqqp.qp.retry_tag_err;
+ rtc = lqqp.qp.retry_tag_committed;
+
+ /* Post one zero byte send */
+ ret = sif_post_send(&qp->ibqp, &snd_wr, &sbad_wr);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "sif_post_send failed with status %d", ret);
+ goto fail;
+ }
+
+ timeout_real = jiffies + timeout;
+ while (rte == lqqp.qp.retry_tag_err || rtc == lqqp.qp.retry_tag_committed) {
+ if (time_is_after_jiffies(timeout_real)) {
+ cond_resched();
+ ret = epsc_query_qp(qp, &lqqp);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "epsc_query_qp failed with status %d", ret);
+ goto fail;
+ }
+ } else {
+ sif_log(sdev, SIF_INFO, "Timeout waiting for flush retry");
+ ret = -ETIMEDOUT;
+ goto fail;
+ }
+ }
+
+ /* Post an RQE to the RQ */
+ ret = sif_post_recv(&qp->ibqp, &rcv_wr, &rbad_wr);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "sif_post_recv failed with status %d", ret);
+ goto fail;
+ }
+
+ /* Poll out the completions of the CQ */
+ rq = get_sif_rq(sdev, qp->rq_idx);
+ cq = get_sif_cq(sdev, rq->cq_idx);
+
+ count = 0;
+ timeout_real = jiffies + timeout;
+ while (count < 2) {
+ struct ib_wc wcs[2];
+ int sts = sif_poll_cq(&cq->ibcq, 2, wcs);
+
+ if (sts < 0) {
+ sif_log(sdev, SIF_INFO, "sif_poll_cq failed with status %d", sts);
+ ret = sts;
+ goto fail;
+ } else
+ count += sts;
+
+ if (time_is_after_jiffies(timeout_real))
+ cond_resched();
+ else {
+ sif_log(sdev, SIF_INFO, "Timeout waiting for completions");
+ for (sts = 0; sts < count; sts++)
+ sif_log(sdev, SIF_INFO, "wr_id %lld status %d opcode %d",
+ wcs[sts].wr_id, wcs[sts].status, wcs[sts].opcode);
+ goto fail;
+ }
+ }
+
+ mutex_unlock(&sdev->flush_lock);
+ return ret;
+fail:
+ sif_hw_free_flush_qp(sdev);
+ sif_hw_allocate_flush_qp(sdev);
+ mutex_unlock(&sdev->flush_lock);
+ return ret;
+
+err_flush_qp:
+ mutex_unlock(&sdev->flush_lock);
+ return ret;
+}
+
+static int outstanding_wqes(struct sif_dev *sdev, struct sif_qp *qp, u16 *head)
+{
+ struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+ struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx);
+ struct psif_query_qp lqqp;
+ int ret = 0;
+
+ ret = epsc_query_qp(qp, &lqqp);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "epsc_query_qp failed with status %d", ret);
+ return ret;
+ }
+ if (head)
+ *head = lqqp.qp.retry_sq_seq;
+
+ return sq_length(sq, lqqp.qp.retry_sq_seq, sq_sw->last_seq);
+}
+
+int pre_process_wa4074(struct sif_dev *sdev, struct sif_qp *qp)
+{
+ struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+ struct psif_sq_entry *sqe;
+ u16 head;
+ int len;
+
+ len = outstanding_wqes(sdev, qp, &head);
+ if (len <= 0)
+ return -1;
+
+ while (len) {
+ head++;
+ sqe = get_sq_entry(sq, head);
+ set_psif_wr__checksum(&sqe->wr, 0);
+ len--;
+ }
+ return 0;
+}
+
+/* QP is in RESET state, its now safe to do a cq_walk and
+ * flush any completions.
+ */
+int post_process_wa4074(struct sif_dev *sdev, struct sif_qp *qp)
+{
+ struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+ struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx);
+ struct psif_query_qp lqqp;
+ bool last_seq_set = false;
+ u16 last_seq, fence_seq;
+ DECLARE_SIF_CQE_POLL(sdev, lcqe);
+ int ret = 0;
+ bool need_gen_fence_completion = true;
+ struct sif_cq *cq = (sq && sq->cq_idx >= 0) ? get_sif_cq(sdev, sq->cq_idx) : NULL;
+ struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index);
+
+
+ /* if flush SQ is in progress, set FLUSH_SQ_IN_FLIGHT.
+ */
+ if (test_bit(FLUSH_SQ_IN_PROGRESS, &sq_sw->flags)) {
+ set_bit(FLUSH_SQ_IN_FLIGHT, &sq_sw->flags);
+ return ret;
+ }
+
+ if (test_and_set_bit(FLUSH_SQ_IN_PROGRESS, &sq_sw->flags)) {
+ set_bit(FLUSH_SQ_IN_FLIGHT, &sq_sw->flags);
+ return ret;
+ }
+
+ if ((sq_sw->last_seq - sq_sw->head_seq) == 0)
+ goto err_post_wa4074;
+
+ /* if SQ has been flushed before, continue to generate
+ * the remaining completions.
+ */
+ if (test_and_set_bit(FLUSH_SQ_FIRST_TIME, &sq_sw->flags)) {
+ sif_log(sdev, SIF_WCE_V, "flush sq not the first time");
+ last_seq = sq_sw->trusted_seq;
+ goto flush_sq_again;
+ }
+
+ ret = epsc_query_qp(qp, &lqqp);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "epsc_query_qp failed, ret %d", ret);
+ goto err_post_wa4074;
+ }
+
+ last_seq = sq_sw->last_seq;
+
+ set_bit(CQ_POLLING_NOT_ALLOWED, &cq_sw->flags);
+
+ sif_log(sdev, SIF_WCE_V, "sq_retry_seq %x sq_seq %x last_seq %x head_seq %x",
+ lqqp.qp.retry_sq_seq, lqqp.qp.sq_seq, sq_sw->last_seq, sq_sw->head_seq);
+
+ /* need_gen_fence_completion is used to flush any cqes in the pipeline.
+ * If this is a good case, no fence completion is needed.
+ * Proceed directly to walk and update the CQE. The good case
+ * is only true if retry_tag_committed == retry_tag_err &&
+ * retry_sq_seq + 1 == sq_seq && !flush_started.
+ */
+
+ need_gen_fence_completion = ((lqqp.qp.retry_tag_committed != lqqp.qp.retry_tag_err) ||
+ (lqqp.qp.retry_sq_seq + 1 != lqqp.qp.sq_seq) ||
+ (lqqp.qp.flush_started));
+
+ if (need_gen_fence_completion) {
+
+ /* This is just a sequence number that we use to flush any cqes in the pipeline.
+ * Before walking the CQ, we need to ensure that we receive a cqe with fence_seq.
+ */
+ fence_seq = sq_sw->head_seq + 1;
+
+ sif_log(sdev, SIF_WCE_V, "fence_seq %x",
+ fence_seq);
+
+ /* Completion fence, this also flushes any cqes in pipeline */
+ ret = sif_gen_sq_flush_cqe(sdev, sq, fence_seq, qp->qp_idx, false);
+ if (ret)
+ sif_log(sdev, SIF_INFO, "sq %d, sif_gen_sq_flush_cqe returned %d",
+ sq->index, ret);
+
+ if (ret == -EAGAIN) {
+ ret = gen_pqp_cqe(&lcqe);
+ if (ret < 0)
+ goto err_post_wa4074;
+
+ ret = poll_cq_waitfor(&lcqe);
+ if (ret < 0)
+ goto err_post_wa4074;
+
+ lcqe.written = false;
+ }
+
+ /* Generate a sync.completion for us on the PQP */
+ ret = gen_pqp_cqe(&lcqe);
+ if (ret < 0) {
+ sif_log(sdev, SIF_INFO, "SQ %d, gen_pqp_cqe ret %d", sq->index, ret);
+ goto err_post_wa4074;
+ }
+ ret = poll_cq_waitfor(&lcqe);
+ if (ret < 0) {
+ sif_log(sdev, SIF_INFO, "SQ %d, poll_cq_waitfor failed, ret %d",
+ sq->index, ret);
+ goto err_post_wa4074;
+ }
+
+ last_seq = cq_walk_wa4074(sdev, qp, &last_seq_set);
+
+ if (!last_seq_set) {
+ sif_log(sdev, SIF_INFO, "failed to generate a completion to cq");
+ goto err_post_wa4074;
+ }
+
+ if (last_seq != fence_seq) {
+ sif_log(sdev, SIF_INFO, "last seq (%x) is different than fenced completion (%x)!",
+ last_seq, fence_seq);
+ /* As the Fenced completion cannot be guaranteed to be the last, software still needs to
+ * walk and update the CQ to avoid unexpected completion/duplicated completion
+ * even thought the last completion is the CQ is not generated fenced completion.
+ */
+ }
+
+ sif_log(sdev, SIF_WCE_V, "after: sq_retry_seq %x sq_seq %x last_seq %x head_seq %x",
+ lqqp.qp.retry_sq_seq, lqqp.qp.sq_seq, sq_sw->last_seq, sq_sw->head_seq);
+
+ }
+ last_seq = walk_and_update_cqes(sdev, qp, sq_sw->head_seq + 1, sq_sw->last_seq);
+ sq_sw->trusted_seq = last_seq;
+
+ clear_bit(CQ_POLLING_NOT_ALLOWED, &cq_sw->flags);
+
+ if (GREATER_16(last_seq, sq_sw->last_seq)) {
+ sif_log(sdev, SIF_WCE_V, "last seq %x > sq_sw->last_seq %x\n", last_seq, sq_sw->last_seq);
+ if (!(qp->flags & SIF_QPF_USER_MODE) && (cq->ibcq.comp_handler)) {
+ if (atomic_add_unless(&cq->refcnt, 1, 0)) {
+ sif_log(sdev, SIF_WCE_V, "need to generate an event to cq %d\n", cq->index);
+ cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+ if (atomic_dec_and_test(&cq->refcnt))
+ complete(&cq->cleanup_ok);
+ }
+ }
+ goto check_in_flight_and_return;
+ }
+
+ sif_log(sdev, SIF_WCE_V, "generate completion from %x to %x",
+ last_seq, sq_sw->last_seq);
+flush_sq_again:
+ for (; (!GREATER_16(last_seq, sq_sw->last_seq)); ++last_seq) {
+ sif_log(sdev, SIF_WCE_V, "generate completion %x",
+ last_seq);
+
+ ret = sif_gen_sq_flush_cqe(sdev, sq, last_seq, qp->qp_idx, true);
+ if (ret)
+ sif_log(sdev, SIF_INFO,
+ "sq %d, last_seq %x, sif_gen_sq_flush_cqe returned %d",
+ sq->index, last_seq, ret);
+
+ if (ret == -EAGAIN) {
+ ret = gen_pqp_cqe(&lcqe);
+ if (ret < 0)
+ goto err_post_wa4074;
+
+ ret = poll_cq_waitfor(&lcqe);
+ if (ret < 0)
+ goto err_post_wa4074;
+
+ lcqe.written = false;
+ continue;
+ }
+
+ if (ret < 0)
+ goto err_post_wa4074;
+ }
+
+ /* Generate a sync.completion for us on the PQP itself
+ * to allow us to wait for the whole to complete:
+ */
+ ret = gen_pqp_cqe(&lcqe);
+ if (ret < 0) {
+ sif_log(sdev, SIF_INFO, "SQ %d, gen_pqp_cqe ret %d", sq->index, ret);
+ goto err_post_wa4074;
+ }
+ ret = poll_cq_waitfor(&lcqe);
+ if (ret < 0) {
+ sif_log(sdev, SIF_INFO, "SQ %d, poll_cq_waitfor failed, ret %d",
+ sq->index, ret);
+ goto err_post_wa4074;
+ }
+
+ sif_log(sdev, SIF_INFO_V, "SQ %d: recv'd completion on cq %d seq 0x%x - done, ret %d",
+ sq->index, sq->cq_idx, lcqe.cqe.seq_num, ret);
+ sq_sw->trusted_seq = last_seq;
+
+check_in_flight_and_return:
+ if (test_and_clear_bit(FLUSH_SQ_IN_FLIGHT, &sq_sw->flags)) {
+ sif_log(sdev, SIF_WCE_V, "in-flight:generate completion from %x to %x",
+ last_seq, sq_sw->last_seq);
+ goto flush_sq_again;
+ }
+
+err_post_wa4074:
+ clear_bit(CQ_POLLING_NOT_ALLOWED, &cq_sw->flags);
+ clear_bit(FLUSH_SQ_IN_FLIGHT, &sq_sw->flags);
+ clear_bit(FLUSH_SQ_IN_PROGRESS, &sq_sw->flags);
+ qp->flush_sq_done_wa4074 = true;
+ return ret = ret > 0 ? 0 : ret;
+}
+
+/* This is called from teardown (user modify QP->ERR) as well as
+ * any subsequent WQEs posted to SQ.
+ */
+int sq_flush_wa4074(struct sif_dev *sdev, struct sif_qp *qp)
+{
+ struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+ struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx);
+ struct sif_cq *cq = (sq && sq->cq_idx >= 0) ? get_sif_cq(sdev, sq->cq_idx) : NULL;
+ struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index);
+ u16 last_seq;
+ int flushed = 0;
+ DECLARE_SIF_CQE_POLL(sdev, lcqe);
+ int ret = 0;
+
+ sif_log(sdev, SIF_INFO_V, "last_seq %x head_seq %x",
+ sq_sw->last_seq, sq_sw->head_seq);
+
+ set_bit(CQ_POLLING_NOT_ALLOWED, &cq_sw->flags);
+
+ last_seq = walk_and_update_cqes(sdev, qp, sq_sw->head_seq + 1, sq_sw->last_seq);
+
+ clear_bit(CQ_POLLING_NOT_ALLOWED, &cq_sw->flags);
+
+ if (last_seq > sq_sw->last_seq)
+ goto err_sq_flush;
+
+ for (; last_seq <= sq_sw->last_seq; ++last_seq) {
+
+ ret = sif_gen_sq_flush_cqe(sdev, sq, last_seq, qp->qp_idx, true);
+ if (ret)
+ sif_log(sdev, SIF_INFO,
+ "sq %d, last_seq %x, sif_gen_sq_flush_cqe returned %d",
+ sq->index, last_seq, ret);
+
+ if (ret == -EAGAIN) {
+ ret = gen_pqp_cqe(&lcqe);
+ if (ret < 0)
+ goto err_sq_flush;
+
+ ret = poll_cq_waitfor(&lcqe);
+ if (ret < 0)
+ goto err_sq_flush;
+
+ lcqe.written = false;
+ continue;
+ }
+
+ if (ret < 0)
+ goto err_sq_flush;
+ ++flushed;
+ }
+
+ /* Generate a sync.completion for us on the PQP itself
+ * to allow us to wait for the whole to complete:
+ */
+ ret = gen_pqp_cqe(&lcqe);
+ if (ret < 0) {
+ sif_log(sdev, SIF_INFO, "SQ %d, gen_pqp_cqe ret %d", sq->index, ret);
+ goto err_sq_flush;
+ }
+ ret = poll_cq_waitfor(&lcqe);
+ if (ret < 0) {
+ sif_log(sdev, SIF_INFO, "SQ %d, poll_cq_waitfor failed, ret %d",
+ sq->index, ret);
+ goto err_sq_flush;
+ }
+
+ sif_log(sdev, SIF_INFO_V, "SQ %d: recv'd completion on cq %d seq 0x%x - done, ret %d",
+ sq->index, sq->cq_idx, lcqe.cqe.seq_num, ret);
+
+err_sq_flush:
+ return ret = ret > 0 ? 0 : ret;
+}
+
+/* Walk the CQ, update the cqe from head to end and return the last_seq */
+static u16 walk_and_update_cqes(struct sif_dev *sdev, struct sif_qp *qp, u16 head, u16 end)
+{
+ struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+ struct sif_cq *cq = (sq && sq->cq_idx >= 0) ? get_sif_cq(sdev, sq->cq_idx) : NULL;
+ struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index);
+ volatile struct psif_cq_entry *cqe;
+ u16 last_seq = 0, updated_seq;
+ u32 seqno, polled_value;
+ unsigned long flags = 0;
+ int n = 0;
+
+ updated_seq = head;
+ last_seq = head;
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ for (seqno = cq_sw->next_seq;; ++seqno) {
+ struct psif_cq_entry lcqe;
+
+ cqe = get_cq_entry(cq, seqno);
+ polled_value = get_psif_cq_entry__seq_num(cqe);
+
+ if (seqno != polled_value)
+ break;
+
+ if (get_psif_cq_entry__qp(cqe) != qp->qp_idx)
+ continue;
+
+ copy_conv_to_sw(&lcqe, cqe, sizeof(lcqe));
+
+ if (!(lcqe.opcode & IB_WC_RECV)) {
+ last_seq = lcqe.wc_id.sq_id.sq_seq_num;
+ sif_log(sdev, SIF_WCE_V, "last_seq %x updated_seq %x lcqe.seq_num %x",
+ last_seq, updated_seq, lcqe.seq_num);
+ if (last_seq != updated_seq) {
+ lcqe.wc_id.sq_id.sq_seq_num = updated_seq;
+ if (GREATER_16(updated_seq, end)) {
+ /* A scenario might be that an additional CQE
+ * must be generated to flush all the HW
+ * generated completions. Thus, igore the polling the cqe.
+ */
+ lcqe.seq_num = ~lcqe.seq_num;
+ sif_log(sdev, SIF_WCE_V, "corrupt: lcqe.seq_num %x",
+ lcqe.seq_num);
+ set_bit(CQ_POLLING_IGNORED_SEQ, &cq_sw->flags);
+ }
+ copy_conv_to_hw(cqe, &lcqe, sizeof(lcqe));
+ }
+ if (!GREATER_16(updated_seq, end))
+ updated_seq++;
+ ++n;
+ }
+ }
+ sif_log(sdev, SIF_WCE_V, "sq/cq %d/%d: %d entries not being pulled yet",
+ sq->index, cq->index, n);
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+ return updated_seq;
+}
+
+/* Walk the CQ and return the last completed sq_seq */
+static u16 cq_walk_wa4074(struct sif_dev *sdev, struct sif_qp *qp, bool *last_seq_set)
+{
+ struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+ struct sif_cq *cq = (sq && sq->cq_idx >= 0) ? get_sif_cq(sdev, sq->cq_idx) : NULL;
+ struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index);
+ volatile struct psif_cq_entry *cqe;
+ u32 seqno, polled_value;
+ unsigned long flags = 0;
+ u16 last_seq = 0, prev_seq = 0;
+ bool prev_seq_set = false;
+ int n = 0;
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ for (seqno = cq_sw->next_seq;; ++seqno) {
+ struct psif_cq_entry lcqe;
+
+ cqe = get_cq_entry(cq, seqno);
+ polled_value = get_psif_cq_entry__seq_num(cqe);
+
+ if (seqno != polled_value)
+ break;
+
+ if (get_psif_cq_entry__qp(cqe) != qp->qp_idx)
+ continue;
+
+ copy_conv_to_sw(&lcqe, cqe, sizeof(lcqe));
+
+ if (!(lcqe.opcode & IB_WC_RECV)) {
+ last_seq = lcqe.wc_id.sq_id.sq_seq_num;
+
+ if (!(*last_seq_set))
+ *last_seq_set = true;
+
+ if (unlikely(prev_seq_set && prev_seq >= last_seq))
+ sif_log(sdev, SIF_INFO_V,
+ "sq/cq %d/%d: prev sq_seq (0x%x) >= curr sq_seq (0x%x)",
+ sq->index, cq->index, prev_seq, last_seq);
+
+ prev_seq = last_seq;
+ if (!(prev_seq_set))
+ prev_seq_set = true;
+ n++;
+ }
+ }
+ sif_log(sdev, SIF_WCE_V, "sq/cq %d/%d: %d entries not being pulled yet",
+ sq->index, cq->index, n);
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+ return last_seq;
+}
--- /dev/null
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_r3.h: Special handling specific for psif revision 3 and earlier
+ */
+
+#ifndef _SIF_R3_H
+#define _SIF_R3_H
+
+int sif_r3_init(struct sif_dev *sdev);
+void sif_r3_deinit(struct sif_dev *sdev);
+
+/* WA for #3713 */
+int reset_qp_flush_retry(struct sif_dev *sdev);
+void sif_r3_recreate_flush_qp(struct sif_dev *sdev);
+
+/* WA for #4074 */
+int pre_process_wa4074(struct sif_dev *sdev, struct sif_qp *qp);
+int post_process_wa4074(struct sif_dev *sdev, struct sif_qp *qp);
+int sq_flush_wa4074(struct sif_dev *sdev, struct sif_qp *qp);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_rq.c: Implementation of sif receive queues
+ */
+
+#include <rdma/ib_verbs.h>
+#include "sif_dev.h"
+#include "psif_hw_data.h"
+#include "psif_hw_setget.h"
+#include "sif_dma.h"
+#include "sif_rq.h"
+#include "sif_xrc.h"
+#include "sif_base.h"
+#include "sif_defs.h"
+#include <linux/seq_file.h>
+
+int poll_wait_for_rq_writeback(struct sif_dev *sdev, struct sif_rq *rq)
+{
+ unsigned long timeout = sdev->min_resp_ticks;
+ unsigned long timeout_real = jiffies + timeout;
+ u8 valid;
+
+ sif_log(sdev, SIF_RQ, "enter rq %d", rq->index);
+ do {
+ /* Make sure the update from hw is observed in correct order */
+ smp_rmb();
+ valid = get_psif_rq_hw__valid(&rq->d);
+
+ if (!valid)
+ break;
+
+ if (time_is_after_jiffies(timeout_real))
+ cpu_relax();
+ else {
+ sif_log(sdev, SIF_INFO,
+ "Timeout waiting for write back for RQ %d - still valid",
+ rq->index);
+ return -ETIMEDOUT;
+ }
+ } while (true);
+
+ sif_log(sdev, SIF_RQ, "exit - write-back observed on rq %d", rq->index);
+ return 0;
+}
+
+int alloc_rq(struct sif_dev *sdev, struct sif_pd *pd,
+ u32 entries, u32 sg_entries,
+ struct ib_srq_init_attr *srq_init_attr,
+ bool user_mode)
+{
+ int ret = 0;
+ bool mark_dirty = false;
+ /* Access to receive queue descriptor elements */
+ struct sif_rq *rq;
+ struct sif_rq_sw *rq_sw;
+ volatile struct psif_rq_hw *rq_hw_p;
+ struct psif_rq_sw lrq_sw;
+ struct psif_xrq_hw lrq_hw;
+ int extent_log2;
+ struct psif_rq_entry rqe; /* Receive queue element for size calc only */
+ u32 max_entries;
+ u32 entries_log2;
+ int rq_idx;
+ u64 alloc_sz;
+
+ max_entries = roundup_pow_of_two(entries);
+ entries_log2 = order_base_2(max_entries);
+
+ /* Meaningless with 0 sge */
+ if (!sg_entries)
+ sg_entries = 1;
+ if (sg_entries > 16) {
+ sif_log(sdev, SIF_INFO,
+ "requested %d but sif only supports 16 receive sg entries",
+ sg_entries);
+ return -ENOMEM;
+ }
+
+ /* Max supporter nmbr of RQ WRs are 2^14 - 1 */
+ if (entries > 0x3fff) {
+ sif_log(sdev, SIF_INFO,
+ "requested %d entries, but sif only supports %d",
+ entries, 0x3fff);
+ return -ENFILE; /* 4 bit size_log2 field in rqs but highest value not supported (#2965) */
+ }
+
+ rq_idx = sif_alloc_rq_hw_idx(pd);
+
+ if (rq_idx < 0) {
+ sif_log(sdev, SIF_INFO,
+ "unable to allocate a receive queue, consider increasing rq_size");
+ ret = -ENOMEM;
+ return ret;
+ }
+ rq = get_sif_rq(sdev, rq_idx);
+
+ /* Make sure the RQ is sofware owned: */
+ ret = poll_wait_for_rq_writeback(sdev, rq);
+ if (ret) {
+ mark_dirty = true;
+ goto err_alloc;
+ }
+ rq->index = rq_idx;
+ rq->pd = pd;
+
+ rq_hw_p = &rq->d;
+ rq_sw = get_sif_rq_sw(sdev, rq_idx);
+
+ /* Initialize driver/user space state within sw extent */
+ atomic_set(&rq_sw->length, 0);
+ rq_sw->next_seq = 0;
+
+ rq->entries = max_entries;
+ /* Ref. #2965 */
+ rq->entries_user = (entries_log2 == 0xe ? max_entries - 1 : max_entries);
+ rq->mask = max_entries - 1;
+ rq->extent =
+ roundup_pow_of_two(sizeof(rqe.rqe_id)
+ + sizeof(struct psif_rq_scatter) * sg_entries);
+
+ /* Now recalculate sge space from the extent to offer any extra room "for free" */
+ sg_entries = min((rq->extent - sizeof(rqe.rqe_id)) / sizeof(struct psif_rq_scatter), 16UL);
+ extent_log2 = order_base_2(rq->extent);
+ alloc_sz = max_entries * rq->extent;
+
+ /* Only whole pages must be exposed to user space */
+ if (user_mode && (alloc_sz & ~PAGE_MASK))
+ alloc_sz = (alloc_sz + PAGE_SIZE) & PAGE_MASK;
+ rq->user_mode = user_mode;
+
+ sif_log(sdev, SIF_QP, "RQ:sw 0x%p, hw 0x%p entries %d index %d extent %d max sge %d",
+ rq_sw, rq_hw_p, rq->entries, rq_idx, rq->extent, sg_entries);
+
+ if (alloc_sz <= SIF_MAX_CONT)
+ rq->mem = sif_mem_create_dmacont(sdev, alloc_sz, GFP_KERNEL | __GFP_ZERO, DMA_BIDIRECTIONAL);
+ else
+ rq->mem = sif_mem_create(sdev, alloc_sz >> PMD_SHIFT,
+ alloc_sz, SIFMT_2M, GFP_KERNEL | __GFP_ZERO, DMA_BIDIRECTIONAL);
+ if (!rq->mem) {
+ sif_log(sdev, SIF_INFO, "Failed RQ buffer pool allocation!");
+ ret = -ENOMEM;
+ goto err_alloc;
+ }
+
+ rq->sg_entries = sg_entries;
+ atomic_set(&rq->refcnt, 1);
+
+ /* Initialize hw part of descriptor */
+ memset(&lrq_hw, 0, sizeof(lrq_hw));
+
+ /* For normal RQs we use the valid bit as follows:
+ *
+ * - If the QP is in RESET state, the RQ is invalid.
+ * - The RQ is set to valid as part of transitioning to INIT.
+ * - The RQ is still valid when the QP is in ERROR state
+ * - A modify to RESET resets the valid bit again.
+ */
+
+ lrq_hw.size_log2 = entries_log2;
+ lrq_hw.prefetch_threshold_log2 = 1;
+
+ /* scatter = 0 means a single entry etc. */
+ lrq_hw.scatter = rq->sg_entries - 1;
+ lrq_hw.pd = pd->idx;
+
+ lrq_hw.head_indx = 0;
+ lrq_hw.base_addr = sif_mem_dma(rq->mem, 0);
+ lrq_hw.extent_log2 = extent_log2;
+
+ /* Allocate mmu context without wr_access set */
+ ret = sif_map_ctx(sdev, &rq->mmu_ctx, rq->mem, lrq_hw.base_addr,
+ alloc_sz, false);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "Failed to set mmu context for rq %d",
+ rq->index);
+ goto err_map_ctx;
+ }
+
+ if (srq_init_attr) {
+ /* Request for an SRQ */
+ lrq_hw.valid = 1; /* SRQs are valid for their entire lifetime */
+ lrq_hw.srq = 1;
+ lrq_hw.srq_lim = srq_init_attr->attr.srq_limit;
+ rq->is_srq = true;
+
+ if (srq_init_attr->srq_type == IB_SRQT_XRC) {
+ struct sif_cq *cq = to_scq(srq_init_attr->ext.xrc.cq);
+ struct sif_xrcd *xrcd = to_sxrcd(srq_init_attr->ext.xrc.xrcd);
+ ulong flags;
+
+ rq->cq_idx = cq->index;
+ rq->xrc_domain = lrq_hw.xrc_domain = xrcd->index;
+ lrq_hw.cqd_id = rq->cq_idx;
+ spin_lock_irqsave(&cq->lock, flags);
+ /* We only allow a CQ to be used for one single XSRQ
+ * This is a violation of the IB standard but one
+ * that probably should not have practical conseqences:
+ * See #3521 for details:
+ */
+ if (cq->xsrq) {
+ sif_log(sdev, SIF_INFO,
+ "xsrq %d: cq %d already used with xsrq %d - please use another cq for this xsrq",
+ rq->index, cq->index, cq->xsrq->index);
+ ret = -EBUSY;
+ } else
+ cq->xsrq = rq;
+ spin_unlock_irqrestore(&cq->lock, flags);
+ if (ret)
+ goto err_map_ctx;
+ }
+ }
+
+ /* Get the hw mmu context populated by sif_map_ctx */
+ lrq_hw.mmu_cntx = rq->mmu_ctx.mctx;
+
+ /* Write network byte order hw copy */
+ copy_conv_to_hw(rq_hw_p, &lrq_hw, sizeof(lrq_hw));
+
+ /* Initialize sw part of descriptor */
+ memset(&lrq_sw, 0, sizeof(lrq_sw));
+ lrq_sw.tail_indx = rq_sw->next_seq;
+
+ copy_conv_to_hw(&rq_sw->d, &lrq_sw, sizeof(lrq_sw));
+
+ spin_lock_init(&rq->lock);
+
+ return rq_idx;
+
+err_map_ctx:
+ sif_mem_free(rq->mem);
+err_alloc:
+ if (!mark_dirty)
+ sif_free_rq_hw_idx(pd, rq_idx);
+ return ret;
+}
+
+
+/* Invalidate the RQ cache and flush a desired amount of
+ * the remaining entries in the given receive queue.
+ * @target_qp indicates the value of the local_qp field in the generated
+ * completion. The qp itself would already have been modified to RESET
+ * to avoid any more traffic;
+ *
+ * Workaround #622: PSIF doesn't generate "FLUSHED IN ERROR" completions.
+ * In order to maintain OFED verbs-programming and IB spec. compatibility,
+ * RQEs needs to be "flushed in error" when
+ * - Verbs layer modifies QP to error
+ * - Hardware sends an async event, after setting the QP in error
+ * - Poll CQ on IB client(kernel/user) receives an error completion
+ * (Responder class A & C) with QP set to error
+ * - More WQEs are posted by IB client(kernel/user) when QP in error
+ * - QP is destroyed
+ *
+ * Note: No locking of the RQ is neccessary as there are multiple trigger points
+ * for flushing RQEs within OFED verbs model.
+ */
+int sif_flush_rq(struct sif_dev *sdev, struct sif_rq *rq, struct sif_qp *target_qp,
+ int max_flushed_in_err)
+{
+ int len, real_len;
+ struct sif_rq_sw *rq_sw = get_sif_rq_sw(sdev, rq->index);
+ int ret = 0;
+ u32 head, tail;
+ enum sif_mqp_type mqp_type = SIF_MQP_SW;
+ DECLARE_SIF_CQE_POLL(sdev, lcqe);
+
+ /* if flush RQ is in progress, set FLUSH_RQ_IN_FLIGHT.
+ */
+ if (test_bit(FLUSH_RQ_IN_PROGRESS, &rq_sw->flags)) {
+ set_bit(FLUSH_RQ_IN_FLIGHT, &rq_sw->flags);
+ return ret;
+ }
+
+ /* if race condition happened while trying to flush RQ,
+ * set the FLUSH_RQ_IN_FLIGHT, and let the other party does the job.
+ */
+ if (test_and_set_bit(FLUSH_RQ_IN_PROGRESS, &rq_sw->flags)) {
+ set_bit(FLUSH_RQ_IN_FLIGHT, &rq_sw->flags);
+ return ret;
+ }
+
+ if (!sif_feature(disable_rq_flush))
+ len = min(max_flushed_in_err, atomic_read(&rq_sw->length));
+ else
+ len = 0;
+ if (len == 0)
+ goto error;
+
+ sif_log(sdev, SIF_INFO_V, "flushing %d entries out of %d/%d entries remaining",
+ len, atomic_read(&rq_sw->length), rq->entries);
+
+ /* Workaround #622 v2 step 1: ModifyQP to RESET
+ * The QP must be in the RESET state to avoid race condition.
+ * sif_flush_rq will only be called when the QP is
+ * in ERROR state. As for now, keeping the same coding style to
+ * check whether the qp flags SIF_QPF_HW_OWNED is clear.
+ * If it is clear, it means that the QP is in the shadowed
+ * software error state (actual hw state is in RESET).
+ *
+ * TBD - Should we add new PSIF_QP_STATE_SHADOWED_ERROR state,
+ * at least to me it is more readable?
+ */
+ mutex_lock(&target_qp->lock);
+ /* qp lock must be held to make sure not other thread is trying to do modify_qp_hw to RESET */
+ mqp_type = sif_modify_qp_is_ok(target_qp, target_qp->last_set_state, IB_QPS_RESET, IB_QP_STATE);
+
+ if (mqp_type == SIF_MQP_HW) {
+ struct ib_qp_attr attr = {
+ .qp_state = IB_QPS_ERR
+ };
+
+ ret = modify_qp_hw_wa_qp_retry(sdev, target_qp, &attr, IB_QP_STATE);
+
+ if (ret)
+ sif_log(sdev, SIF_INFO, "qp %d RESET failed, ret %d",
+ target_qp->qp_idx, ret);
+
+ }
+ mutex_unlock(&target_qp->lock);
+
+ /* Workaround #622 v2 step 2: Invalidate RQ
+ * Invalidation of an RQ causes PSIF to flush it's caches for that RQ.
+ * If PSIF finds the RQ invalid, it will attempt to fetch it.
+ * It is then required to be valid (otherwise it will be interpreted as an error
+ * by PSIF (see #2134). So software cannot rely upon the completion of the invalidate
+ * to signal that the descriptor can be re-used, instead it will have to
+ * verify by checking the final write-back of the descriptor, which will have
+ * valid set to 0 by PSIF. In the general case we handle this lazy and check before we
+ * try to re-use. The request is posted with no completion requested as we
+ * do not need the completion:
+ */
+ if (!(test_bit(RQ_IS_INVALIDATED, &rq_sw->flags))) {
+ ret = sif_invalidate_rq_hw(sdev, rq->index, PCM_POST);
+ if (ret) {
+ sif_log(sdev, SIF_INFO,
+ "Invalidate rq_hw failed, status %d", ret);
+ goto error;
+ }
+ set_bit(RQ_IS_INVALIDATED, &rq_sw->flags);
+ }
+
+ /* Make sure the RQ is sofware owned: */
+ ret = poll_wait_for_rq_writeback(sdev, rq);
+ if (ret)
+ goto error;
+
+ /* The RQ is now software owned and the (after a successful invalidate) so we
+ * should be able to trust rq_hw::head_indx - better than scanning the CQ
+ * for unprocessed elements:
+ * Note that only the lowest 14 bits of the sequence number in head_indx is
+ * valid:
+ */
+flush_rq_again:
+ head = get_psif_rq_hw__head_indx(&rq->d);
+ tail = rq_sw->next_seq;
+ real_len = rq_length(rq, head, tail & ((1 << 14) - 1)) & ((1 << 14) - 1);
+
+ /* Workaround #622 v2 step 3: Check the last completion on the CQ
+ * The rq_sw->length is used to track the length of a queue
+ * with #posted - #completed. If the calculated real_len is
+ * smaller than the len, it means that a completion is missing.
+ * Instead of loooping RQ to find rqe of the completed wc_id, the
+ * rq_sw->length represents the #posted - #completed, and nfixup
+ * represents the remaining completions after the QP moved to RESET.
+ * Thus, the number of flush-in error that must be generated is
+ * rq_sw->length - nfixup.
+ */
+ if (!(test_bit(FLUSH_RQ_FIRST_TIME, &rq_sw->flags))) {
+ /* need to use a flag to differentiate between the first call of
+ * sif_flush_rq or the subsequent call. The race condition where
+ * HW acquired a RWQE but does not generate a completion can
+ * only happen at the first call of sif_flush_rq. This is because
+ * the QP state is moved to RESET.
+ * Besides, if the generated completion arrived later and
+ * FLUSH_RQ_IN_FLIGHT is set, the test of real_len < len
+ * might be true.
+ */
+ len = atomic_read(&rq_sw->length);
+ if (real_len < len) {
+ int nfixup;
+ u32 cq_idx = get_psif_qp_core__rcv_cq_indx(&target_qp->d.state);
+ struct sif_cq *cq = rq ? get_sif_cq(sdev, cq_idx) : NULL;
+
+ nfixup = sif_fixup_cqes(cq, NULL, target_qp);
+ sif_log(sdev, SIF_RQ,
+ "RQ %d: updating calculated entries from %d to %d - %d (%d)",
+ rq->index, real_len, len, nfixup, len - nfixup);
+ real_len = len - nfixup;
+ }
+ set_bit(FLUSH_RQ_FIRST_TIME, &rq_sw->flags);
+ }
+
+ /* Now find the actual 32 bit seq.no */
+ head = tail - real_len;
+
+ sif_log(sdev, SIF_RQ,
+ "RQ %d not empty: sz %d, head %d, next_seq %d, %d/%d entries at exit",
+ rq->index, rq->entries, head, tail, len, real_len);
+
+ if (!real_len)
+ goto error;
+
+ /* Workaround #622 v2 step 4: generate flush in error completion
+ * Generate flushed in error completions:
+ * these give no pqp completions but may in theory fail
+ */
+ while (real_len > 0) {
+ sif_log(sdev, SIF_PQP, "rq %d, len %d", rq->index, real_len);
+ ret = sif_gen_rq_flush_cqe(sdev, rq, head, target_qp);
+ if (ret)
+ sif_log(sdev, SIF_INFO, "rq %d, len %d, sif_gen_rq_flush_cqe returned %d",
+ rq->index, real_len, ret);
+ if (ret == -EAGAIN) {
+ ret = gen_pqp_cqe(&lcqe);
+ if (ret < 0)
+ goto error;
+ ret = poll_cq_waitfor(&lcqe);
+ if (ret < 0)
+ goto error;
+ lcqe.written = false;
+ continue;
+ }
+ if (ret < 0)
+ goto error;
+ real_len--;
+ head++;
+ }
+
+ /* Finally generate a sync.completion for us on the PQP itself
+ * to allow us to wait for the whole to complete:
+ */
+ ret = gen_pqp_cqe(&lcqe);
+ if (ret < 0) {
+ sif_log(sdev, SIF_INFO, "rq %d, cqe %p gen_pqp_cqe returned %d",
+ rq->index, &lcqe, ret);
+ goto error;
+ }
+
+ ret = poll_cq_waitfor(&lcqe);
+ if (ret < 0) {
+ sif_log(sdev, SIF_INFO, "rq %d, cqe %p poll_cq_waitfor returned %d",
+ rq->index, &lcqe, ret);
+ goto error;
+ }
+
+ sif_log(sdev, SIF_INFO_V, "RQ %d: received completion on cq %d seq 0x%x - done",
+ rq->index, rq->cq_idx, lcqe.cqe.seq_num);
+
+ /* Make sure hardware pointer reflects the flushed situation */
+ set_psif_rq_hw__head_indx(&rq->d, head);
+ wmb();
+
+ /* if FLUSH_RQ_IN_FLIGHT is set, it means another party is trying to
+ * flush the rq at the same time. This should be retried
+ * once as no more than one asynchronous event will be generated if
+ * QP is in ERROR state. This is to take care of a scenario where
+ * QP is modified to ERROR explicitly and at the same time received
+ * the asynchronous event. Nevertheless, the RQ entry changes in between
+ * of these two scenario that can trigger flush rq.
+ */
+ if (test_and_clear_bit(FLUSH_RQ_IN_FLIGHT, &rq_sw->flags))
+ goto flush_rq_again;
+
+error:
+ clear_bit(FLUSH_RQ_IN_PROGRESS, &rq_sw->flags);
+ return ret = ret > 0 ? 0 : ret;
+}
+
+
+int free_rq(struct sif_dev *sdev, int rq_idx)
+{
+ struct sif_rq *rq;
+ int stat;
+
+ rq = get_sif_rq(sdev, rq_idx);
+ sif_log(sdev, SIF_RQ, "entry %d", rq_idx);
+
+ stat = atomic_dec_and_test(&rq->refcnt);
+ if (!stat) {
+ sif_log(sdev, SIF_RQ, "rq %d still in use - ref.cnt %d",
+ rq_idx, atomic_read(&rq->refcnt));
+ return -EBUSY;
+ }
+
+ sif_release_rq(sdev, rq->index);
+ return 0;
+}
+
+
+void sif_release_rq(struct sif_dev *sdev, int index)
+{
+ struct sif_rq *rq = get_sif_rq(sdev, index);
+ struct sif_pd *pd = rq->pd;
+
+ if (!pd) {
+ sif_log(sdev, SIF_INFO, "Internal error: no pd associated with rq %d", index);
+ return;
+ }
+
+ sif_unmap_ctx(sdev, &rq->mmu_ctx);
+
+ sif_mem_free(rq->mem);
+ sif_clear_rq_sw(sdev, index);
+
+ if (!sif_feature(disable_invalidate_rq))
+ sif_free_rq_hw_idx(pd, index);
+}
+
+void sif_dfs_print_rq_hw(struct seq_file *s, struct sif_dev *sdev, loff_t pos)
+{
+ struct sif_rq *rq;
+ struct sif_rq_sw *rq_sw;
+ volatile struct psif_rq_hw *rq_hw;
+ u32 tail, head;
+ int qlen;
+
+ if (unlikely(pos < 0)) {
+ seq_puts(s, "# Index head sw_tail entries queue_len nmbr_sge next_seq srq_lim\n");
+ return;
+ }
+ rq = get_sif_rq(sdev, pos);
+ rq_hw = &rq->d;
+ rq_sw = get_sif_rq_sw(sdev, pos);
+
+ head = get_psif_rq_hw__head_indx(rq_hw);
+ tail = get_psif_rq_sw__tail_indx(&rq_sw->d);
+ qlen = atomic_read(&rq_sw->length);
+
+ seq_printf(s, "%7llu %5u %8u %8u %9u %8u %8u %7u", pos,
+ head, tail, rq->entries, qlen, rq->sg_entries, rq_sw->next_seq, rq->srq_limit);
+ if (rq->is_srq & rq->xrc_domain)
+ seq_puts(s, "\t[XRC-SRQ]\n");
+ else if (rq->is_srq)
+ seq_puts(s, "\t[SRQ]\n");
+ else
+ seq_puts(s, "\n");
+}
--- /dev/null
+/*
+ * Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_rq.h: Interface to sif receive queues
+ */
+
+#ifndef _SIF_RQ_H
+#define _SIF_RQ_H
+
+struct sif_rq {
+ volatile struct psif_rq_hw d; /* Hardware descriptor */
+ struct ib_srq ibsrq ____cacheline_internodealigned_in_smp; /* Only used if this is an SRQ */
+ spinlock_t lock ____cacheline_internodealigned_in_smp;
+ struct sif_mmu_ctx mmu_ctx;
+ struct sif_pd *pd; /* Ref to owning protection domain */
+ int index;
+ int cq_idx; /* Default compl.queue index to use, if any */
+ bool user_mode; /* Set if this is an RQ to be mapped to user space */
+ bool is_srq; /* Set if this is a shared receive queue */
+ int xrc_domain; /* If != 0: This is an XRC SRQ member of this domain idx */
+ atomic_t refcnt; /* Ref.count for usage as a shared receive queue */
+ u16 entries; /* Allocated entries */
+ u16 entries_user; /* Entries reported to user (entries -1 if max) */
+ u32 sg_entries; /* Max receive scatter/gather configured for this rq */
+ u32 mask; /* entries - 1 for modulo using & */
+ u32 extent;
+ u16 srq_limit;
+ struct sif_mem *mem; /* Allocated queue memory */
+};
+
+static inline struct sif_rq *to_srq(struct ib_srq *ibsrq)
+{
+ return container_of(ibsrq, struct sif_rq, ibsrq);
+}
+
+int poll_wait_for_rq_writeback(struct sif_dev *sdev, struct sif_rq *rq);
+
+/* Allocate a receive queue - if @srq_init_attr is non-nil
+ * this is a shared receive queue (SRQ)
+ * A return value >= 0 is the index of the receive queue descriptor allocated
+ * otherwise it is -errno
+ */
+int alloc_rq(struct sif_dev *sdev, struct sif_pd *pd,
+ u32 entries, u32 sge_entries,
+ struct ib_srq_init_attr *srq_init_attr,
+ bool user_mode);
+
+/* Invalidate the RQ cache and flush a desired amount of
+ * the remaining entries in the given receive queue.
+ * @target_qp indicates the value of the local_qp field in the generated
+ * completion but is not interpreted by SIF in any way.
+ */
+int sif_flush_rq(struct sif_dev *sdev, struct sif_rq *rq,
+ struct sif_qp *target_qp, int max_flushed_in_err);
+
+int free_rq(struct sif_dev *sdev, int rq_idx);
+
+/* Low level callbacks to release memory for these queues
+ * Called from sif_hiw::handle_invalidate_wc
+ */
+void sif_release_rq(struct sif_dev *sdev, int index);
+
+void sif_dfs_print_rq_hw(struct seq_file *s, struct sif_dev *sdev, loff_t pos);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_sndrcv.c: Implementation of post send/recv logic for SIF
+ */
+#include <linux/sched.h>
+#include <net/checksum.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+#include "sif_dev.h"
+#include "sif_query.h"
+#include "sif_defs.h"
+#include "sif_base.h"
+#include "sif_sndrcv.h"
+#include "sif_qp.h"
+#include "sif_mr.h"
+#include "sif_tqp.h"
+#include "sif_r3.h"
+#include "psif_hw_setget.h"
+#include "sif_checksum.h"
+#include <linux/kgdb.h>
+
+
+/* Handle a NULL terminated array of send work requests */
+#define SQS_ACTIVE (get_psif_sq_hw__sq_next(&sq->d) != 0xFFFFFFFF)
+int sif_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr)
+{
+ struct sif_dev *sdev = to_sdev(ibqp->device);
+ struct sif_qp *qp = to_sqp(ibqp);
+ struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+ struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx);
+ unsigned long flags;
+ bool doorbell_mode;
+ bool last;
+ u16 first_seq;
+ const int nmbr_wrs_to_bulk_process = 32;
+ int ret = 0;
+ int n;
+
+ sif_log(sdev, SIF_SND, "on qp_idx %d wr 0x%p ibv type %d",
+ qp->qp_idx, wr, wr->opcode);
+
+ if (unlikely(qp->type > PSIF_QP_TRANSPORT_MANSP2)) {
+ sif_log(sdev, SIF_INFO, "Invalid QP type");
+ ret = -EINVAL;
+ goto err_post_send_unlocked;
+ }
+
+ if (unlikely(is_epsa_tunneling_qp(ibqp->qp_type))) {
+ sif_log(sdev, SIF_QP, "epsa tunneling post_send");
+ return sif_epsa_tunneling_post_send(ibqp, wr, bad_wr);
+ }
+
+ /* PSIF does not support SQD. Per IBTA 11.4.1.1, error is only returned
+ * when the QP is in the RESET, INIT or RTR states.
+ */
+ if (unlikely(qp->last_set_state < IB_QPS_RTS)) {
+ sif_log(sdev, SIF_INFO, "Invalid QP state - expected RTS(%d) found %d!",
+ (int)IB_QPS_RTS, qp->last_set_state);
+ ret = -EINVAL;
+ goto err_post_send_unlocked;
+ }
+
+ while (wr) {
+ /* Workaround #3595: ring doorbell if SQS active */
+ doorbell_mode = qp->flags & SIF_QPF_FORCE_SQ_MODE || SQS_ACTIVE;
+
+ /* We need to serialize sends on the same send queue
+ * so we need to keep sq->lock around it all
+ */
+ spin_lock_irqsave(&sq->lock, flags);
+ first_seq = sq_sw->last_seq + 1;
+ for (n = 0; wr && n < nmbr_wrs_to_bulk_process; ++n, wr = wr->next) {
+ last = !wr->next || n == (nmbr_wrs_to_bulk_process - 1);
+ ret = sif_post_send_single(ibqp, wr, &doorbell_mode, last, &first_seq);
+ if (ret < 0)
+ goto err_post_send;
+ }
+ spin_unlock_irqrestore(&sq->lock, flags);
+ }
+
+ if ((qp->type != PSIF_QP_TRANSPORT_MANSP1)
+ && (qp->last_set_state == IB_QPS_ERR)) {
+ ret = 0;
+ goto flush_sq_wa4074;
+ }
+
+
+ sif_log(sdev, SIF_SND, "Exit: success");
+ return 0;
+
+err_post_send:
+ spin_unlock_irqrestore(&sq->lock, flags);
+
+err_post_send_unlocked:
+ *bad_wr = wr;
+
+flush_sq_wa4074:
+ if ((qp->type != PSIF_QP_TRANSPORT_MANSP1)
+ && (qp->last_set_state == IB_QPS_ERR)) {
+ if (post_process_wa4074(sdev, qp))
+ sif_log(sdev, SIF_INFO, "failed to flush SQ %d", qp->qp_idx);
+ }
+
+ sif_log(sdev, SIF_SND, "Exit: error %d", ret);
+ return ret;
+
+}
+#undef SQS_ACTIVE
+
+
+/* The copy_from_user function on x86_64 calls might_fault() to verify that
+ * it is not called from interrupt context. However with our use case the memory is guaranteed
+ * to be pinned, so no faults will ever happen.
+ *
+ * TBD: Sparc does not define _copy_from_user - just use copy_from _user for now
+ */
+inline unsigned long sif_copy_from_user(void *to, const void __user *from, unsigned int n)
+{
+#ifdef __x86_64__
+ return _copy_from_user(to, from, n);
+#else
+ return copy_from_user(to, from, n);
+#endif
+}
+
+
+static int copy_sg(struct sif_qp *qp, void *dest, u64 vaddr, u32 len)
+{
+ struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+
+ if (qp->ibqp.uobject) {
+ unsigned long not_copied;
+
+ sif_log(sdev, SIF_SND, "Copy sg len %d from user addr 0x%llx to %p",
+ len, vaddr, dest);
+ not_copied = sif_copy_from_user(dest, (void __user *)vaddr, len);
+ if (not_copied) {
+ sif_log(sdev, SIF_INFO,
+ "copy_from_user: Failed to copy %ld/%d bytes from uaddr %llx",
+ not_copied, len, vaddr);
+ return -EFAULT;
+ }
+ } else {
+ sif_log(sdev, SIF_SND, "Copy sge len %d from kernel addr 0x%llx to %p",
+ len, vaddr, dest);
+ memcpy(dest, (void *)vaddr, len);
+ }
+ return 0;
+}
+
+
+/* Copy the first @sg_cnt sg entries of @wr into the inline space
+ */
+
+/* TBD: Consider cleaning up/unrolling this into one copy
+ * into temp buffer for csumming/cb copy_convert
+ * and one other plain copy into send queue:
+ */
+static int prep_inline_part(struct sif_qp *qp, struct ib_send_wr *wr, int sg_cnt,
+ struct psif_cb *wqe, struct psif_wr_local *la, u32 sqe_seq,
+ bool is_phys_addr)
+{
+ int ret;
+ int wr_len = 0;
+ struct sif_sq *sq;
+ struct psif_sq_entry *sqe;
+ struct psif_key *key;
+
+ /* collect buffer only supports 256 byte inlined, this first part
+ * of the inline data must be handled in host byte order to
+ * make sure the checksum gets right:
+ */
+ int cb_len = min_t(int, ((qp->max_inline_data + CB_KICK_MASK) & ~CB_KICK_MASK), CB_LENGTH);
+ int space = qp->max_inline_data;
+ int copy = 0;
+ int remaining = -1;
+ int i;
+ u32 len = 0;
+ u64 addr = 0;
+ struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+
+ u8 buf[CB_LENGTH];
+ u8 *dbuf = buf;
+
+ if (wr->send_flags & IB_SEND_IP_CSUM) {
+ /* Cannot use collect-buffer for inline data when offloading */
+ cb_len = 0;
+ }
+
+ sq = get_sif_sq(sdev, qp->qp_idx);
+ sqe = get_sq_entry(sq, sqe_seq);
+
+ sif_log(sdev, SIF_SND, "inline from %d sges, buf at %p sqe at %p", sg_cnt, buf, sqe);
+
+ for (i = 0; i < sg_cnt; ++i) {
+ if (unlikely(remaining >= 0)) {
+ /* Switch to copying directly into send queue
+ * @copy already holds the offset
+ */
+ dbuf = ((u8 *)sqe->payload);
+ if (remaining > 0) {
+ addr += len;
+ len = remaining;
+ remaining = -1;
+ goto do_copy;
+ } else
+ remaining = -1;
+ }
+ len = wr->sg_list[i].length;
+ addr = wr->sg_list[i].addr;
+
+ if (len > 0) {
+ u32 lkey = wr->sg_list[i].lkey;
+
+ key = safe_get_key(sdev, lkey);
+ if (!key || PSIF_DMA_KEY_INVALID == get_psif_key__lkey_state(key)) {
+ sif_log(sdev, SIF_INFO,
+ "Attempt to do inline copying from an invalid MR with lkey %d at addr 0x%llx",
+ lkey, addr);
+ return -EPERM;
+ }
+ }
+
+do_copy:
+ wr_len += len;
+ if (unlikely(dbuf == buf && wr_len >= cb_len)) {
+ remaining = wr_len - cb_len;
+ len -= remaining;
+ wr_len -= remaining;
+ if (remaining)
+ i--; /* Run an extra iter to copy remainder */
+ } else if (unlikely(copy + len > space)) {
+ sif_log(sdev, SIF_INFO,
+ "Inline space exhausted: available %d, copied %d, len %d",
+ space, copy, len);
+ return -ENOMEM;
+ }
+ if (is_phys_addr) {
+ u64 *kva = phys_to_virt(addr);
+
+ sif_log(sdev, SIF_SND,
+ "Phys-addr %llx -> %llx copy %d len %d",
+ addr, (u64)kva, copy, len);
+ memcpy((void *)&dbuf[copy], (void *)kva, len);
+ ret = 0;
+ } else {
+ ret = copy_sg(qp, &dbuf[copy], addr, len);
+ }
+ if (ret < 0)
+ return ret;
+ copy += len;
+ }
+
+ if (buf == dbuf && copy & CB_KICK_MASK) {
+ /* Pad out the misaligned end data */
+ memset(&buf[copy], 0, CB_KICK_ALIGN - (copy & CB_KICK_MASK));
+ }
+
+ sif_log(sdev, SIF_QP, "wr_len is %d bytes, cb_len %d bytes", wr_len, cb_len);
+ if (cb_len > 0) {
+ /* Convert payload twice to get checksum right.
+ * The 32 bit version of the checksumming in PSIF does not
+ * have the property that checksumming of the same data
+ * on different endian hosts yields the same checksum..
+ */
+ copy_conv_to_sw(wqe->payload, buf, cb_len);
+ }
+ wqe->wr.collect_length = min(wr_len, cb_len);
+ return wr_len;
+}
+
+static inline int prep_inline(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe,
+ struct psif_wr_local *la, u32 sqe_seq,
+ bool is_phys_addr)
+{
+ struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+ struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+ int wr_len = prep_inline_part(qp, wr, wr->num_sge, wqe, la, sqe_seq, is_phys_addr);
+
+ if (wr_len < 0)
+ return wr_len;
+ if (wr_len) {
+ /* la must point to the start of the payload in the send queue
+ * to have the whole message available in case of retries:
+ */
+ la->addr = get_sqe_dma(sq, sqe_seq) + offsetof(struct psif_sq_entry, payload);
+ la->lkey = sq->sg_mr->index;
+ }
+ la->length = wr_len;
+ return wr_len;
+}
+
+/* Helper funcs declared below */
+static void prep_atomic(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe);
+static int prep_send(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe,
+ bool inlined, struct psif_wr_local *la, u32 sqe_idx);
+static int prep_send_lso(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe,
+ bool inlined, struct psif_wr_local *la, u32 sqe_idx);
+static int prep_remote_addr(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe);
+
+
+/* Return bypass mode offset or 0 if invalid for post_sends (see below)
+ * (PSIF will take care of rejecting the post)
+ */
+
+inline u64 mr_uv2dma(struct sif_dev *sdev, int idx)
+{
+ struct sif_mr *mr = safe_get_sif_mr(sdev, idx);
+
+ if (mr)
+ return mr->mmu_ctx.uv2dma;
+ return 0;
+}
+
+
+/*
+ * Handle send of a single wr - can be called from any context.
+ *
+ * Use either CB mode or DB mode. In CB mode, wqe is allocated,
+ * written to SQ, SW pointer updated, and finally the wqe is written
+ * to the CB. In DB mode, the wqe is allocated and written to the
+ * SQ. On the last wqe, SW pointer is updated and the doorbell is rung
+ * with the seq number of the first sqe.
+ */
+int sif_post_send_single(struct ib_qp *ibqp, struct ib_send_wr *wr, bool *use_db, bool last, u16 *first_seq)
+{
+ bool inlined = false;
+ u64 csum;
+ struct psif_cb wqe;
+ struct psif_sq_entry *sqe;
+ int cb_len = 0;
+ int cb_len_8 = 0;
+ struct sif_dev *sdev = to_sdev(ibqp->device);
+ struct sif_qp *qp = to_sqp(ibqp);
+ struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+ int ret = 0;
+ u16 head, sq_seq, q_sz;
+ struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx);
+ bool is_ud = qp->type == PSIF_QP_TRANSPORT_UD;
+ struct sif_sq_hdl *wh;
+
+ if (wr->num_sge > sq->sg_entries) {
+ sif_log(sdev, SIF_SND, "attempt to post wr with %d/%d sg entries",
+ wr->num_sge, sq->sg_entries);
+ return -EINVAL;
+ }
+
+ sq_seq = ++sq_sw->last_seq;
+ head = sq_sw->head_seq;
+ q_sz = sq_length(sq, head, sq_seq);
+
+ if (q_sz > sq->entries) {
+ sif_log(sdev, SIF_INFO,
+ "Send Queue %d full - head %d, tail %d, entries %d, sge_entries %u, sq->user_mode: %s, sq->alloc_sz: %llu",
+ sq->cq_idx, head, sq_seq, sq->entries, sq->sg_entries,
+ (sq->user_mode) ? "[yes]" : "[no]", sq->mem->size);
+ ret = -EAGAIN;
+ goto fail;
+ }
+
+
+ sqe = get_sq_entry(sq, sq_seq);
+
+ memset(&wqe, 0, sizeof(wqe));
+
+ wqe.wr.tsu_qosl = qp->qosl;
+ wqe.wr.eps_tag = qp->eps_tag;
+
+ ret = prep_remote_addr(qp, wr, &wqe);
+ if (ret)
+ goto fail;
+
+ if (wr->send_flags & IB_SEND_FENCE) /* RC only */
+ wqe.wr.fence = 1;
+
+ if (qp->flags & SIF_QPF_DYNAMIC_MTU)
+ wqe.wr.dynamic_mtu_enable = 1;
+
+ wqe.wr.completion = sq->complete_all;
+ if (wr->send_flags & IB_SEND_SIGNALED)
+ wqe.wr.completion = 1;
+
+ inlined = wr->send_flags & IB_SEND_INLINE;
+
+ if (qp->qp_idx < 4) {
+ /* Field valid for QP0/1 only */
+ wqe.wr.port = qp->port - 1;
+
+ /* and in the work request we must use "real" QP numbers as well */
+ wqe.wr.local_qp = qp->qp_idx & 1;
+ } else
+ wqe.wr.local_qp = qp->qp_idx;
+
+ if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+ wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
+ wqe.wr.imm = cpu_to_be32(wr->ex.imm_data);
+ }
+
+ /* TBD: only set if wr opcode allows it */
+ if (wr->send_flags & IB_SEND_SOLICITED)
+ wqe.wr.se = 1;
+
+ if (wr->send_flags & IB_SEND_IP_CSUM) {
+ wqe.wr.l3_checksum_en = 1;
+ wqe.wr.l4_checksum_en = 1;
+ qp->ipoib_tx_csum_l3++;
+ qp->ipoib_tx_csum_l4++;
+ }
+ switch (wr->opcode) {
+ case IB_WR_LSO:
+ {
+ struct psif_wr_local *la = &wqe.wr.details.send.ud.local_addr;
+
+ if (!supports_offload(qp)) {
+ sif_log(sdev, SIF_INFO,
+ "LSO WR on qp %d which does not support offloading",
+ qp->qp_idx);
+ ret = -EINVAL;
+ goto fail;
+ }
+ ret = prep_send_lso(qp, wr, &wqe, inlined, la, sq_seq);
+ if (ret < 0)
+ goto fail;
+ break;
+ }
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ {
+ struct psif_wr_local *la = (is_ud ?
+ &wqe.wr.details.send.ud.local_addr :
+ &wqe.wr.details.send.uc_rc_xrc.local_addr);
+ ret = prep_send(qp, wr, &wqe, inlined, la, sq_seq);
+ if (ret < 0)
+ goto fail;
+ break;
+ }
+ case IB_WR_RDMA_READ:
+ /* RDMA READ does not support dynamic MTU */
+ wqe.wr.dynamic_mtu_enable = 0;
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ {
+ struct psif_wr_local *la = &wqe.wr.details.rdma.local_addr;
+ struct psif_wr_remote *ra = &wqe.wr.details.rdma.remote_addr;
+
+ ra->addr = wr->wr.rdma.remote_addr;
+ ra->rkey = wr->wr.rdma.rkey;
+
+ ret = prep_send(qp, wr, &wqe, inlined, la, sq_seq);
+ if (ret < 0)
+ goto fail;
+
+ ra->length = ret;
+ break;
+ }
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ prep_atomic(qp, wr, &wqe);
+ break;
+ case IB_WR_SEND_WITH_INV:
+ case IB_WR_RDMA_READ_WITH_INV:
+ sif_log(sdev, SIF_SND, "Opcode not implemented");
+ ret = -EOPNOTSUPP;
+ goto fail;
+ case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+ case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
+ {
+ /* Bug 3844, WA for HW bug 3683 */
+ bool masked_atomics_defeatured = PSIF_REVISION(sdev) <= 3;
+
+ if (masked_atomics_defeatured)
+ sif_log(sdev, SIF_SND, "Opcode not supported");
+ else
+ sif_log(sdev, SIF_SND, "Opcode not yet implemented");
+ ret = -EOPNOTSUPP;
+ goto fail;
+ }
+ default:
+ sif_log(sdev, SIF_SND, "Unsupported opcode");
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ sif_log(sdev, SIF_SND,
+ "copied %d bytes inline, num_sgl %d, sqe at %p",
+ wqe.wr.collect_length, wqe.wr.num_sgl, sqe);
+ cb_len_8 = sizeof(struct psif_wr)
+ + ((wqe.wr.collect_length + 7) & ~7);
+ cb_len = sizeof(struct psif_wr)
+ + ((wqe.wr.collect_length + CB_KICK_MASK) & ~CB_KICK_MASK);
+
+ wqe.wr.sq_seq = sq_seq;
+ wqe.wr.tsu_sl = qp->tsl;
+
+ /* Map sqe (repr.by index in sq) to this wr_id */
+ wh = get_sq_hdl(sq, sq_seq);
+ wh->wr_id = wr->wr_id;
+ wh->sq_seq = sq_seq;
+ wh->used = true;
+
+ sif_log(sdev, SIF_SND, "wr_id %llx at tail 0x%x sq_seq_num %d%s",
+ wr->wr_id, sq_seq & sq->mask, wqe.wr.sq_seq, (wqe.wr.completion ? " [req.compl]" : ""));
+
+ /* We can safely checksum any "hole" due to end misalignment + byte swap
+ * towards the end of the inline data
+ * as prep_inline has nil'ed these bytes out:
+ */
+ if (qp->nocsum) {
+ wqe.wr.checksum = qp->magic;
+ } else {
+ csum = csum32_partial(&wqe, cb_len_8, qp->magic);
+ csum = csum32_fold(csum);
+ wqe.wr.checksum = csum;
+ }
+ sif_log(sdev, SIF_SND, "op %s checksum %x cb_len 0x%x",
+ string_enum_psif_wr_type(wqe.wr.op),
+ wqe.wr.checksum, cb_len);
+ sif_logs(SIF_DUMP, write_struct_psif_wr(NULL, 0, &wqe.wr));
+
+ /* First update send queue (any further inline data beyond cb_len
+ * has already been copied in prep_inline:
+ */
+ copy_conv_to_hw(sqe, &wqe, cb_len);
+
+ /* A heuristic mechanism to determine the traffic pattern. */
+ /* Even though traffic_patterns.mask is being set by handle_wc, no
+ * lock is used.The reason is that the mask is used to get a "rough"
+ * idea about the underlying traffic pattern without adding latency
+ * in the driver.
+ */
+ qp->traffic_patterns.mask = (qp->traffic_patterns.mask << 1) |
+ HEUR_TX_DIRECTION;
+ sif_log_perf(sdev, SIF_PERF_V, "qp:traffic_pattern %x",
+ qp->traffic_patterns.mask);
+ /* If the traffic pattern shows that it's not latency sensitive,
+ * use SQ mode by ringing the doorbell.
+ * In a latency sensitive traffic pattern, a SEND should
+ * be accompanied by a WC_OPCODE_RECEIVE_SEND. Thus,
+ * a latency sensitve traffic pattern should have
+ * half_of_bits(sizeof(traffic_patterns.submask[n)) set.
+ * The constant 7 and 9 are used below as we are adding one
+ * to half_of_bits(sizeof(traffic_patterns.submask[n]))
+ * as the tolerance.
+ */
+ if (((hweight16(qp->traffic_patterns.submask[0]) < 7) ||
+ (hweight16(qp->traffic_patterns.submask[0]) > 9)) ||
+ ((hweight16(qp->traffic_patterns.submask[1]) < 7) ||
+ (hweight16(qp->traffic_patterns.submask[1]) > 9)))
+ *use_db = true;
+
+ /* Flush writes before updating the sw pointer,
+ * This is necessary to ensure that the sqs do not see
+ * an incomplete entry.
+ * NB! Note that as opposed to software consuming
+ * queues this value should point to the last used entry, not the first
+ * unused:
+ */
+ if (!*use_db || last) {
+ wmb();
+ set_psif_sq_sw__tail_indx(&sq_sw->d, sq_seq);
+ }
+
+ /* Finally write to collect buffer or ring doorbell if last */
+ if (*use_db && last)
+ /* Write doorbell for first WR when we process the last request */
+ sif_doorbell_from_sqe(qp, *first_seq, true);
+ else if (!*use_db)
+ if (sif_cb_write(qp, &wqe.wr, cb_len)) {
+ /*vcb lock busy, convert to db mode */
+ if (last)
+ sif_doorbell_from_sqe(qp, sq_seq, true);
+ else {
+ *use_db = true;
+ *first_seq = sq_seq;
+ }
+ }
+
+ return ret;
+fail:
+ sif_log(sdev, SIF_SND, "Exit: Fail to post_send a WR");
+ sif_logs(SIF_DUMP, write_struct_psif_wr(NULL, 0, &wqe.wr));
+
+ /* Avoid "using" the allocated entry */
+ sq_sw->last_seq--;
+ return ret;
+} /* end sif_post_send_single */
+
+
+static int get_gsi_qp_idx(struct sif_qp *qp)
+{
+ struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+ int pma_qp_idx = sdev->pma_qp_idxs[!!(qp->qp_idx & 2)];
+ struct sif_qp *pma_qp = get_sif_qp(sdev, pma_qp_idx);
+ struct sif_rq_sw *rq_sw;
+ int gsi_qlen, pma_qlen;
+
+ rq_sw = get_sif_rq_sw(sdev, qp->rq_idx);
+ gsi_qlen = atomic_read(&rq_sw->length);
+ rq_sw = get_sif_rq_sw(sdev, pma_qp->rq_idx);
+ pma_qlen = atomic_read(&rq_sw->length);
+
+ return (gsi_qlen <= pma_qlen) ? qp->qp_idx : pma_qp->qp_idx;
+}
+
+
+int sif_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ struct sif_qp *qp = to_sqp(ibqp);
+ struct sif_rq *rq;
+ struct sif_dev *sdev = to_sdev(ibqp->device);
+ struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+ bool need_pma_pxy_qp = eps_version_ge(es, 0, 57)
+ && (qp->qp_idx == 1 || qp->qp_idx == 3);
+
+
+ sif_log(sdev, SIF_RCV, "Enter: wr_id 0x%llx qp_idx %d",
+ wr->wr_id, qp->qp_idx);
+
+ if (need_pma_pxy_qp) {
+ qp = get_sif_qp(sdev, get_gsi_qp_idx(qp));
+ sif_log(sdev, SIF_RCV, "Redirect wr_id 0x%llx to qp_idx %d",
+ wr->wr_id, qp->qp_idx);
+ }
+
+ if (qp->last_set_state == IB_QPS_RESET) {
+ sif_log(sdev, SIF_INFO, "Invalid QP state (IB_QPS_RESET)");
+ return -EINVAL;
+ }
+
+ rq = get_sif_rq(sdev, qp->rq_idx);
+
+ if (wr->num_sge > rq->sg_entries) {
+ sif_log(sdev, SIF_INFO, "qp only supports %d receive sg entries - wr has %d",
+ rq->sg_entries, wr->num_sge);
+ return -ENOMEM;
+ }
+
+ return post_recv(sdev, qp, rq, wr, bad_wr);
+}
+
+
+/* Post a list of receives - can be called from any context */
+int post_recv(struct sif_dev *sdev, struct sif_qp *qp, struct sif_rq *rq,
+ struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr)
+{
+ struct sif_rq_sw *rq_sw = get_sif_rq_sw(sdev, rq->index);
+ int ret = 0;
+ u32 rq_len;
+
+ unsigned long flags;
+
+ if (unlikely(rq->user_mode)) {
+ sif_log(sdev, SIF_INFO,
+ "rq %d: Attempt to use kernel API to post to user mode receive queue",
+ rq->index);
+ return -EINVAL;
+ }
+
+ if (!wr)
+ return ret;
+
+ /* TBD: Revisit locking scheme again later
+ * to allow more parallelism. For now serialize to avoid
+ * having to handle "holes":
+ */
+ spin_lock_irqsave(&rq->lock, flags);
+
+ for (; wr; wr = wr->next) {
+ struct psif_rq_entry *rqe;
+ struct psif_rq_entry lrqe;
+ struct psif_rq_scatter *sge;
+ int i = 0;
+ int rqe_sz = 8 + wr->num_sge*sizeof(struct psif_rq_scatter);
+ int max_rqe_sz = 8 + rq->sg_entries*sizeof(struct psif_rq_scatter);
+
+ rq_len = atomic_inc_return(&rq_sw->length);
+ if (rq_len > rq->entries) {
+ sif_log(sdev, SIF_INFO, "queue full - rq %d entries %d len %d",
+ rq->index, rq->entries, rq_len);
+ atomic_dec(&rq_sw->length);
+ ret = -ENOMEM;
+ goto err_post_recv;
+ }
+ if (wr->num_sge > rq->sg_entries) {
+ sif_log(sdev, SIF_INFO, "too many sges - rq %d sges configured %d, sges in wr %d",
+ rq->index, rq->sg_entries, wr->num_sge);
+ atomic_dec(&rq_sw->length);
+ ret = -EINVAL;
+ goto err_post_recv;
+ }
+
+ rqe = get_rq_entry(rq, rq_sw->next_seq++);
+
+ /* On the receive side we use the full wr_id directly */
+ lrqe.rqe_id = wr->wr_id;
+
+ sge = lrqe.scatter;
+ for (i = 0; i < wr->num_sge; i++) {
+ u32 lkey = wr->sg_list[i].lkey;
+
+ sge[i].lkey = lkey;
+ sge[i].base_addr = wr->sg_list[i].addr + mr_uv2dma(sdev, lkey);
+ sge[i].length = wr->sg_list[i].length;
+ sif_log(sdev, SIF_RCV,
+ "sg_adr 0x%llx sg_len %d lkey %d",
+ wr->sg_list[i].addr, wr->sg_list[i].length, lkey);
+ }
+
+ copy_conv_to_hw(rqe, &lrqe, rqe_sz);
+
+ /* As per PRM, unused sges shall be zero, which is endian neutral */
+ if (max_rqe_sz > rqe_sz)
+ memset(rqe->scatter + wr->num_sge, 0, max_rqe_sz - rqe_sz);
+
+ sif_log(sdev, SIF_RCV,
+ " entries %u extent %u RQ %d next_seq %x length %d",
+ rq->entries, rq->extent, rq->index,
+ rq_sw->next_seq, atomic_read(&rq_sw->length));
+ }
+ /* Enforce reordering of new rq entries and tail */
+ wmb();
+ set_psif_rq_sw__tail_indx(&rq_sw->d, rq_sw->next_seq);
+ /* Enforce visibility of rq tail on hw */
+ smp_wmb();
+
+ sif_log(sdev, SIF_RCV, "Exit: success");
+err_post_recv:
+ spin_unlock_irqrestore(&rq->lock, flags);
+ *bad_wr = wr;
+
+ /* WA #622, Check if QP in ERROR, flush RQ */
+ if (!rq->is_srq && is_regular_qp(qp) && qp->last_set_state == IB_QPS_ERR) {
+ if (sif_flush_rq(sdev, rq, qp, atomic_read(&rq_sw->length)))
+ sif_log(sdev, SIF_INFO, "failed to flush RQ %d", rq->index);
+ }
+
+ return ret;
+}
+
+int sif_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ struct sif_dev *sdev = to_sdev(ibqp->device);
+ struct sif_qp *qp = to_sqp(ibqp);
+ struct psif_epsc_csr_rsp rsp;
+ struct psif_epsc_csr_req req;
+
+ sif_log(sdev, SIF_MC, "qp %d mc gid %llx.%llx lid 0x%x",
+ qp->qp_idx, gid->global.subnet_prefix, gid->global.interface_id, lid);
+
+ memset(&req, 0, sizeof(req));
+ req.opcode = EPSC_MC_ATTACH;
+ req.u.mc.qp = qp->qp_idx;
+ req.u.mc.port = qp->port; /* The EPS uses IB port space */
+ /* union ib_gid contains BE gids and we do copy_convert later.. */
+ req.u.mc.mgid_0 = be64_to_cpu(gid->global.subnet_prefix);
+ req.u.mc.mgid_1 = be64_to_cpu(gid->global.interface_id);
+ return sif_epsc_wr(sdev, &req, &rsp);
+}
+
+int sif_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ struct sif_dev *sdev = to_sdev(ibqp->device);
+ struct sif_qp *qp = to_sqp(ibqp);
+ struct psif_epsc_csr_rsp rsp;
+ struct psif_epsc_csr_req req;
+
+ sif_log(sdev, SIF_MC, "qp %d mc gid %llx.%llx lid 0x%x",
+ qp->qp_idx, gid->global.subnet_prefix, gid->global.interface_id, lid);
+
+ memset(&req, 0, sizeof(req));
+ req.opcode = EPSC_MC_DETACH;
+ req.u.mc.qp = qp->qp_idx;
+ req.u.mc.port = qp->port; /* The EPS uses IB port space */
+ /* union ib_gid contains BE gids and we do copy_convert later.. */
+ req.u.mc.mgid_0 = be64_to_cpu(gid->global.subnet_prefix);
+ req.u.mc.mgid_1 = be64_to_cpu(gid->global.interface_id);
+ return sif_epsc_wr(sdev, &req, &rsp);
+}
+
+
+/* Workaround to emulate extra send sg entries from software:
+ * We use the available inline space and copy the first fitting
+ * xsg = wr->num_sge - hw_max + 1 entries into this space:
+ */
+static int prep_sw_sg(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe,
+ struct psif_wr_local *la, u32 sqe_seq)
+{
+ struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+ struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+ struct psif_sq_entry *sqe = get_sq_entry(sq, sqe_seq);
+ void *sgl_start = sq_sgl_offset(sq, sqe);
+ struct psif_rq_scatter *sge = sq->tmp_sge;
+ int i;
+ int xsg = wr->num_sge - SIF_HW_MAX_SEND_SGE + 1;
+ int xi = -1;
+ int pi = 0;
+ u32 xcnt = 0;
+ u32 len = 0;
+ int ret;
+ u32 xlen = 0;
+ u64 addr = 0;
+ int space = qp->max_inline_data;
+
+ la->addr = get_sqe_dma(sq, sqe_seq) + sq->sgl_offset;
+ la->lkey = sq->sg_mr->index;
+
+ for (i = 0; i < wr->num_sge; i++) {
+ if (i == xsg)
+ space -= 256; /* We can no longer use the inline bytes */
+ xlen += wr->sg_list[i].length;
+ sif_log(sdev, SIF_SND, "xsg %d, xlen 0x%x space 0x%x", xsg, xlen, space);
+ if (xcnt < xsg) {
+ xcnt++;
+ if (xcnt < xsg)
+ continue;
+ }
+ if (xlen <= space) {
+ xi = i - xsg + 1;
+ break;
+ }
+ xlen -= wr->sg_list[i - xsg].length;
+ }
+ if (xi < 0) {
+ /* If our worst case calculations are right, this should not happen.. */
+ sif_log(sdev, SIF_INFO, "Failed to find sg entries to collapse into inline space!");
+ return -ENOMEM;
+ }
+ if (xi == 0) {
+ ret = prep_inline_part(qp, wr, xsg, wqe, la, sqe_seq, false);
+ if (ret < 0)
+ return ret;
+ } else {
+ /* TBD: We can consider merging xsg + 1 entries into two
+ * sg entries, one containing the first entries, but for now
+ * keep it simple and just not use the first 256 bytes:
+ */
+ u8 *dbuf = ((u8 *)sqe->payload);
+ int copy = 0;
+
+ for (i = xi; i < xi + xsg; i++) {
+ u32 lkey = wr->sg_list[i].lkey;
+
+ len = wr->sg_list[i].length;
+ addr = wr->sg_list[i].addr;
+ if (len > 0) {
+ struct psif_key *key = safe_get_key(sdev, lkey);
+
+ if (!key || PSIF_DMA_KEY_INVALID == get_psif_key__lkey_state(key)) {
+ sif_log(sdev, SIF_INFO,
+ "Attempt to do inline copying from an invalid MR with lkey %d at addr 0x%llx",
+ wr->sg_list[i].lkey, addr);
+ return -EPERM;
+ }
+ }
+
+ ret = copy_sg(qp, &dbuf[copy], addr, len);
+ if (ret < 0)
+ return ret;
+ copy += len;
+ }
+ }
+
+ la->length = 0;
+ for (i = 0; i < wr->num_sge; i++) {
+ u32 lkey;
+ u32 offset = i ? 256 : 0;
+
+ if (i == xi) {
+ sge[pi].lkey = sq->sg_mr->index;
+ sge[pi].base_addr =
+ get_sqe_dma(sq, sqe_seq) +
+ offsetof(struct psif_sq_entry, payload) + offset;
+ sge[pi].length = xlen;
+ la->length += xlen;
+ i += xsg - 1;
+ sif_log(sdev, SIF_SND,
+ "sg_list[%d]: sge entry: dma addr 0x%llx, len = %d, lkey %d",
+ pi, sge[pi].base_addr, sge[pi].length, sge[pi].lkey);
+ pi++;
+ continue;
+ }
+ lkey = wr->sg_list[i].lkey;
+ sge[pi].base_addr = wr->sg_list[i].addr
+ + mr_uv2dma(sdev, lkey);
+ sge[pi].lkey = wr->sg_list[i].lkey;
+ sge[pi].length = wr->sg_list[i].length;
+ la->length += sge[pi].length;
+ sif_log(sdev, SIF_SND,
+ "sg_list[%d]: sge entry: dma addr 0x%llx, len = %d, lkey %d",
+ pi, sge[pi].base_addr, sge[pi].length, sge[pi].lkey);
+ pi++;
+ }
+ sif_log(sdev, SIF_SND,
+ "ready with sgl_start %p, sg list addr 0x%llx, message len %d, lkey %d, sge %p",
+ sgl_start, la->addr, la->length, la->lkey, sge);
+
+ copy_conv_to_hw(sgl_start, sge,
+ sizeof(struct psif_rq_scatter) * SIF_HW_MAX_SEND_SGE);
+ wqe->wr.num_sgl = SIF_HW_MAX_SEND_SGE - 1;
+ return la->length;
+}
+
+
+static int prep_send(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe,
+ bool inlined, struct psif_wr_local *la, u32 sqe_seq)
+{
+ struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+ int ret = 0;
+ int num_sge;
+ int use_inline_first_sge = 0;
+
+ if (inlined)
+ return prep_inline(qp, wr, wqe, la, sqe_seq, false);
+
+ la->length = 0;
+ num_sge = wr->num_sge;
+ if (num_sge == 0) {
+ sif_log(sdev, SIF_SND, "no sge entries - local_addr left as 0");
+ return 0;
+ }
+ if (!sif_feature(disable_inline_first_sge) && qp->ulp_type == RDS_ULP && num_sge == 2
+ && wr->sg_list[0].length <= qp->max_inline_data) {
+ use_inline_first_sge = 1;
+ }
+
+ if (use_inline_first_sge) {
+ int wr_len;
+ u32 lkey = wr->sg_list[0].lkey;
+ struct sif_mr *mr = safe_get_sif_mr(sdev, lkey);
+ int mem_type = mr ? mr->mem->mem_type : 0;
+ bool is_phys_addr = mem_type != SIFMT_UMEM;
+
+ sif_log(sdev, SIF_SND, "qp_%d handle special case; "
+ "#sge == 2 && sg[0].len == 48 max_inline_data %d, mem_type %d",
+ qp->qp_idx, qp->max_inline_data, mem_type);
+ /* Copy first sge inline */
+ if ((wr->sg_list[0].length + wr->sg_list[1].length) <= qp->max_inline_data) {
+ sif_log(sdev, SIF_SND, "qp_%d Inlining both %d + %d = %d",
+ qp->qp_idx,
+ wr->sg_list[0].length,
+ wr->sg_list[1].length,
+ (wr->sg_list[0].length + wr->sg_list[1].length));
+ return prep_inline(qp, wr, wqe, la, sqe_seq, is_phys_addr);
+ }
+ wr_len = prep_inline_part(qp, wr, 1, wqe, la, sqe_seq, is_phys_addr);
+ if (wr_len < 0)
+ return wr_len;
+ lkey = wr->sg_list[1].lkey;
+ /* Subtract to get address "correct" for hw-usage */
+ la->addr = wr->sg_list[1].addr + mr_uv2dma(sdev, lkey) - wr_len;
+ la->lkey = lkey;
+ la->length = wr_len + wr->sg_list[1].length;
+ num_sge = 1;
+ sif_log(sdev, SIF_SND,
+ "Changed to single sge user addr 0x%llx dma addr 0x%llx, message len %d, key %d collect_len %d wr_len %d",
+ wr->sg_list[1].addr, la->addr, la->length, lkey, wqe->wr.collect_length, wr_len);
+ } else if (num_sge == 1) {
+ /* Single entry S/G list result after inlining */
+ u32 lkey = wr->sg_list[0].lkey;
+
+ la->addr = wr->sg_list[0].addr + mr_uv2dma(sdev, lkey);
+ la->lkey = lkey;
+ la->length += wr->sg_list[0].length;
+ sif_log(sdev, SIF_SND,
+ "single sge user addr 0x%llx dma addr 0x%llx, message len %d, key %d",
+ wr->sg_list[0].addr, la->addr, la->length, lkey);
+ } else if (unlikely(wr->num_sge > SIF_HW_MAX_SEND_SGE)) {
+ return prep_sw_sg(qp, wr, wqe, la, sqe_seq);
+ } else {
+ struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+ struct psif_sq_entry *sqe = get_sq_entry(sq, sqe_seq);
+ void *sgl_start = sq_sgl_offset(sq, sqe);
+ struct psif_rq_scatter *sge = sq->tmp_sge;
+ int i;
+
+ la->addr = get_sqe_dma(sq, sqe_seq) + sq->sgl_offset;
+ la->lkey = sq->sg_mr->index;
+
+ for (i = 0; i < num_sge; i++) {
+ u32 lkey = wr->sg_list[i].lkey;
+
+ sge[i].base_addr = wr->sg_list[i].addr
+ + mr_uv2dma(sdev, lkey);
+ sge[i].lkey = wr->sg_list[i].lkey;
+ sge[i].length = wr->sg_list[i].length;
+ la->length += sge[i].length;
+ sif_log(sdev, SIF_SND,
+ "sg_list[%d]: sge entry: dma addr 0x%llx, len = %d, lkey %d",
+ i, sge[i].base_addr, sge[i].length, sge[i].lkey);
+ }
+ sif_log(sdev, SIF_SND,
+ "ready with sgl_start %p, sg list addr 0x%llx, message len %d, lkey %d, sge %p",
+ sgl_start, la->addr, la->length, la->lkey, sge);
+
+ copy_conv_to_hw(sgl_start, sge,
+ sizeof(struct psif_rq_scatter) * wr->num_sge);
+ ret = la->length;
+ }
+ /* 0 here means a single entry, but input 0 must also be 0 */
+ wqe->wr.num_sgl = num_sge ? num_sge - 1 : 0;
+ return ret;
+}
+static int prep_send_lso(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe,
+ bool inlined, struct psif_wr_local *la, u32 sqe_seq)
+{
+ struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+ void *sgl_start;
+ int ret = 0;
+ int i;
+ u8 *p8;
+ struct sif_sq *sq;
+ struct psif_sq_entry *sqe;
+ struct psif_rq_scatter *sge;
+ const int stencil_sge = 1;
+
+ sq = get_sif_sq(sdev, qp->qp_idx);
+ sqe = get_sq_entry(sq, sqe_seq);
+ sge = sq->tmp_sge;
+ sgl_start = sq_sgl_offset(sq, sqe);
+
+ if (unlikely(wr->num_sge >= SIF_HW_MAX_SEND_SGE || wr->num_sge < 1)) {
+ sif_log(sdev, SIF_INFO, "attempt to post lso wr with %d/%d sg entries",
+ wr->num_sge, sq->sg_entries);
+ return -EINVAL;
+ }
+
+ wqe->wr.details.send.ud.mss = wr->wr.ud.mss;
+
+ la->addr = get_sqe_dma(sq, sqe_seq) + sq->sgl_offset;
+ la->lkey = sq->sg_mr->index;
+ la->length = 0;
+
+ /* copy stencil to payload-area in send_queue */
+ p8 = (u8 *)wr->wr.ud.header;
+ memcpy((u8 *)sqe->payload, p8, wr->wr.ud.hlen);
+
+ sge[0].base_addr = get_sqe_dma(sq, sqe_seq)
+ + offsetof(struct psif_sq_entry, payload) + mr_uv2dma(sdev, la->lkey);
+ sge[0].lkey = sq->sg_mr->index;
+ sge[0].length = wr->wr.ud.hlen;
+ la->length += sge[0].length;
+
+ sif_log(sdev, SIF_SND,
+ "sg_list[%d]: sge entry: dma addr 0x%llx, len = %d, lkey %d",
+ 0, sge[0].base_addr, sge[0].length, sge[0].lkey);
+
+ for (i = 0; i < wr->num_sge; i++) {
+ u32 lkey = wr->sg_list[i].lkey;
+
+ sge[i+1].base_addr = wr->sg_list[i].addr + mr_uv2dma(sdev, lkey);
+ sge[i+1].lkey = wr->sg_list[i].lkey;
+ sge[i+1].length = wr->sg_list[i].length;
+ la->length += sge[i+1].length;
+ sif_log(sdev, SIF_SND,
+ "sg_list[%d]: sge entry: dma addr 0x%llx, len = %d, lkey %d",
+ i+1, sge[i+1].base_addr, sge[i+1].length, sge[i+1].lkey);
+ }
+ copy_conv_to_hw(sgl_start, sge,
+ sizeof(struct psif_rq_scatter) * (wr->num_sge+1));
+
+ wmb();
+ wqe->wr.num_sgl = wr->num_sge - 1 + stencil_sge;
+ sif_log(sdev, SIF_SND,
+ "num_sgl %d, sqe at %p la ->addr 0x%llx ->lkey %d ->length %d %d", wqe->wr.num_sgl, sqe,
+ la->addr, la->lkey, la->length, la->length-sge[0].length);
+ qp->ipoib_tx_lso_pkt++;
+ qp->ipoib_tx_lso_bytes += (la->length - sge[0].length);
+ return ret;
+}
+
+
+static int prep_remote_addr(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe)
+{
+ struct sif_ah *ah = NULL;
+ struct psif_ah *ah_p;
+ bool is_dr = false;
+ struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+
+ sif_log(sdev, SIF_SND, "");
+ switch (qp->type) {
+ case PSIF_QP_TRANSPORT_UD:
+ if (!wr->wr.ud.ah) {
+ sif_log(sdev, SIF_INFO, "No ah supplied for ud packet");
+ return -EINVAL;
+ }
+ ah = to_sah(wr->wr.ud.ah);
+ ah_p = get_ah(sdev, ah->index);
+ is_dr = get_psif_ah__remote_lid(ah_p) == 0xffff;
+
+ /* Direct routed packets are destined for the SMA at uf 33.
+ * For all other packets this field is ignored by the hw:
+ */
+ if (is_dr)
+ wqe->wr.destuf = 33;
+ wqe->wr.details.send.ud.remote_addr.ah_indx
+ = ah->index;
+ wqe->wr.details.send.ud.qp.qkey = wr->wr.ud.remote_qkey;
+ wqe->wr.details.send.ud.qp.remote_qp = wr->wr.ud.remote_qpn;
+ wqe->wr.ud_pkt = 1;
+ break;
+ case PSIF_QP_TRANSPORT_UC:
+ case PSIF_QP_TRANSPORT_RC:
+ break;
+ case PSIF_QP_TRANSPORT_XRC:
+ wqe->wr.xrc_hdr.xrqd_id = wr->xrc_remote_srq_num;
+ break;
+ default:
+ sif_log(sdev, SIF_INFO,
+ "unhandled transport type %s", string_enum_psif_qp_trans(qp->type));
+ return -EINVAL;
+ }
+ wqe->wr.op = ib2sif_wr_op(wr->opcode, is_dr);
+ return 0;
+}
+
+
+
+static void prep_atomic(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe)
+{
+ struct psif_wr_local *la = &wqe->wr.details.atomic.local_addr;
+ struct psif_wr_remote *ra = &wqe->wr.details.atomic.remote_addr;
+
+ la->addr = wr->sg_list[0].addr;
+ la->lkey = wr->sg_list[0].lkey;
+ la->length = sizeof(long);
+
+ ra->addr = wr->wr.atomic.remote_addr;
+ ra->rkey = wr->wr.atomic.rkey;
+ ra->length = sizeof(long);
+
+ /* Payload order as in IB header */
+ if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+ wqe->payload[0] = cpu_to_be64(wr->wr.atomic.swap);
+ wqe->payload[1] = cpu_to_be64(wr->wr.atomic.compare_add);
+ wqe->wr.collect_length = 16;
+ } else {
+ wqe->payload[0] = cpu_to_be64(wr->wr.atomic.compare_add);
+ wqe->wr.collect_length = 8;
+ }
+}
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_sndrcv.h: Interface to IB send/receive, MAD packet recv and
+ * multicast send/recv
+ */
+
+#ifndef __SIF_SNDRCV_H
+#define __SIF_SNDRCV_H
+
+struct sif_rq;
+struct sif_dev;
+
+int sif_post_send(struct ib_qp *ibqp,
+ struct ib_send_wr *wr, struct ib_send_wr **bad_wr);
+int sif_post_recv(struct ib_qp *ibqp,
+ struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr);
+
+int sif_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+int sif_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int post_recv(struct sif_dev *sdev, struct sif_qp *qp, struct sif_rq *rq,
+ struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr);
+
+/* Send a single wr */
+int sif_post_send_single(struct ib_qp *ibqp, struct ib_send_wr *wr, bool *use_db, bool last, u16 *first_seq);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Vinay Shaw <vinay.shaw@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_spt.c: Experimental implementation of shared use of the OS's page tables.
+ * Default is to use private page tables - shared page tables can be enabled using
+ * a vendor flag. This implementation assumes that physical addresses and DMA addresses
+ * are 1-1, which might not in general be the case if going through an IOMMU.
+ */
+
+#include "sif_mmu.h"
+#include "sif_dev.h"
+#include "sif_base.h"
+#include "sif_dma.h"
+#include "sif_hwi.h"
+#include "sif_spt.h"
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/highmem.h>
+#include <rdma/ib_umem.h>
+
+
+#define PMD_ALIGN(addr) ALIGN(addr, PMD_SIZE)
+#define PUD_ALIGN(addr) ALIGN(addr, PUD_SIZE)
+#define PGDIR_ALIGN(addr) ALIGN(addr, PGDIR_SIZE)
+
+
+static void set_ctx_w_page(struct sif_dev *sdev,
+ struct sif_mmu_ctx *ctx,
+ enum psif_table_level level,
+ enum psif_page_size pg_sz, u64 val)
+{
+ struct psif_mmu_cntx *hw_ctx = &ctx->mctx;
+
+ hw_ctx->page_size = pg_sz;
+ hw_ctx->table_ptr = ((val) >> PAGE_SHIFT) & ~PSIF_TABLE_PTR_MASK;
+ hw_ctx->table_level = level;
+ sif_log(sdev, SIF_MMU, "pte 0x%08llx level %d", val, level);
+}
+
+
+static int sif_set_mmu_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *sctx,
+ struct sif_mem *mem, bool write);
+
+int sif_spt_map_gva_ctx(struct sif_dev *sdev,
+ struct sif_mmu_ctx *ctx,
+ struct sif_mem *mem,
+ bool write)
+{
+ int ret;
+
+ if (!(mem->mem_type == SIFMT_UMEM) || !mem->m.u.umem) {
+ sif_log(sdev, SIF_MMU, "Only implemented for user space mappings!");
+ return -EINVAL;
+ }
+
+ ret = sif_set_mmu_ctx(sdev, ctx, mem, write);
+ if (ret)
+ goto mmctx_failed;
+ return 0;
+
+mmctx_failed:
+ return ret;
+}
+
+
+static int sif_set_mmu_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx,
+ struct sif_mem *mem, bool write)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep, pte;
+ u64 start = ctx->base;
+ u64 len = ctx->size;
+ struct psif_mmu_cntx *pctx = &ctx->mctx;
+ int npgds, npuds, npmds, nptes;
+ int ret = 0;
+
+ sif_log(sdev, SIF_MMU, "start 0x%llx len 0x%llx", start, len);
+
+ if (len == 0)
+ goto err;
+
+ pgd = pgd_offset(mem->m.u.umem->mm, start);
+ if (pgd_none(*pgd))
+ goto err;
+
+ ctx->pt = (void *)pgd; /* Misuse pt to save the pointer to avoid going via mm at dealloc time */
+ ctx->mt = SIFMT_ZERO;
+ pud = pud_offset(pgd, start);
+ if (pud_none(*pud))
+ goto err;
+
+ pctx->wr_access = write;
+ pctx->translation_type = MMU_GVA2GPA_MODE;
+
+ npgds = PGDIR_ALIGN(len + (start & ~PGDIR_MASK)) >> PGDIR_SHIFT;
+ npuds = PUD_ALIGN(len + (start & ~PUD_MASK)) >> PUD_SHIFT;
+
+#ifndef __aarch64__
+ if (pud_large(*pud)) {
+ ptep = (pte_t *) pud;
+ pte = *ptep;
+
+ if (!pte_present(pte)) {
+ sif_log(sdev, SIF_MMU,
+ "Page not present, bugging out..");
+ BUG();
+ goto err;
+ }
+
+ if (npuds == 1) {
+ set_ctx_w_page(sdev, ctx, PAGE_LEVEL2, PAGE_SIZE_IA32E_1GB,
+ pte_val(pte));
+ } else if (npgds == 1)
+ set_ctx_w_page(sdev, ctx, PAGE_LEVEL3, PAGE_SIZE_IA32E_1GB,
+ pgd_val(*pgd));
+#ifdef CONFIG_X86
+ else
+ set_ctx_w_page(sdev, ctx, PAGE_LEVEL4, PAGE_SIZE_IA32E_1GB,
+ read_cr3());
+#endif
+ goto out;
+ }
+#endif /* !__aarch64__ */
+
+ pmd = pmd_offset(pud, start);
+ if (pmd_none(*pmd))
+ goto err;
+
+ npmds = PMD_ALIGN(len + (start & ~PMD_MASK)) >> PMD_SHIFT;
+
+#ifndef __aarch64__
+ if (pmd_large(*pmd)) {
+ ptep = (pte_t *) pmd;
+ pte = *ptep;
+
+ if (!pte_present(pte)) {
+ sif_log(sdev, SIF_MMU,
+ "Page not present, bugging out..");
+ BUG();
+ goto err;
+ }
+
+ if (npmds == 1) {
+ set_ctx_w_page(sdev, ctx, PAGE_LEVEL1, PAGE_SIZE_IA32E_2MB,
+ pte_val(pte));
+ } else if (npuds == 1)
+ set_ctx_w_page(sdev, ctx, PAGE_LEVEL2, PAGE_SIZE_IA32E_2MB,
+ pud_val(*pud));
+ else if (npgds == 1)
+ set_ctx_w_page(sdev, ctx, PAGE_LEVEL3, PAGE_SIZE_IA32E_2MB,
+ pgd_val(*pgd));
+#ifdef CONFIG_X86
+ else
+ set_ctx_w_page(sdev, ctx, PAGE_LEVEL4, PAGE_SIZE_IA32E_2MB,
+ read_cr3());
+#endif
+ goto out;
+ }
+#endif /* !__aarch64__ */
+
+ ptep = pte_offset_map(pmd, start);
+ pte = *ptep;
+ if (!pte_present(pte)) {
+ sif_log(sdev, SIF_MMU, "Page not present, bugging out..");
+ BUG();
+ goto err;
+ }
+
+ nptes = PAGE_ALIGN(len + (start & ~PAGE_MASK)) >> PAGE_SHIFT;
+ if (nptes == 1) {
+ set_ctx_w_page(sdev, ctx, PAGE_LEVEL0, PAGE_SIZE_IA32E_4KB, pte_val(pte));
+ } else if (npmds == 1) {
+ set_ctx_w_page(sdev, ctx, PAGE_LEVEL1, PAGE_SIZE_IA32E_4KB, pmd_val(*pmd));
+ } else if (npuds == 1) {
+ set_ctx_w_page(sdev, ctx, PAGE_LEVEL2, PAGE_SIZE_IA32E_4KB, pud_val(*pud));
+ } else if (npgds == 1) {
+ set_ctx_w_page(sdev, ctx, PAGE_LEVEL3, PAGE_SIZE_IA32E_4KB, pgd_val(*pgd));
+#ifdef CONFIG_X86
+ } else {
+ set_ctx_w_page(sdev, ctx, PAGE_LEVEL4, PAGE_SIZE_IA32E_4KB, read_cr3());
+#endif
+ }
+ goto out;
+err:
+ sif_log(sdev, SIF_MMU, "Error in setting mmu context");
+ ret = -1;
+out:
+ return ret;
+}
+
+void sif_spt_unmap_gva_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *sctx)
+{
+ u64 start = sctx->base;
+ u64 len = sctx->size;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep, pte;
+
+ int npgds, npuds, npmds, nptes;
+
+ sif_log(sdev, SIF_MMU, "start 0x%llx len 0x%llx", start, len);
+
+ if (len == 0)
+ goto err;
+
+ pgd = (pgd_t *)sctx->pt;
+ if (pgd_none(*pgd))
+ goto err;
+
+ if (pgd_none(*pgd)) {
+ sif_log(sdev, SIF_MMU, "Table entry(pgd) already freed");
+ goto out;
+ }
+
+ pud = pud_offset(pgd, start);
+ if (pud_none(*pud)) {
+ sif_log(sdev, SIF_MMU, "Table entry(pud) already freed");
+ goto out;
+ }
+
+ npgds = PGDIR_ALIGN(len + (start & ~PGDIR_MASK)) >> PGDIR_SHIFT;
+ npuds = PUD_ALIGN(len + (start & ~PUD_MASK)) >> PUD_SHIFT;
+
+#ifndef __aarch64__
+ if (pud_large(*pud)) {
+ ptep = (pte_t *) pud;
+ pte = *ptep;
+
+ if (!pte_present(pte)) {
+ sif_log(sdev, SIF_MMU,
+ "Page not present, bugging out..");
+ BUG();
+ goto err;
+ }
+ goto out;
+ }
+#endif /* !__aarch64__ */
+
+ pmd = pmd_offset(pud, start);
+ if (pmd_none(*pmd)) {
+ sif_log(sdev, SIF_MMU, "Table entry(pmd) already freed");
+ goto out;
+ }
+
+ npmds = PMD_ALIGN(len + (start & ~PMD_MASK)) >> PMD_SHIFT;
+
+#ifndef __aarch64__
+ if (pmd_large(*pmd)) {
+ ptep = (pte_t *) pmd;
+ pte = *ptep;
+
+ if (!pte_present(pte)) {
+ sif_log(sdev, SIF_MMU,
+ "Page not present, bugging out..");
+ BUG();
+ goto err;
+ }
+ goto out;
+ }
+#endif /* !__aarch64__ */
+
+ ptep = pte_offset_map(pmd, start);
+ pte = *ptep;
+ if (!pte_present(pte)) {
+ sif_log(sdev, SIF_MMU, "Page not present, bugging out..");
+ BUG();
+ goto err;
+ }
+
+ nptes = PAGE_ALIGN(len + (start & ~PAGE_MASK)) >> PAGE_SHIFT;
+
+ goto out;
+err:
+ sif_log(sdev, SIF_MMU, "Error releasing mmu context");
+out:
+ return;
+}
+
--- /dev/null
+/*
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_spt.h: Experimental (still unsafe)
+ * implementation of direct use of the operating system's
+ * page tables (shared page tables)
+ */
+
+#ifndef _SIF_SPT_H
+#define _SIF_SPT_H
+
+struct sif_dev;
+struct sif_mmu_ctx;
+
+
+#define PSIF_TABLE_PTR_SHIFT 52
+#define PSIF_TABLE_PTR_SIZE (_AC(1, UL) << PSIF_TABLE_PTR_SHIFT)
+#define PSIF_TABLE_PTR_MASK (~(PSIF_TABLE_PTR_SIZE-1))
+
+int sif_spt_map_gva_ctx(struct sif_dev *sdev,
+ struct sif_mmu_ctx *ctx,
+ struct sif_mem *mem,
+ bool write);
+
+void sif_spt_unmap_gva_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_sq.c: Implementation of the send queue side of an IB queue pair
+ */
+
+#include <rdma/ib_verbs.h>
+#include "sif_dev.h"
+#include "sif_base.h"
+#include "sif_defs.h"
+#include "sif_dma.h"
+#include "sif_mmu.h"
+#include "sif_pt.h"
+#include "sif_mr.h"
+#include "sif_sq.h"
+#include "sif_hwi.h"
+#include "psif_hw_setget.h"
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+/* Figure out the minimal space needed in each send queue element
+ * given the input sizes.
+ *
+ * We also use this space to collapse sg entries if we need to emulate more
+ * sg entries in software than what hardware supports.
+ *
+ * TBD: Note that the SQS sometimes checksums more data
+ * (up to 256 bytes depending on max_inline??) which we then cannot use
+ * as sg list data area.
+ * Note also that no sgl is needed in PSIF for the single sg entry case:
+ */
+
+static u32 compute_sq_extent(u32 sge_entries, u32 max_inline_data,
+ u32 *sgl_offset, u32 *min_extent_p,
+ u32 *sgl_size_p, u32 *max_inline_p)
+{
+ u32 hw_sge_entries = min_t(u32, SIF_HW_MAX_SEND_SGE, sge_entries);
+ u32 sgl_size = sge_entries > 1 ? hw_sge_entries * sizeof(struct psif_wr_local) : 0;
+ u32 xsge = sge_entries - hw_sge_entries;
+
+ /* This amount must be reserved for 0-padded inline data due to
+ * restrictions in the SQS:
+ */
+ u32 sqs_headroom = min(256U, ((max_inline_data + 63U) & ~63U));
+ u32 sqs_inline_extra =
+ max_inline_data > sqs_headroom ? max_inline_data - sqs_headroom : 0;
+
+ /* This applies to UD only, with max 4K message size:
+ * Set aside room for inlining of @xsge sg entries.
+ * Average size of an sge entry will be max 256 bytes, add an extra
+ * 256 to handle the case where we cannot use the initial inline space:
+ */
+ u32 xsge_space = !xsge ? 0 : (xsge + 2) * 256;
+
+ u32 min_extent = sizeof(struct psif_wr)
+ + sqs_headroom
+ + max(max(sqs_inline_extra, sgl_size), xsge_space);
+
+ u32 real_extent = roundup_pow_of_two(min_extent);
+
+ if (sgl_offset)
+ *sgl_offset = real_extent - sgl_size;
+ if (sgl_size_p)
+ *sgl_size_p = sgl_size;
+ if (min_extent_p)
+ *min_extent_p = min_extent;
+ if (max_inline_p)
+ *max_inline_p = max_t(int, xsge_space - sqs_headroom, sqs_inline_extra);
+ return real_extent;
+}
+
+
+int sif_alloc_sq(struct sif_dev *sdev, struct sif_pd *pd,
+ struct sif_qp *qp, struct ib_qp_cap *cap,
+ bool user_mode, int wr_hdl_sz)
+{
+ /* Send queues always uses same indexes as the corresponding qp */
+ int ret = 0;
+ int extent_log2;
+ struct sif_sq *sq;
+ struct sif_sq_sw *sq_sw;
+ struct psif_sq_hw *sq_hw_p;
+ struct psif_sq_rspq *sq_rspq_p;
+ struct psif_sq_sw lsq_sw;
+ struct psif_sq_hw lsq_hw;
+ struct psif_sq_entry sqe;
+
+ u32 min_entries = cap->max_send_wr;
+ u32 max_entries;
+ u32 entries_log2;
+ u32 min_extent;
+ u32 sgl_size;
+ u32 max_inline;
+ u64 alloc_sz;
+ dma_addr_t dma_start;
+ bool need_page_aligned;
+ bool need_wa_4049 = PSIF_REVISION(sdev) <= 3;
+
+
+ max_entries = roundup_pow_of_two(max(2U, min_entries));
+ entries_log2 = order_base_2(max_entries);
+
+ if (entries_log2 > SIF_SW_MAX_SQE_LOG2) {
+ sif_log(sdev, SIF_INFO,
+ "requested %d entries -> %d but sif only supports %d",
+ cap->max_send_wr, max_entries, SIF_SW_MAX_SQE);
+ return -ENFILE; /* Limited by 4 bit size_log2 field in sq desc */
+ }
+
+ sq = get_sif_sq(sdev, qp->qp_idx);
+ sq_sw = get_sif_sq_sw(sdev, qp->qp_idx);
+ sq->index = qp->qp_idx;
+ sq->wr_hdl_sz = wr_hdl_sz;
+
+ /* Due to IB standard requirements for ssn = 1 on the first packet
+ * on a QP and that psif now uses send queue sequence number == ssn
+ * we must initialize so the first packet is sent on index 1.
+ * Also the send queue in psif uses last_seq == last used seq instead of
+ * next_seq == next seq to use..
+ * NB! This applies only to the send queue - we start at index 0 on all the others!
+ */
+ sq_sw->last_seq = sq_sw->head_seq = 0;
+
+ sq_hw_p = get_sq_hw(sdev, qp->qp_idx);
+
+ sq->entries = max_entries;
+ sq->mask = max_entries - 1;
+ sq->sg_entries = need_wa_4049 ? roundup_pow_of_two(cap->max_send_sge) : cap->max_send_sge;
+
+ sq->extent = compute_sq_extent(sq->sg_entries, cap->max_inline_data,
+ &sq->sgl_offset, &min_extent, &sgl_size, &max_inline);
+
+ qp->max_inline_data = cap->max_inline_data;
+ if (sq->extent > min_extent) {
+ int extra_extent = sq->extent - min_extent;
+
+ if (sq->sg_entries > SIF_HW_MAX_SEND_SGE) {
+ qp->max_inline_data = max_inline + extra_extent;
+ } else if (cap->max_inline_data >= 256) {
+ sif_log(sdev, SIF_QP, "QP %d has room for %d bytes of extra inline space",
+ qp->qp_idx, extra_extent);
+ qp->max_inline_data += extra_extent;
+ }
+ }
+
+ extent_log2 = order_base_2(sq->extent);
+ alloc_sz = max_entries * sq->extent;
+
+ /* Only whole pages must be exposed to user space.
+ * For simplicity we impose the same for reliable QPs as their SQs
+ * have to be page aligned to ensure proper access from SQ_CMPL:
+ */
+ need_page_aligned = user_mode || reliable_qp(qp->type);
+
+ if (need_page_aligned && (alloc_sz & ~PAGE_MASK))
+ alloc_sz = (alloc_sz + ~PAGE_MASK) & PAGE_MASK;
+ sq->user_mode = user_mode;
+
+ if (alloc_sz <= SIF_MAX_CONT)
+ sq->mem = sif_mem_create_dmacont(sdev, alloc_sz, GFP_KERNEL, DMA_BIDIRECTIONAL);
+ else {
+ alloc_sz = (alloc_sz + ~PMD_MASK) & PMD_MASK;
+ sq->mem = sif_mem_create(sdev, alloc_sz >> PMD_SHIFT,
+ alloc_sz, SIFMT_2M, GFP_KERNEL | __GFP_ZERO,
+ DMA_BIDIRECTIONAL);
+ }
+ if (!sq->mem) {
+ sif_log(sdev, SIF_INFO, "Failed to allocate %llu bytes of SQ buffer pool",
+ alloc_sz);
+ ret = -ENOMEM;
+ goto err_alloc_dma;
+ }
+
+ dma_start = sif_mem_dma(sq->mem, 0);
+
+ sif_log(sdev, SIF_QP, "SQ dma %pad va 0x%p, sz %d, min_extent %d -> extent %d",
+ &dma_start, sif_mem_kaddr(sq->mem, 0), sq->entries, min_extent, sq->extent);
+ sif_log(sdev, SIF_SQ, "SQ wr sz %ld, sgl_offset/sz %d/%d, max_inline %d, max sge %d",
+ sizeof(sqe.wr), sq->sgl_offset, sgl_size,
+ qp->max_inline_data, sq->sg_entries);
+
+ sq->wr_hdl = kzalloc(max_entries * sq->wr_hdl_sz, GFP_KERNEL);
+ if (!sq->wr_hdl) {
+ sif_log(sdev, SIF_INFO, "Failed to allocate wr_hdl table!");
+ ret = -ENOMEM;
+ goto err_alloc_wrid;
+ }
+
+ if (qp->type != PSIF_QP_TRANSPORT_MANSP1 && (qp->max_inline_data || sgl_size)) {
+ /* Allocate a DMA validation entry to be used for sif to access
+ * s/g lists, which we put in the spare space between entries
+ * in the send queue. This MR is also used by the SQS to access
+ * inline data.
+ */
+ sq->sg_mr = alloc_mr(sdev, pd, sq->mem, dma_start, 0);
+ if (IS_ERR(sq->sg_mr)) {
+ ret = PTR_ERR(sq->sg_mr);
+ sif_log(sdev, SIF_INFO, "Failed to allocate lkey for s/g list (%d)",
+ ret);
+ goto err_alloc_sg_mr;
+ }
+ }
+
+ /* Initialize hw part of descriptor */
+ memset(&lsq_hw, 0, sizeof(lsq_hw));
+
+ lsq_hw.size_log2 = entries_log2;
+ lsq_hw.extent_log2 = extent_log2;
+ /* TBD: mmu_context */
+
+ /* See comment above */
+ lsq_hw.last_seq = 0;
+ lsq_hw.base_addr = dma_start;
+ lsq_hw.sq_max_inline = min(256U, qp->max_inline_data);
+ lsq_hw.sq_max_sge = sq->sg_entries - 1;
+
+ /* These are needed for sq mode to work */
+ lsq_hw.sq_next.next_qp_num = 0xffffff;
+ lsq_hw.sq_next.next_null = 0xff;
+
+ /* Allocate mmu context for the send queue - only read access needed
+ * for the queue itself:
+ */
+ ret = sif_map_ctx(sdev, &sq->mmu_ctx, sq->mem, lsq_hw.base_addr,
+ alloc_sz, false);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "Failed to set mmu context for sq %d",
+ sq->index);
+ goto err_map_ctx;
+ }
+
+
+ lsq_hw.mmu_cntx = sq->mmu_ctx.mctx;
+
+ /* Write network byte order copy */
+ copy_conv_to_hw(sq_hw_p, &lsq_hw, sizeof(lsq_hw));
+
+ /* Initialize sw part of descriptor */
+ memset(&lsq_sw, 0, sizeof(lsq_sw));
+
+ copy_conv_to_hw(&sq_sw->d, &lsq_sw, sizeof(lsq_sw));
+
+ spin_lock_init(&sq->lock);
+
+ sq_rspq_p = get_sq_rspq(sdev, qp->qp_idx);
+
+ /* We need to set the (network byte order)
+ * fields next_qp_num and rspq_next to all 1's (see bug 3479)
+ * TBD: This needs to be properly set up in psifapi
+ */
+ sq_rspq_p->something_tbd[0] = (u64)-1;
+ return 0;
+
+ sif_unmap_ctx(sdev, &sq->mmu_ctx);
+err_map_ctx:
+ if (sq->sg_mr)
+ dealloc_mr(sdev, sq->sg_mr);
+err_alloc_sg_mr:
+ kfree(sq->wr_hdl);
+err_alloc_wrid:
+ sif_mem_free(sq->mem);
+err_alloc_dma:
+ return ret;
+}
+
+
+int sif_flush_sqs(struct sif_dev *sdev, struct sif_sq *sq)
+{
+ ulong start_time = jiffies;
+ ulong timeout = start_time + sdev->min_resp_ticks * 2;
+ struct sif_qp *qp = get_sif_qp(sdev, sq->index);
+ bool sqs_idle = false;
+ u32 sq_next;
+ u32 prev_sq_next;
+ struct psif_wr wr;
+ struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, sq->index);
+
+ if (qp->ibqp.xrcd) /* XRC target QPs dont have any valid sqs setup */
+ return 0;
+
+ memset(&wr, 0, sizeof(struct psif_wr));
+ wr.local_qp = sq->index;
+
+ /* Trigger a stop of SQS (rev2 feature) */
+ sif_doorbell_write(qp, &wr, false);
+
+ prev_sq_next = sq_next = get_psif_sq_hw__sq_next(&sq->d);
+
+ sif_log(sdev, SIF_SQ, "Entering sq_hw poll for sq %d: last_seq %d head_seq %d sq_next %x",
+ sq->index, sq_sw->last_seq, sq_sw->head_seq, sq_next);
+ for (;;) {
+ if (!sqs_idle) {
+ sqs_idle = get_psif_sq_hw__destroyed(&sq->d);
+ if (sqs_idle) {
+ rmb(); /* Make sure we observe sq_next after the
+ * destroyed bit has been set
+ */
+ sq_next = get_psif_sq_hw__sq_next(&sq->d);
+ }
+ }
+ if (sqs_idle && sq_next == 0xffffffff)
+ break;
+ if (sq_next != prev_sq_next) {
+ /* Reset timeout */
+ timeout = jiffies + sdev->min_resp_ticks * 2;
+ sif_log(sdev, SIF_INFO_V, "sq %d: sq_next moved from %d -> %d",
+ sq->index, prev_sq_next, sq_next);
+ } else if (time_is_before_jiffies(timeout)) {
+ if (sif_feature(pcie_trigger))
+ force_pcie_link_retrain(sdev);
+ sif_log(sdev, SIF_INFO,
+ "Error: sq %d timed out - waited %d ms for SQ flush. Idle:%d sq_next:%x",
+ sq->index, jiffies_to_msecs(jiffies - start_time), sqs_idle, sq_next);
+ return -ETIMEDOUT;
+ }
+ /* TBD: No sleep necessary as this should be really quick (?) */
+ cpu_relax();
+ prev_sq_next = sq_next;
+ sq_next = get_psif_sq_hw__sq_next(&sq->d);
+ }
+
+ sif_log(sdev, SIF_SQ, " sq %d: done waiting for SQS to finish", sq->index);
+ return 0;
+}
+
+
+void sif_free_sq(struct sif_dev *sdev, struct sif_qp *qp)
+{
+ struct sif_sq *sq;
+ volatile struct psif_sq_hw *sq_hw_p;
+ volatile struct psif_sq_sw *sq_sw_p;
+
+ int index = qp->qp_idx;
+
+ sq = get_sif_sq(sdev, index);
+ sif_log(sdev, SIF_SQ, "idx %d", sq->index);
+
+ sq_sw_p = get_sq_sw(sdev, index);
+ sq_hw_p = &sq->d;
+
+ if (reliable_qp(qp->type) && qp->sq_cmpl_map_valid)
+ sif_sq_cmpl_unmap_sq(sdev, sq);
+
+ sif_unmap_ctx(sdev, &sq->mmu_ctx);
+
+ /* We clear the whole sq field including sq_hw below */
+ sif_clear_sq_sw(sdev, index);
+
+ if (sq->sg_mr)
+ dealloc_mr(sdev, sq->sg_mr);
+
+ sif_mem_free(sq->mem);
+ kfree(sq->wr_hdl);
+ memset(sq, 0, sizeof(struct sif_sq));
+}
+
+
+/* Setup of the root node(s) of a page table mapping all
+ * active send queues:
+ */
+int sif_sq_cmpl_setup(struct sif_table *tp)
+{
+ u32 max_sq_extent = compute_sq_extent(16, sif_max_inline,
+ NULL, NULL, NULL, NULL);
+ struct sif_dev *sdev = tp->sdev;
+
+ tp->ext_sz = SIF_SW_MAX_SQE * max_sq_extent; /* Largest possible send queue */
+ tp->table_sz = (size_t)tp->ext_sz * tp->entry_cnt;
+ tp->sif_base = SIF_SQ_CMPL_START;
+ tp->mem = sif_mem_create_ref(sdev, SIFMT_CS, tp->sif_base, tp->table_sz,
+ GFP_KERNEL);
+
+ sif_log(sdev, SIF_SQ, "ext.sz %d entry cnt %d max sq extent 0x%x tbl.sz 0x%lx",
+ tp->ext_sz, tp->entry_cnt, max_sq_extent, tp->table_sz);
+ return 0;
+}
+
+
+/* Map/unmap the page table of a send queue in the sq_cmpl mapping
+ * The way to map it depends on the map type of the send queue itself:
+ */
+int sif_sq_cmpl_map_sq(struct sif_dev *sdev, struct sif_sq *sq)
+{
+ struct sif_table *sctp = &sdev->ba[sq_cmpl];
+
+ /* Start offset of this send queue in the large virtual sq_cmpl mapping: */
+ u64 virt_base = sctp->mmu_ctx.base + (u64)sq->index * sctp->ext_sz;
+ u64 size = sq->mem->size;
+
+ return sif_map_ctx_part(sdev, &sctp->mmu_ctx, sq->mem, virt_base, size);
+}
+
+
+int sif_sq_cmpl_unmap_sq(struct sif_dev *sdev, struct sif_sq *sq)
+{
+ struct sif_table *sctp = &sdev->ba[sq_cmpl];
+
+ /* Start offset of this send queue in the large virtual sq_cmpl mapping: */
+ u64 virt_base = sctp->mmu_ctx.base + (u64)sq->index * sctp->ext_sz;
+ u64 size = sq->mem->size;
+
+ sif_log(sdev, SIF_SQ, "sq %d, virt_base 0x%llx size 0x%llx", sq->index, virt_base, size);
+ return sif_unmap_gva_ctx_part(sdev, &sctp->mmu_ctx, virt_base, size);
+}
+
+
+void sif_dfs_print_sq_hw(struct seq_file *s, struct sif_dev *sdev, loff_t pos)
+{
+ struct sif_sq *sq;
+ int qlen;
+ u32 head, tail;
+ struct psif_sq_hw lhw;
+ struct sif_sq_sw *sq_sw;
+ struct sif_qp *qp;
+ int tsv;
+
+ if (unlikely(pos < 0)) {
+ seq_puts(s, "# N = next_null, T = sq_timestamp_valid, D = sq_done, X = destroyed\n");
+ seq_puts(s, "# [----------------------- sw view ----------------------] [----------- hw view ------------]\n");
+ seq_puts(s, "# Index cq_idx head tail q_sz q_len q_high max_sge inline head tail n.qp N T D X\n");
+ return;
+ }
+ sq = get_sif_sq(sdev, pos);
+ sq_sw = get_sif_sq_sw(sdev, pos);
+ qp = get_sif_qp(sdev, pos);
+
+ /* Check for QP0/1 which is reserved but not initialized */
+ if (sq->entries == 0)
+ return;
+
+ head = sq_sw->head_seq;
+ tail = sq_sw->last_seq;
+ qlen = sq_length(sq, head, tail);
+
+ copy_conv_to_sw(&lhw, &sq->d, sizeof(lhw));
+ tsv = lhw.sq_timestamp_valid;
+
+ seq_printf(s, "%7lld %7d %8d %8d %8d %9d %9d %7d %6d %8d%8d %06x %2x %d %d %d\n",
+ pos,
+ sq->cq_idx, head, tail, sq->entries, qlen, sq->max_outstanding,
+ sq->sg_entries, qp->max_inline_data,
+ get_psif_sq_sw__tail_indx(&sq_sw->d), lhw.last_seq,
+ lhw.sq_next.next_qp_num, lhw.sq_next.next_null,
+ tsv, lhw.sq_done, lhw.destroyed);
+}
+
+
+void sif_dfs_print_sq_cmpl(struct seq_file *s, struct sif_dev *sdev, loff_t pos)
+{
+ struct sif_sq *sq;
+ struct sif_qp *qp;
+ struct sif_table *sctp = &sdev->ba[sq_cmpl];
+ u64 virt_base;
+ dma_addr_t val;
+ u64 pte_cnt, i;
+ dma_addr_t dma_start;
+ struct sif_mmu_ctx *ctx = &sctp->mmu_ctx;
+
+ if (unlikely(pos < 0)) {
+ u64 table_ptr = sif_pt_dma_root(ctx->pt);
+
+ seq_printf(s, "# - mmu_cntx: root %016llx, level %d\n",
+ table_ptr, sctp->mmu_ctx.mctx.table_level);
+ seq_puts(s, "# Index psif vaddr #pages @pte[0] pte[0..]\n");
+ return;
+ }
+ sq = get_sif_sq(sdev, pos);
+ qp = get_sif_qp(sdev, pos);
+ virt_base = sctp->mmu_ctx.base + (u64)sq->index * sctp->ext_sz;
+
+ /* Check for QP0/1 which is reserved but not initialized */
+ if (sq->entries == 0)
+ return;
+
+ /* Only QPs with multipacket support is mapped here; */
+ if (!reliable_qp(qp->type))
+ return;
+
+ if (sif_pt_entry(ctx->pt, virt_base, &dma_start, &val))
+ return;
+
+ pte_cnt = 1; /* TBD: read the correct value to report all pages the pt refers to */
+ seq_printf(s, " %6lld %016llx %6lld @%pad: [", pos, virt_base, pte_cnt, &dma_start);
+ for (i = 0; i < pte_cnt; i++) {
+ if (i > 0)
+ seq_puts(s, ",");
+ seq_printf(s, "%pad", &val);
+ }
+ seq_puts(s, "]\n");
+}
+
+
+bool multipacket_qp(enum psif_qp_trans type)
+{
+ switch (type) {
+ case PSIF_QP_TRANSPORT_RC:
+ case PSIF_QP_TRANSPORT_UC:
+ case PSIF_QP_TRANSPORT_XRC:
+ return true;
+ default:
+ return false;
+ }
+}
+
+
+bool reliable_qp(enum psif_qp_trans type)
+{
+ return
+ type == PSIF_QP_TRANSPORT_RC ||
+ type == PSIF_QP_TRANSPORT_XRC;
+}
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_sq.h: Implementation of the send queue side of an IB queue pair
+ */
+
+#ifndef __SIF_SQ_H
+#define __SIF_SQ_H
+
+struct sif_sq_hdl {
+ u64 wr_id; /* Stored work id */
+ u32 sq_seq; /* Extra sanity checks */
+ bool used;
+};
+
+
+struct sif_sq {
+ volatile struct psif_sq_hw d; /* Hardware descriptor */
+ /* Serializes access to sq_sw->last_seq (alloc of new sqes): */
+ spinlock_t lock ____cacheline_internodealigned_in_smp;
+ struct sif_mmu_ctx mmu_ctx;
+ int index; /* Send queue index (same as the qp index) */
+ int cq_idx; /* Default send compl.queue index to use */
+ u32 sg_entries; /* Max send scatter/gather configured for this sq */
+ u16 entries;
+ u16 mask; /* entries - 1 for modulo using & */
+ u16 max_outstanding; /* Longest observed send queue len */
+ u8 complete_all; /* Gets or'ed into completion bit in WRs */
+ u32 extent;
+ u32 sgl_offset; /* Offset from start of the sqe where the sgl starts */
+ bool user_mode; /* Set if this is an SQ to be mapped to user space */
+ struct sif_mem *mem; /* Allocated queue memory */
+ void *wr_hdl; /* map from sq entry index to wr_id + optional bookkeeping */
+ int wr_hdl_sz; /* Sz of each elem. in wr_hdl - PQP and std send path uses different sizes */
+ struct sif_mr *sg_mr; /* DMA val.entry for the sge list when in the send queue */
+ struct psif_rq_scatter tmp_sge[16]; /* Temp.storage for buildup of LE sge list */
+};
+
+
+/* Lookup function for the handle for a particular request: */
+static inline struct sif_sq_hdl *get_sq_hdl(struct sif_sq *sq, u32 seq)
+{
+ return (struct sif_sq_hdl *)(sq->wr_hdl + sq->wr_hdl_sz * (seq & sq->mask));
+}
+
+int sif_sq_cmpl_setup(struct sif_table *tp);
+
+int sif_alloc_sq(struct sif_dev *sdev, struct sif_pd *pd,
+ struct sif_qp *qp, struct ib_qp_cap *cap,
+ bool user_mode, int sq_hdl_sz);
+
+void sif_free_sq(struct sif_dev *sdev, struct sif_qp *qp);
+
+int sif_flush_sqs(struct sif_dev *sdev, struct sif_sq *sq);
+
+int sif_sq_cmpl_map_sq(struct sif_dev *sdev, struct sif_sq *sq);
+int sif_sq_cmpl_unmap_sq(struct sif_dev *sdev, struct sif_sq *sq);
+
+/* Line printers for debugfs files */
+void sif_dfs_print_sq_hw(struct seq_file *s, struct sif_dev *sdev, loff_t pos);
+void sif_dfs_print_sq_cmpl(struct seq_file *s, struct sif_dev *sdev, loff_t pos);
+
+bool multipacket_qp(enum psif_qp_trans type);
+bool reliable_qp(enum psif_qp_trans type);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_srq.c: Interface to shared receive queues for SIF
+ */
+
+#include <rdma/ib_verbs.h>
+#include "sif_dev.h"
+#include "sif_qp.h"
+#include "sif_srq.h"
+#include "sif_base.h"
+#include "sif_defs.h"
+#include "sif_sndrcv.h"
+
+struct ib_srq *sif_create_srq(struct ib_pd *ibpd,
+ struct ib_srq_init_attr *srq_init_attr,
+ struct ib_udata *udata)
+{
+ int rq_idx;
+ struct sif_dev *sdev = to_sdev(ibpd->device);
+ struct sif_rq *rq;
+ ulong user_flags = 0;
+ int ret = 0;
+ bool user_mode = udata != NULL;
+
+ if (sif_feature(disable_srq))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (udata) {
+ struct sif_create_srq_ext cmd;
+
+ ret = ib_copy_from_udata(&cmd, udata, sizeof(cmd));
+ if (ret)
+ goto err_create_srq;
+ user_flags = cmd.flags;
+
+ if (sif_vendor_enable(SVF_kernel_mode, user_flags))
+ user_mode = false;
+ }
+
+ sif_log(sdev, SIF_SRQ, "%s", (user_mode ? "(user)" : "(kernel)"));
+
+ rq_idx = alloc_rq(sdev, to_spd(ibpd), srq_init_attr->attr.max_wr,
+ srq_init_attr->attr.max_sge, srq_init_attr, user_mode);
+ if (rq_idx < 0) {
+ ret = rq_idx;
+ goto err_create_srq;
+ }
+
+ rq = get_sif_rq(sdev, rq_idx);
+
+ if (udata) {
+ struct sif_create_srq_resp_ext resp;
+
+ memset(&resp, 0, sizeof(resp));
+ resp.index = rq_idx;
+ resp.extent = rq->extent;
+ ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+ if (ret)
+ goto err_udata;
+ }
+
+ srq_init_attr->attr.max_wr = rq->entries_user;
+
+ return &rq->ibsrq;
+err_udata:
+ free_rq(sdev, rq->index);
+err_create_srq:
+ return ERR_PTR(ret);
+}
+
+#define ARM_SRQ_HOLDOFF (10 + jiffies)
+
+static int sif_arm_srq(struct sif_dev *sdev, struct sif_rq *srq, u32 srq_limit)
+{
+ int ret;
+ struct psif_wr wr;
+ struct psif_cq_entry *cqe;
+ DECLARE_SIF_CQE_POLL_WITH_RR_PQP(sdev, lcqe);
+ struct sif_pqp *pqp = lcqe.pqp;
+
+ if (unlikely(!pqp))
+ return -EAGAIN;
+
+ memset(&wr, 0, sizeof(struct psif_wr));
+
+ wr.completion = 1;
+ wr.op = PSIF_WR_SET_SRQ_LIM;
+ wr.details.su.srq_lim = srq_limit;
+ wr.details.su.u2.rq_id = srq->index;
+
+try_again:
+ if (time_is_after_jiffies((unsigned long)atomic64_read(&pqp->qp->arm_srq_holdoff_time))) {
+ cpu_relax();
+ goto try_again;
+ }
+
+ atomic64_set(&pqp->qp->arm_srq_holdoff_time, ARM_SRQ_HOLDOFF);
+ pqp->qp->srq_idx = srq->index;
+
+ ret = sif_pqp_poll_wr(sdev, &wr, &lcqe);
+ if (ret < 0) {
+ sif_log(sdev, SIF_INFO, "pqp request failed with errno %d", ret);
+ return ret;
+ }
+
+ cqe = &lcqe.cqe;
+ if (cqe->status != PSIF_WC_STATUS_SUCCESS) {
+ sif_log(sdev, SIF_INFO, "failed with status %s(%d) for cq_seq %d",
+ string_enum_psif_wc_status(cqe->status), cqe->status, cqe->seq_num);
+ return -EIO;
+ }
+
+ srq->srq_limit = srq_limit;
+
+ return 0;
+}
+
+int sif_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+ struct sif_dev *sdev = to_sdev(ibsrq->device);
+ struct sif_rq *srq = to_srq(ibsrq);
+ u16 srq_limit;
+ int ret;
+
+ if (attr_mask & IB_SRQ_MAX_WR) {
+ sif_log(sdev, SIF_SRQ, "SRQ_MAX_WR not supported");
+ return -EINVAL;
+ }
+
+ if (attr_mask & IB_SRQ_LIMIT) {
+ srq_limit = attr->srq_limit & 0x3fff;
+ if (srq_limit >= srq->entries)
+ return -EINVAL;
+
+ ret = sif_arm_srq(sdev, srq, srq_limit);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+int sif_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+ struct sif_rq *srq = to_srq(ibsrq);
+
+ attr->max_wr = srq->entries;
+ attr->max_sge = srq->sg_entries;
+ attr->srq_limit = srq->srq_limit;
+
+ return 0;
+}
+
+int sif_destroy_srq(struct ib_srq *ibsrq)
+{
+ int sts;
+ struct sif_dev *sdev = to_sdev(ibsrq->device);
+ struct sif_rq *rq = to_srq(ibsrq);
+
+ sif_log(sdev, SIF_SRQ, "rq %d", rq->index);
+
+ if (atomic_read(&rq->refcnt) > 1)
+ return -EBUSY;
+
+ /* An SRQ cannot be flushed with flushed-in-error completions
+ * as we don't know which completion queue to generate
+ * the flushed-in-error completions for, and this should be fine
+ * from a standards perspective:
+ * IB spec refs: 10.2.9.4, 11.2.3.4.
+ */
+ sts = sif_invalidate_rq_hw(sdev, rq->index, PCM_WAIT);
+ if (sts) {
+ sif_log(sdev, SIF_INFO,
+ "Invalidate rq_hw failed");
+ }
+
+ return free_rq(sdev, rq->index);
+}
+
+int sif_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *recv_wr,
+ struct ib_recv_wr **bad_recv_wr)
+{
+ struct sif_dev *sdev = to_sdev(ibsrq->device);
+ struct sif_rq *rq = to_srq(ibsrq);
+
+ sif_logi(ibsrq->device, SIF_SRQ, "rq %d (SRQ)", rq->index);
+
+ return post_recv(sdev, NULL, rq, recv_wr, bad_recv_wr);
+}
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_srq.h: Interface to internal Shared receive queue logic for SIF
+ */
+
+#ifndef __SIF_SRQ_H
+#define __SIF_SRQ_H
+
+struct ib_srq *sif_create_srq(struct ib_pd *ibpd,
+ struct ib_srq_init_attr *srq_init_attr,
+ struct ib_udata *udata);
+int sif_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr,
+ enum ib_srq_attr_mask srq_attr_mask, struct ib_udata *udata);
+int sif_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr);
+int sif_destroy_srq(struct ib_srq *ibsrq);
+
+int sif_post_srq_recv(struct ib_srq *ibsrq,
+ struct ib_recv_wr *recv_wr,
+ struct ib_recv_wr **bad_recv_wr);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Wei Lin Guay <wei.lin.guay@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_tqp.c: Implementation of EPSA tunneling QP for SIF
+ */
+#include <linux/sched.h>
+#include <rdma/ib_verbs.h>
+#include "sif_tqp.h"
+#include "psif_hw_setget.h"
+#include "sif_defs.h"
+
+/*
+ * This is a host-EPSA mailbox function that is called via ib_post_send()
+ * The conditions and assumptions are:-
+ * 1. qp_type == IB_QPT_EPSA_TUNNELING.
+ * 2. opcode == IB_WR_SEND_WITH_IMM
+ * 3. Only receive completion - no send completion will be generated.
+ * 4. Only the first wr.sge will be handled.
+ * 5. wr.ex.imm_data is the EPSA_N
+ */
+int sif_epsa_tunneling_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr)
+{
+ struct psif_epsc_csr_req req;
+ struct psif_epsc_csr_rsp rsp;
+ struct sif_dev *sdev = to_sdev(ibqp->device);
+
+ /* The status of the epsa mailbox communication is logged in the received cq: */
+ struct sif_cq *cq = to_scq(ibqp->recv_cq);
+ struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index);
+ volatile struct psif_cq_entry *cqe;
+ struct psif_cq_entry lcqe;
+ u32 seqno;
+ int ret;
+
+ memset(&req, 0, sizeof(req));
+ memset(&rsp, 0, sizeof(rsp));
+
+ req.uf = 0;
+ req.opcode = EPSC_A_COMMAND;
+ req.u.epsa_cmd.cmd = EPSA_GENERIC_CMD;
+ req.u.epsa_cmd.length = wr->sg_list[0].length;
+ req.u.epsa_cmd.host_addr = wr->sg_list[0].addr;
+ req.u.epsa_cmd.key = wr->sg_list[0].lkey;
+
+ if (wr->ex.imm_data > 3) {
+ sif_log(sdev, SIF_INFO, "Exit: Fail to post_send a WR");
+ return -EINVAL;
+ }
+
+ sif_log(sdev, SIF_SND, "len %d host addr addr 0x%llx key 0x%x",
+ req.u.epsa_cmd.length, req.u.epsa_cmd.host_addr, key);
+
+ ret = sif_eps_wr(sdev, u32_to_mbox(wr->ex.imm_data), &req, &rsp);
+
+ seqno = cq_sw->next_seq;
+ cqe = get_cq_entry(cq, seqno);
+
+ memset(&lcqe, 0, sizeof(lcqe));
+ /* construct the required info for WC during poll_cq.
+ * As for now include the wr_id, mailbox status, qp_num, and status:
+ */
+ lcqe.seq_num = seqno;
+ lcqe.wc_id.rq_id = wr->wr_id;
+ lcqe.vendor_err = rsp.status;
+ lcqe.qp = ibqp->qp_num;
+ lcqe.status = ret == 0 ? PSIF_WC_STATUS_SUCCESS : PSIF_WC_STATUS_GENERAL_ERR;
+
+ copy_conv_to_hw(cqe, &lcqe, sizeof(*cqe));
+
+ return ret;
+}