From: Knut Omang <knut.omang@oracle.com>
Date: Wed, 25 May 2016 09:01:11 +0000 (+0200)
Subject: sif driver initial commit part 3
X-Git-Tag: v4.1.12-92~148^2~18
X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=7e0b60f6dc9f1cd5d22d10bcc75d6c6521b8a36e;p=users%2Fjedix%2Flinux-maple.git

sif driver initial commit part 3

sif_pt.c:        SIF (private) page table management
sif_pt.h:        SIF (private) page table management.
sif_qp.c:        Implementation of IB queue pair logic for sif
sif_qp.h:        Interface to internal IB queue pair logic for sif
sif_query.c:     SIF implementation of some of IB query APIs
sif_query.h:     SIF implementation of some of IB query APIs
sif_r3.c:        Special handling specific for psif revision 3 and earlier
sif_r3.h:        Special handling specific for psif revision 3 and earlier
sif_rq.c:        Implementation of sif receive queues
sif_rq.h:        Interface to sif receive queues
sif_sndrcv.c:    Implementation of post send/recv logic for SIF
sif_sndrcv.h:    Interface to IB send/receive, MAD packet recv and
sif_spt.c:       Experimental implementation of shared use of the OS's page tables.
sif_spt.h:       Experimental (still unsafe)
sif_sq.c:        Implementation of the send queue side of an IB queue pair
sif_sq.h:        Implementation of the send queue side of an IB queue pair
sif_srq.c:       Interface to shared receive queues for SIF
sif_srq.h:       Interface to internal Shared receive queue logic for SIF
sif_tqp.c:       Implementation of EPSA tunneling QP for SIF

Signed-off-by: Knut Omang <knut.omang@oracle.com>
---

diff --git a/drivers/infiniband/hw/sif/sif_pt.c b/drivers/infiniband/hw/sif/sif_pt.c
new file mode 100644
index 000000000000..e6f314a9ba7c
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_pt.c
@@ -0,0 +1,1408 @@
+/*
+ * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_pt.c: SIF (private) page table management
+ */
+
+#include <linux/mm.h>
+#include <linux/kref.h>
+#include <linux/seq_file.h>
+#include "sif_dev.h"
+#include "sif_mr.h"
+#include "sif_mem.h"
+#include "sif_pt.h"
+#include "sif_base.h"
+
+/* A kmem_cache to allocate the nodes in the rb_trees */
+static struct kmem_cache *pt_page_cache;
+
+static inline void *sif_pt_cache_alloc(struct sif_dev *sdev, gfp_t flags)
+{
+#ifdef CONFIG_NUMA
+	void *n;
+
+	n = kmem_cache_alloc_node(pt_page_cache, flags, sdev->pdev->dev.numa_node);
+	if (n)
+		return n;
+
+	sif_log(sdev, SIF_INFO, "Warning: unable to allocate mem on numa node %d",
+		sdev->pdev->dev.numa_node);
+#endif
+	return kmem_cache_alloc(pt_page_cache, flags);
+}
+
+
+/* Declared below */
+static int init_top(struct sif_pt *pt, u64 vstart, int npages);
+
+
+int sif_pt_init(void)
+{
+	pt_page_cache = KMEM_CACHE(sif_pt_page, 0);
+	if (!pt_page_cache)
+		return -ENOMEM;
+	sif_log0(SIF_INFO, "order PAGE_SIZE = %d", order_base_2(PAGE_SIZE));
+	return 0;
+}
+
+void sif_pt_exit(void)
+{
+	kmem_cache_destroy(pt_page_cache);
+}
+
+/* some utilities: */
+
+/* Find the optimal page size (represented by the leaf level)
+ * to use based on device capabilities, configuration and a max_shift
+ * value (typically based on continuousness of memory.
+ * The result is adjusted with the address pair of a corresponding virtual
+ * address and dma address to ensure that it is possible to create a mapping at that
+ * level. pte_ext_shift is set to the number bits to shift increment between
+ * each valid pte (For the odd sized leaf pages)
+ * Assumes vaddr and dma_add.
+ */
+int find_optimal_leaf_level(struct sif_dev *sdev, u32 max_shift,
+			u64 vaddr, u64 dma_addr, u64 size,
+			u8 *leaf_level, u8 *pte_ext_shift)
+{
+	u32 shift, adj_page_shift, page_shift;
+	unsigned long smallest_misalign;
+	u32 bits = sizeof(dma_addr_t) << 3;
+
+	/* Page size not supported by device configuration
+	 * TBD: Remove (Should not happen unless a programming error)
+	 */
+	if (sdev->mi.page_shift > max_shift) {
+		sif_log(sdev, SIF_INFO,
+			"Failed to find a valid leaf level (page_shift %d, max_shift %d)",
+			sdev->mi.page_shift, max_shift);
+		return -EINVAL;
+	}
+
+	*leaf_level = 0;
+	*pte_ext_shift = 0;
+	shift = sdev->mi.page_shift;
+
+	switch (shift) {
+	case 12:
+		/* Device configured for Intel page sizes:
+		 * In x86 mode for PSIF 2.1 only 4K base page size is supported
+		 */
+		if (max_shift < 21)
+			break;
+		*leaf_level = 1;
+		if (max_shift < 30)
+			break;
+		*leaf_level = 2;
+		break;
+	case 13: /* Device configured for Sparc page sizes */
+		if (max_shift < 16)
+			break;
+		*pte_ext_shift = 3; /* 64K base page - only populate every 8th leaf entry */
+		if (max_shift < 19)
+			break;
+		*pte_ext_shift = 6;  /* 512K base page - only populate every 64th leaf entry */
+		if (max_shift < 22)
+			break;
+		*leaf_level = 1;
+		*pte_ext_shift = 0;
+		if (max_shift < 25)
+			break;
+		*pte_ext_shift = 3; /* Fits 32M pages at level 1 - every 8th 4M entry */
+		if (max_shift < 28)
+			break;
+		*pte_ext_shift = 6; /* Fits 256M pages at level 1 - every 64th 4M entry */
+		if (max_shift < 31)
+			break;
+		*leaf_level = 2;
+		*pte_ext_shift = 0; /* Fits 2GB pages at level 2 */
+		if (max_shift < 34)
+			break;
+		*pte_ext_shift = 3; /* Fits 16GB pages at level 2 - every 8th 2GB entry */
+		if (max_shift < 37)
+			break;
+		break;
+	default:
+		BUG();
+	}
+	if (*leaf_level) {
+		page_shift = shift + (*leaf_level * sdev->mi.level_shift);
+		smallest_misalign = (dma_addr ^ vaddr) & ((1 << page_shift) - 1);
+		if (smallest_misalign & ~PAGE_MASK) {
+			sif_log(sdev, SIF_INFO,
+				"Failed to create page table: misaligned VA/DMA (0x%lx) dma 0x%llx vaddr 0x%llx",
+				smallest_misalign, dma_addr, vaddr);
+			return -EINVAL;
+		}
+
+		if (smallest_misalign) {
+			adj_page_shift = find_first_bit(&smallest_misalign, bits);
+			*leaf_level = (adj_page_shift - shift) / sdev->mi.level_shift;
+			sif_log(sdev, SIF_PT,
+				"misaligned VA/DMA adj: leaf_level %d, page_shift %d, smallest_misalign 0x%lx, adj_page_shift %d",
+				*leaf_level,
+				page_shift, smallest_misalign, adj_page_shift);
+			page_shift = adj_page_shift;
+		}
+		/* TBD: Remove - just for debugging */
+		if (*leaf_level > 3) {
+			sif_log(sdev, SIF_INFO,
+				"haywire leaf level %d - should not be possible - setting safe value 0",
+				*leaf_level);
+			*leaf_level = 0;
+			return -EINVAL;
+		}
+		if (*leaf_level) {
+			/* Check if we can do equally well with a lower level pointer */
+			int size_order = order_base_2(size);
+			int size_shift = page_shift - size_order;
+
+			if (size_shift < 0)
+				goto out;
+			sif_log(sdev, SIF_PT, "order %d page_shift %d size_shift %d",
+				size_order, page_shift, size_shift);
+			if (size_shift > 0) {
+				u32 new_leaf_level =
+					((page_shift - size_shift + sdev->mi.level_shift - 1 - shift)
+						/ sdev->mi.level_shift);
+				sif_log(sdev, SIF_PT, "new_leaf_level %d", new_leaf_level);
+				if (new_leaf_level < *leaf_level) {
+					*leaf_level = new_leaf_level;
+					sif_log(sdev, SIF_PT,
+						"size_shift %d, size adjusted leaf_level %d",
+						size_shift, *leaf_level);
+				}
+			}
+		}
+	}
+out:
+	sif_log(sdev, SIF_PT, "shift %d leaf_level %d", shift, *leaf_level);
+	return 0;
+}
+
+/* Find the aligned size of a region within a certain page alignment size
+ * (eg. the number of pages of size @alignment needed to address (start,len))
+ */
+u64 aligned_size(u64 start, u64 len, u64 alignment)
+{
+	u64 mask = alignment - 1;
+	u64 aligned_start = start & ~mask;
+	u64 aligned_end = (start + len + mask) & ~mask;
+
+	return aligned_end - aligned_start;
+}
+
+/* Find the union of the two ranges including non-overlapped parts */
+static u64 merge_ranges(u64 start1, u64 size1, u64 start2, u64 size2, u64 *new_size)
+{
+	u64 new_start = min(start1, start2);
+	u64 new_end = max(start1 + size1, start2 + size2);
+	*new_size = new_end - new_start;
+	return new_start;
+}
+
+static u32 level_to_pageshift(struct sif_pt *pt, int level)
+{
+	struct sif_mem_info *mi = &pt->sdev->mi;
+
+	level++;
+	if (level < 0 || level > 4)
+		sif_log(pt->sdev, SIF_INFO, "level %d", level);
+	BUG_ON(level < 0 || level > 4);
+	return mi->page_shift + mi->level_shift * level;
+}
+
+static u64 level_to_pagesize(struct sif_pt *pt, int level)
+{
+	return (1ull << level_to_pageshift(pt, level));
+}
+
+static u64 level_to_pagemask(struct sif_pt *pt, int level)
+{
+	return (level_to_pagesize(pt, level) - 1);
+}
+
+
+u32 sif_pt_page_shift(struct sif_pt *pt)
+{
+	return level_to_pageshift(pt, pt->leaf_level - 1);
+}
+
+/* Find the required page table memory need in number of
+ * pt->page_table_page sized pages
+ * If pt->fixed_top, calculate space for a final page for each of the levels
+ * even if only one entry is necessary.
+ *
+ * NB! Sets pt->top_level as a side effect
+ */
+static u32 table_mem_need(struct sif_pt *pt, u64 vstart, u64 mapsize)
+{
+	u64 aligned_size_pte;
+	u64 aligned_size_pmd;
+	u64 aligned_size_pud;
+	u64 aligned_size_pgd;
+	u64 aligned_size_pml4;
+	u64 psz;
+	int nptes, npmds, npuds, npgds, pte_pages;
+	int pshift;
+	/* If we need to guarantee that the top node remains the same, we must build
+	 * a max level page table
+	 */
+	int single = pt->fixed_top ? 1 : 0;
+	struct sif_dev *sdev = pt->sdev;
+
+	/* Determine what setup to use for the kmem object based on the initial mapsize:
+	 * We use 4K pages for now, and set sg_size to the number of pages needed to
+	 * support mapsize + the full chain of pages if we need a 4-level table:
+	 */
+	psz = sdev->mi.page_size;
+	aligned_size_pte = aligned_size(vstart, mapsize, psz);
+	psz <<= sdev->mi.level_shift;
+	aligned_size_pmd = aligned_size(vstart, mapsize, psz);
+	psz <<= sdev->mi.level_shift;
+	aligned_size_pud = aligned_size(vstart, mapsize, psz);
+	psz <<= sdev->mi.level_shift;
+	aligned_size_pgd = aligned_size(vstart, mapsize, psz);
+	psz <<= sdev->mi.level_shift;
+	aligned_size_pml4 = aligned_size(vstart, mapsize, psz);
+
+	sif_log(pt->sdev, SIF_MMU, "aligned lengths: pte %llx pmd %llx pud %llx pgd %llx pml4 %llx",
+		aligned_size_pte, aligned_size_pmd, aligned_size_pud,
+		aligned_size_pgd, aligned_size_pml4);
+
+	pshift = sdev->mi.page_shift + sdev->mi.level_shift;
+	nptes = aligned_size_pmd >> pshift;
+	pshift += sdev->mi.level_shift;
+	npmds = nptes > 1 ? aligned_size_pud >> pshift : single;
+	pshift += sdev->mi.level_shift;
+	npuds = npmds > 1 ? aligned_size_pgd >> pshift : single;
+	pshift += sdev->mi.level_shift;
+	npgds = npuds > 1 ? aligned_size_pml4 >> pshift : single;
+
+	pte_pages = pt->leaf_level ? 0 : nptes;
+
+	sif_log(pt->sdev, SIF_MMU, "npgds %d, npuds %d, npmds: %d, pte_pages %d",
+		npgds, npuds, npmds, pte_pages);
+
+	pt->top_level = single ? 3 : (npgds ? 3 : (npuds ? 2 : (npmds ? 1 : 0)));
+	return pte_pages + npmds + npuds + npgds;
+}
+
+/* Find page table entry index for the pte referring
+ * the page starting at vaddr at level @level
+ */
+static inline int sif_pte_index(struct sif_dev *sdev, u64 vaddr, u64 page_shift)
+{
+	return (vaddr >> page_shift) & (sdev->mi.ptes_per_page - 1);
+}
+
+
+
+
+static void pt_free_page(struct sif_pt *pt, struct sif_pt_page *n)
+{
+	list_add_tail(&n->list, &pt->freelist);
+	n->parent = NULL;
+	n->vaddr = 0;
+}
+
+
+/* Destructor callback for kref */
+static void sif_pt_release(struct kref *kref)
+{
+	struct sif_pt *pt = container_of(kref, struct sif_pt, refcnt);
+	struct list_head *np;
+	struct list_head *npp;
+	struct sif_pt_page *n;
+
+	sif_log(pt->sdev, SIF_MMU_V, "at %p", pt);
+
+	if (pt->top)
+		pt_free_page(pt, pt->top);
+
+	/* Actual cleanup */
+	list_for_each_safe(np, npp, &pt->freelist) {
+		n = list_entry(np, struct sif_pt_page, list);
+		kfree(n);
+	}
+	if (pt->m.sg_size)
+		sif_kmem_free(pt->sdev, &pt->m);
+	kfree(pt);
+}
+
+
+/* Create a sif_page_table object and if mapsize > 0,
+ * map the range starting at @sg to a map with start at virtual
+ * address @vstart and size @mapsize and the number of bits to use in each page
+ * in page_shift. The object can later be resized using sif_pt_extend/sif_pt_shrink:
+ * Set @modifiable to allow the table to be extended and shrinked
+ * Set @fixed_top to have pt guarantee that the top node remains constant
+ * in which case it will always be a level 4 tree.
+ */
+struct sif_pt *sif_pt_create(struct sif_dev *sdev, struct scatterlist *sg,
+			u64 vstart, size_t size, u32 page_shift,
+			bool modifiable, bool fixed_top)
+{
+	int ret = 0;
+	int i;
+	dma_addr_t dma_start = sg ? sg_dma_address(sg) : 0;
+	struct sif_pt *pt = sif_kmalloc(sdev, sizeof(*pt), GFP_KERNEL | __GFP_ZERO);
+
+	if (!pt)
+		return NULL;
+
+	/* sub-page misalignment in vstart must correspond with
+	 * misalignment in dma address but sg entries are page aligned:
+	 */
+	dma_start += vstart & ~PAGE_MASK;
+
+	sif_log(sdev, SIF_MMU, "vstart %llx, size %lx, page_shift %d%s", vstart, size,
+		page_shift, (modifiable ? " (modifiable)" : ""));
+	pt->sdev = sdev;
+	pt->fixed_top = fixed_top;
+	pt->modifiable = modifiable;
+
+	ret = find_optimal_leaf_level(sdev, page_shift,
+				vstart, dma_start, size,
+				&pt->leaf_level, &pt->pte_ext_shift);
+	if (ret)
+		goto extend_failed;
+
+	pt->page_shift = sdev->mi.page_shift + pt->leaf_level * sdev->mi.level_shift;
+	pt->ptes_per_page = 1 << sdev->mi.level_shift;
+
+	for (i = 0; i < PT_LEVELS; i++)
+		pt->pmd[i] = RB_ROOT;
+	kref_init(&pt->refcnt);
+	mutex_init(&pt->lock);
+	INIT_LIST_HEAD(&pt->freelist);
+
+	ret = sif_pt_extend(pt, sg, vstart, size);
+	if (ret < 0)
+		goto extend_failed;
+	return pt;
+
+extend_failed:
+	kfree(pt);
+	return NULL;
+}
+
+
+struct sif_pt *sif_pt_create_for_mem(struct sif_mem *mem,
+				u64 vstart, u32 page_shift, bool modifiable, bool fixed_top)
+{
+	int ret = 0;
+	int i;
+	struct sif_dev *sdev = mem->sdev;
+	struct sif_pt *pt = sif_kmalloc(sdev, sizeof(*pt), GFP_KERNEL | __GFP_ZERO);
+	size_t size = mem->size;
+
+	if (!pt)
+		return NULL;
+
+	sif_log(sdev, SIF_MMU, "vstart %llx, size %lx, page_shift %d%s", vstart, size,
+		page_shift, (modifiable ? " (modifiable)" : ""));
+	pt->sdev = sdev;
+	pt->fixed_top = fixed_top;
+	pt->modifiable = modifiable;
+	ret = find_optimal_leaf_level(sdev, page_shift,
+				vstart, sif_mem_dma(mem, 0), size,
+				&pt->leaf_level, &pt->pte_ext_shift);
+	if (ret)
+		goto extend_failed;
+
+	pt->page_shift = sdev->mi.page_shift + pt->leaf_level * sdev->mi.level_shift;
+	pt->ptes_per_page = 1 << sdev->mi.level_shift;
+
+	for (i = 0; i < PT_LEVELS; i++)
+		pt->pmd[i] = RB_ROOT;
+	kref_init(&pt->refcnt);
+	mutex_init(&pt->lock);
+	INIT_LIST_HEAD(&pt->freelist);
+
+	ret = sif_pt_extend_with_mem(pt, mem, vstart);
+	if (ret < 0)
+		goto extend_failed;
+	return pt;
+
+extend_failed:
+	kfree(pt);
+	return NULL;
+}
+
+
+/* Create an empty, extendable sif page table object */
+struct sif_pt *sif_pt_create_empty(struct sif_dev *sdev, u64 vstart, enum sif_mem_type map_mt)
+{
+	u32 page_shift = sdev->mi.page_shift;
+	struct sif_pt *pt;
+	int ret;
+
+	if (map_mt == SIFMT_2M)
+		page_shift += sdev->mi.level_shift;
+
+	pt = sif_pt_create(sdev, NULL, vstart, 0, page_shift, true, map_mt == SIFMT_CS);
+	if (!pt)
+		return NULL;
+
+	if (map_mt == SIFMT_CS) {
+		/* Allocate an empty top page table page to get an address to send to PSIF: */
+		pt->top_level = 3;
+		ret = init_top(pt, 0, 1);
+		if (ret) {
+			sif_kmem_free(pt->sdev, &pt->m);
+			return NULL;
+		}
+	}
+	return pt;
+}
+
+
+/* DMA address of root pointer of page table */
+dma_addr_t sif_pt_dma_root(struct sif_pt *pt)
+{
+	return pt->top ? sg_dma_address(pt->top->page) : 0;
+}
+
+/* SIF level of root pointer */
+u8 sif_pt_root_table_level(struct sif_pt *pt)
+{
+	return pt->top_level + 1;
+}
+
+
+/* Create sif_pt_page objects for @npages new pages for the page list in @sgl
+ * and insert them into the freelist:
+ */
+static int add_pages_to_freelist(struct sif_pt *pt, struct scatterlist *sgl, int npages)
+{
+	struct scatterlist *sg;
+	struct sif_pt_page *n;
+	int i;
+
+	for_each_sg(sgl, sg, npages, i) {
+		n = sif_pt_cache_alloc(pt->sdev, GFP_KERNEL | __GFP_ZERO);
+		if (!n)
+			return -ENOMEM;
+		sif_log(pt->sdev, SIF_MMU_V, "i = %d: sg %p", i, sg);
+		n->page = sg;
+		list_add_tail(&n->list, &pt->freelist);
+	}
+	return 0;
+}
+
+
+/* TBD: Consider allocating more than a single page at a time from @m object
+ * as sif_kmem_find_sg_list is O(n) where n is the number of sg arrays in @m.
+ */
+static struct sif_pt_page *pt_alloc_page(struct sif_pt *pt, u64 vaddr)
+{
+	int ret;
+	struct scatterlist *sg;
+	struct sif_pt_page *n;
+
+	if (list_empty(&pt->freelist)) {
+		ret = sif_kmem_extend(pt->sdev, &pt->m, PAGE_SIZE, GFP_KERNEL);
+		if (ret < 0)
+			goto failed;
+		sg = sif_kmem_find_sg_idx(&pt->m, ret);
+		ret = add_pages_to_freelist(pt, sg, 1);
+		if (ret < 0)
+			goto failed;
+	}
+
+	n = list_first_entry(&pt->freelist, struct sif_pt_page, list);
+	list_del(&n->list);
+	n->vaddr = vaddr;
+	return n;
+failed:
+	return ERR_PTR(ret);
+}
+
+
+
+static struct sif_pt_page *replace_top(struct sif_pt *pt, u64 vaddr)
+{
+	/* insert a new top node, put the old one into the
+	 * empty rbtree for this level, and link the old top node from
+	 * the new top:
+	 */
+	u64 aligned_vaddr, top_pagesize;
+	u64 pt_shift, ptv;
+	u64 *pmd;
+	int i;
+	struct sif_pt_page *ep;
+	struct sif_dev *sdev = pt->sdev;
+
+	if (pt->top->usecnt == 1) {
+		/* Top node not used, just reuse with different va */
+		pt->top->vaddr = vaddr;
+		return pt->top;
+	}
+
+	pt->top->usecnt--;
+	/* Loop until we have a top node that spans vaddr */
+	do {
+		int level = pt->top_level;
+		struct rb_root *root = &pt->pmd[level];
+		struct rb_node **np = &root->rb_node;
+
+		top_pagesize = level_to_pagesize(pt, ++pt->top_level);
+		aligned_vaddr = pt->top->vaddr & ~(top_pagesize - 1);
+
+		rb_link_node(&pt->top->node, NULL, np);
+		rb_insert_color(&pt->top->node, root);
+		ep = pt->top;
+		pt->top = pt_alloc_page(pt, aligned_vaddr);
+		if (IS_ERR(pt->top)) {
+			ep = pt->top;
+			pt->top = NULL;
+			return ep;
+		}
+
+		ep->parent = pt->top;
+		pmd = sg_virt(pt->top->page);
+		pt_shift = level_to_pageshift(pt, level);
+		i = sif_pte_index(sdev, ep->vaddr, pt_shift);
+		ptv = sg_dma_address(ep->page) | PT_PAGE_PRESENT;
+		sif_log(sdev, SIF_MMU_V, "level %d: pmd[%d](%p) = %llx", level, i, &pmd[i], ptv);
+		BUG_ON(pmd[i] != 0);
+		pmd[i] = ptv;
+		pt->top->usecnt++;
+
+		sif_log(sdev, SIF_MMU,
+			"New top node at dma addr %pad level %d - aligned at %llx, page sz. %llx",
+			&sg_dma_address(pt->top->page), pt->top_level, aligned_vaddr, top_pagesize);
+	} while (vaddr < aligned_vaddr || vaddr >= aligned_vaddr + top_pagesize);
+
+	return NULL;
+}
+
+
+
+/* Find the page table page at level whose first entry references the sif virtual address @vaddr
+ * @vaddr assumed to be aligned to the appropriate alignment for the level.
+ * If the page does not exist, allocate a new one and add it:
+ */
+static struct sif_pt_page *find_insert_page(struct sif_pt *pt, u8 level, u64 vaddr)
+{
+	struct rb_root *root = &pt->pmd[level];
+	struct rb_node **np = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct sif_pt_page *ep;
+	struct sif_dev *sdev = pt->sdev;
+
+	sif_log(sdev, SIF_MMU, "level %d vaddr %llx", level, vaddr);
+	if (level == pt->top_level) {
+		if (likely(vaddr == pt->top->vaddr))
+			return pt->top;
+
+		/* (possibly recursively) build up a new top node that spans both
+		 * the old tree and the new subtree:
+		 */
+		ep = replace_top(pt, vaddr);
+		if (ep)
+			return ep;
+	}
+
+	while (*np) {
+		ep = container_of(*np, struct sif_pt_page, node);
+		parent = *np;
+		if (vaddr < ep->vaddr)
+			np = &((*np)->rb_left);
+		else if (vaddr > ep->vaddr)
+			np = &((*np)->rb_right);
+		else {
+			sif_log(sdev, SIF_PT,
+				"Level %d: Found page at vaddr %llx with dma addr %pad",
+				level, ep->vaddr, &sg_dma_address(ep->page));
+			return ep;
+		}
+	}
+
+	/* Allocate and insert a new node into the tree */
+	ep = pt_alloc_page(pt, vaddr);
+	if (IS_ERR(ep))
+		return ep;
+
+	sif_log(sdev, SIF_PT, "Allocated new pt page for vaddr %llx with dma addr %pad",
+		vaddr, &sg_dma_address(ep->page));
+
+	rb_link_node(&ep->node, parent, np);
+	rb_insert_color(&ep->node, root);
+	return ep;
+}
+
+
+/* Find an element in the tree for the given level, return NULL if it does not
+ * exist:
+ */
+static struct sif_pt_page *find_page(struct sif_pt *pt, u8 level, u64 vaddr)
+{
+	struct rb_root *root;
+	struct rb_node *n;
+	struct rb_node *parent = NULL;
+	struct sif_pt_page *ep;
+
+	if (level == pt->top_level)
+		return pt->top;
+
+	root = &pt->pmd[level];
+	n = root->rb_node;
+
+	sif_log(pt->sdev, SIF_MMU_V, "level %d vaddr %llx", level, vaddr);
+	while (n) {
+		ep = container_of(n, struct sif_pt_page, node);
+		parent = n;
+		if (vaddr < ep->vaddr)
+			n = n->rb_left;
+		else if (vaddr > ep->vaddr)
+			n = n->rb_right;
+		else
+			return ep;
+	}
+	return NULL;
+}
+
+
+static inline struct sif_pt_page *next_page(struct sif_pt_page *p)
+{
+	struct rb_node *node = rb_next(&p->node);
+
+	if (node)
+		return container_of(node, struct sif_pt_page, node);
+	else
+		return NULL;
+}
+
+static inline struct sif_pt_page *prev_page(struct sif_pt_page *p)
+{
+	struct rb_node *node = rb_prev(&p->node);
+
+	if (node)
+		return container_of(node, struct sif_pt_page, node);
+	else
+		return NULL;
+}
+
+static inline struct sif_pt_page *first_page(struct sif_pt *pt, int level)
+{
+	struct rb_node *node = rb_first(&pt->pmd[level]);
+
+	if (node)
+		return container_of(node, struct sif_pt_page, node);
+	else
+		return NULL;
+}
+
+static inline struct sif_pt_page *last_page(struct sif_pt *pt, int level)
+{
+	struct rb_node *node = rb_last(&pt->pmd[level]);
+
+	if (node)
+		return container_of(node, struct sif_pt_page, node);
+	else
+		return NULL;
+}
+
+
+/* Create the page table tree from the given vaddr upwards, until
+ * we reach an existsting node or find the top node. Update use counts on the
+ * involved nodes:
+ */
+static struct sif_pt_page *find_next(struct sif_pt *pt, u8 level, u64 vaddr)
+{
+	u64 vaddr_up = 0;
+	struct sif_pt_page *pt_page_start = find_insert_page(pt, level, vaddr);
+	struct sif_pt_page *pt_page;
+	struct sif_pt_page *pt_parent;
+	struct sif_dev *sdev = pt->sdev;
+	int i;
+
+	if (pt_page_start == pt->top || IS_ERR(pt_page_start))
+		return pt_page_start;
+
+	sif_log(sdev, SIF_MMU_V, "level %d vaddr %llx", level, vaddr);
+
+	pt_page = pt_page_start;
+	for (;;) {
+		u64 pt_shift, ptv;
+		u64 *pmd;
+
+		pt_shift = level_to_pageshift(pt, level);
+		pt_parent = pt_page->parent;
+		level++;
+		if (pt_parent) {
+			/* We found an existing node - rest of the tree upwards is ok */
+			break;
+		}
+		vaddr_up = vaddr & ~level_to_pagemask(pt, level);
+		if (level == pt->top_level && vaddr_up == pt->top->vaddr) {
+			sif_log(sdev, SIF_PT, "found top at level %d", level);
+			pt_parent = pt->top;
+		} else {
+			sif_log(sdev, SIF_PT, "searching at level %d/%d from vaddr %llx",
+				level, pt->top_level, vaddr_up);
+			pt_parent = find_insert_page(pt, level, vaddr_up);
+		}
+
+		if (IS_ERR(pt_parent))
+			return pt_parent;
+
+		pt_page->parent = pt_parent;
+
+		/* Set page pointer in parent */
+		pmd = sg_virt(pt_parent->page);
+		i = sif_pte_index(sdev, vaddr, pt_shift);
+		ptv = sg_dma_address(pt_page->page) | PT_PAGE_PRESENT;
+		sif_log(sdev, SIF_MMU_V, "level %d: pmd[%d](%p) = %llx", level, i, &pmd[i], ptv);
+		WARN_ON(pmd[i] != 0);
+		pmd[i] = ptv;
+
+		pt_parent->usecnt++;
+		if (pt_parent == pt->top || pt_parent->usecnt > 1)
+			break;
+		pt_page = pt_parent;
+		vaddr = vaddr_up;
+	}
+	return pt_page_start;
+}
+
+
+static int populate_pt(struct sif_pt *pt, struct scatterlist *sg,
+		u64 vstart, size_t size)
+{
+	int level = pt->leaf_level;
+	u64 va, vend, incr;
+	u64 pt_shift = level_to_pageshift(pt, level-1); /* page shift for the level below us */
+	u64 page_flags = PT_PAGE_PRESENT;
+	struct sif_dev *sdev = pt->sdev;
+	u64 small_page_misalign;
+	u64 large_page_misalign = 0;
+	off_t sg_offset; /* Running page aligned offset within the current sg */
+
+	/* If level > 0 we must set the PS bit to indicate that this is a leaf node
+	 * We also have two levels of alignment to consider:
+	 */
+	if (level > 0) {
+		small_page_misalign = vstart & level_to_pagemask(pt, level - 2);
+		large_page_misalign = (vstart & level_to_pagemask(pt, level - 1)) - small_page_misalign;
+		page_flags |= PT_PAGE_PS;
+	} else
+		small_page_misalign = (vstart & level_to_pagemask(pt, level - 1));
+
+
+	/* Populate the table at level @level - assuming no overlap */
+	vend = vstart + size;
+	va = vstart & ~level_to_pagemask(pt, level - 1);
+
+	/* Depending on alignment we might need to point to a DMA address
+	 * way ahead of the first sg, but aligned to the first small page size:
+	 */
+	sg_offset = -large_page_misalign;
+	incr = level_to_pagesize(pt, level - 1) << pt->pte_ext_shift;
+
+	sif_log(sdev, SIF_PT,
+		"level %d mis (0x%llx,0x%llx) vstart %llx -> %llx size %lx pte_ext_shift %d, incr 0x%llx sg_offset %#lx",
+		level, small_page_misalign, large_page_misalign, vstart, va, size,
+		pt->pte_ext_shift, incr, sg_offset);
+
+	while (va < vend) {
+		struct sif_pt_page *pt_page;
+		u64 *pmd;
+		int i;
+		u64 va_up = va & ~level_to_pagemask(pt, level);
+
+		pt_page = find_next(pt, level, va_up);
+		if (IS_ERR(pt_page))
+			return PTR_ERR(pt_page);
+
+		pmd = sg_virt(pt_page->page);
+		i = sif_pte_index(sdev, va, pt_shift);
+		for (; i < sdev->mi.ptes_per_page && va < vend; i++) {
+			u64 ptv;
+
+			if (!sg) {
+				sif_log(sdev, SIF_INFO,
+					"##### pt at %p: level %d: failed to find next sg at va %llx (vstart,size) = (%llx,%lx))",
+					pt, level, va, vstart, size);
+				return -EIO;
+			}
+			ptv = (sg_dma_address(sg) + sg_offset) | page_flags;
+			WARN_ON(pmd[i] != 0);
+			sif_log(sdev, SIF_PT_V, "va %llx: level %d: pmd[%d](%p) = %llx",
+				va, level, i, &pmd[i], ptv);
+			pmd[i] = ptv;
+			pt_page->usecnt++;
+			va += incr;
+			sg_offset += incr;
+			/* At this point size might be the end aligned size at this level so
+			 * make sure to terminate at the end of the sg list:
+			 */
+			while (sg && sg_offset >= sg_dma_len(sg)) {
+				if (incr > sdev->mi.page_size)
+					sif_log(sdev, SIF_PT_VV,
+						"sg_offset %#lx sg->length %x sg_dma_len(sg) %x",
+						sg_offset, sg->length, sg_dma_len(sg));
+				sg_offset -= sg_dma_len(sg);
+				sg = sg_next(sg);
+			}
+			/* Note that we must handle both small incr in large pages and opposite! */
+			if (unlikely(sg_offset && sg_offset < incr))
+				return 0; /* We're done - vend in the middle of a higher level page */
+		}
+	}
+
+	return 0;
+}
+
+
+/* sif_mem iterator based page table population - needed for special types */
+static int populate_pt_from_mem(struct sif_pt *pt, struct sif_mem *mem, u64 vstart, bool fast_path)
+{
+	u8 level = pt->leaf_level;
+	u64 va, vend, incr;
+	u64 pt_shift = level_to_pageshift(pt, level-1); /* page shift for the level below us */
+	u64 page_flags = PT_PAGE_PRESENT;
+	struct sif_mem_iter mi;
+	struct sif_dev *sdev = pt->sdev;
+	u64 small_page_misalign;
+	u64 large_page_misalign = 0;
+	off_t sg_offset; /* Running page aligned offset within the current sg */
+
+	/* If level > 0 we must set the PS bit to indicate that this is a leaf node
+	 * We also have two levels of alignment to consider:
+	 */
+	if (level > 0) {
+		small_page_misalign = vstart & level_to_pagemask(pt, level - 2);
+		large_page_misalign = (vstart & level_to_pagemask(pt, level - 1)) - small_page_misalign;
+		page_flags |= PT_PAGE_PS;
+	} else
+		small_page_misalign = (vstart & level_to_pagemask(pt, level - 1));
+
+	/* Populate the table at level @level - assuming no overlap */
+	vend = vstart + mem->size;
+	va = vstart & ~level_to_pagemask(pt, level - 1);
+
+	/* Depending on alignment we might need to point to a DMA address
+	 * way ahead of the first sg, but aligned to the first small page size:
+	 */
+	sg_offset = -large_page_misalign;
+	incr = level_to_pagesize(pt, level - 1) << pt->pte_ext_shift;
+	sif_mem_iter_init(mem, &mi);
+
+	sif_log(sdev, SIF_PT,
+		"level %d mis (0x%llx,0x%llx) vstart %llx -> %llx size %llx pte_ext_shift %d, incr 0x%llx sg_offset %#lx",
+		level, small_page_misalign, large_page_misalign, vstart, va, mem->size,
+		pt->pte_ext_shift, incr, sg_offset);
+
+	while (va < vend) {
+		struct sif_pt_page *pt_page;
+		u64 *pmd;
+		int i;
+		u64 va_up = va & ~level_to_pagemask(pt, level);
+
+		pt_page = find_next(pt, level, va_up);
+		if (IS_ERR(pt_page))
+			return PTR_ERR(pt_page);
+
+		pmd = sg_virt(pt_page->page);
+		i = sif_pte_index(sdev, va, pt_shift);
+		for (; i < sdev->mi.ptes_per_page && va < vend; i++) {
+			u64 ptv;
+
+			ptv = (sif_mem_iter_dma(&mi) + sg_offset) | page_flags;
+			BUG_ON(!(ptv & ~0x81));
+			sif_log(sdev, SIF_PT_V, "level %d: pmd[%d](%p) = %llx", level, i, &pmd[i], ptv);
+			pmd[i] = ptv;
+			if (!fast_path)
+				pt_page->usecnt++;
+			va += incr;
+			sg_offset += incr;
+			if (va < vend) {
+				int ret = sif_mem_iter_advance(&mi, sg_offset);
+
+				if (ret) {
+					sif_log(sdev, SIF_MMU_V, "No page for vaddr %llx", va);
+					return ret;
+				}
+				sg_offset = 0;
+			}
+		}
+	}
+
+	return 0;
+}
+
+
+/* (safe) observe leaf node of page table at @vaddr */
+int sif_pt_entry(struct sif_pt *pt, u64 vaddr, dma_addr_t *entry, dma_addr_t *val)
+{
+	int ret = 0;
+	struct sif_pt_page *p;
+	struct sif_dev *sdev = pt->sdev;
+	u64 *pmd;
+	u64 pt_shift;
+	u64 va_up;
+	u8 level;
+	int i, ip;
+
+	mutex_lock(&pt->lock);
+	level = pt->leaf_level;
+	va_up = vaddr & ~level_to_pagemask(pt, level);
+	pt_shift = level_to_pageshift(pt, level-1);
+	p = find_page(pt, level, va_up);
+	if (p) {
+		pmd = sg_virt(p->page);
+		i = sif_pte_index(sdev, vaddr, pt_shift);
+		*val = pmd[i];
+		pmd = sg_virt(p->parent->page);
+		ip = sif_pte_index(sdev, va_up, level_to_pageshift(pt, level));
+		*entry = pmd[ip];
+		sif_log(sdev, SIF_MMU_V,
+			"Page at vaddr %llx, lookup vaddr %llx at index %d: entry(idx = %d): %pad, value: %pad",
+			va_up, vaddr, i, ip, entry, val);
+	} else {
+		sif_log(sdev, SIF_MMU_V, "Page at vaddr %llx not found", va_up);
+		ret = -EINVAL;
+	}
+	mutex_unlock(&pt->lock);
+	return ret;
+}
+
+
+/* Remove a reference to the given remove_addr from page @p,
+ * if refcnt == 0, return page to freelist
+ * and (if at leaf level) return the next page in the rb_tree, otherwise return
+ * the same page.
+ *
+ */
+static struct sif_pt_page *remove_page_ref(struct sif_pt *pt, struct sif_pt_page *p,
+					u64 remove_addr, u8 level)
+{
+	struct sif_pt_page *np = p;
+	u64 *pmd = sg_virt(p->page);
+	int index = sif_pte_index(pt->sdev, remove_addr, level_to_pageshift(pt, level-1));
+	u64 dma_addr = sg_dma_address(p->page);
+
+	BUG_ON(p->usecnt < 1);
+	pmd[index] = 0;
+
+	p->usecnt--;
+	sif_log(pt->sdev, SIF_PT_VV,
+		"level %d: index = %d ps = %d, page - dma at 0x%llx - use count %d",
+		level, index, level_to_pageshift(pt, level-1), dma_addr, p->usecnt);
+	if (!p->usecnt) {
+		if (p->parent)
+			remove_page_ref(pt, p->parent, p->vaddr, level + 1);
+		else
+			BUG_ON(p != pt->top);
+		if (level == pt->leaf_level)
+			np = next_page(p);
+		if (pt->top != p) /* We dont use the rbtree for the top node */
+			rb_erase(&p->node, &pt->pmd[level]);
+		else
+			pt->top = NULL; /* So we can check if removal is needed in sif_pt_release() */
+		pt_free_page(pt, p);
+	}
+	return np;
+}
+
+/* size of each sg list used to maintain page table pages
+ * when fixed_top is set (currently only used by the sq_cmpl table)
+ * We want it reasonably large as we index in constant time into the list
+ * but use a linear scan to navigate the chain of lists
+ */
+#define FIXED_TOP_SG_SIZE 0x1000
+
+static int init_top(struct sif_pt *pt, u64 vstart, int npages)
+{
+	u64 aligned_vaddr = vstart & ~(level_to_pagesize(pt, pt->top_level) - 1);
+	int ret;
+	size_t sg_size = pt->fixed_top ? FIXED_TOP_SG_SIZE : max(npages, 1);
+
+	/* Single pte table necessary for WA for Bug #4096 */
+	if (pt->top_level < pt->leaf_level) {
+		sif_log(pt->sdev, SIF_PT_V, "Adjusting top level %d -> %d",
+			pt->top_level, pt->leaf_level);
+		pt->top_level = pt->leaf_level;
+	}
+
+	ret = sif_kmem_init(pt->sdev, &pt->m, sg_size, (u64)npages << PAGE_SHIFT,
+			PAGE_SHIFT, GFP_KERNEL, DMA_TO_DEVICE);
+	if (ret < 0)
+		return ret;
+
+	if (add_pages_to_freelist(pt, pt->m.sg, pt->m.sg_max))
+		return ret;
+
+	/* Create the top node of the page table: */
+	pt->top = pt_alloc_page(pt, aligned_vaddr);
+	if (unlikely(IS_ERR(pt->top))) {
+		int ret = PTR_ERR(pt->top);
+
+		pt->top = NULL;
+		return ret;
+	}
+	sif_log(pt->sdev, SIF_PT_V,
+		"Created top node at kva %p, dma addr %pad level %d for vstart %llx - aligned at %llx",
+		sg_virt(pt->top->page), &sg_dma_address(pt->top->page),
+		pt->top_level, vstart, aligned_vaddr);
+
+	if (pt->modifiable) {
+		/* avoid that this node gets freed if all mappings are removed */
+		pt->top->usecnt++;
+	}
+	return 0;
+}
+
+
+inline void reinit_top(struct sif_pt *pt, u64 vstart)
+{
+	u64 aligned_vaddr = vstart & ~(level_to_pagesize(pt, pt->top_level) - 1);
+
+	sif_log(pt->sdev, SIF_PT_V,
+		"Reused top node at dma addr %pad level %d for vstart %llx - aligned at %llx",
+		&sg_dma_address(pt->top->page), pt->top_level, vstart, aligned_vaddr);
+	pt->top->vaddr = aligned_vaddr;
+}
+
+
+static u64 recalc_vstart(struct sif_pt *pt)
+{
+	struct sif_dev *sdev = pt->sdev;
+	struct sif_pt_page *p = first_page(pt, pt->leaf_level);
+	u64 page_shift = level_to_pageshift(pt, pt->leaf_level - 1);
+	int i;
+
+	if (p) {
+		u64 *pmd = sg_virt(p->page);
+
+		for (i = 0; i < sdev->mi.ptes_per_page; i++)
+			if (pmd[i]) {
+				u64 nvaddr = p->vaddr + (i << page_shift);
+				u64 delta_sz = nvaddr - pt->vstart;
+
+				sif_log(sdev, SIF_PT_V, "vstart %llx -> %llx (vsize %llx -> %llx)",
+					pt->vstart, nvaddr, pt->vsize, pt->vsize - delta_sz);
+				pt->vsize -= delta_sz;
+				return nvaddr;
+			}
+	}
+	pt->vsize = 0;
+	pt->vstart = 0;
+	return 0;
+}
+
+static u64 recalc_size(struct sif_pt *pt)
+{
+	struct sif_dev *sdev = pt->sdev;
+	struct sif_pt_page *p = last_page(pt, pt->leaf_level);
+	u64 page_shift = level_to_pageshift(pt, pt->leaf_level - 1);
+	int i;
+
+	if (p) {
+		u64 *pmd = sg_virt(p->page);
+
+		for (i = sdev->mi.ptes_per_page - 1; i >= 0; i--)
+			if (pmd[i]) {
+				u64 nend = p->vaddr + ((i+1) << page_shift);
+				u64 nvsize = nend - pt->vstart;
+
+				sif_log(sdev, SIF_MMU_V, "vstart at %llx, size %llx -> %llx",
+					pt->vstart, pt->vsize, nvsize);
+				return nvsize;
+			}
+	}
+	pt->vsize = 0;
+	pt->vstart = 0;
+	return 0;
+}
+
+
+
+/* Extend a page table at DMA address @vstart with the list starting at @sg with size @size */
+int sif_pt_extend(struct sif_pt *pt, struct scatterlist *sg, u64 vstart, size_t size)
+{
+	int ret = 0;
+	u32 npages;
+	u64 page_mask = level_to_pagesize(pt, pt->leaf_level - 1) - 1;
+	u64 new_start;
+	u64 new_size;
+
+	if (!size)
+		return 0;
+
+	sif_log(pt->sdev, SIF_MMU, "** vstart %llx size %lx page size %llx leaf_level %d **",
+		vstart, size, page_mask + 1, pt->leaf_level);
+	mutex_lock(&pt->lock);
+
+	/* Calculate a good size of each sg table in the kmem object: */
+	if (!pt->top) {
+		/* This is a blank pt - allocate and set up the initial structures */
+		npages = table_mem_need(pt, vstart, size);
+
+		ret = init_top(pt, vstart, npages);
+		if (ret)
+			goto kmem_ext_failed;
+
+		new_start = vstart;
+		new_size = size;
+	} else if (pt->vsize == 0) {
+		new_start = vstart;
+		new_size = size;
+		reinit_top(pt, vstart);
+	} else {
+		if (!pt->modifiable) {
+			sif_log(pt->sdev, SIF_INFO, "error: Attempt to modify an unmodifiable page table");
+			return -EINVAL;
+		}
+		new_start = merge_ranges(pt->vstart, pt->vsize, vstart, size, &new_size);
+		sif_log(pt->sdev, SIF_MMU_V, "new_start %llx new_size %llx **",
+			new_start, new_size);
+	}
+
+	kref_get(&pt->refcnt);
+
+	ret = populate_pt(pt, sg, vstart, size);
+	if (ret)
+		goto populate_failed;
+
+	/* sync the whole table memory to make sure the changes are reflected:
+	 * TBD: Optimize to only sync the parts that have actually been modified.
+	 * With this code we will potentially sync a long page freelist as well:
+	 */
+	dma_sync_sg_for_device(pt->sdev->ib_dev.dma_device, pt->m.sg, pt->m.sg_max, DMA_TO_DEVICE);
+
+	pt->vstart = new_start;
+	pt->vsize = new_size;
+	mutex_unlock(&pt->lock);
+	return ret;
+populate_failed:
+	kref_put(&pt->refcnt, sif_pt_release);
+kmem_ext_failed:
+	sif_kmem_free(pt->sdev, &pt->m);
+	mutex_unlock(&pt->lock);
+	return ret;
+}
+
+
+
+/* Extend a page table at DMA address @vstart with the contents of @mem */
+int sif_pt_extend_with_mem(struct sif_pt *pt, struct sif_mem *mem, u64 vstart)
+{
+	int ret = 0;
+	u32 npages;
+	u64 page_mask = level_to_pagesize(pt, pt->leaf_level - 1) - 1;
+	u64 new_start;
+	u64 new_size;
+	size_t size = mem->size;
+
+	if (!size)
+		return 0;
+
+	sif_log(pt->sdev, SIF_MMU, "** vstart %llx size %lx page size %llx leaf level %d **",
+		vstart, size, page_mask + 1, pt->leaf_level);
+	mutex_lock(&pt->lock);
+
+	/* Calculate a good size of each sg table in the kmem object: */
+	if (!pt->top) {
+		/* This is a blank pt - allocate and set up the initial structures */
+		npages = table_mem_need(pt, vstart, size);
+
+		ret = init_top(pt, vstart, npages);
+		if (ret)
+			goto kmem_ext_failed;
+
+		new_start = vstart;
+		new_size = size;
+	} else if (!pt->modifiable) {
+		sif_log(pt->sdev, SIF_INFO, "error: Attempt to modify an unmodifiable page table");
+		return -EINVAL;
+	} else if (pt->vsize == 0) {
+		new_start = vstart;
+		new_size = size;
+		reinit_top(pt, vstart);
+	} else {
+		new_start = merge_ranges(pt->vstart, pt->vsize, vstart, size, &new_size);
+		sif_log(pt->sdev, SIF_MMU_V, "new_start %llx new_size %llx **",
+			new_start, new_size);
+	}
+
+	kref_get(&pt->refcnt);
+
+	ret = populate_pt_from_mem(pt, mem, vstart, false);
+
+	/* sync the whole table memory to make sure the changes are reflected:
+	 * TBD: Optimize to only sync the parts that have actually been modified.
+	 * With this code we will potentially sync a long page freelist as well:
+	 */
+	dma_sync_sg_for_device(pt->sdev->ib_dev.dma_device, pt->m.sg, pt->m.sg_max, DMA_TO_DEVICE);
+
+	pt->vstart = new_start;
+	pt->vsize = new_size;
+	mutex_unlock(&pt->lock);
+	return ret;
+
+kmem_ext_failed:
+	sif_kmem_free(pt->sdev, &pt->m);
+	mutex_unlock(&pt->lock);
+	return ret;
+}
+
+
+/* Shrink a page table to no longer contain DMA address start @sg and size @size */
+int sif_pt_free_part(struct sif_pt *pt, u64 vstart, size_t size)
+{
+	struct sif_pt_page *p;
+	int level = pt->leaf_level;
+	u64 va = vstart & ~level_to_pagemask(pt, level - 1);
+	u64 va_up = va & ~level_to_pagemask(pt, level);
+	u64 vend = vstart + size;
+	u64 page_size;
+	int ret = 0;
+
+	sif_log(pt->sdev, SIF_PT_V, "** vstart %llx -> %llx, size %lx **", vstart, va, size);
+
+	page_size = level_to_pagesize(pt, level - 1);
+	mutex_lock(&pt->lock);
+	p = find_page(pt, level, va_up);
+	if (!p) {
+		sif_log(pt->sdev, SIF_INFO, "vaddr %llx not found at level %d",
+			va_up, level);
+		ret = -EINVAL; /* va not mapped */
+		goto failed;
+	}
+
+	while (va < vend && p) {
+		p = remove_page_ref(pt, p, va, level);
+		if (!p)
+			break;
+		if (va < p->vaddr)
+			va = p->vaddr;
+		else
+			va += page_size;
+	}
+	if (vstart == pt->vstart) {
+		pt->vsize -= size;
+		pt->vstart += size;
+		if (size == pt->vsize)
+			pt->vstart = pt->vsize = 0;
+		else
+			pt->vstart = recalc_vstart(pt);
+	}
+	if (vend == pt->vstart + pt->vsize) {
+		pt->vsize -= size;
+		if (size == pt->vsize)
+			pt->vstart = pt->vsize = 0;
+		else
+			pt->vsize = recalc_size(pt);
+	}
+
+	/* sync the whole table memory to make sure the changes are reflected:
+	 * TBD: Optimize to only sync the parts that have actually been modified.
+	 * With this code we will potentially sync a long page freelist as well:
+	 */
+	dma_sync_sg_for_device(pt->sdev->ib_dev.dma_device, pt->m.sg, pt->m.sg_max, DMA_TO_DEVICE);
+
+	mutex_unlock(&pt->lock);
+	return kref_put(&pt->refcnt, sif_pt_release);
+
+failed:
+	mutex_unlock(&pt->lock);
+	return ret;
+}
+
+/* Free remaining mappings */
+int sif_pt_free(struct sif_pt *pt)
+{
+	int ret = 0;
+
+	if (pt->vsize) {
+		int ref = atomic_read(&pt->refcnt.refcount);
+
+		if (ref == 2)
+			ret = sif_pt_free_part(pt, pt->vstart, pt->vsize);
+		else {
+			sif_log(pt->sdev, SIF_MMU_V, "failed - vstart %llx, sz %llx, refcnt %d",
+				pt->vstart, pt->vsize, ref);
+			return -EBUSY;
+		}
+	}
+	if (!ret) {
+		sif_log(pt->sdev, SIF_MMU_V, "refcnt %d", atomic_read(&pt->refcnt.refcount) - 1);
+		ret = kref_put(&pt->refcnt, sif_pt_release);
+		if (!ret)
+			return -EBUSY;
+		ret = 0;
+	}
+	return ret;
+}
+
+
+
+/* Remap the (remappable) page table to be used starting at vstart for the range of mem */
+int sif_pt_remap_for_mem(struct sif_pt *pt, struct sif_mem *mem, u32 page_shift,
+			u64 vstart)
+{
+	/* We optimize the case where @vstart is aligned in a way that allows
+	 * the page table to be reused directly. For now we just handle the case where
+	 * the old and new vaddr and the size is the same, which is the case for RDS,
+	 * our main use case for FMR at this stage.
+	 * For all other cases, we just do a full cycle of free/extend_with_mem:
+	 */
+	int ret = 0;
+
+	if (pt->vstart != vstart || pt->vsize != mem->size || pt->page_shift != page_shift) {
+		ret = sif_pt_free_part(pt, pt->vstart, pt->vsize);
+		if (ret)
+			return ret;
+		ret = sif_pt_extend_with_mem(pt, mem, vstart);
+		return ret;
+	}
+
+	sif_log(pt->sdev, SIF_MMU_V, "** vstart %llx size %llx **", vstart, mem->size);
+	mutex_lock(&pt->lock);
+
+	/* Fast path: Repopulate ptes directly - all ref.cnts are kept as is: */
+
+	ret = populate_pt_from_mem(pt, mem, vstart, true);
+
+	/* sync the whole table memory to make sure the changes are reflected:
+	 * TBD: Optimize to only sync the parts that have actually been modified.
+	 * With this code we will potentially sync a long page freelist as well:
+	 */
+	if (!ret)
+		dma_sync_sg_for_device(pt->sdev->ib_dev.dma_device, pt->m.sg, pt->m.sg_max, DMA_TO_DEVICE);
+	mutex_unlock(&pt->lock);
+	return ret;
+}
+
+
+/* Called from debugfs key file - caller assumes this function will
+ * finish the line in the file:
+ */
+void sif_pt_dfs_print(struct seq_file *s, struct sif_dev *sdev, loff_t pos)
+{
+	/* First figure out if a pt object exists for this key,
+	 * we only care about MR keys here yet:
+	 */
+	struct sif_pt *pt;
+	struct sif_mr *mr = safe_get_sif_mr(sdev, pos);
+
+	pt = mr ? mr->mmu_ctx.pt : NULL;
+	if (!pt) {
+		seq_puts(s, "\n");
+		return;
+	}
+
+	seq_printf(s, "  %3d %3d %4lld\n",
+		pt->top_level, pt->leaf_level, pt->m.size >> pt->m.page_shift);
+}
diff --git a/drivers/infiniband/hw/sif/sif_pt.h b/drivers/infiniband/hw/sif/sif_pt.h
new file mode 100644
index 000000000000..e62a91e9fb14
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_pt.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_pt.h: SIF (private) page table management.
+ *   API for managing a sif specific page table which can be referenced from
+ *   multiple MMU contexts.
+ */
+
+#ifndef _SIF_PT_H
+#define _SIF_PT_H
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include "sif_mem.h"
+
+struct seq_file;
+
+/* rb_tree entries to track virtual addresses
+ * in this page table.
+ */
+struct sif_pt_page {
+	struct rb_node node;		/* Linkage for pt->pmd */
+	struct list_head list;		/* Linkage for freelist */
+	struct scatterlist *page;	/* Pointer to info on the used page within pt->m */
+	struct sif_pt_page *parent;	/* Pointer to the parent page in the page table */
+	u64 vaddr;			/* Virtual address mapped by the page table page */
+	u32 usecnt;			/* Number of entries in use in the referred pt page */
+};
+
+
+/* Number of page table page levels we support:
+ * This level uses 0 = pte pages, 1 = pmd pages, 2 = pud pages, 3 = pgdir pages
+ * This equals psif_table_level - 1 as we do not represent the pages themselves.
+ *
+ * Example: Corresponding page_shift will then eg be 12 (4K pages) for level -1 and 21 (2M)
+ * for level 1 for the default x86 case. For Sparc, several level 0 page sizes are
+ * supported, which gives multiple alternatives for the lowest level.
+ */
+#define PT_LEVELS 4
+
+/* Lower bits with special meaning
+ * from the Intel page table spec
+ */
+#define PT_PAGE_PRESENT	0x1 /* Page is present */
+#define PT_PAGE_PS     0x80 /* If set (at level >= 0) page is a leaf pointer even at level > 0 */
+#define PT_PAGE_SHIFT    12 /* Number of insignificant bits in a sif page table pointer */
+
+/* SIF driver representation of a generic
+ * driver maintained page table.
+ *
+ * Note that the base leaf page size is
+ * based on the "theoretical" smallest page, eg with 2M pages it will be 4K = shift 12.
+ * Whether that size is actually used is then determined by leaf_level.
+ */
+struct sif_pt {
+	struct sif_dev *sdev;	/* Device this mapping is valid for */
+	bool fixed_top;         /* If set, pt guarantees that the top node remains constant */
+	bool modifiable;	/* Set if this page table should support modification */
+	u8 top_level;		/* Page table level of top node, 0 means no table */
+	u8 leaf_level;		/* Page table level of leaf node */
+	u8 pte_ext_shift;	/* Only populate every (1 << pte_ext_shift) pte */
+	u16 ptes_per_page;	/* #ptes per page table page - also defines size of the pt page */
+	u32 page_shift;		/* Base leaf page shift in use for this table */
+	u64 vstart;		/* Start of the mapping in VA as seen from SIF */
+	u64 vsize;		/* Extent of the mapping (including any holes) */
+	struct sif_pt_page *top;/* Top level page table page exposed to sif */
+	struct mutex lock;	/* Protects modifications to the page table data structure */
+	struct kref refcnt;	/* Keep track of users of this page table */
+	struct sif_kmem m;	/* DMA mapped store for page table memory */
+	struct rb_root pmd[PT_LEVELS];/* Pr.level lookup table from offset to page table page */
+	struct list_head freelist; /* list of DMA mapped pt pages not currently in use */
+};
+
+
+/* Called from sif_init/exit to set up/clean up global data structures */
+int sif_pt_init(void);
+void sif_pt_exit(void);
+
+/* Called from debugfs key file */
+void sif_pt_dfs_print(struct seq_file *s, struct sif_dev *sdev, loff_t pos);
+
+/* Create a referenced sif page table object with an empty top level page */
+struct sif_pt *sif_pt_create_empty(struct sif_dev *sdev, u64 vstart, enum sif_mem_type map_mt);
+
+/* Create a sif page table object of size @mapsize using memory referenced by @sg
+ * with SIF virtual address starting at @vstart, which must be aligned at a page
+ * size boundary compatible with page sizes used by the memory type used by the backing store
+ * @map_mt. Assuming sg is a valid (possibly chained) scatterlist long enough to provide
+ * backing for @mapsize.
+ * Set @modifiable to allow the table to be extended and shrinked
+ * Set @fixed_top to have pt guarantee that the top node remains constant
+ * in which case it will always be a level 4 tree.
+ */
+struct sif_pt *sif_pt_create(struct sif_dev *sdev, struct scatterlist *sg,
+			u64 vstart, size_t mapsize,
+			u32 page_shift, bool modifiable, bool fixed_top);
+
+/* Create a sif page table from a mem object:
+ * Set @fixed_top to prepare for a table where the top node is fixed:
+ * (will always be a level 4 tree)
+ */
+struct sif_pt *sif_pt_create_for_mem(struct sif_mem *mem, u64 vstart,
+				u32 page_shift, bool modifiable, bool fixed_top);
+
+/* Remap the (remappable) page table to be used starting at vstart for the range of mem
+ * eg. replace the current mapping with a new one, preserving the top node
+ * (but possibly reuse at a different level!)
+ */
+int sif_pt_remap_for_mem(struct sif_pt *pt, struct sif_mem *mem,
+			u32 page_shift, u64 vstart);
+
+/* Extend a page table at DMA address @vstart with the list starting at @sg with size @size */
+int sif_pt_extend(struct sif_pt *pt, struct scatterlist *sg, u64 vstart, size_t size);
+
+/* Extend a page table at DMA address @vstart with the contents of @mem */
+int sif_pt_extend_with_mem(struct sif_pt *pt, struct sif_mem *mem, u64 vstart);
+
+/* DMA address of root pointer of page table */
+dma_addr_t sif_pt_dma_root(struct sif_pt *pt);
+
+/* SIF level of root pointer */
+u8 sif_pt_root_table_level(struct sif_pt *pt);
+
+/* Leaf page shift (number of bits within page) of this page table */
+u32 sif_pt_page_shift(struct sif_pt *pt);
+
+/* Observe leaf node of page table at @vaddr */
+int sif_pt_entry(struct sif_pt *pt, u64 vaddr, dma_addr_t *entry, dma_addr_t *val);
+
+/* free a part of the page table and dereference */
+int sif_pt_free_part(struct sif_pt *pt, u64 vstart, size_t size);
+
+/* Free this page table. If more than one reference has been created (using sif_pt_extend)
+ * return -EBUSY, e.g. this call can be used parenthetic with sif_pt_create, but not if
+ * mapping has been referenced more than once, in which case sif_pt_free_part must be called
+ * with identical start, size as with extend to clean up properly before a final sif_pt_free:
+ */
+int sif_pt_free(struct sif_pt *pt);
+
+/* Div. utilities: */
+
+/* Find the aligned size of a region within a certain page alignment size
+ * (eg. the number of pages of size @alignment needed to address (start,len))
+ */
+u64 aligned_size(u64 start, u64 len, u64 alignment);
+
+/* Find the optimal page size (represented by leaf level)
+ * to use based on device capabilities, configuration and a max_shift
+ * value (typically based on continuousness of memory:
+ * The result is adjusted with the address pair of a corresponding virtual
+ * address and dma address to ensure that it is possible to create a mapping at that
+ * level. pte_extent is set to the number bits to shift increment between
+ * each valid pte (For the odd sized leaf pages)
+ */
+int find_optimal_leaf_level(struct sif_dev *sdev, u32 max_shift,
+			u64 vaddr, u64 dma_addr, u64 size,
+			u8 *leaf_level,	u8 *pte_ext_shift);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_qp.c b/drivers/infiniband/hw/sif/sif_qp.c
new file mode 100644
index 000000000000..7c293d426ee8
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_qp.c
@@ -0,0 +1,2441 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_qp.c: Implementation of IB queue pair logic for sif
+ */
+
+#include <linux/random.h>
+#include <rdma/ib_verbs.h>
+#include "sif_dev.h"
+#include "sif_defs.h"
+#include "sif_qp.h"
+#include "sif_ah.h"
+#include "sif_sq.h"
+#include "sif_pqp.h"
+#include "sif_dma.h"
+#include "sif_user.h"
+#include "sif_base.h"
+#include "sif_mr.h"
+#include "sif_xrc.h"
+#include "sif_query.h"
+#include "sif_hwi.h"
+#include "sif_user.h"
+#include "psif_hw_data.h"
+#include "psif_hw_setget.h"
+#include "psif_hw_csr.h"
+#include "sif_ibcq.h"
+#include "sif_sndrcv.h"
+#include <linux/delay.h>
+#include <linux/seq_file.h>
+
+/* Work-around for bz 3646 */
+static unsigned char bug_3646_conv_table[32] = {
+	0,
+	18,
+	20,
+	21,
+	22,
+	23,
+	24,
+	25,
+	26,
+	27,
+	28,
+	29,
+	30,
+	31,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+};
+
+static int reset_qp(struct sif_dev *sdev, struct sif_qp *qp);
+
+static int sif_create_pma_qp(struct ib_pd *ibpd,
+			struct ib_qp_init_attr *init_attr,
+			struct sif_qp_init_attr sif_attr);
+
+static int poll_wait_for_qp_writeback(struct sif_dev *sdev, struct sif_qp *qp)
+{
+	unsigned long timeout = sdev->min_resp_ticks;
+	unsigned long timeout_real = jiffies + timeout;
+	enum psif_qp_state state = PSIF_QP_STATE_INIT;
+
+	sif_log(sdev, SIF_QP, "enter qp %d", qp->qp_idx);
+	do {
+		/* Make sure the update from hw is observed in correct order */
+		smp_rmb();
+		state = get_psif_qp_core__state(&qp->d.state);
+
+		if (state == PSIF_QP_STATE_RESET)
+			break;
+
+		if (time_is_before_jiffies(timeout_real))
+			cond_resched();
+		else {
+			sif_log(sdev, SIF_INFO,
+				"Timeout waiting for write back for QP %d - last state %s",
+				qp->qp_idx, string_enum_psif_qp_state(state));
+
+			if (unlikely(sif_debug_mask & SIF_QP_V)) {
+				struct psif_query_qp lqqp;
+				int ret;
+
+				ret = epsc_query_qp(qp, &lqqp);
+				if (ret)
+					sif_log(sdev, SIF_QP_V,
+						"Unable to retrieve qp state for qp %d from epsc, status %d",
+						qp->qp_idx, ret);
+				else
+					sif_logs(SIF_QP_V, write_struct_psif_query_qp(NULL, 0, &lqqp));
+			}
+
+			return -ETIMEDOUT;
+		}
+	} while (true);
+
+	sif_log(sdev, SIF_QP, "exit - write-back observed on qp %d", qp->qp_idx);
+	return 0;
+}
+
+static int send_epsa_proxy_qp_sq_key(struct sif_dev *sdev, u32 lkey,
+				     int qpnum,
+				    enum psif_mbox_type eps_num)
+{
+	struct psif_epsc_csr_req req;
+	struct psif_epsc_csr_rsp rsp;
+	int ret;
+
+	memset(&req, 0, sizeof(req));
+	req.opcode = EPSC_A_COMMAND;
+	req.u.epsa_cmd.cmd = EPSA_GET_PROXY_QP_SQ_KEY;
+	req.u.epsa_cmd.key = lkey;
+	req.u.epsa_cmd.qpnum = qpnum;
+	ret = sif_eps_wr(sdev, eps_num, &req, &rsp);
+
+	return ret;
+}
+
+struct sif_qp *create_qp(struct sif_dev *sdev,
+			struct ib_qp_init_attr *init_attr,
+			struct sif_qp_init_attr *sif_attr)
+{
+	struct sif_qp *qp, *rqp = NULL;
+	struct sif_sq *sq;
+	struct psif_qp qpi;
+	struct sif_rq *rq = NULL;
+	struct sif_pd *pd = sif_attr->pd;
+
+	int ret = 0;
+	int rq_idx = -1;
+	int request_qpn = -1;
+	int index;
+	bool mark_dirty = false;
+	struct sif_cq *send_cq = NULL;
+	struct sif_cq *recv_cq = NULL;
+	u32 flags = init_attr->create_flags;
+	u32 max_sge;
+	int min_tso_inline;
+
+	if (init_attr->send_cq)
+		send_cq = to_scq(init_attr->send_cq);
+	if (init_attr->recv_cq)
+		recv_cq = to_scq(init_attr->recv_cq);
+
+	/* Software need to support more than max hw send sge for UD - see #1883 */
+	max_sge =
+		sif_attr->qp_type == PSIF_QP_TRANSPORT_UD ? SIF_SW_MAX_UD_SEND_SGE : SIF_HW_MAX_SEND_SGE;
+
+	if (init_attr->cap.max_send_sge > max_sge) {
+		sif_log(sdev, SIF_INFO, "illegal max send sge %d, SIF only supports %d",
+			init_attr->cap.max_send_sge, max_sge);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (init_attr->cap.max_inline_data > sif_max_inline) {
+		sif_log(sdev, SIF_INFO,
+			"%d bytes of inline data requested - supported max %u - this limit is defined by module parameter max_inline",
+			init_attr->cap.max_inline_data, sif_max_inline);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (init_attr->qp_type <= IB_QPT_GSI) {
+		/* IB verbs port numbers start at 1 while psif starts w/port 0 */
+		int qpn = init_attr->qp_type + ((init_attr->port_num - 1) << 1);
+		int ok = atomic_add_unless(&sdev->sqp_usecnt[qpn], 1, 1);
+
+		if (!ok) {
+			sif_log(sdev, SIF_INFO,
+				"Attempt to create QP %d for port %d more than once",
+				init_attr->qp_type, init_attr->port_num);
+			return ERR_PTR(-EBUSY);
+		}
+		request_qpn = qpn;
+		sif_log(sdev, SIF_QP, "Requested qp %d, port %d",
+			init_attr->qp_type, init_attr->port_num);
+	}
+
+	/* Allow allocation of qp 0/1 */
+	index = request_qpn >= 0 ? request_qpn : sif_alloc_qp_idx(pd);
+	if (index < 0) {
+		rqp = ERR_PTR(-ENOMEM);
+		sif_log(sdev, SIF_QP, "sif_alloc_qp_idx failed");
+		goto err_alloc_index;
+	}
+	qp = get_sif_qp(sdev, index);
+
+	/* Set this temporarily - needed by reporting of qp write-back check */
+	qp->qp_idx = index;
+	/*
+	 * We add a sge (with the stencil) when sending with TSO. The stencil is stored at
+	 * the beginning of the inline-area. TSO implies checksumming which again has
+	 * a requirement that no inline can be used. It is therefore necessary to check that we have at least
+	 * 64 bytes of inline-buffering.
+	 */
+	min_tso_inline = 64;
+	if ((flags & IB_QP_CREATE_IPOIB_UD_LSO) &&
+		init_attr->cap.max_inline_data < min_tso_inline) {
+		sif_log(sdev, SIF_INFO,
+			"Create LSO QP; qp_%d max_sge %d inline_size %d qp_type %d; modifing max_inline_size to %d",
+			index, init_attr->cap.max_send_sge, init_attr->cap.max_inline_data,
+			init_attr->qp_type, min_tso_inline);
+		init_attr->cap.max_inline_data = min_tso_inline;
+	}
+
+	if (init_attr->qp_type == IB_QPT_RC || init_attr->qp_type == IB_QPT_XRC_INI) {
+		/* Required in anticipation of Atomics use */
+		init_attr->cap.max_inline_data = max(init_attr->cap.max_inline_data, 16U);
+	}
+
+	/* Now, before we can write the QP state - we must ensure that any previous usage
+	 * has been completed (the writeback after modify_qp to RESET happens asynchronously
+	 * after the modify_qp request completes.
+	 */
+	ret = poll_wait_for_qp_writeback(sdev, qp);
+	if (ret) {
+		/* Dont release this desc as it is probably not safe to use anymore */
+		mark_dirty = true;
+		rqp = ERR_PTR(ret);
+		goto err_lazy_wb;
+	}
+
+	memset(qp, 0, sizeof(struct sif_qp));
+	qp->qp_idx = index;
+	qp->ulp_type = sif_attr->ulp_type;
+
+	if (qp->ulp_type == RDS_ULP) {
+		int new_max_inline = CB_LENGTH; /* collectbuffer_length is max 256 */
+
+		sif_log(sdev, SIF_QP,
+			"Create QP; qp_%d max_sge %d inline_size %d qp_type %d; modifing max_inline_size to %d",
+			index, init_attr->cap.max_send_sge, init_attr->cap.max_inline_data,
+			init_attr->qp_type, new_max_inline);
+		init_attr->cap.max_inline_data = new_max_inline;
+	}
+
+	if (init_attr->qp_type <= IB_QPT_GSI) {
+		qp->port = init_attr->port_num;
+		if (init_attr->qp_type == IB_QPT_SMI)
+			qp->flags |= SIF_QPF_SMI;
+		else if (init_attr->qp_type == IB_QPT_GSI)
+			qp->flags |= SIF_QPF_GSI;
+	} else {
+		/* Let port 1 be default: init_attr->port_num is only valid for qp 0/1 */
+		qp->port = 1;
+	}
+
+	qp->last_set_state = IB_QPS_RESET;
+	qp->tracked_state = IB_QPS_RESET;
+	qp->mtu = IB_MTU_4096;
+	qp->type = sif_attr->qp_type;
+
+	/* TBD: Optimize this log to a single stmt */
+	if (send_cq)
+		sif_log(sdev, SIF_QP, "qpn %d, qp 0x%p send cq %d (type %s) port %d, pd %d",
+			index, qp, send_cq->index, string_enum_psif_qp_trans(qp->type),
+			qp->port, pd->idx);
+	else
+		sif_log(sdev, SIF_QP, "qpn %d, qp 0x%p [no send cq] (type %s) port %d, pd %d",
+			index, qp, string_enum_psif_qp_trans(qp->type), qp->port, pd->idx);
+
+	/* The PQP does not have any receive queue, neither does the XRC qp
+	 * where RQs are selected per work request via wr.xrc_hdr.xrqd_id
+	 */
+	if (is_regular_qp(qp)) {
+		if (init_attr->srq) {
+			rq = to_srq(init_attr->srq);
+			if (atomic_add_unless(&rq->refcnt, 1, 0)) {
+				rq_idx = rq->index;
+				sif_log(sdev, SIF_QP, "Connected qp %d to SRQ %d",
+					index, rq_idx);
+			} else {
+				sif_log(sdev, SIF_INFO,
+					"failed to connect qp %d to SRQ %d, rq invalid",
+					index, rq_idx);
+				rqp = ERR_PTR(-ENODEV);
+				goto err_rq_fail;
+			}
+		} else {
+			rq_idx = alloc_rq(sdev, pd, init_attr->cap.max_recv_wr,
+					init_attr->cap.max_recv_sge, NULL,
+					sif_attr->user_mode);
+			if (rq_idx >= 0)
+				rq = get_sif_rq(sdev, rq_idx);
+		}
+		if (rq_idx < 0) {
+			rqp = ERR_PTR(rq_idx);
+			goto err_rq_fail;
+		}
+
+		/* Adjust requested values based on what we got: */
+		init_attr->cap.max_recv_wr = rq->entries_user;
+	}
+	qp->rq_idx = rq_idx;
+
+	if (rq && !init_attr->srq) {
+		/* Check/update max sge cap: */
+		if (rq->sg_entries > init_attr->cap.max_recv_sge) {
+			sif_log(sdev, SIF_QP, "recv sge adjusted (%d -> %d)",
+				init_attr->cap.max_recv_sge, rq->sg_entries);
+			init_attr->cap.max_recv_sge = rq->sg_entries;
+		}
+
+		/* Store cq reference for cleanup purposes */
+		if (recv_cq)
+			rq->cq_idx = recv_cq->index;
+	}
+
+
+	/* sq always gets same index as QP.. */
+	ret = sif_alloc_sq(sdev, pd, qp, &init_attr->cap,
+			sif_attr->user_mode, sif_attr->sq_hdl_sz);
+	if (ret < 0) {
+		rqp = ERR_PTR(ret);
+		goto err_sq_fail;
+	}
+
+	/* Store send completion queue index default since
+	 * for psif send cq number is a parameter in the work request
+	 */
+	sq = get_sif_sq(sdev, qp->qp_idx);
+	sq->cq_idx = send_cq ? send_cq->index : (u32)-1; /* XRC recv only */
+	sq->complete_all = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR ? 1 : 0;
+
+	/* Adjust requested values based on what we got: */
+	init_attr->cap.max_send_wr = sq->entries;
+
+	/* Initialization of qp state via local copy */
+	memset(&qpi, 0, sizeof(struct psif_qp));
+
+	if (multipacket_qp(qp->type)) {
+		qpi.state.sq_clog2_extent = order_base_2(sq->extent);
+		qpi.state.sq_clog2_size = order_base_2(sq->entries);
+	}
+	qpi.state.retry_sq_seq = 0;
+	qpi.state.state = ib2sif_qp_state(IB_QPS_RESET);
+	qpi.state.pd = pd->idx;
+	if (!sif_feature(zero_magic)) {
+		qp->magic = prandom_u32();
+		qpi.state.magic = qp->magic;
+	}
+	qpi.state.transport_type = qp->type;
+	if (qp->type == PSIF_QP_TRANSPORT_XRC && init_attr->xrcd)
+		qpi.state.xrc_domain = to_sxrcd(init_attr->xrcd)->index;
+	qpi.state.rq_indx = rq_idx;
+	qpi.state.rq_is_srq = !!init_attr->srq || (init_attr->qp_type == IB_QPT_XRC_TGT);
+	qpi.state.send_cq_indx = send_cq ? send_cq->index : (u32)-1;
+	qpi.state.rcv_cq_indx = recv_cq ? recv_cq->index : (u32)-1;
+
+	qpi.state.mstate = APM_MIGRATED;
+	qpi.state.path_mtu = ib2sif_path_mtu(qp->mtu);
+	/* Last acked psn must be initialized to one less than xmit_psn
+	 * and it is a 24 bit value. See issue #1011
+	 */
+	qpi.state.xmit_psn = 0;
+	qpi.state.last_acked_psn = 0xffffff;
+	qpi.state.qosl = qp->qosl = sif_attr->qosl;
+
+	/* See #2402/#2770 */
+	if (sif_feature(infinite_rnr)) {
+		qpi.state.rnr_retry_init = 7;
+		qpi.state.rnr_retry_count = 7;
+		qpi.state.min_rnr_nak_time = 26; /* Bug 3646, this is about 160 us */
+	}
+
+	if (flags & IB_QP_NO_CSUM)
+		qpi.state.no_checksum = 1;
+
+	if (sif_attr->proxy != SIFPX_OFF) {
+		/* This is a proxy QP */
+		qpi.state.proxy_qp_enable = 1;
+		qp->eps_tag |= EPS_TAG_FROM_HOST;
+		ret = send_epsa_proxy_qp_sq_key(sdev, sq->sg_mr->index,
+						qp->qp_idx,
+						proxy_to_mbox(sif_attr->proxy));
+		if (ret)
+			sif_log(sdev, SIF_QP, "send_epsa_proxy_qp_sq_key failed");
+	}
+
+	if (sif_attr->user_mode)
+		qp->flags |= SIF_QPF_USER_MODE;
+
+	if (flags & IB_QP_CREATE_IPOIB_UD_LSO) {
+		qp->flags |= SIF_QPF_IPOIB;
+		qpi.state.ipoib_enable = 1;
+		qpi.state.ipoib = 1;
+	}
+
+	/* PSIF extensions */
+	if (flags & IB_QP_CREATE_EOIB) {
+		qp->flags |= SIF_QPF_EOIB;
+		qpi.state.eoib_enable = 1;
+		qpi.state.eoib = 1;
+		qpi.state.eoib_type = EOIB_QKEY_ONLY;
+	}
+	if (flags & IB_QP_CREATE_RSS)
+		qpi.state.rss_enable = 1;
+	if (flags & IB_QP_CREATE_HDR_SPLIT)
+		qpi.state.hdr_split_enable = 1;
+	if (flags & IB_QP_CREATE_RCV_DYNAMIC_MTU)
+		qpi.state.rcv_dynamic_mtu_enable = 1;
+	if (flags & IB_QP_CREATE_SND_DYNAMIC_MTU)
+		qpi.state.send_dynamic_mtu_enable = 1;
+
+	/* according to ib_verbs.h init_attr->port_num is only valid for QP0/1 */
+	if (init_attr->qp_type <= IB_QPT_GSI)
+		qpi.path_a.port = init_attr->port_num - 1;
+
+	sif_log(sdev, SIF_QP, "qp %d path_a.port = %d", qp->qp_idx, qpi.path_a.port);
+
+	/* Write composed entry to shared area */
+	copy_conv_to_hw(&qp->d, &qpi, sizeof(struct psif_qp));
+
+	mutex_init(&qp->lock); /* TBD: Sync scheme! */
+
+	/* Users should see qp 0/1 even though qp 0/1 is mapped to qp 2/3 for
+	 * port 2
+	 */
+	qp->ibqp.qp_num = qp->qp_idx > 3 ?  qp->qp_idx : (qp->qp_idx & 0x1);
+
+	/* For the priv. QP types we need to set some other elements in the
+	 * ib verbs struct as well
+	 */
+	if (qp->type == PSIF_QP_TRANSPORT_MANSP1) {
+		qp->ibqp.device = &sdev->ib_dev;
+		qp->ibqp.qp_num = qp->qp_idx;
+		qp->ibqp.qp_type = IB_QPT_UD;
+	}
+
+	qp->flush_sq_done_wa4074 = false;
+
+	ret = sif_dfs_add_qp(sdev, qp);
+	if (ret)
+		goto err_dfs_qp;
+	/* initialize the sychronization between destroy qp and event handling.*/
+	init_completion(&qp->can_destroy);
+
+	/* a qp can only be destroyed if refcnt == 0.*/
+	atomic_set(&qp->refcnt, 1);
+
+	return qp;
+
+err_dfs_qp:
+	sif_free_sq(sdev, qp);
+err_sq_fail:
+	if (rq && !rq->is_srq)
+		free_rq(sdev, rq_idx);
+err_rq_fail:
+err_lazy_wb:
+	if (!mark_dirty)
+		sif_free_qp_idx(pd, qp->qp_idx);
+err_alloc_index:
+	return rqp;
+}
+
+/* PMA proxy QP */
+static int sif_create_pma_qp(struct ib_pd *ibpd,
+			struct ib_qp_init_attr *init_attr,
+			struct sif_qp_init_attr sif_attr)
+{
+	struct ib_qp *ret = NULL;
+	struct sif_dev *sdev;
+	struct sif_pd *pd;
+	struct sif_qp *qp;
+
+	sdev = to_sdev(ibpd->device);
+	pd = to_spd(ibpd);
+	/* Let's override IB_QPT_GSI by IB_QPT_UD*/
+	init_attr->qp_type = IB_QPT_UD;
+
+	qp = create_qp(sdev, init_attr, &sif_attr);
+
+	if (IS_ERR(qp)) {
+		/* Convert interior error to right type: */
+		ret = (struct ib_qp *)qp;
+		goto err_create_qp;
+	}
+	qp->flags |= SIF_QPF_PMA_PXY;
+	qp->port = init_attr->port_num;
+	sdev->pma_qp_idxs[qp->port - 1] = qp->qp_idx;
+
+	/* Make dfs and query_qp happy: */
+	qp->ibqp.device = &sdev->ib_dev;
+	qp->ibqp.pd = &sdev->pd->ibpd;
+
+	/* Set back IB_QPT_GSI */
+	init_attr->qp_type = IB_QPT_GSI;
+
+	sif_log(sdev, SIF_QP, "Exit: success 0x%p  proxy qp %d - real qp %d",
+		&qp->ibqp, qp->ibqp.qp_num, qp->qp_idx);
+	return qp->qp_idx;
+
+err_create_qp:
+	sif_log(sdev, SIF_QP, "Exit: failed");
+	return 0;
+}
+
+struct ib_qp *sif_create_qp(struct ib_pd *ibpd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata)
+{
+	struct sif_dev *sdev;
+	struct sif_qp *qp;
+	struct sif_pd *pd;
+	struct sif_xrcd *xrcd = NULL;
+	struct ib_qp *ret = NULL;
+	enum ib_qp_create_flags flags = init_attr->create_flags;
+	ulong user_flags = 0;
+
+	struct sif_qp_init_attr sif_attr = {
+		.qp_type = ib2sif_qp_type(init_attr->qp_type),
+		.user_mode = udata != NULL,
+		.sq_hdl_sz = sizeof(struct sif_sq_hdl),
+	};
+
+
+	/* First we need to locate the device pointer -
+	 * if this is an XRC QP ibpd will be NULL:
+	 */
+	if (init_attr->qp_type == IB_QPT_XRC_TGT) {
+		if (!init_attr->xrcd) {
+			sif_log0(SIF_INFO, "Error: missing XRC domain for XRC qp");
+			return ERR_PTR(-EINVAL);
+		}
+
+		xrcd = to_sxrcd(init_attr->xrcd);
+		sdev = to_sdev(init_attr->xrcd->device);
+
+		pd = xrcd->pd;
+	} else {
+		sdev = to_sdev(ibpd->device);
+		pd = to_spd(ibpd);
+	}
+
+	sif_attr.pd = pd;
+
+	sif_log(sdev, SIF_QP, "Enter qp_type %d%s", init_attr->qp_type,
+		(udata ? " (user call)" : ""));
+
+	/* TBD: How to handle this? */
+	if (flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+		sif_log(sdev, SIF_QP, "flag IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK set (ignored)");
+
+	if (flags & IB_QP_CREATE_PROXY) {
+		/* We don't know the actual EPSA to use here but QPs dont care */
+		sif_attr.proxy = SIFPX_EPSA_1;
+	}
+
+	/* TBD: Verify that user params such as the send cq are authorized?? */
+	if (!xrcd && !init_attr->send_cq) {
+		sif_log(sdev, SIF_INFO, "No send completion queue specified");
+		ret = ERR_PTR(-EINVAL);
+		goto err_create_qp;
+	}
+
+	if (!xrcd && !init_attr->recv_cq) {
+		sif_log(sdev, SIF_INFO, "No receive completion queue specified");
+		ret = ERR_PTR(-EINVAL);
+		goto err_create_qp;
+	}
+
+	if (udata && init_attr->qp_type <= IB_QPT_GSI) {
+		sif_log(sdev, SIF_INFO, "Attempt to create SMI/GSI QP %d from user space",
+			init_attr->qp_type);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (udata) {
+		struct sif_create_qp_ext cmd;
+		int rv = ib_copy_from_udata(&cmd, udata, sizeof(cmd));
+
+		if (rv) {
+			ret = ERR_PTR(rv);
+			goto err_create_qp;
+		}
+		user_flags = cmd.flags;
+		if (sif_vendor_enable(proxy_mode, user_flags))
+			sif_attr.proxy = cmd.proxy;
+
+		if (sif_vendor_enable(SVF_kernel_mode, user_flags))
+			sif_attr.user_mode = false;
+
+		if (sif_vendor_enable(tsu_qosl, user_flags))
+			sif_attr.qosl = QOSL_LOW_LATENCY;
+
+		if (sif_vendor_enable(no_checksum, user_flags)) {
+			/* update the init_attr->create_flags directly.
+			 * This will allow the same code path if umem can pass this as a
+			 * create_qp flag via struct ibv_qp_init_attr_ex in the future:
+			 */
+			init_attr->create_flags |= IB_QP_NO_CSUM;
+		}
+	}
+
+	/* TBD: check init_attr params against device cap-limits */
+	/* TBD update ib_qp_cap? */
+	if (sif_vendor_enable(dynamic_mtu, user_flags)) {
+		/* TBD - check the device capabilities to determine whether to
+		 * create qp with the support of send/receive dynamic MTU.
+		 */
+		init_attr->create_flags |= IB_QP_CREATE_RCV_DYNAMIC_MTU;
+		init_attr->create_flags |= IB_QP_CREATE_SND_DYNAMIC_MTU;
+	}
+
+	/* best effort to determine the ULP caller. */
+	if (!sif_attr.user_mode)
+		sif_attr.ulp_type = sif_find_kernel_ulp_caller();
+
+	qp = create_qp(sdev, init_attr, &sif_attr);
+
+	if (IS_ERR(qp)) {
+		/* Convert interior error to right type: */
+		ret = (struct ib_qp *)qp;
+		goto err_create_qp;
+	} else {
+		sif_log(sdev, SIF_QP, "Exit: success 0x%p  ib qp %d - real qp %d%s",
+			&qp->ibqp, qp->ibqp.qp_num, qp->qp_idx,
+			(sif_attr.user_mode ? " (user mode)" : ""));
+	}
+
+	qp->qosl = sif_attr.qosl;
+	qp->nocsum = init_attr->create_flags & IB_QP_NO_CSUM;
+
+
+
+	if (sif_vendor_enable(dynamic_mtu, user_flags)) {
+		/* TBD - dynamic mtu flag should only be set during modify_qp in CM
+		 * or OOB establishment. It is only set if remote dynamic_mtu_supported &&
+		 * local dynamic_send_mtu_supported. As create_qp should not be in
+		 * the critical path, split this code from the setting of
+		 * IB_QP_CREATE_RCV_DYNAMIC_MTU and IB_QP_CREATE_SND_DYNAMIC_MTU flags
+		 * to remind ourself that this need to be implemented separately.
+		 */
+		sif_log(sdev, SIF_QP, "Enabling forced dynamic MTU for qp %d", qp->qp_idx);
+		qp->flags |= SIF_QPF_DYNAMIC_MTU;
+	}
+
+	if (sif_vendor_enable(SQ_mode, user_flags)) {
+		sif_log(sdev, SIF_QP, "Enabling forced SQ mode for qp %d", qp->qp_idx);
+		qp->flags |= SIF_QPF_FORCE_SQ_MODE;
+	}
+
+	if (udata) {
+		struct sif_create_qp_resp_ext resp;
+		struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+		int rv;
+
+		memset(&resp, 0, sizeof(resp));
+		resp.qp_idx = qp->qp_idx;
+		resp.sq_extent = sq->extent;
+		resp.sq_sgl_offset = sq->sgl_offset;
+		resp.sq_mr_idx = sq->sg_mr ? sq->sg_mr->index : 0;
+		resp.sq_dma_handle = sif_mem_dma(sq->mem, 0);
+		if (init_attr->qp_type != IB_QPT_XRC_INI && init_attr->qp_type != IB_QPT_XRC_TGT) {
+			/* XRC qps do not have any rq */
+			struct sif_rq *rq = get_sif_rq(sdev, qp->rq_idx);
+
+			resp.rq_idx = qp->rq_idx;
+			resp.rq_extent = rq->extent;
+		}
+
+		resp.magic = get_psif_qp_core__magic(&qp->d.state);
+		rv = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (rv) {
+			ret = ERR_PTR(rv);
+			goto err_udata;
+		}
+	}
+	/* Support for PMA_PXY QP bug #3357 */
+	if (init_attr->qp_type == IB_QPT_GSI
+		&& eps_version_ge(&sdev->es[sdev->mbox_epsc], 0, 57)) {
+		int pma_qp_idx = sif_create_pma_qp(ibpd, init_attr, sif_attr);
+
+		if (!pma_qp_idx)
+			sif_log(sdev, SIF_INFO, "Create PMA_PXY qp %d port %d failed",
+				qp->qp_idx, init_attr->port_num);
+	}
+
+	return &qp->ibqp;
+err_udata:
+	destroy_qp(sdev, qp);
+err_create_qp:
+	sif_log(sdev, SIF_QP, "Exit: failed");
+	return ret;
+}
+
+
+/* Modify qp implementation related: */
+
+
+enum sif_mqp_type sif_modify_qp_is_ok(struct sif_qp *qp, enum ib_qp_state cur_state,
+				enum ib_qp_state next_state, enum ib_qp_attr_mask mask)
+{
+	struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+	enum ib_qp_type type = qp->ibqp.qp_type;
+	int ret;
+	enum rdma_link_layer ll = IB_LINK_LAYER_INFINIBAND;
+
+	/* PSIF treats XRC just as any other RC QP */
+	if (type == IB_QPT_XRC_INI || type == IB_QPT_XRC_TGT)
+		type = IB_QPT_RC;
+	ret = ((qp->type == PSIF_QP_TRANSPORT_MANSP1 || is_epsa_tunneling_qp(type)) ? 1 :
+		ib_modify_qp_is_ok(cur_state, next_state, type, mask, ll));
+	if (!ret)
+		return SIF_MQP_ERR;
+	switch (cur_state) {
+	case IB_QPS_RESET:
+		if (qp->tracked_state == IB_QPS_SQD)
+			qp->tracked_state = IB_QPS_RESET;
+		return SIF_MQP_SW;
+	case IB_QPS_INIT:
+		if (next_state == IB_QPS_INIT || next_state == IB_QPS_RESET ||
+			next_state == IB_QPS_ERR)
+			return SIF_MQP_SW;
+		/* else fall-through */
+	case IB_QPS_RTS:
+		/* TBD: Elim.hack to behave like mlx on this: */
+		if (unlikely(qp->tracked_state == IB_QPS_SQD &&
+				next_state != IB_QPS_RESET && next_state != IB_QPS_ERR))
+			return SIF_MQP_ERR;
+		if (unlikely(next_state == IB_QPS_SQD)) {
+			qp->tracked_state = next_state; /* To fail on future transitions */
+			return SIF_MQP_IGN; /* Allow, but ignore as MLX does */
+		}
+		/* else fall-through */
+	case IB_QPS_RTR:
+		if (unlikely(next_state == IB_QPS_SQD))
+			return SIF_MQP_ERR;
+		return SIF_MQP_HW;
+	case IB_QPS_SQE:
+		return SIF_MQP_HW;
+	case IB_QPS_ERR:
+		/* Bug #3933 WA for HW bug 3928
+		 * For this specific transition, modify qp must be done based
+		 * on current qp ownership (towards HW only if HW owned)
+		 */
+		return (PSIF_REVISION(sdev) <= 3)
+			&& !(qp->flags & SIF_QPF_HW_OWNED) ?
+			SIF_MQP_SW : SIF_MQP_HW;
+	default:
+		return SIF_MQP_IGN;
+	}
+}
+
+
+
+static int modify_qp_sw(struct sif_dev *sdev, struct sif_qp *qp,
+		 struct ib_qp_attr *qp_attr, int qp_attr_mask);
+static int modify_qp_hw(struct sif_dev *sdev, struct sif_qp *qp,
+		 struct ib_qp_attr *qp_attr, int qp_attr_mask);
+
+
+
+int modify_qp_hw_wa_qp_retry(struct sif_dev *sdev, struct sif_qp *qp,
+			struct ib_qp_attr *qp_attr, int qp_attr_mask)
+{
+	struct ib_qp_attr mod_attr = {
+		.qp_state        = IB_QPS_ERR
+	};
+
+	bool need_wa_3713 = PSIF_REVISION(sdev) <= 3
+		&& IS_PSIF(sdev)
+		&& qp_attr_mask & IB_QP_STATE && qp_attr->qp_state == IB_QPS_RESET;
+
+	/* WA for duplicate CQEs */
+	bool need_wa_4074 = PSIF_REVISION(sdev) <= 3
+		&& (qp->type != PSIF_QP_TRANSPORT_MANSP1)
+		&& qp_attr_mask & IB_QP_STATE && qp_attr->qp_state == IB_QPS_ERR
+		&& IS_PSIF(sdev);
+
+	int ret = 0;
+
+	if (need_wa_3713 || need_wa_4074) {
+		if (qp->type != PSIF_QP_TRANSPORT_MANSP1)
+			ret = pre_process_wa4074(sdev, qp);
+
+		if (ret) {
+			if (ret != -1)
+				sif_log(sdev, SIF_INFO, "Failed to pre-process WA4074, ret - %d", ret);
+		}
+	}
+
+	if (need_wa_3713) {
+		/* Workaround for bug #3713 part 2 - see #3714 */
+		ret = modify_qp_hw(sdev, qp, &mod_attr, IB_QP_STATE);
+		if (ret)
+			sif_log(sdev, SIF_INFO, "implicit modify qp %d to ERR failed - ignoring",
+				qp->qp_idx);
+	}
+
+	ret = modify_qp_hw(sdev, qp, qp_attr, qp_attr_mask);
+
+	if (need_wa_3713 || need_wa_4074) {
+		struct ib_qp_attr attr = {
+			.qp_state = IB_QPS_RESET
+		};
+
+		if (need_wa_4074) {
+			ret = modify_qp_hw(sdev, qp, &attr, IB_QP_STATE);
+			if (ret) {
+				sif_log(sdev, SIF_INFO, "qp %d RESET failed, ret %d", qp->qp_idx, ret);
+				goto err_modify_qp_wa;
+			}
+			/* Restore QP SW state to ERROR */
+			qp->last_set_state = qp->tracked_state = IB_QPS_ERR;
+		}
+
+		qp->flags &= ~SIF_QPF_HW_OWNED;
+
+		if (qp->type != PSIF_QP_TRANSPORT_MANSP1)
+			ret = post_process_wa4074(sdev, qp);
+
+		if (ret)
+			sif_log(sdev, SIF_INFO, "Failed to post-process WA #4074 %d", ret);
+	}
+err_modify_qp_wa:
+
+	return ret;
+}
+
+int notify_epsc_pma_qp(struct sif_dev *sdev, int qp_idx, short port)
+{
+	struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+	struct psif_epsc_csr_req req;
+	struct psif_epsc_csr_rsp rsp;
+	int ret = -1;
+
+	if (eps_version_ge(es, 0, 57)) {
+		memset(&req, 0, sizeof(req));
+		memset(&rsp, 0, sizeof(rsp));
+		req.opcode = EPSC_SET;
+		req.u.set.data.op = EPSC_QUERY_PMA_REDIRECT_QP;
+		req.u.set.data.index = port;
+		req.u.set.data.value = qp_idx;
+
+		ret = sif_epsc_wr_poll(sdev, &req, &rsp);
+		if (ret) {
+			sif_log(sdev, SIF_INFO, "Failed to configure epsc PMA_PXY QP\n");
+			return ret;
+		}
+		return ret;
+	} else
+		return -EINVAL;
+}
+
+int sif_modify_qp(struct ib_qp *ibqp,
+	struct ib_qp_attr *qp_attr,
+	int qp_attr_mask, struct ib_udata *udata)
+{
+	struct sif_qp *qp = to_sqp(ibqp);
+	struct sif_dev *sdev = to_sdev(ibqp->device);
+	struct sif_qp *pma_qp = NULL;
+	struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+	int ret = 0;
+	bool need_pma_pxy_qp = eps_version_ge(es, 0, 57)
+		&& (qp_attr->qp_state != IB_QPS_RTS)
+		&& (qp->qp_idx == 1 || qp->qp_idx == 3);
+
+	if (need_pma_pxy_qp) {
+		pma_qp = get_sif_qp(sdev, sdev->pma_qp_idxs[!!(qp->qp_idx & 2)]);
+		ret = modify_qp(sdev, pma_qp, qp_attr, qp_attr_mask, true, udata);
+		if (ret)
+			sif_log(sdev, SIF_INFO, "Modify PMA_PXY QP %d failed",
+				pma_qp->qp_idx);
+		else if (qp_attr->qp_state == IB_QPS_RTR) {
+			ret = notify_epsc_pma_qp(sdev, pma_qp->qp_idx, pma_qp->port);
+			if (ret)
+				sif_log(sdev, SIF_INFO, "Notify epsc PMA_PXY QP %d failed",
+					pma_qp->qp_idx);
+		}
+	}
+
+	return modify_qp(sdev, qp, qp_attr, qp_attr_mask,
+			true, udata);
+}
+
+
+int modify_qp(struct sif_dev *sdev, struct sif_qp *qp,
+	struct ib_qp_attr *qp_attr, int qp_attr_mask,
+	bool fail_on_same_state, struct ib_udata *udata)
+{
+	int ret = 0;
+	struct ib_qp *ibqp = &qp->ibqp;
+	enum ib_qp_state cur_state, new_state;
+	enum sif_mqp_type mqp_type = SIF_MQP_IGN;
+
+	sif_log(sdev, SIF_QP, "Enter: qpn %d qp_idx %d mask 0x%x",
+		ibqp->qp_num, qp->qp_idx, qp_attr_mask);
+
+	/* WA #622, RQ flush from error completion in userspace */
+	if (udata && is_regular_qp(qp)) {
+		struct sif_modify_qp_ext cmd;
+		struct sif_rq *rq = get_sif_rq(sdev, qp->rq_idx);
+
+		ret = ib_copy_from_udata(&cmd, udata, sizeof(cmd));
+		if (ret) {
+			sif_log(sdev, SIF_INFO, "ib_copy_from_udata failed, sts %d, qp %d, size %ld",
+				ret, qp->qp_idx, sizeof(cmd));
+			return ret;
+		}
+
+		switch (cmd.flush) {
+		case FLUSH_RQ:
+			ret = sif_flush_rq(sdev, rq, qp, rq->entries);
+			if (ret)
+				sif_log(sdev, SIF_INFO, "failed to flush RQ %d",
+					rq->index);
+			return ret;
+		case FLUSH_SQ:
+			ret = post_process_wa4074(sdev, qp);
+			if (ret)
+				sif_log(sdev, SIF_INFO, "failed to flush SQ %d", qp->qp_idx);
+			return ret;
+		default:
+			break;
+		}
+	}
+
+	mutex_lock(&qp->lock);
+
+	cur_state = qp_attr_mask & IB_QP_CUR_STATE ?
+		qp_attr->cur_qp_state : qp->last_set_state;
+
+	new_state = qp_attr_mask & IB_QP_STATE ? qp_attr->qp_state : cur_state;
+
+	if (!fail_on_same_state && cur_state == qp_attr->qp_state) {
+		/* Silently ignore.. (used at destroy time) */
+		goto sif_mqp_ret;
+	}
+
+	mqp_type = sif_modify_qp_is_ok(qp, cur_state, new_state, qp_attr_mask);
+	switch (mqp_type) {
+	case SIF_MQP_SW:
+		ret = modify_qp_sw(sdev, qp, qp_attr, qp_attr_mask);
+		break;
+	case SIF_MQP_HW:
+		ret = modify_qp_hw_wa_qp_retry(sdev, qp, qp_attr, qp_attr_mask);
+		break;
+	case SIF_MQP_IGN:
+		break;
+	case SIF_MQP_ERR:
+	default:
+		sif_log(sdev, SIF_INFO, "illegal state change from %d to %d for qp %d",
+			cur_state, new_state, qp->qp_idx);
+		ret = -EINVAL;
+	}
+
+sif_mqp_ret:
+	if (!ret && !(mqp_type == SIF_MQP_IGN)) {
+		/* TBD: Is this needed? */
+		qp_attr->cur_qp_state = new_state;
+	}
+
+	/* QP ownership flag must be updated before release
+	 * the lock in order to avoid race conditions
+	 */
+	switch (new_state) {
+	case IB_QPS_RESET:
+		qp->flags &= ~SIF_QPF_HW_OWNED;
+		break;
+	case IB_QPS_RTR:
+		qp->flags |= SIF_QPF_HW_OWNED;
+		break;
+	default:
+		/* No extra actions needed */
+		break;
+	}
+
+	mutex_unlock(&qp->lock);
+
+	if (ret)
+		return ret;
+
+	/* Bug #3933 - WA for HW bug 3928
+	 * enable/disable the HW ownership QP flag
+	 */
+	switch (new_state) {
+	case IB_QPS_ERR:
+		if (is_regular_qp(qp)) {
+			struct sif_rq *rq = get_sif_rq(sdev, qp->rq_idx);
+
+			/* WA #3850:if SRQ, generate LAST_WQE event */
+			if (rq->is_srq && qp->ibqp.event_handler) {
+				struct ib_event ibe = {
+					.device = &sdev->ib_dev,
+					.event = IB_EVENT_QP_LAST_WQE_REACHED,
+					.element.qp = &qp->ibqp
+				};
+
+				qp->ibqp.event_handler(&ibe, qp->ibqp.qp_context);
+			} else if (rq && !rq->is_srq) {
+				/* WA #622: if reqular RQ, flush */
+				ret = sif_flush_rq(sdev, rq, qp, rq->entries);
+				if (ret) {
+					sif_log(sdev, SIF_INFO, "failed to flush RQ %d",
+						rq->index);
+					return ret;
+				}
+			}
+		}
+		break;
+	case IB_QPS_RESET:
+		/* clean all state associated with this QP */
+		ret = reset_qp(sdev, qp);
+		break;
+	default:
+		/* No extra actions needed */
+		break;
+	}
+	return ret;
+}
+
+
+static void set_qp_path_hw(struct sif_qp *qp, struct psif_epsc_csr_modify_qp *mct,
+			struct ib_qp_attr *qp_attr, int qp_attr_mask, bool alternate)
+{
+	struct psif_qp_path *path;
+	struct ib_ah_attr *ah_attr;
+	struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+	struct psif_csr_modify_qp_ctrl *ctrl_attr = &mct->ctrl;
+	u8 ipd = 0;
+
+	/* IBV_QP_ALT_PATH  Set the alternative path via:
+	 * alt_ah_attr, alt_pkey_index, alt_port_num and
+	 * alt_timeout.
+	 */
+	if (alternate) {
+		ctrl_attr->alt_path = 1;
+		path = &mct->data.alternate_path;
+		ah_attr = &qp_attr->alt_ah_attr;
+		path->pkey_indx = qp_attr->alt_pkey_index;
+		path->local_ack_timeout = qp_attr->alt_timeout;
+		path->port = qp_attr->alt_port_num - 1;
+		sif_log(sdev, SIF_QP, "Alternate pkey_indx %d local_ack_timeout %d, port %d",
+			qp_attr->alt_pkey_index, qp_attr->alt_timeout, qp_attr->alt_port_num + 1);
+	} else {
+		ctrl_attr->prim_path = 1;
+		/* TBD: Does this belong here? */
+		ctrl_attr->pkey_index = 1;
+		path = &mct->data.primary_path;
+		ah_attr = &qp_attr->ah_attr;
+		path->pkey_indx = qp->pkey_index;
+		/* Use the value set by IB_QP_PORT: */
+		path->port = qp->port - 1;
+		sif_log(sdev, SIF_QP, "Primary pkey_indx %d local_ack_timeout %d, port %d",
+			qp_attr->pkey_index, qp_attr->timeout, qp_attr->port_num + 1);
+	}
+	path->sl = ah_attr->sl;
+	path->remote_lid = ah_attr->dlid;
+	path->local_lid_path = ah_attr->src_path_bits;
+
+	path->loopback =
+		(sdev->port[path->port].lid | path->local_lid_path) == ah_attr->dlid ?
+		LOOPBACK : NO_LOOPBACK;
+
+	/* sif_calc_ipd do not set ipd if sif_calc_ipd failed. In that case, ipd = 0.*/
+	sif_calc_ipd(sdev, qp->port, (enum ib_rate) ah_attr->static_rate, &ipd);
+	path->ipd = ipd;
+
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		path->use_grh = USE_GRH;
+		path->remote_gid_0 = cpu_to_be64(ah_attr->grh.dgid.global.subnet_prefix);
+		path->remote_gid_1 = cpu_to_be64(ah_attr->grh.dgid.global.interface_id);
+		path->flowlabel = ah_attr->grh.flow_label;
+		path->hoplmt = ah_attr->grh.hop_limit;
+		/* TBD: ah_attr->grh.sgid_index? */
+
+		sif_log(sdev, SIF_QP, " - with grh dgid %llx.%llx",
+			ah_attr->grh.dgid.global.subnet_prefix,
+			ah_attr->grh.dgid.global.interface_id);
+	}
+
+	if (qp_attr_mask & IB_QP_TIMEOUT) {
+		path->local_ack_timeout = qp_attr->timeout;
+		sif_log(sdev, SIF_QP, " - with timeout %d", qp_attr->timeout);
+	}
+
+	sif_log(sdev, SIF_QP, "local_lid_path %d, remote_lid %d %s, QP(ipd):%d %s",
+		path->local_lid_path, path->remote_lid, (path->loopback ? "(loopback)" : ""),
+		path->ipd, (alternate ? "(alternate)" : ""));
+}
+
+static int modify_qp_hw(struct sif_dev *sdev, struct sif_qp *qp,
+		 struct ib_qp_attr *qp_attr, int qp_attr_mask)
+{
+	struct psif_epsc_csr_rsp resp;
+	struct psif_epsc_csr_req req;
+	struct psif_epsc_csr_modify_qp *mct = &req.u.modify_qp;
+	struct psif_csr_modify_qp_ctrl *ctrl_attr = &mct->ctrl;
+	struct psif_csr_modify_qp_ctrl *cmd = &mct->ctrl;
+	int ret = 0;
+
+	memset(&req, 0, sizeof(req));
+
+	req.opcode = EPSC_MODIFY_QP;
+
+	cmd->cmd = QP_CMD_MODIFY;
+
+	if (qp->qp_idx <= 3) {
+		/* sif requires "real" QP numbers in modify_qp */
+		cmd->qp_num = qp->qp_idx & 1;
+		cmd->port_num = qp->qp_idx >> 1;
+	} else
+		cmd->qp_num = qp->qp_idx;
+
+	if (qp_attr_mask & IB_QP_STATE) {
+		ctrl_attr->qp_state = 1;
+		mct->data.state = ib2sif_qp_state(qp_attr->qp_state);
+	}
+
+	if (qp->last_set_state == IB_QPS_INIT && qp_attr->qp_state == IB_QPS_RTR) {
+		/* Bug #3933 - WA for HW bug 3928
+		 * QP hw state must be set to INIT before modify_qp_hw to RTR
+		 */
+		volatile struct psif_qp *qps;
+
+		qps = &qp->d;
+		set_psif_qp_core__state(&qps->state, PSIF_QP_STATE_INIT);
+
+		/* For INIT -> RTR the rest of the attrs are set directly in the descriptor: */
+		ret = modify_qp_sw(sdev, qp, qp_attr, qp_attr_mask & ~IB_QP_STATE);
+
+		/* Flag to the FW that this is the PQP */
+		if (qp->type == PSIF_QP_TRANSPORT_MANSP1)
+			req.flags |= EPSC_FL_PQP;
+		if (ret)
+			goto err_modify_qp;
+		else
+			goto ok_modify_qp_sw;
+	}
+
+	if (qp_attr_mask & IB_QP_CUR_STATE) {
+		ctrl_attr->use_current_state = 1;
+		cmd->current_state = ib2sif_qp_state(qp_attr->cur_qp_state);
+
+		/* TBD: Remove this sanity check later: */
+		if (qp_attr->cur_qp_state != qp->last_set_state)
+			sif_log(sdev, SIF_QP,
+				"** WARNING: possible state inconsistency (user %d, driver %d)",
+				qp->last_set_state, qp_attr->cur_qp_state);
+	}
+
+	if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) {
+		/* TBD: Needed? */
+		sif_log(sdev, SIF_QP,
+			"IB_QP_EN_SQD_ASYNC_NOTIFY needed!");
+		goto err_modify_qp;
+	}
+
+	if (qp_attr_mask & IB_QP_ACCESS_FLAGS) {
+		/* TBD: qp_rcv_cap must be set and the whole struct psif_qp_rcv_cap
+		 * must be set if any of it's values are modified..
+		 * - must keep driver copies of this
+		 */
+
+		/* TBD: (qp_attr->qp_access_flags & IB_ACCESS_LOCAL_WRITE) ? 1 : 0; ? */
+		mct->data.rdma_rd_enable =
+			(qp_attr->qp_access_flags & IB_ACCESS_REMOTE_READ) ? 1 : 0;
+		mct->data.rdma_wr_enable =
+			(qp_attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+		mct->data.atomic_enable =
+			(qp_attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0;
+		/* IB_ACCESS_MW_BIND not supported (?) */
+	}
+
+	/* This section must be before IB_QP_AV */
+	if (qp_attr_mask & IB_QP_PKEY_INDEX) {
+		/* TBD: Argument check on index value ? */
+		qp->pkey_index = qp_attr->pkey_index;
+	}
+
+	/* This section must be before IB_QP_AV */
+	if (qp_attr_mask & IB_QP_PORT) {
+		if (qp_attr->port_num < 1 || qp_attr->port_num > 2) {
+			sif_log(sdev, SIF_INFO, "Modify port: Illegal port %d specified for qp %d",
+				qp_attr->port_num, qp->qp_idx);
+			ret = -EINVAL;
+			goto err_modify_qp;
+		}
+		sif_log(sdev, SIF_QP, "Modify port to %d for qp %d",
+			qp_attr->port_num, qp->qp_idx);
+		qp->port = qp_attr->port_num;
+	}
+
+	if (qp_attr_mask & IB_QP_QKEY) {
+		ctrl_attr->qkey = 1;
+		mct->data.rx_qkey = qp_attr->qkey;
+
+		sif_log(sdev, SIF_QP, "Assign QKEY 0x%x for qp %d",
+			qp_attr->qkey, qp->qp_idx);
+
+	}
+
+	if (qp_attr_mask & IB_QP_AV)
+		set_qp_path_hw(qp, mct, qp_attr, qp_attr_mask, false);
+
+	if (qp_attr_mask & IB_QP_PATH_MTU) {
+		if (!ib_legal_path_mtu(qp_attr->path_mtu)) {
+			sif_log(sdev, SIF_INFO, "Illegal MTU encoding %d", qp_attr->path_mtu);
+			ret = EINVAL;
+			goto err_modify_qp;
+		}
+		ctrl_attr->path_mtu = 1;
+		if ((qp->type == PSIF_QP_TRANSPORT_RC) && sif_feature(force_rc_2048_mtu)) {
+			if (qp_attr->path_mtu > IB_MTU_2048)
+				qp_attr->path_mtu = IB_MTU_2048;
+		}
+		mct->data.path_mtu = ib2sif_path_mtu(qp_attr->path_mtu);
+		qp->mtu = qp_attr->path_mtu;
+	}
+
+	if (qp_attr_mask & IB_QP_TIMEOUT) {
+		ctrl_attr->local_ack_timeout = 1;
+		if (!(qp_attr_mask & (IB_QP_AV|IB_QP_ALT_PATH)))
+			mct->data.primary_path.local_ack_timeout = qp_attr->timeout;
+	}
+
+	if (qp_attr_mask & IB_QP_RETRY_CNT) {
+		ctrl_attr->error_retry_count = 1;
+		mct->data.error_retry_count = qp_attr->retry_cnt;
+	}
+
+	if (qp_attr_mask & IB_QP_RNR_RETRY) {
+		ctrl_attr->rnr_retry_count = 1;
+		mct->data.rnr_retry_count = qp_attr->rnr_retry;
+	}
+
+	if (qp_attr_mask & IB_QP_RQ_PSN) {
+		/* expected receive PSN */
+		ctrl_attr->expected_psn = 1;
+		mct->data.expected_psn = qp_attr->rq_psn;
+	}
+
+	if (qp_attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		/* This is the sending side */
+		ctrl_attr->max_outstanding = 1;
+		if (qp_attr->max_rd_atomic == 0) {
+			sif_log(sdev, SIF_QP,
+				"IB_QP_MAX_QP_RD_ATOMIC value 0 incrementing to 1");
+			qp_attr->max_rd_atomic = 1;
+		}
+		if (qp_attr->max_rd_atomic > 16 || qp_attr->max_rd_atomic < 0) {
+			/* As per IBTA 9.4.4 & 11.2.4.2 */
+			sif_log(sdev, SIF_INFO,
+				"IB_QP_MAX_QP_RD_ATOMIC value %u out of range",
+				qp_attr->max_rd_atomic);
+			ret = -EINVAL;
+			goto err_modify_qp;
+		}
+		mct->data.max_outstanding = qp_attr->max_rd_atomic;
+	}
+
+	if (qp_attr_mask & IB_QP_ALT_PATH) {
+		if (qp_attr->alt_port_num < 1 || qp_attr->alt_port_num > 2) {
+			sif_log(sdev, SIF_INFO, "Illegal alternate port %d specified for qp %d",
+				qp_attr->alt_port_num, qp->qp_idx);
+			ret = -EINVAL;
+			goto err_modify_qp;
+		}
+		set_qp_path_hw(qp, mct, qp_attr, qp_attr_mask, true);
+	}
+
+	if (qp_attr_mask & IB_QP_MIN_RNR_TIMER) {
+		ctrl_attr->min_rnr_nak_time = 1;
+		mct->data.min_rnr_nak_time = sif_feature(force_wa_3646) ?
+			bug_3646_conv_table[qp_attr->min_rnr_timer & 0x1F] :
+			qp_attr->min_rnr_timer & 0x1F;
+	}
+
+	if (qp_attr_mask & IB_QP_SQ_PSN) {
+		/* Send packet sequence number */
+		ctrl_attr->xmit_psn = 1;
+		mct->data.xmit_psn = qp_attr->sq_psn;
+	}
+
+	if (qp_attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		/* Currently hard coded to 16 in psif */
+		if (unlikely(qp_attr->max_dest_rd_atomic > 16)) {
+			sif_log(sdev, SIF_QP,
+				"IB_QP_MAX_DEST_RD_ATOMIC value %u out of range - psif supports 16 as a hard coded value",
+				qp_attr->max_dest_rd_atomic);
+			goto err_modify_qp;
+		} else if (qp_attr->max_dest_rd_atomic < 16) {
+			sif_log(sdev, SIF_QP,
+				"IB_QP_MAX_DEST_RD_ATOMIC value %u ignored - psif supports 16 as a hard coded value",
+				qp_attr->max_dest_rd_atomic);
+		}
+	}
+
+	if (qp_attr_mask & IB_QP_PATH_MIG_STATE) {
+		ctrl_attr->mig_state = 1;
+		mct->data.mstate = ib2sif_mig_state(qp_attr->path_mig_state);
+	}
+
+	if (qp_attr_mask & IB_QP_CAP) {
+		sif_log(sdev, SIF_QP, "IB_QP_CAP not supported by PSIF");
+		goto err_modify_qp;
+	}
+
+	if (qp_attr_mask & IB_QP_DEST_QPN) {
+		/* Since this is only valid from the init state which is
+		 * owned by software anyway, we set it directly from software
+		 * (see issues #929, #1027)
+		 */
+		qp->remote_qp = qp_attr->dest_qp_num;
+		set_psif_qp_core__remote_qp(&qp->d.state, qp_attr->dest_qp_num);
+		sif_log(sdev, SIF_QP, "Modified remote qp (hw), qp_idx: %d, value %d\n",
+		    qp->qp_idx, qp_attr->dest_qp_num);
+	}
+
+ok_modify_qp_sw:
+
+	/*
+	 * On modify to RTR, we set the TSU SL (tsl), because we have
+	 * port # and sl present in the QP state at this point.
+	 */
+	if ((qp_attr_mask & IB_QP_STATE) && (qp_attr->qp_state == IB_QPS_RTR)) {
+		int sl = get_psif_qp_path__sl(&qp->d.path_a);
+		int port = qp->port - 1;
+		enum psif_tsu_qos qosl = qp->qosl;
+
+		if (cmd->qp_num == 0)
+			qp->tsl = sdev->qp0_tsl[qp->port - 1];
+		else if (qp->type == PSIF_QP_TRANSPORT_MANSP1)
+			qp->tsl = sdev->pqp_rcn_tsl[qp->port - 1];
+		else
+			qp->tsl = sdev->sl2tsl[sl][port][(int)qosl];
+
+		set_psif_qp_core__tsl(&qp->d.state, qp->tsl);
+
+		/* Tell user-lib about tsl to use */
+		if (qp->flags & SIF_QPF_USER_MODE) {
+			struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx);
+
+			sq_sw->tsl = qp->tsl;
+		}
+
+		sif_log(sdev, SIF_TSL,
+			"%s qp_idx: %d with sl: %d, port: %d, qosl: %s tsl: %d",
+			qp->type == PSIF_QP_TRANSPORT_MANSP1 ? "privileged" : "regular",
+			qp->qp_idx, sl, qp->port, string_enum_psif_tsu_qos(qosl) + 5, qp->tsl);
+	}
+
+	{
+		struct sif_eps_cqe lcqe;
+		u16 seq_num;
+
+		lcqe.rsp = &resp;
+		init_completion(&lcqe.cmpl);
+
+		ret = sif_post_epsc_wr(sdev, &req, &seq_num, &lcqe, true);
+		if (ret)
+			goto err_modify_qp;
+
+		if (reliable_qp(qp->type)
+			&& (qp_attr_mask & IB_QP_STATE)) {
+			if ((qp->last_set_state == IB_QPS_INIT)
+				&& (qp_attr->qp_state == IB_QPS_RTR)) {
+				/* Map the new send queue into the global sq_cmpl PSIF
+				 * only address map, see #944
+				 */
+				ret = sif_sq_cmpl_map_sq(sdev, get_sif_sq(sdev, qp->qp_idx));
+				if (ret)
+					goto err_modify_qp;
+
+				qp->sq_cmpl_map_valid = true;
+
+			} else if ((qp->sq_cmpl_map_valid)
+				&& (qp_attr->qp_state == IB_QPS_RESET)) {
+				/* Unmap the send queue from the global sq_cmpl PSIF */
+				ret = sif_sq_cmpl_unmap_sq(sdev, get_sif_sq(sdev, qp->qp_idx));
+				if (ret)
+					goto err_modify_qp;
+
+				qp->sq_cmpl_map_valid = false;
+			}
+		}
+
+		ret = sif_epsc_waitfor(sdev, seq_num, &lcqe);
+		if (ret)
+			goto err_modify_qp;
+	}
+
+	if (resp.status != EPSC_SUCCESS) {
+		sif_log(sdev, SIF_INFO, "qp %d failed with status %s",
+			qp->qp_idx, string_enum_psif_epsc_csr_status(resp.status));
+		goto err_modify_qp;
+	}
+
+	/* sif_logs(SIF_DUMP, write_struct_psif_qp(0, 1, (const struct psif_qp *)&qp->d)); */
+	sif_log(sdev, SIF_QP, "qp %d done QP state %d -> %d",
+		qp->qp_idx, qp->last_set_state,
+		(qp_attr_mask & IB_QP_STATE ? qp_attr->qp_state : qp->last_set_state));
+
+	if (qp_attr_mask & IB_QP_STATE)
+		qp->last_set_state = qp_attr->qp_state;
+
+	return ret;
+
+err_modify_qp:
+	if (resp.status == EPSC_MODIFY_INVALID_QP_STATE)
+		ret = -ESPIPE;
+
+	if (!ret)
+		ret = -EINVAL;
+	if (qp_attr_mask & IB_QP_STATE)
+		sif_log(sdev, SIF_QPE,
+			"qp %d failed - mask 0x%x cur.state %d, requested state %d, ret %d",
+			qp->qp_idx, qp_attr_mask, qp->last_set_state,
+			qp_attr->qp_state,
+			ret);
+	else
+		sif_log(sdev, SIF_QPE, "qp %d failed - mask 0x%x no state trans requested, ret %d",
+			qp->qp_idx, qp_attr_mask, ret);
+
+	sif_logs(SIF_DUMP, write_struct_psif_qp(NULL, 1, (const struct psif_qp *)&qp->d));
+	return ret;
+}
+
+
+static void set_qp_path_sw(struct sif_qp *qp, struct ib_qp_attr *qp_attr,
+			int qp_attr_mask, bool alternate)
+{
+	volatile struct psif_qp_path *path;
+	struct ib_ah_attr *ah_attr;
+	struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+	unsigned int local_lid_path;
+	u8 psif_port;
+	u8 ipd = 0;
+
+	if (alternate) {
+		path =  &qp->d.path_b;
+		ah_attr = &qp_attr->alt_ah_attr;
+		set_psif_qp_path__pkey_indx(path, qp_attr->alt_pkey_index);
+		set_psif_qp_path__local_ack_timeout(path, qp_attr->alt_timeout);
+		set_psif_qp_path__port(path, qp_attr->alt_port_num - 1);
+	} else {
+		path = &qp->d.path_a;
+		ah_attr = &qp_attr->ah_attr;
+		set_psif_qp_path__pkey_indx(path, qp->pkey_index);
+		/* Use the value set by IB_QP_PORT: */
+		set_psif_qp_path__port(path, qp->port - 1);
+	}
+	set_psif_qp_path__sl(path, ah_attr->sl);
+
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		set_psif_qp_path__use_grh(path, USE_GRH);
+		set_psif_qp_path__remote_gid_0(path, cpu_to_be64(ah_attr->grh.dgid.global.subnet_prefix));
+		set_psif_qp_path__remote_gid_1(path, cpu_to_be64(ah_attr->grh.dgid.global.interface_id));
+		set_psif_qp_path__flowlabel(path, ah_attr->grh.flow_label);
+		set_psif_qp_path__hoplmt(path, ah_attr->grh.hop_limit);
+		/* TBD: ah_attr->grh.sgid_index? */
+
+		sif_log(sdev, SIF_QP, " - with grh dgid %llx.%llx",
+			be64_to_cpu(path->remote_gid_0),
+			be64_to_cpu(path->remote_gid_1));
+	}
+
+	if (qp_attr_mask & IB_QP_TIMEOUT) {
+		set_psif_qp_path__local_ack_timeout(path, qp_attr->timeout);
+		sif_log(sdev, SIF_QP, " - with timeout %d", qp_attr->timeout);
+	}
+
+	qp->remote_lid = ah_attr->dlid;
+	set_psif_qp_path__remote_lid(path, ah_attr->dlid);
+	local_lid_path = ah_attr->src_path_bits;
+	psif_port = get_psif_qp_path__port(path);
+	set_psif_qp_path__local_lid_path(path, local_lid_path);
+	set_psif_qp_path__loopback(path,
+		(sdev->port[psif_port].lid | local_lid_path) == ah_attr->dlid ?
+		LOOPBACK : NO_LOOPBACK);
+
+	/* sif_calc_ipd do not set ipd if sif_calc_ipd failed. In that case, ipd = 0.*/
+	sif_calc_ipd(sdev, qp->port, (enum ib_rate) ah_attr->static_rate, &ipd);
+	set_psif_qp_path__ipd(path, ipd);
+
+	sif_log(sdev, SIF_QP, "port %d lid %d(%#x) local_lid_path %d(%#x) remote_lid %d(%#x)",
+		ah_attr->port_num,
+		sdev->port[psif_port].lid,
+		sdev->port[psif_port].lid,
+		ah_attr->src_path_bits,
+		ah_attr->src_path_bits,
+		ah_attr->dlid,
+		ah_attr->dlid);
+
+	sif_log(sdev, SIF_QP, "(path_%c) psif_port %d, remote_lid %d(%#x) %s",
+		(alternate ? 'b' : 'a'),
+		psif_port,
+		get_psif_qp_path__remote_lid(path), get_psif_qp_path__remote_lid(path),
+		(get_psif_qp_path__loopback(path) == LOOPBACK ? "(loopback)" : "(not loopback)"));
+}
+
+static int modify_qp_sw(struct sif_dev *sdev, struct sif_qp *qp,
+		 struct ib_qp_attr *qp_attr, int qp_attr_mask)
+{
+	int ret = 0;
+	volatile struct psif_qp *qps;
+	struct sif_rq *rq = NULL;
+
+	if (qp->rq_idx >= 0)
+		rq = get_sif_rq(sdev, qp->rq_idx);
+
+	qps = &qp->d;
+
+	if ((qp_attr_mask & IB_QP_STATE)
+		&& (qp->last_set_state == IB_QPS_RESET)
+		&& (qp_attr->qp_state == IB_QPS_INIT)) {
+		set_psif_qp_core__bytes_received(&qps->state, 0);
+		set_psif_qp_core__committed_received_psn(&qps->state, 0);
+		set_psif_qp_core__expected_psn(&qps->state, 0);
+		set_psif_qp_core__last_committed_msn(&qps->state, 0);
+		set_psif_qp_core__last_received_outstanding_msn(&qps->state, 0);
+		set_psif_qp_core__msn(&qps->state, 0); /* According to Brian 11.9.2012 */
+		set_psif_qp_core__scatter_indx(&qps->state, 0);
+		set_psif_qp_core__spin_hit(&qps->state, 0);
+		set_psif_qp_core__sq_seq(&qps->state, 1);
+		set_psif_qp_core__srq_pd(&qps->state, 0);
+	}
+
+	if (qp_attr_mask & IB_QP_CUR_STATE && qp_attr->cur_qp_state != qp->last_set_state) {
+		sif_log(sdev, SIF_INFO,
+			"Error: current state %d - user expected %d",
+			qp->last_set_state, qp_attr->cur_qp_state);
+		ret = -EINVAL;
+		goto err_modify_qp;
+	}
+
+	/* Bug #3933 - WA for HW bug 3928
+	 * ibv_query_qp might report wrong state when in state IBV_QPS_ERR
+	 * QP hw state keeps in RESET for modify_qp_sw to INIT or ERR states
+	 */
+	if (qp_attr_mask & IB_QP_STATE)
+		if ((qp_attr->qp_state != IB_QPS_INIT && qp_attr->qp_state != IB_QPS_ERR)
+			|| (PSIF_REVISION(sdev) > 3))
+			set_psif_qp_core__state(&qps->state, ib2sif_qp_state(qp_attr->qp_state));
+
+	if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) {
+		sif_log(sdev, SIF_INFO,
+			"IB_QP_EN_SQD_ASYNC_NOTIFY needed!");
+		ret = -EINVAL;
+		goto err_modify_qp;
+	}
+
+	if (qp_attr_mask & IB_QP_ACCESS_FLAGS) {
+
+		set_psif_qp_core__rdma_rd_enable(&qps->state,
+			((qp_attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
+				? 1 : 0));
+		set_psif_qp_core__rdma_wr_enable(&qps->state,
+			((qp_attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
+				? 1 : 0));
+		set_psif_qp_core__atomic_enable(&qps->state,
+			((qp_attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)
+				? 1 : 0));
+	}
+
+	/* This section must be before IB_QP_AV */
+	if (qp_attr_mask & IB_QP_PKEY_INDEX) {
+		volatile struct psif_qp_path *path =  &qp->d.path_a;
+
+		/* TBD: Argument check on index value ? */
+		qp->pkey_index = qp_attr->pkey_index;
+		set_psif_qp_path__pkey_indx(path, qp->pkey_index);
+		sif_log(sdev, SIF_QP, "pkey_indx in primary path set to %d", qp->pkey_index);
+
+	}
+
+	/* This section must be before IB_QP_AV */
+	if (qp_attr_mask & IB_QP_PORT) {
+		if (qp_attr->port_num < 1 || qp_attr->port_num > 2) {
+			sif_log(sdev, SIF_INFO, "Modify port: Illegal port %d specified for qp %d",
+				qp_attr->port_num, qp->qp_idx);
+			ret = -EINVAL;
+			goto err_modify_qp;
+		}
+		sif_log(sdev, SIF_QP, "Modify port to %d for qp %d",
+			qp_attr->port_num, qp->qp_idx);
+		qp->port = qp_attr->port_num;
+	}
+
+	if (qp_attr_mask & IB_QP_QKEY) {
+
+		/* Set the 'ipoib' and 'ipoib_enable' fields for UD QPs with the IPoIB QKey */
+		/* TBD: The IPoIB QKEY value is hardcoded. We need to figured out how ask the
+		 * driver to ask the FW for this value
+		 */
+		if (qp_attr->qkey == 0x00000b1b) {
+			set_psif_qp_core__ipoib(&qps->state, 1);
+			set_psif_qp_core__ipoib_enable(&qps->state, 1);
+		}
+
+		set_psif_qp_core__qkey(&qps->state, qp_attr->qkey);
+
+		sif_log(sdev, SIF_QP, "Assign QKEY 0x%x for qp %d",
+			qp_attr->qkey, qp->qp_idx);
+	}
+
+	if (qp_attr_mask & IB_QP_AV)
+		set_qp_path_sw(qp, qp_attr, qp_attr_mask, false);
+
+	if (qp_attr_mask & IB_QP_PATH_MTU) {
+		if (!ib_legal_path_mtu(qp_attr->path_mtu)) {
+			sif_log(sdev, SIF_INFO, "Illegal MTU encoding %d", qp_attr->path_mtu);
+			ret = EINVAL;
+			goto err_modify_qp;
+		}
+		if ((qp->type == PSIF_QP_TRANSPORT_RC) && sif_feature(force_rc_2048_mtu)) {
+			if (qp_attr->path_mtu > IB_MTU_2048)
+				qp_attr->path_mtu = IB_MTU_2048;
+		}
+		sif_log(sdev, SIF_QP, "Modify path_mtu to %d for qp %d",
+			qp_attr->path_mtu, qp->qp_idx);
+		set_psif_qp_core__path_mtu(&qps->state,
+			ib2sif_path_mtu(qp_attr->path_mtu));
+		qp->mtu = qp_attr->path_mtu;
+	}
+
+	if (!(qp_attr_mask & (IB_QP_AV|IB_QP_ALT_PATH))) {
+		/* Set these values also if a path does not get set */
+		if (qp_attr_mask & IB_QP_TIMEOUT)
+			set_psif_qp_path__local_ack_timeout(&qps->path_a, qp_attr->timeout);
+	}
+
+	if (qp_attr_mask & IB_QP_RETRY_CNT) {
+		set_psif_qp_core__error_retry_init(&qps->state, qp_attr->retry_cnt);
+		set_psif_qp_core__error_retry_count(&qps->state, qp_attr->retry_cnt);
+	}
+
+	if (qp_attr_mask & IB_QP_RNR_RETRY) {
+		int rnr_value = qp_attr->retry_cnt;
+
+		set_psif_qp_core__rnr_retry_init(&qps->state, rnr_value);
+		set_psif_qp_core__rnr_retry_count(&qps->state, qp_attr->rnr_retry);
+	}
+
+	if (qp_attr_mask & IB_QP_RQ_PSN)
+		set_psif_qp_core__expected_psn(&qps->state, qp_attr->rq_psn);
+
+	if (qp_attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		/* This is the sending side */
+		if (unlikely(qp_attr->max_rd_atomic > 16)) {
+			sif_log(sdev, SIF_QP,
+				"IB_QP_MAX_QP_RD_ATOMIC value %u out of range - psif supports no more than 16",
+				qp_attr->max_rd_atomic);
+			qp_attr->max_rd_atomic = 16;
+		}
+		set_psif_qp_core__max_outstanding(&qps->state, qp_attr->max_rd_atomic);
+	}
+
+	if (qp_attr_mask & IB_QP_ALT_PATH) {
+		if (qp_attr->alt_port_num < 1 || qp_attr->alt_port_num > 2) {
+			sif_log(sdev, SIF_INFO, "Illegal alternate port %d specified for qp %d",
+				qp_attr->alt_port_num, qp->qp_idx);
+			ret = -EINVAL;
+			goto err_modify_qp;
+		}
+		set_qp_path_sw(qp, qp_attr, qp_attr_mask, true);
+	}
+
+	if (qp_attr_mask & IB_QP_MIN_RNR_TIMER)
+		set_psif_qp_core__min_rnr_nak_time(&qps->state,
+			bug_3646_conv_table[qp_attr->min_rnr_timer & 0x1F]);
+
+	if (qp_attr_mask & IB_QP_SQ_PSN) {
+		/* last_acked_psn must be 1 less (modulo 24 bit) than xmit_psn
+		 * (see issue #1011)
+		 */
+		u32 prev = qp_attr->sq_psn == 0 ? 0xFFFFFF : qp_attr->sq_psn - 1;
+
+		set_psif_qp_core__xmit_psn(&qps->state, qp_attr->sq_psn);
+		set_psif_qp_core__last_acked_psn(&qps->state, prev);
+	}
+
+	if (qp_attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		/* Currently hard coded to 16 in psif */
+		if (unlikely(qp_attr->max_dest_rd_atomic > 16)) {
+			sif_log(sdev, SIF_INFO,
+				"IB_QP_MAX_DEST_RD_ATOMIC value %u out of range - psif supports 16 as a hard coded value",
+				qp_attr->max_dest_rd_atomic);
+			ret = -EINVAL;
+			goto err_modify_qp;
+		} else if (qp_attr->max_dest_rd_atomic < 16) {
+			sif_log(sdev, SIF_QP,
+				"IB_QP_MAX_DEST_RD_ATOMIC value %u ignored - psif supports 16 as a hard coded value",
+				qp_attr->max_dest_rd_atomic);
+		}
+	}
+
+	if (qp_attr_mask & IB_QP_PATH_MIG_STATE)
+		set_psif_qp_core__mstate(&qps->state,
+			ib2sif_mig_state(qp_attr->path_mig_state));
+
+	if (qp_attr_mask & IB_QP_CAP) {
+		sif_log(sdev, SIF_INFO, "resizing QP not implemented");
+		sif_log(sdev, SIF_INFO, "IB_QP_CAP needed!");
+		ret = -EOPNOTSUPP;
+		goto err_modify_qp;
+	}
+
+	if (qp_attr_mask & IB_QP_DEST_QPN) {
+		set_psif_qp_core__remote_qp(&qps->state, qp_attr->dest_qp_num);
+		sif_log(sdev, SIF_QP, "Modified remote qp (sw), local qp_idx: %d, remote_qp %d\n",
+		    qp->qp_idx, qp_attr->dest_qp_num);
+	}
+
+	/* Set the valid bit whenever we transition to INIT */
+	if (rq && !rq->is_srq && qp_attr_mask & IB_QP_STATE && qp_attr->qp_state == IB_QPS_INIT)
+		set_psif_rq_hw__valid(&rq->d, 1);
+
+	sif_log(sdev, SIF_QP, "qp %d done QP state %d -> %d",
+		qp->qp_idx, qp->last_set_state,
+		(qp_attr_mask & IB_QP_STATE ? qp_attr->qp_state : qp->last_set_state));
+
+	if (qp_attr_mask & IB_QP_STATE)
+		qp->last_set_state = qp_attr->qp_state;
+
+	return ret;
+err_modify_qp:
+	return ret;
+}
+
+
+static int sif_query_qp_sw(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+static int sif_query_qp_hw(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+
+int sif_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+	bool use_hw = false;
+	struct sif_qp *qp = to_sqp(ibqp);
+	struct sif_dev *sdev = to_sdev(ibqp->device);
+
+	sif_logi(ibqp->device, SIF_QP, "last_set_state %d", qp->last_set_state);
+
+	switch (qp->last_set_state) {
+	case IB_QPS_RESET:
+	case IB_QPS_INIT:
+		break;
+	default:
+		/* Bug #3933 - WA for HW bug 3928
+		 * ibv_query_qp might report wrong state when in state IBV_QPS_ERR
+		 * Query must be done based on current ownership (towards HW only if HW owned)
+		 */
+		if (PSIF_REVISION(sdev) <= 3 || qp->flush_sq_done_wa4074)
+			use_hw = (qp->flags & SIF_QPF_HW_OWNED);
+		else
+			use_hw = true;
+		break;
+	}
+
+	return use_hw ?
+		sif_query_qp_hw(ibqp, qp_attr, qp_attr_mask, qp_init_attr) :
+		sif_query_qp_sw(ibqp, qp_attr, qp_attr_mask, qp_init_attr);
+}
+
+enum ib_qp_state get_qp_state(struct sif_qp *qp)
+{
+	struct ib_qp *ibqp = &qp->ibqp;
+	struct ib_qp_init_attr init_attr;
+	struct ib_qp_attr attr;
+
+	memset(&attr, 0, sizeof(attr));
+	memset(&init_attr, 0, sizeof(init_attr));
+
+	if (sif_query_qp(ibqp, &attr, IB_QP_STATE, &init_attr)) {
+		sif_logi(ibqp->device, SIF_INFO,
+			"query_qp failed for qp %d", ibqp->qp_num);
+		return -1;
+	}
+	return attr.qp_state;
+}
+
+static void get_qp_path_sw(struct sif_qp *qp, struct ib_qp_attr *qp_attr, bool alternate)
+{
+	volatile struct psif_qp_path *path;
+	struct ib_ah_attr *ah_attr;
+	enum psif_use_grh use_grh;
+	volatile struct psif_qp_path *alt_path;
+	struct ib_ah_attr *alt_ah_attr;
+
+	alt_path =  &qp->d.path_b;
+	alt_ah_attr = &qp_attr->alt_ah_attr;
+	path = &qp->d.path_a;
+	ah_attr = &qp_attr->ah_attr;
+
+	ah_attr->sl = get_psif_qp_path__sl(path);
+	use_grh = get_psif_qp_path__use_grh(path);
+
+	if (use_grh == USE_GRH) {
+		ah_attr->ah_flags |= IB_AH_GRH;
+		ah_attr->grh.dgid.global.subnet_prefix = get_psif_qp_path__remote_gid_0(path);
+		ah_attr->grh.dgid.global.interface_id = get_psif_qp_path__remote_gid_1(path);
+		ah_attr->grh.flow_label = get_psif_qp_path__flowlabel(path);
+		ah_attr->grh.hop_limit = get_psif_qp_path__hoplmt(path);
+		/* TBD: ah_attr->grh.sgid_index? */
+	}
+
+	qp_attr->pkey_index = get_psif_qp_path__pkey_indx(path);
+	qp_attr->timeout = get_psif_qp_path__local_ack_timeout(path);
+
+	ah_attr->port_num = get_psif_qp_path__port(path);
+	ah_attr->dlid =	get_psif_qp_path__remote_lid(path);
+	ah_attr->src_path_bits = get_psif_qp_path__local_lid_path(path);
+
+	alt_ah_attr->port_num = get_psif_qp_path__port(alt_path);
+	alt_ah_attr->dlid =	get_psif_qp_path__remote_lid(alt_path);
+	alt_ah_attr->src_path_bits = get_psif_qp_path__local_lid_path(alt_path);
+}
+
+
+
+static int sif_query_qp_sw(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		    int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+	struct sif_dev *sdev = to_sdev(ibqp->device);
+	struct sif_qp *qp = to_sqp(ibqp);
+	volatile struct psif_qp *qps = &qp->d;
+	struct sif_rq *rq = NULL;
+	struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+	int ret = 0;
+
+	if (qp->type != PSIF_QP_TRANSPORT_XRC)
+		rq = get_sif_rq(sdev, qp->rq_idx);
+
+	/* Mellanox almost completely ignores the mask on both
+	 * input and output and reports all attributes regardlessly..
+	 * as opposed to what man ibv_query_qp indicates.
+	 * Since this behavour is utilized by a.o. qperf
+	 * we probably have no other meaningful choice than
+	 * to report back everything even with mask 0.
+	 */
+	sif_log(sdev, SIF_QP, "qp_attr_mask 0x%x", qp_attr_mask);
+
+	memset(qp_init_attr, 0, sizeof(struct ib_qp_init_attr));
+	memset(qp_attr, 0, sizeof(struct ib_qp_attr));
+
+	qp_attr->qp_state = qp_attr->cur_qp_state = qp->last_set_state;
+	qp_attr->qp_access_flags |=
+		get_psif_qp_core__rdma_rd_enable(&qps->state) ? IB_ACCESS_REMOTE_READ : 0;
+	qp_attr->qp_access_flags |=
+		get_psif_qp_core__rdma_wr_enable(&qps->state) ? IB_ACCESS_REMOTE_WRITE : 0;
+	qp_attr->qp_access_flags |=
+		get_psif_qp_core__atomic_enable(&qps->state) ? IB_ACCESS_REMOTE_ATOMIC : 0;
+
+	qp_attr->pkey_index = get_psif_qp_path__pkey_indx(&qps->path_a);
+	qp_attr->port_num = qp->port;
+	qp_attr->qkey = get_psif_qp_core__qkey(&qps->state);
+	get_qp_path_sw(qp, qp_attr, qp_attr_mask & IB_QP_ALT_PATH);
+
+	qp_attr->path_mtu = sif2ib_path_mtu(get_psif_qp_core__path_mtu(&qps->state));
+	qp_attr->timeout = get_psif_qp_path__local_ack_timeout(&qps->path_a);
+	qp_attr->retry_cnt = get_psif_qp_core__error_retry_count(&qps->state);
+	qp_attr->rnr_retry = get_psif_qp_core__rnr_retry_count(&qps->state);
+	qp_attr->rq_psn = get_psif_qp_core__expected_psn(&qps->state);
+	qp_attr->min_rnr_timer = get_psif_qp_core__min_rnr_nak_time(&qps->state);
+	qp_attr->sq_psn = get_psif_qp_core__xmit_psn(&qps->state);
+	qp_attr->path_mig_state = sif2ib_mig_state(get_psif_qp_core__mstate(&qps->state));
+	qp_attr->dest_qp_num = get_psif_qp_core__remote_qp(&qps->state);
+
+	/* TBD: Revisit this: This value is currently hard coded to 16 in psif */
+	qp_attr->max_dest_rd_atomic = 16;
+
+	qp_init_attr->port_num = qp->port;
+	if (rq) {
+		if (rq->is_srq)
+			qp_init_attr->srq = &rq->ibsrq;
+		qp_init_attr->cap.max_recv_wr     = rq->entries_user;
+		qp_init_attr->cap.max_recv_sge    = rq->sg_entries;
+	}
+	qp_init_attr->cap.max_send_wr     = sq->entries;
+	qp_init_attr->cap.max_send_sge    = sq->sg_entries;
+	qp_init_attr->cap.max_inline_data = qp->max_inline_data;
+
+	/* TBD: What to do with this:
+	 * IB_QP_MAX_QP_RD_ATOMIC		= (1<<13),
+	 */
+	return ret;
+}
+
+static void get_qp_path_hw(struct psif_query_qp *qqp, struct ib_qp_attr *qp_attr, bool alternate)
+{
+	struct psif_qp_path *path;
+	struct ib_ah_attr *ah_attr;
+	enum psif_use_grh use_grh;
+	struct psif_qp_path *alt_path;
+	struct ib_ah_attr *alt_ah_attr;
+
+	alt_path =  &qqp->alternate_path;
+	alt_ah_attr = &qp_attr->alt_ah_attr;
+	path = &qqp->primary_path;
+	ah_attr = &qp_attr->ah_attr;
+
+	ah_attr->sl = path->sl;
+	use_grh = path->use_grh;
+
+	if (use_grh == USE_GRH) {
+		ah_attr->ah_flags |= IB_AH_GRH;
+		ah_attr->grh.dgid.global.subnet_prefix = path->remote_gid_0;
+		ah_attr->grh.dgid.global.interface_id = path->remote_gid_1;
+		ah_attr->grh.flow_label = path->flowlabel;
+		ah_attr->grh.hop_limit = path->hoplmt;
+		/* TBD: ah_attr->grh.sgid_index? */
+	}
+	qp_attr->pkey_index = path->pkey_indx;
+	qp_attr->timeout = path->local_ack_timeout;
+	qp_attr->port_num = path->port + 1;
+
+	qp_attr->alt_pkey_index = alt_path->pkey_indx;
+	qp_attr->alt_timeout = alt_path->local_ack_timeout;
+	qp_attr->alt_port_num = alt_path->port + 1;
+
+
+
+	ah_attr->port_num = path->port + 1;
+	ah_attr->dlid =	path->remote_lid;
+	ah_attr->src_path_bits = path->local_lid_path;
+
+	alt_ah_attr->port_num = alt_path->port + 1;
+	alt_ah_attr->dlid =	alt_path->remote_lid;
+	alt_ah_attr->src_path_bits = alt_path->local_lid_path;
+}
+
+u64 sif_qqp_dma_addr(struct sif_dev *sdev, struct sif_qp *qps)
+{
+	struct sif_table *tp = &sdev->ba[qp];
+	u64 offset = qps->qp_idx * tp->ext_sz + offsetof(struct sif_qp, qqp);
+
+	if (tp->mmu_ctx.mt == SIFMT_BYPASS)
+		return sif_mem_dma(tp->mem, offset);
+	else if (!epsc_gva_permitted(sdev))
+		return sif_mem_dma(tp->mem, offset);
+	else
+		return tp->mmu_ctx.base + offset;
+}
+
+/* Internal query qp implementation - updates the local query qp state for this QP */
+int epsc_query_qp(struct sif_qp *sqp, struct psif_query_qp *lqqp)
+{
+	int ret;
+	struct psif_epsc_csr_rsp cqe;
+	struct psif_epsc_csr_req req;
+	struct psif_csr_modify_qp_ctrl *cmd = &req.u.query_qp.ctrl;
+	struct sif_dev *sdev = to_sdev(sqp->ibqp.device);
+
+	/* This function can potentially use the same qqp data structure reentrant
+	 * but we dont care as we know that EPSC operations gets sequenced
+	 */
+
+	memset(&req, 0, sizeof(req));
+	req.opcode = EPSC_QUERY_QP;
+	cmd->cmd = QP_CMD_QUERY;
+	if (sqp->qp_idx <= 3) {
+		cmd->qp_num = sqp->qp_idx & 1;
+		cmd->port_num = sqp->qp_idx >> 1;
+	} else
+		cmd->qp_num = sqp->qp_idx;
+	req.u.query_qp.address = sif_qqp_dma_addr(sdev, sqp);
+
+	if (!epsc_gva_permitted(sdev))
+		req.u.query_qp.mmu_cntx = sif_mmu_ctx_passthrough(true);
+	else
+		req.u.query_qp.mmu_cntx = sdev->ba[qp].mmu_ctx.mctx;
+	ret = sif_epsc_wr_poll(sdev, &req, &cqe);
+
+	/* Copy data irrespective of how the EPSC operation went */
+	if (eps_version_ge(&sdev->es[sdev->mbox_epsc], 0, 31))
+		copy_conv_to_sw(lqqp, &sqp->qqp, sizeof(*lqqp));
+	else
+		memcpy(lqqp, &sqp->qqp, sizeof(*lqqp));
+
+	return ret;
+}
+
+
+static int sif_query_qp_hw(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+	int ret = 0;
+	struct sif_qp *qp = to_sqp(ibqp);
+	struct sif_dev *sdev = to_sdev(ibqp->device);
+	struct sif_rq *rq = NULL;
+	struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+	struct psif_query_qp lqqp;
+
+	/* Take QP lock to avoid any race condition on updates to last_set_state: */
+	mutex_lock(&qp->lock);
+
+	ret = epsc_query_qp(qp, &lqqp);
+	if (!ret)
+		qp->last_set_state = sif2ib_qp_state(lqqp.qp.state);
+	mutex_unlock(&qp->lock);
+
+	if (ret)
+		return ret;
+
+	if (qp->type != PSIF_QP_TRANSPORT_XRC)
+		rq = get_sif_rq(sdev, qp->rq_idx);
+
+	/* Mellanox almost completely ignores the mask on both
+	 * input and output and reports all attributes regardlessly..
+	 * as opposed to what man ibv_query_qp indicates.
+	 * Since this behavour is utilized by a.o. qperf
+	 * we probably have no other meaningful choice than
+	 * to report back everything even with mask 0.
+	 */
+	sif_log(sdev, SIF_QP|SIF_DUMP, "qp %d,  qp_attr_mask 0x%x", qp->qp_idx, qp_attr_mask);
+	sif_logs(SIF_DUMP, write_struct_psif_query_qp(NULL, 0, &lqqp));
+
+
+	memset(qp_init_attr, 0, sizeof(struct ib_qp_init_attr));
+	memset(qp_attr, 0, sizeof(struct ib_qp_attr));
+
+	qp_attr->qp_state = qp_attr->cur_qp_state = qp->last_set_state;
+	qp_attr->qp_access_flags |= lqqp.qp.rdma_rd_enable ? IB_ACCESS_REMOTE_READ : 0;
+	qp_attr->qp_access_flags |= lqqp.qp.rdma_wr_enable ? IB_ACCESS_REMOTE_WRITE : 0;
+	qp_attr->qp_access_flags |= lqqp.qp.atomic_enable  ? IB_ACCESS_REMOTE_ATOMIC : 0;
+
+	qp_attr->pkey_index = lqqp.primary_path.pkey_indx;
+	qp_attr->port_num = lqqp.primary_path.port + 1;
+	qp_attr->qkey = lqqp.qp.qkey;
+	get_qp_path_hw(&lqqp, qp_attr, qp_attr_mask & IB_QP_ALT_PATH);
+
+	qp_attr->path_mtu = sif2ib_path_mtu(lqqp.qp.path_mtu);
+	qp_attr->timeout = lqqp.primary_path.local_ack_timeout;
+	qp_attr->retry_cnt = lqqp.qp.error_retry_count;
+	qp_attr->rnr_retry = lqqp.qp.rnr_retry_count;
+	qp_attr->rq_psn = lqqp.qp.expected_psn;
+	qp_attr->min_rnr_timer = lqqp.qp.min_rnr_nak_time;
+	qp_attr->sq_psn = lqqp.qp.xmit_psn;
+	qp_attr->path_mig_state = sif2ib_mig_state(lqqp.qp.mstate);
+	qp_attr->dest_qp_num = lqqp.qp.remote_qp;
+
+	/* TBD: Revisit this: This value is currently hard coded to 16 in psif */
+	qp_attr->max_dest_rd_atomic = 16;
+
+	qp_init_attr->port_num = qp->port; /* TBD: Use primary path info here as well? */
+
+	if (rq) {
+		if (rq->is_srq)
+			qp_init_attr->srq = &rq->ibsrq;
+		qp_init_attr->cap.max_recv_wr     = rq->entries_user;
+		qp_init_attr->cap.max_recv_sge    = rq->sg_entries;
+	}
+	qp_init_attr->cap.max_send_wr     = sq->entries;
+	qp_init_attr->cap.max_send_sge    = sq->sg_entries;
+	qp_init_attr->cap.max_inline_data = qp->max_inline_data;
+
+	/* TBD: What to do with these..
+	 * IB_QP_MAX_QP_RD_ATOMIC		= (1<<13),
+	 */
+	return ret;
+}
+
+
+int sif_destroy_qp(struct ib_qp *ibqp)
+{
+	struct sif_qp *qp = to_sqp(ibqp);
+	struct sif_dev *sdev = to_sdev(ibqp->device);
+	struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+	bool need_pma_pxy_qp = eps_version_ge(es, 0, 57)
+		&& (qp->qp_idx == 1 || qp->qp_idx == 3);
+
+	sif_log(sdev, SIF_QP, "qp_num %d", ibqp->qp_num);
+
+	/* Destroy PMA_PXY QPs associated with QP1/3 */
+	if (need_pma_pxy_qp) {
+		struct sif_qp *pma_qp = NULL;
+		int pma_qp_idx;
+		int ret;
+
+		pma_qp_idx = sdev->pma_qp_idxs[!!(qp->qp_idx & 2)];
+		pma_qp = get_sif_qp(sdev, pma_qp_idx);
+
+		/* clearing epsc PMA_PXY QP redirection */
+		ret = notify_epsc_pma_qp(sdev, -1, qp->port);
+		if (ret)
+			sif_log(sdev, SIF_QP,
+				"Failed to clear epsc PMA_PXY rerirect for qp_num %d", pma_qp_idx);
+		destroy_qp(sdev, pma_qp);
+	}
+
+	return destroy_qp(sdev, qp);
+}
+
+
+int destroy_qp(struct sif_dev *sdev, struct sif_qp *qp)
+{
+	int ret;
+	int index = qp->qp_idx;
+	struct sif_pd *pd = qp->ibqp.pd ? to_spd(qp->ibqp.pd) : to_sxrcd(qp->ibqp.xrcd)->pd;
+	struct ib_qp_attr mod_attr = {
+		.qp_state        = IB_QPS_RESET
+	};
+	struct sif_rq *rq = NULL;
+	bool reuse_ok = true;
+
+	/* See bug #3496 */
+	if (sif_feature(no_multipacket_qp_reuse)) {
+		switch (qp->type) {
+		case PSIF_QP_TRANSPORT_UD:
+		case PSIF_QP_TRANSPORT_MANSP1:
+			reuse_ok = true;
+			break;
+		default:
+			reuse_ok = false;
+			break;
+		}
+	}
+
+	sif_log(sdev, SIF_QP, "## Enter qp_idx %d", index);
+
+	if (is_regular_qp(qp))
+		rq = get_sif_rq(sdev, qp->rq_idx);
+
+	/* make sure event handling is performed before reset the qp.*/
+	if (atomic_dec_and_test(&qp->refcnt))
+		complete(&qp->can_destroy);
+	wait_for_completion(&qp->can_destroy);
+
+	/* Modify to reset causes an implicit reset_qp() if state is RESET */
+	ret = modify_qp(sdev, qp, &mod_attr, IB_QP_STATE, false, NULL);
+	if (ret)
+		sif_log(sdev, SIF_INFO, "modify qp %d to RESET failed, sts %d", index, ret);
+
+	if (!(qp->flags & SIF_QPF_USER_MODE)) {
+		int nfixup;
+		struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+		u32 cq_idx = get_psif_qp_core__rcv_cq_indx(&qp->d.state);
+		struct sif_cq *send_cq = (sq && sq->cq_idx >= 0) ? get_sif_cq(sdev, sq->cq_idx) : NULL;
+		struct sif_cq *recv_cq = rq ? get_sif_cq(sdev, cq_idx) : NULL;
+
+		if (send_cq) {
+			nfixup = sif_fixup_cqes(send_cq, sq, qp);
+			if (nfixup < 0) {
+				sif_log(sdev, SIF_INFO,
+					"sif_fixup_cqes: on qp %d send cq %d failed with error %d",
+					qp->qp_idx, sq->cq_idx, nfixup);
+				goto fixup_failed;
+			}
+			sif_log(sdev, SIF_QP, "sif_fixup_cqes: fixed %d CQEs in sq.cq %d",
+				nfixup, sq->cq_idx);
+		}
+		if (recv_cq && recv_cq != send_cq) {
+			nfixup = sif_fixup_cqes(recv_cq, sq, qp);
+			if (nfixup < 0) {
+				sif_log(sdev, SIF_INFO,
+					"sif_fixup_cqes: on qp %d recv cq %d failed with error %d",
+					qp->qp_idx, cq_idx, nfixup);
+				goto fixup_failed;
+			}
+			sif_log(sdev, SIF_QP, "sif_fixup_cqes: fixed %d CQEs in rq.cq %d",
+				nfixup, cq_idx);
+
+		}
+	}
+
+fixup_failed:
+	if (qp->qp_idx < 4) {
+		/* Special QP cleanup */
+		int ok = atomic_add_unless(&sdev->sqp_usecnt[qp->qp_idx], -1, 0);
+
+		if (!ok) {
+			sif_log(sdev, SIF_INFO,
+				"Attempt to destroy an uncreated QP %d", qp->qp_idx);
+			return -EINVAL;
+		}
+	}
+
+	sif_dfs_remove_qp(qp);
+
+	sif_free_sq(sdev, qp);
+
+	if (rq) {
+		ret = free_rq(sdev, qp->rq_idx);
+		if (ret && (ret != -EBUSY || !rq->is_srq))
+			return ret;
+	}
+
+	if (index > 3 && reuse_ok)
+		sif_free_qp_idx(pd, index);
+
+	sif_log(sdev, SIF_QP, "## Exit success (qp_idx %d)", index);
+	return 0;
+}
+
+/* Set this QP back to the initial state
+ * (called by modify_qp after a successful modify to reset
+ */
+static int reset_qp(struct sif_dev *sdev, struct sif_qp *qp)
+{
+	volatile struct psif_qp *qps = &qp->d;
+	struct sif_rq *rq = NULL;
+	struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+	bool need_wa_3713 = 0;
+
+	/* Bring down order needed by rev2 according to bug #3480 */
+	int ret = poll_wait_for_qp_writeback(sdev, qp);
+
+	if (ret)
+		goto failed;
+
+	if (is_regular_qp(qp))
+		rq = get_sif_rq(sdev, qp->rq_idx);
+
+	/* WA 3713 special handling */
+	need_wa_3713 = (PSIF_REVISION(sdev) <= 3)
+		&& IS_PSIF(sdev) /* Next check if there is a retry outstanding */
+		&& !qp->flush_sq_done_wa4074
+		&& (get_psif_qp_core__retry_tag_committed(&qp->d.state) !=
+			get_psif_qp_core__retry_tag_err(&qp->d.state))
+		&& (qp->qp_idx != sdev->flush_qp);
+
+	if (need_wa_3713) {
+		ret = reset_qp_flush_retry(sdev);
+		if (ret < 0)
+			sif_log(sdev, SIF_INFO,	"Flush_retry special handling failed with ret %d", ret);
+
+	}
+
+
+	/* if the send queue scheduler is running, wait for
+	 * it to terminate:
+	 */
+	ret = sif_flush_sqs(sdev, sq);
+	if (ret)
+		goto failed;
+
+	sif_logs(SIF_DUMP,
+		write_struct_psif_qp(NULL, 1, (struct psif_qp *)&qp->d));
+
+failed:
+	if (ret) {
+		/* TBD: Debug case - should never fail? */
+		if (qp->type != PSIF_QP_TRANSPORT_MANSP1)
+			return ret;
+	}
+
+	/* Reset the SQ pointers */
+	if (!qp->ibqp.xrcd) {
+		struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx);
+
+		memset(sq_sw, 0, sizeof(*sq_sw));
+		set_psif_sq_sw__tail_indx(&sq_sw->d, 0);
+		set_psif_sq_hw__last_seq(&sq->d, 0);
+		set_psif_sq_hw__destroyed(&sq->d, 0);
+	}
+
+	/* Invalidate the RQ and set it in a consistent state for reuse */
+	if (rq && !rq->is_srq) {
+		struct sif_rq_sw *rq_sw = get_sif_rq_sw(sdev, rq->index);
+
+		if (!(test_bit(RQ_IS_INVALIDATED, &rq_sw->flags))) {
+			ret = sif_invalidate_rq_hw(sdev, rq->index, PCM_POST);
+			if (ret) {
+				sif_log(sdev, SIF_INFO,
+					"Invalidate rq_hw failed, status %d", ret);
+				return ret;
+			}
+			set_bit(RQ_IS_INVALIDATED, &rq_sw->flags);
+		}
+
+		/* Make sure the RQ is sofware owned: */
+		ret = poll_wait_for_rq_writeback(sdev, rq);
+		if (ret)
+			return ret;
+
+		/* Reset pointers */
+		memset(rq_sw, 0, sizeof(*rq_sw));
+		set_psif_rq_hw__head_indx(&rq->d, 0);
+	}
+
+	mb();
+
+	if (multipacket_qp(qp->type) && IS_PSIF(sdev) && PSIF_REVISION(sdev) > 2) {
+		int i;
+		int loop_count = 1;
+
+		/* bz #3794: WA for HW bug 3198, VAL issuing read to uninitialized DMA VT entry */
+		if (qp->type == PSIF_QP_TRANSPORT_UC && PSIF_REVISION(sdev) <= 3)
+			loop_count = 64;
+
+		/* Invalidate the SGL cache (mapped to the qp type)
+		 * TBD: We can consider a posted inv.req and check lazy upon reuse
+		 */
+
+		for (i = 0; i < loop_count; ++i) {
+			ret = sif_invalidate_qp(sdev, qp->qp_idx, PCM_WAIT);
+			if (ret) {
+				sif_log(sdev, SIF_INFO,
+					"Invalidate SGL cache failed");
+				return ret;
+			}
+			cpu_relax();
+		}
+	}
+
+	/* Reset counters to same values used at QP create
+	 * Last acked psn must be initialized to one less than xmit_psn
+	 * and it is a 24 bit value. See issue #1011
+	 */
+	set_psif_qp_core__xmit_psn(&qps->state, 0);
+	set_psif_qp_core__last_acked_psn(&qps->state, 0xffffff);
+	qp->flush_sq_done_wa4074 = false;
+
+	return ret;
+}
+
+
+
+void sif_dfs_print_qp(struct seq_file *s, struct sif_dev *sdev,
+		loff_t pos)
+{
+	struct sif_qp *qp;
+	volatile struct psif_qp *qps;
+	struct psif_qp lqps;
+
+	if (unlikely(pos < 0)) {
+		seq_puts(s, "Index\tState\tRecvCQ\tSendCQ\tRQ\tRemQP\tType\n");
+		return;
+	}
+
+	qp = get_sif_qp(sdev, pos);
+	qps = &qp->d;
+	copy_conv_to_sw(&lqps, qps, sizeof(struct psif_qp));
+
+	if (pos <= 3 && atomic_read(&sdev->sqp_usecnt[pos]) != 1)
+		return;
+
+	seq_printf(s, "%llu\t%d\t", pos,	qp->last_set_state);
+
+	if (qp->rq_idx == -1)
+		seq_puts(s, "[none]");
+	else
+		seq_printf(s, "%u", lqps.state.rcv_cq_indx);
+
+	seq_printf(s, "\t%u\t", lqps.state.send_cq_indx);
+
+	if (qp->rq_idx == -1)
+		seq_puts(s, "[none]");
+	else
+		seq_printf(s, "%u", lqps.state.rq_indx);
+
+	seq_printf(s, "\t%u", lqps.state.remote_qp);
+	seq_printf(s, "\t%s", string_enum_psif_qp_trans(lqps.state.transport_type)+18);
+	if (lqps.state.proxy_qp_enable)
+		seq_puts(s, "\t[proxy]\n");
+	else if (is_epsa_tunneling_qp(qp->ibqp.qp_type))
+		seq_puts(s, "\t[EPSA tunneling]\n");
+	else if (qp->ulp_type == RDS_ULP)
+		seq_puts(s, "\t[RDS]\n");
+	else if (qp->ulp_type == IPOIB_CM_ULP)
+		seq_puts(s, "\t[IPOIB_CM]\n");
+	else if (qp->flags & SIF_QPF_EOIB)
+		seq_puts(s, "\t[EoIB]\n");
+	else if (qp->flags & SIF_QPF_IPOIB)
+		seq_puts(s, "\t[IPoIB]\n");
+	else if (qp->flags & SIF_QPF_NO_EVICT)
+		seq_puts(s, "\t[no_evict]\n");
+	else if (qp->flags & SIF_QPF_FLUSH_RETRY)
+		seq_puts(s, "\t[flush_retry]\n");
+	else if (qp->flags & SIF_QPF_KI_STENCIL)
+		seq_puts(s, "\t[ki_stencil]\n");
+	else if (qp->flags & SIF_QPF_PMA_PXY)
+		if (qp->port == 1)
+			seq_puts(s, "\t[PMA_PXY_QP_P1]\n");
+		else
+			seq_puts(s, "\t[PMA_PXY_QP_P2]\n");
+	else if (qp->flags & SIF_QPF_SMI)
+		if (qp->port == 1)
+			seq_puts(s, "\t[SMI_QP_P1]\n");
+		else
+			seq_puts(s, "\t[SMI_QP_P2]\n");
+	else if (qp->flags & SIF_QPF_GSI)
+		if (qp->port == 1)
+			seq_puts(s, "\t[GSI_QP_P1]\n");
+		else
+			seq_puts(s, "\t[GSI_QP_P2]\n");
+	else
+		seq_puts(s, "\n");
+}
+
+void sif_dfs_print_ipoffload(struct seq_file *s, struct sif_dev *sdev, loff_t pos)
+{
+	struct sif_qp *qp;
+
+	if (unlikely(pos < 0)) {
+		seq_printf(s, "#%7s %10s %21s %21s %21s\n",
+			"", "TX csum", "---- RX l3_csum ----", "---- RX l4_csum ----",
+			"-------- LSO --------");
+		seq_printf(s, "#%7s %10s %10s %10s %10s %10s %10s %10s\n",
+			"Index", "", "ok", "err", "ok", "err", "pkt", "bytes");
+		return;
+	}
+
+	qp = get_sif_qp(sdev, pos);
+
+	if (qp->flags & SIF_QPF_IPOIB || qp->flags & SIF_QPF_EOIB) {
+		if (pos <= 3 && atomic_read(&sdev->sqp_usecnt[pos]) != 1)
+			return;
+
+		seq_printf(s, "%8llu ", pos);
+		seq_printf(s, "%10llu ",
+			qp->ipoib_tx_csum_l3);
+		seq_printf(s, "%10llu %10llu ",
+			qp->ipoib_rx_csum_l3_ok, qp->ipoib_rx_csum_l3_err);
+		seq_printf(s, "%10llu %10llu ",
+			qp->ipoib_rx_csum_l4_ok, qp->ipoib_rx_csum_l4_err);
+		seq_printf(s, "%10llu %10llu\n",
+			qp->ipoib_tx_lso_pkt, qp->ipoib_tx_lso_bytes);
+	}
+}
+
+bool has_srq(struct sif_dev *sdev, struct sif_qp *qp)
+{
+	struct sif_rq *rq = has_rq(qp) ? get_sif_rq(sdev, qp->rq_idx) : NULL;
+
+	return rq && rq->is_srq;
+}
diff --git a/drivers/infiniband/hw/sif/sif_qp.h b/drivers/infiniband/hw/sif/sif_qp.h
new file mode 100644
index 000000000000..0ab36abd3804
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_qp.h
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_qp.h: Interface to internal IB queue pair logic for sif
+ */
+
+#ifndef __SIF_QP_H
+#define __SIF_QP_H
+#include "psif_hw_data.h"
+#include "sif_rq.h"
+#include "sif_sq.h"
+#include "sif_ibqp.h"
+
+struct sif_dev;
+struct seq_file;
+struct sif_sq;
+struct sif_rq;
+
+#define CB_LENGTH 256
+#define CB_KICK_ALIGN 64
+#define CB_KICK_MASK  (CB_KICK_ALIGN - 1)
+
+enum sif_qp_flags {
+	SIF_QPF_EOIB          = 0x1,
+	SIF_QPF_IPOIB         = 0x2,
+	SIF_QPF_FORCE_SQ_MODE = 0x1000,   /* Set by vendor specific flag to enforce use of SQ mode */
+	SIF_QPF_NO_EVICT      = 0x2000,   /* Special fake qp with do_not_evict set (see #3552) */
+	SIF_QPF_KI_STENCIL    = 0x4000,   /* Special stencil qp set up for efficient key invalidates */
+	SIF_QPF_DYNAMIC_MTU   = 0x8000,   /* Set by vendor specific flag to enforce use of dynamic MTU */
+	SIF_QPF_FLUSH_RETRY   = 0x10000,  /* Special fake rc qp to flush retry (see #3714) */
+	SIF_QPF_USER_MODE     = 0x20000,  /* User (udata != NULL) and not kernel verbs */
+	SIF_QPF_PMA_PXY       = 0x100000, /* Performance management interface QP type */
+	SIF_QPF_SMI           = 0x200000, /* Subnet management interface QP type */
+	SIF_QPF_GSI           = 0x400000, /* General services interface QP type */
+	SIF_QPF_HW_OWNED      = 0x1000000,/* Indicates HW ownership */
+};
+
+struct dentry;
+
+/*
+ * TBD - not suitable for kernel.org:
+ * As for now, the stack unwind is done at sif_create_qp() within sif driver.
+ * Picking UEK version 4.1.12 as a starting point to have this,
+ * as UEK kernel has ib_create_qp->ib_create_qp_ex.
+ * Thus, set it to 4 based on what is implemented in Oracle Kernel
+ * to retrieve the ULP.
+*/
+#define STACK_UNWIND_LEVEL 4
+/*
+ * sif_create_qp        = __builtin_return_address(0)
+ * ib_create_qp         = __builtin_return_address(1)
+ * ib_create_qp_ex      = __builtin_return_address(2)
+ * if (rdma_cm)
+ * rdma_create_qp       = __builtin_return_address(3)
+ * ULP                  = __builtin_return_address(4)
+*/
+
+/* The enum to determine what is the ULP caller
+ */
+enum kernel_ulp_type {
+	OTHER_ULP    = 0,
+	RDS_ULP	     = 1,
+	IPOIB_CM_ULP = 2,
+	IPOIB_ULP    = 3,
+};
+
+struct sif_qp_init_attr {
+	struct sif_pd *pd;
+	enum psif_qp_trans qp_type;
+	enum sif_proxy_type proxy;
+	enum psif_tsu_qos qosl;
+	enum kernel_ulp_type ulp_type; /* the ulp caller hint */
+	bool user_mode;
+	int sq_hdl_sz;
+};
+
+struct sif_qp {
+	volatile struct psif_qp d;	/* Hardware QPSC entry */
+	struct ib_qp ibqp ____cacheline_internodealigned_in_smp;
+
+	/* Data area for query_qp results: */
+	struct psif_query_qp qqp ____cacheline_internodealigned_in_smp;
+
+	/* Pack the members used in critical path in as few cache lines as possible */
+	union {
+		u16 submask[2];
+		u32 mask;
+	} traffic_patterns;              /* heuristic mask to determine the traffic pattern */
+	enum kernel_ulp_type ulp_type; /* the ulp caller hint */
+	atomic_t refcnt;               /* qp refcnt to sync between destroy qp and event handling. */
+	struct completion can_destroy; /* use to synchronize destroy qp with event handling */
+	struct mutex lock ____cacheline_internodealigned_in_smp;
+	int qp_idx;			/* qp and sq index */
+	int rq_idx;
+	u32 max_inline_data;		/* Requested max inline for this QP */
+
+	/* Next 6 members are copy from the qp state */
+	u32 remote_qp;
+	u32 magic;
+	bool nocsum;
+	enum psif_tsu_qos qosl;
+	u8 tsl;
+	u16 remote_lid;
+
+	u16 eps_tag;			/* Value to use for the eps_tag field (proxy_qp) */
+	short port;			/* IB port number (= sif port# + 1) */
+	u32 flags;
+	enum ib_qp_state last_set_state;
+	enum psif_qp_trans type;	/* PSIF transport type set up for this QP */
+
+	/* The following members are not used in critical path */
+	u16 pkey_index;			/* Default PKEY index as set by IB_QP_PKEY */
+	enum ib_mtu mtu;		/* Currently set mtu */
+	enum ib_qp_state tracked_state; /* TBD: This is stupid: Make SQD fail as MLX for SQD */
+	struct dentry *dfs_qp;		/* Raw qp dump debugfs handle - used by sif_debug.c */
+	bool sq_cmpl_map_valid;
+
+	int srq_idx;			/* WA #3952: Track SRQ for modify_srq(used only for pQP) */
+	atomic64_t arm_srq_holdoff_time;/* Wait-time,if the pQP is held for a prev modify_srq */
+
+	bool flush_sq_done_wa4074;	/* WA #4074: Track if QP state changes are already applied */
+
+	u64 ipoib_tx_csum_l3;
+	u64 ipoib_tx_csum_l4;
+	u64 ipoib_rx_csum_l3_ok;
+	u64 ipoib_rx_csum_l3_err;
+	u64 ipoib_rx_csum_l4_ok;
+	u64 ipoib_rx_csum_l4_err;
+	u64 ipoib_tx_lso_pkt;
+	u64 ipoib_tx_lso_bytes;
+};
+
+
+/* Definition of PSIF EPSA tunneling QP using IB_QPT_RESERVED1 */
+#define IB_QPT_EPSA_TUNNELING IB_QPT_RESERVED1
+
+/* Command used to invalidate a collect buffer by writing to offset 0xff8 */
+#define PSIF_WR_CANCEL_CMD_BE 0xff00000000000000ULL
+
+/* HEURISTIC BITS used for TX/RX direction. */
+#define HEUR_RX_DIRECTION (~1ULL)
+#define HEUR_TX_DIRECTION (1ULL)
+
+static inline bool supports_offload(struct sif_qp *qp)
+{
+	return qp->flags & (SIF_QPF_EOIB | SIF_QPF_IPOIB);
+}
+
+static inline int psif_supported_trans(enum psif_qp_trans type)
+{
+	return type != PSIF_QP_TRANSPORT_RSVD1;
+}
+
+static inline bool is_regular_qp(struct sif_qp *qp)
+{
+	return (qp->type != PSIF_QP_TRANSPORT_MANSP1 &&
+		qp->type != PSIF_QP_TRANSPORT_XRC);
+}
+
+static inline bool is_epsa_tunneling_qp(enum ib_qp_type type)
+{
+	return type == IB_QPT_EPSA_TUNNELING;
+}
+
+static inline struct sif_qp *to_sqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct sif_qp, ibqp);
+}
+
+struct sif_qp *create_qp(struct sif_dev *sdev,
+			struct ib_qp_init_attr *init_attr,
+			struct sif_qp_init_attr *sif_attr);
+
+int destroy_qp(struct sif_dev *sdev, struct sif_qp *qp);
+
+
+int modify_qp(struct sif_dev *sdev, struct sif_qp *qp,
+	struct ib_qp_attr *qp_attr, int qp_attr_mask,
+	bool fail_on_same_state, struct ib_udata *udata);
+
+enum ib_qp_state get_qp_state(struct sif_qp *qp);
+
+/* Line printers for debugfs files */
+void sif_dfs_print_qp(struct seq_file *s, struct sif_dev *sdev, loff_t pos);
+void sif_dfs_print_ipoffload(struct seq_file *s, struct sif_dev *sdev, loff_t pos);
+
+/* SIF specific type of handling of a modify QP operation:
+ *
+ */
+enum sif_mqp_type {
+	SIF_MQP_ERR, /* Illegal transition */
+	SIF_MQP_SW,  /* Software handled transition */
+	SIF_MQP_HW,  /* Hardware handled transition */
+	SIF_MQP_IGN, /* Silently ignored transition req */
+	SIF_MQP_MAX
+};
+
+u64 sif_qqp_dma_addr(struct sif_dev *sdev, struct sif_qp *qps);
+
+/* Internal query qp implementation - stores a host order query qp state in lqqp */
+int epsc_query_qp(struct sif_qp *qp, struct psif_query_qp *lqqp);
+
+/* EPSC configuration to forward PMA responses to the remapped qp_idx */
+int notify_epsc_pma_qp(struct sif_dev *sdev, int qp_idx, short port);
+
+enum sif_mqp_type sif_modify_qp_is_ok(
+	struct sif_qp *qp,
+	enum ib_qp_state cur_state,
+	enum ib_qp_state next_state,
+	enum ib_qp_attr_mask mask
+);
+
+static inline enum psif_mbox_type proxy_to_mbox(enum sif_proxy_type proxy)
+{
+	switch (proxy) {
+	case SIFPX_EPSA_1:
+		return MBOX_EPSA0;
+	case SIFPX_EPSA_2:
+		return MBOX_EPSA1;
+	case SIFPX_EPSA_3:
+		return MBOX_EPSA2;
+	case SIFPX_EPSA_4:
+		return MBOX_EPSA3;
+	default:
+		break;
+	}
+	return (enum psif_mbox_type) -1;
+}
+
+int modify_qp_hw_wa_qp_retry(struct sif_dev *sdev, struct sif_qp *qp,
+		    struct ib_qp_attr *qp_attr, int qp_attr_mask);
+
+static inline bool has_rq(struct sif_qp *qp)
+{
+	return qp->rq_idx >= 0;
+}
+
+bool has_srq(struct sif_dev *sdev, struct sif_qp *qp);
+
+static inline bool ib_legal_path_mtu(enum ib_mtu mtu)
+{
+	return (mtu >= IB_MTU_256) && (mtu <= IB_MTU_4096);
+}
+
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_query.c b/drivers/infiniband/hw/sif/sif_query.c
new file mode 100644
index 000000000000..dcb03e39b0d5
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_query.c
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_query.c: SIF implementation of some of IB query APIs
+ */
+#include <linux/version.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_mad.h>
+#include "sif_dev.h"
+#include "sif_query.h"
+#include "sif_defs.h"
+#include "sif_qp.h"
+
+int epsc_query_device(struct sif_dev *sdev, struct psif_epsc_device_attr *ldev)
+{
+	int ret;
+	struct psif_epsc_csr_rsp cqe;
+	struct psif_epsc_csr_req req;
+	struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+
+	memset(&req, 0, sizeof(req));
+	/* MMU context nil - passthrough */
+	req.opcode = EPSC_QUERY_DEVICE;
+	req.u.query_hw.address =
+		(u64)es->data_dma_hdl + offsetof(struct sif_epsc_data, dev);
+	req.u.query_hw.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx;
+	ret = sif_epsc_wr(sdev, &req, &cqe);
+
+	/* Copy data irrespective of how the EPSC operation went */
+	if (eps_version_ge(es, 0, 31))
+		copy_conv_to_sw(ldev, &es->data->dev, sizeof(*ldev));
+	else
+		memcpy(ldev, &es->data->dev, sizeof(*ldev));
+
+	return ret;
+}
+
+int sif_query_device(struct ib_device *ibdev, struct ib_device_attr *props)
+{
+	int ret;
+	struct sif_dev *sdev = to_sdev(ibdev);
+	struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+	struct psif_epsc_device_attr ldev;
+
+	ret = epsc_query_device(sdev, &ldev);
+	if (ret)
+		return ret;
+
+	memset(props, 0, sizeof(*props));
+	/* TBD: x.y.z - 16 bit per sublevel - we use x.y.0 for now */
+	props->fw_ver = (u64)es->ver.fw_major << 32 | (u64)es->ver.fw_minor << 16;
+	props->sys_image_guid = cpu_to_be64(ldev.sys_image_guid);
+	props->max_mr_size = ~0ull;
+	props->page_size_cap = 0xfffffe00; /* TBD: Sensible value? Use what Mellanox uses */
+	props->vendor_id = ldev.vendor_id;
+	props->vendor_part_id = ldev.vendor_part_id;
+	props->hw_ver = ldev.hw_ver;
+	props->max_qp = sdev->ba[qp].entry_cnt; /* TBD: min(ldev.max_qp, sdev->ba[qp].entry_cnt) */
+	props->max_qp_wr = min_t(u32, SIF_SW_MAX_SQE, ldev.max_srq_wr); /* Max on _any_ work queue */
+	props->device_cap_flags =
+		IB_DEVICE_BAD_PKEY_CNTR |
+		IB_DEVICE_BAD_QKEY_CNTR |
+		IB_DEVICE_AUTO_PATH_MIG |
+		IB_DEVICE_CURR_QP_STATE_MOD |
+		IB_DEVICE_SHUTDOWN_PORT |
+		IB_DEVICE_PORT_ACTIVE_EVENT |
+		IB_DEVICE_SYS_IMAGE_GUID |
+		IB_DEVICE_RC_RNR_NAK_GEN |
+		IB_DEVICE_UD_IP_CSUM |
+		IB_DEVICE_UD_TSO |
+		IB_DEVICE_XRC |
+		IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
+
+	/* returns max_sge SIF_HW_MAX_SEND_SGE -1 for IPoIB connected mode.
+	 */
+	props->max_sge = (sif_find_kernel_ulp_caller() == IPOIB_CM_ULP) ?
+		SIF_HW_MAX_SEND_SGE - 1 : SIF_HW_MAX_SEND_SGE;
+	props->max_sge_rd = ldev.max_sge_rd;
+	props->max_cq = sdev->ba[cq_sw].entry_cnt;
+	props->max_cqe = SIF_SW_MAX_CQE;
+	/* Make sure we never fill the CQ completely on rev 1-3 - Bug #3657 */
+	if (PSIF_REVISION(sdev) <= 3)
+		props->max_cqe = SIF_SW_MAX_CQE - 1;
+	props->max_mr = sdev->ba[key].entry_cnt;
+	props->max_pd = SIF_MAX_PD_INDEX - 1; /* 0 not used, limited by hw field size */
+	props->max_qp_rd_atom = ldev.max_qp_rd_atom;
+	props->max_ee_rd_atom = ldev.max_ee_rd_atom;
+	props->max_res_rd_atom = props->max_qp_rd_atom * sdev->ba[qp].entry_cnt;
+	props->max_qp_init_rd_atom = ldev.max_qp_init_rd_atom;
+	props->max_ee_init_rd_atom = ldev.max_ee_init_rd_atom;
+	props->atomic_cap = ldev.atomic_cap;
+	props->max_ee = ldev.max_ee;
+	props->max_rdd = ldev.max_rdd;
+	props->max_mw = ldev.max_mw;
+	props->max_raw_ipv6_qp = min_t(u32, ldev.max_raw_ipv6_qp, props->max_qp);
+	props->max_raw_ethy_qp = min_t(u32, ldev.max_raw_ethy_qp, props->max_qp);
+	props->max_mcast_grp = ldev.max_mcast_grp;
+	props->max_mcast_qp_attach = ldev.max_mcast_qp_attach;
+	props->max_total_mcast_qp_attach = ldev.max_total_mcast_qp_attach;
+	props->max_ah = sdev->ba[ah].entry_cnt;
+	props->max_fmr = props->max_mr;
+	props->max_map_per_fmr = 0x7ffff000; /* Should be props->max_mr_size but that breaks ibv_devinfo */
+	props->max_srq = sdev->ba[rq_hw].entry_cnt;
+	props->max_srq_wr = ldev.max_srq_wr;
+	props->max_srq_sge = ldev.max_srq_sge;
+	props->max_pkeys = ldev.max_pkeys;
+	props->local_ca_ack_delay = ldev.local_ca_ack_delay;
+	return ret;
+}
+
+
+
+static int epsc_query_port(struct sif_dev *sdev, u8 port, struct psif_epsc_port_attr *lpa)
+{
+	int ret;
+	struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+	struct psif_epsc_csr_rsp cqe;
+	struct psif_epsc_csr_req req;
+	const u8 psif_port = port - 1; /* sif port index starts at 0 */
+	struct psif_epsc_port_attr *ps;
+
+	if (port > 2) {
+		sif_log(sdev, SIF_INFO, "error: request for port %d while PSIF has only 2 ports",
+			port);
+		return -EINVAL;
+	}
+
+	ps = &es->data->port[psif_port];
+
+	memset(&req, 0, sizeof(req));
+	req.opcode = psif_port == PORT_1 ? EPSC_QUERY_PORT_1 : EPSC_QUERY_PORT_2;
+	req.u.query_hw.address =
+		(u64)es->data_dma_hdl + offsetof(struct sif_epsc_data, port[psif_port]);
+	req.u.query_hw.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx;
+
+	ret = sif_epsc_wr(sdev, &req, &cqe);
+
+	/* Copy data irrespective of how the EPSC operation went */
+	if (eps_version_ge(es, 0, 31))
+		copy_conv_to_sw(lpa, ps, sizeof(*lpa));
+	else
+		memcpy(lpa, ps, sizeof(*lpa));
+
+	if (!ret)
+		sif_log(sdev, SIF_VERBS, "port %d lid %d sm_lid %d seq 0x%llx",
+			port, lpa->lid, lpa->sm_lid, cqe.seq_num);
+	else
+		sif_log(sdev, SIF_INFO, "error: port %d seq 0x%llx failed with status %s (ret = %d)",
+			port, cqe.seq_num, string_enum_psif_epsc_csr_status(cqe.status),
+			ret);
+	return ret;
+}
+
+int sif_calc_ipd(struct sif_dev	 *sdev, u8 port, enum ib_rate static_rate, u8 *ipd)
+{
+	int path = ib_rate_to_mult(static_rate);
+	int link, ret;
+	struct ib_port_attr lpa;
+
+	if (static_rate == IB_RATE_PORT_CURRENT) {
+		*ipd = 0;
+		return 0;
+	}
+
+	if (unlikely(path < 0)) {
+		sif_log(sdev, SIF_INFO, " Invalid static rate = %x\n",
+			path);
+		return -EINVAL;
+	}
+
+	ret = sif_query_port(&sdev->ib_dev, port, &lpa);
+	if (unlikely(ret != 0)) {
+		sif_log(sdev, SIF_INFO, "Failed to query port %u\n", port);
+		return ret;
+	}
+	/* 2^active_width * active_speed */
+	link = (1 << lpa.active_width)*lpa.active_speed;
+
+	if (path >= link)
+		*ipd = 0;
+	else
+		*ipd = (link/path)-1;
+	return 0;
+}
+
+
+int sif_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props)
+{
+	int ret;
+	struct sif_dev *sdev = to_sdev(ibdev);
+	struct psif_epsc_port_attr lpa;
+
+	ret = epsc_query_port(sdev, port, &lpa);
+	memset(props, 0, sizeof(*props));
+	props->state = lpa.state;
+	props->max_mtu = IB_MTU_4096;
+	props->active_mtu = lpa.active_mtu;
+	props->gid_tbl_len = lpa.gid_tbl_len;
+	props->port_cap_flags = lpa.port_cap_flags;
+	props->max_msg_sz = lpa.max_msg_sz;
+	props->bad_pkey_cntr = lpa.bad_pkey_cntr;
+	props->qkey_viol_cntr = lpa.qkey_viol_cntr;
+	props->pkey_tbl_len = lpa.pkey_tbl_len;
+	props->lid = lpa.lid;
+	props->sm_lid = lpa.sm_lid;
+	props->lmc = lpa.lmc;
+	props->max_vl_num = lpa.max_vl_num;
+	props->sm_sl = lpa.sm_sl;
+	props->subnet_timeout = lpa.subnet_timeout;
+	props->init_type_reply = lpa.init_type_reply;
+	props->active_width = lpa.active_width;
+	props->active_speed = lpa.active_speed;
+	props->phys_state = lpa.phys_state;
+
+	/* Cache values */
+	sdev->port[port - 1] = *props;
+	return ret;
+}
+
+int sif_query_gid(struct ib_device *ibdev, u8 port_num, int index, union ib_gid *gid)
+{
+	int ret = 0;
+	ulong log_class = SIF_VERBS;
+	struct sif_dev *sdev = to_sdev(ibdev);
+	struct psif_epsc_csr_rsp cqe;
+	struct psif_epsc_csr_req req;
+
+	memset(&req, 0, sizeof(req));
+	req.opcode = EPSC_QUERY_GID;
+	req.u.query_table.port = port_num;
+	req.u.query_table.index = index;
+	ret = sif_epsc_wr(sdev, &req, &cqe);
+	if (ret)
+		return ret;
+
+	/* Apparently clients expect to get GIDs in network byte order
+	 * which requires an extra swap here:
+	 */
+	gid->global.subnet_prefix = be64_to_cpu(cqe.data);
+	gid->global.interface_id = be64_to_cpu(cqe.info);
+
+	if (ret)
+		log_class = SIF_INFO;
+	sif_logi(ibdev, log_class,
+		 " port_num %d, GID Table index %d - > %llx.%llx",
+		port_num, index, gid->global.subnet_prefix, gid->global.interface_id);
+	return ret;
+}
+
+
+int sif_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			  u16 *pkey)
+{
+	int ret = 0;
+	struct sif_dev *sdev = to_sdev(ibdev);
+	struct psif_epsc_csr_rsp cqe;
+	struct psif_epsc_csr_req req;
+
+	memset(&req, 0, sizeof(req));
+	req.opcode = EPSC_QUERY_PKEY;
+	req.u.query_table.port = port;
+	req.u.query_table.index = index;
+	ret = sif_epsc_wr(sdev, &req, &cqe);
+	if (ret) {
+		sif_log(sdev, SIF_INFO, "port %u index %u: Failed with status %d", port, index, ret);
+		return ret;
+	}
+	*pkey = (u16)cqe.data;
+	sif_logi(ibdev, SIF_VERBS_V, "port %u index %u -> key 0x%x",
+		port, index, *pkey);
+	return ret;
+}
+
+
+/* Called from sif_modify_device when IB_DEVICE_MODIFY_EXTENDED is set
+ * PSIF specific extension bits defined in sif_verbs.h
+ */
+static int sif_modify_device_extended(struct sif_dev *sdev, struct ib_device_modify *device_modify,
+			struct psif_epsc_csr_req *req)
+{
+	struct sif_device_modify *dm =
+		container_of(device_modify, struct sif_device_modify, ib);
+
+	/* TBD: Simplifying firmware support? */
+	sif_log(sdev, SIF_INFO, "uf %d eoib_ctrl %x eoib_data %x (not implemented)",
+		dm->uf, dm->eoib_ctrl, dm->eoib_data);
+	return -EOPNOTSUPP;
+}
+
+
+int sif_modify_device(struct ib_device *ibdev,
+		int device_modify_mask,
+		struct ib_device_modify *device_modify)
+{
+	int ret = 0;
+	struct sif_dev *sdev = to_sdev(ibdev);
+	struct psif_epsc_csr_rsp cqe;
+	struct psif_epsc_csr_req req;
+
+	memset(&req, 0, sizeof(req));
+	req.opcode = EPSC_MODIFY_DEVICE;
+	if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
+		req.u.device.modify_mask |= PSIF_DEVICE_MODIFY_SYS_IMAGE_GUID;
+		sif_logi(ibdev, SIF_VERBS, "sys_image_guid = 0x%llx",
+			device_modify->sys_image_guid);
+		req.u.device.sys_image_guid = device_modify->sys_image_guid;
+	}
+	if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
+		req.u.device.modify_mask |= PSIF_DEVICE_MODIFY_NODE_DESC;
+		sif_logi(ibdev, SIF_VERBS, "node_desc = %s",
+			device_modify->node_desc);
+		strncpy(req.u.device.node_desc, device_modify->node_desc,
+			ARRAY_SIZE(req.u.device.node_desc)-1);
+		strncpy(ibdev->node_desc, device_modify->node_desc,
+			ARRAY_SIZE(ibdev->node_desc)-1);
+	}
+
+	/** PSIF specific extensions (sif_verbs.h) **/
+	if (device_modify_mask & IB_DEVICE_MODIFY_EXTENDED)
+		ret = sif_modify_device_extended(sdev, device_modify, &req);
+
+	ret = sif_epsc_wr(sdev, &req, &cqe);
+	if (ret)
+		sif_log(sdev, SIF_INFO, "Failed with status %d", ret);
+	return ret;
+}
+
+int sif_modify_port(struct ib_device *ibdev,
+		u8 port, int port_modify_mask,
+		struct ib_port_modify *props)
+{
+	int ret = 0;
+	struct sif_dev *sdev = to_sdev(ibdev);
+	struct psif_epsc_csr_rsp cqe;
+	struct psif_epsc_csr_req req;
+
+	sif_logi(ibdev, SIF_VERBS,
+		"via eps - port %d mask %x init_type %d, set mask %x, clr mask %x",
+		port, port_modify_mask,
+		props->init_type,
+		props->set_port_cap_mask,
+		props->clr_port_cap_mask);
+
+	memset(&req, 0, sizeof(req));
+	/* TBD: Why both port and different op for port 1 and 2? */
+	req.u.port.port = port;
+	if (port == 1)
+		req.opcode = EPSC_MODIFY_PORT_1;
+	else if (port == 2)
+		req.opcode = EPSC_MODIFY_PORT_2;
+	else {
+		/* No such port */
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* TBD: Check later on if we can let this mask straight through 1-1 */
+	if (port_modify_mask & IB_PORT_SHUTDOWN)
+		req.u.port.modify_mask |= PSIF_PORT_SHUTDOWN;
+	if (port_modify_mask & IB_PORT_INIT_TYPE) {
+		req.u.port.modify_mask |= PSIF_PORT_INIT_TYPE;
+		req.u.port.init_type = props->init_type;
+	}
+	if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
+		req.u.port.modify_mask |= PSIF_PORT_RESET_QKEY_CNTR;
+	if (port_modify_mask & (1<<4))
+		req.u.port.modify_mask |= PSIF_PORT_RESET_PKEY_CNTR;
+	req.u.port.set_port_cap_mask = props->set_port_cap_mask;
+	req.u.port.clr_port_cap_mask = props->clr_port_cap_mask;
+	ret = sif_epsc_wr(sdev, &req, &cqe);
+	if (ret)
+		sif_log(sdev, SIF_INFO, "Failed with status %d", ret);
+out:
+	return ret;
+}
+
+
diff --git a/drivers/infiniband/hw/sif/sif_query.h b/drivers/infiniband/hw/sif/sif_query.h
new file mode 100644
index 000000000000..fc1fe8766e79
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_query.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_query.h: SIF implementation of some of IB query APIs
+ */
+
+#ifndef _SIF_QUERY_H
+#define _SIF_QUERY_H
+#include "psif_hw_data.h"
+#include "sif_epsc.h"
+#include "sif_fwa.h"
+
+/* Max size of firmware version info */
+#define MAX_FW_VERSION_INFO_SZ 4096
+
+/* DMA mapped structure to receive query data in
+ * We only need one of these and we protect user access to
+ * it with sif_epsc->lock
+ */
+
+struct sif_epsc_data {
+	struct psif_epsc_device_attr dev;
+	struct psif_epsc_port_attr port[2];
+	struct psif_epsc_log_stat log;
+
+	/* fixed buffer space for special FWA client needs */
+	char fw_version[MAX_FW_VERSION_INFO_SZ]; /* Data area for firmware version info */
+	char flash[MAX_FWA_NL_PAYLOAD];  /* Data area for flash support */
+	char epsc_cli[MAX_FWA_NL_PAYLOAD]; /* Data area for EPSC CLI response*/
+	char vimm_agent[MAX_FWA_NL_PAYLOAD]; /* Data area for VIMM agent */
+	char log_data_area[0];  /* Data area will be allocated right after this struct */
+};
+
+int sif_query_device(struct ib_device *ibdev, struct ib_device_attr *props);
+
+int sif_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props);
+int sif_query_gid(struct ib_device *ibdev, u8 port_num, int index, union ib_gid *gid);
+int sif_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+		u16 *pkey);
+
+int sif_calc_ipd(struct sif_dev *sdev, u8 port, enum ib_rate static_rate,
+		 u8 *ipd);
+
+int sif_modify_device(struct ib_device *ibdev,
+		int device_modify_mask,
+		struct ib_device_modify *device_modify);
+
+int sif_modify_port(struct ib_device *ibdev,
+		u8 port, int port_modify_mask,
+		struct ib_port_modify *props);
+
+/* Populate ldev with host endian query_device info requested from the epsc */
+int epsc_query_device(struct sif_dev *sdev, struct psif_epsc_device_attr *ldev);
+
+
+static inline bool epsc_gva_permitted(struct sif_dev *sdev)
+{
+	/* None of the planned SIBS versions supports GVA2GPA for EPSC mappings */
+	return !IS_SIBS(sdev) && sdev->pdev->revision != 2 && !sif_feature(passthrough_query_qp);
+}
+
+static inline bool eps_version_ge(struct sif_eps *es, u16 major, u16 minor)
+{
+	return EPSC_API_VERSION(es->ver.epsc_major, es->ver.epsc_minor) >=
+		EPSC_API_VERSION(major, minor);
+}
+
+static inline bool eps_fw_version_ge(struct sif_eps *es, u16 major, u16 minor)
+{
+	return EPSC_API_VERSION(es->ver.fw_major, es->ver.fw_minor) >=
+		EPSC_API_VERSION(major, minor);
+}
+
+static inline bool eps_fw_version_lt(struct sif_eps *es, u16 major, u16 minor)
+{
+	return EPSC_API_VERSION(es->ver.fw_major, es->ver.fw_minor) <
+		EPSC_API_VERSION(major, minor);
+}
+
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_r3.c b/drivers/infiniband/hw/sif/sif_r3.c
new file mode 100644
index 000000000000..0dcd7118b1f6
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_r3.c
@@ -0,0 +1,880 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_r3.c: Special handling specific for psif revision 3 and earlier
+ */
+#include "sif_dev.h"
+#include "sif_r3.h"
+#include "sif_base.h"
+#include "sif_query.h"
+#include "sif_qp.h"
+#include "sif_ibqp.h"
+#include "sif_sndrcv.h"
+#include "sif_ibcq.h"
+#include "sif_defs.h"
+#include "psif_hw_setget.h"
+
+/* Declared below: */
+static void sif_hw_free_flush_qp(struct sif_dev *sdev);
+static int sif_hw_allocate_flush_qp(struct sif_dev *sdev);
+static int sif_hw_allocate_dne_qp(struct sif_dev *sdev);
+static void sif_hw_free_dne_qp(struct sif_dev *sdev);
+
+static int outstanding_wqes(struct sif_dev *sdev, struct sif_qp *qp, u16 *head);
+static u16 cq_walk_wa4074(struct sif_dev *sdev, struct sif_qp *qp, bool *last_seq_set);
+static u16 walk_and_update_cqes(struct sif_dev *sdev, struct sif_qp *qp, u16 head, u16 end);
+
+int sif_r3_init(struct sif_dev *sdev)
+{
+	int ret;
+	bool dne_qp_alloc = false;
+
+	if (eps_fw_version_lt(&sdev->es[sdev->mbox_epsc], 0, 58)) {
+		ret = sif_hw_allocate_dne_qp(sdev);
+		if (ret)
+			return ret;
+		dne_qp_alloc = true;
+	}
+
+	/* Init the flush_retry qp lock */
+	mutex_init(&sdev->flush_lock);
+	ret = sif_hw_allocate_flush_qp(sdev);
+	if (ret)
+		goto flush_retry_failed;
+
+	return 0;
+flush_retry_failed:
+	if (dne_qp_alloc)
+		sif_hw_free_dne_qp(sdev);
+	return ret;
+}
+
+
+void sif_r3_deinit(struct sif_dev *sdev)
+{
+	sif_hw_free_flush_qp(sdev);
+	if (eps_fw_version_lt(&sdev->es[sdev->mbox_epsc], 0, 58))
+		sif_hw_free_dne_qp(sdev);
+}
+
+
+static int sif_hw_allocate_dne_qp(struct sif_dev *sdev)
+{
+	int ret;
+	u32 idx = sif_alloc_qp_idx(sdev->pd);
+	struct sif_qp *qp;
+	struct psif_qp lqp;
+	struct psif_query_qp lqqp;
+
+	if (idx < 0) {
+		sif_log(sdev, SIF_INFO, "Unable to reserve QP index for the do-not-evict qp");
+		return -ENOMEM;
+	}
+	sdev->dne_qp = idx;
+	qp = get_sif_qp(sdev, idx);
+	/* Make dfs and query_qp happy: */
+	qp->qp_idx = idx;
+	qp->ibqp.device = &sdev->ib_dev;
+	qp->ibqp.pd = &sdev->pd->ibpd;
+	qp->rq_idx = -1;
+	qp->last_set_state = IB_QPS_RTS;
+	qp->flags = SIF_QPF_NO_EVICT;
+	mutex_init(&qp->lock);
+
+	memset(&lqp, 0, sizeof(struct psif_qp));
+
+	lqp.state.do_not_evict = 1;
+	lqp.state.timeout_time = 0xffffffffffffULL; /* 48 bits */
+	lqp.state.state = PSIF_QP_STATE_RTS;
+	lqp.state.timer_running = 1;
+	lqp.state.transport_type = PSIF_QP_TRANSPORT_RC;
+
+	/* Write composed entry to shared area */
+	copy_conv_to_hw(&qp->d, &lqp, sizeof(struct psif_qp));
+
+	/* Do a query_qp to make PSIF fill it's cache with it
+	 *- we dont care about the results from the query other than
+	 * that the operation succeeds:
+	 */
+	ret = epsc_query_qp(qp, &lqqp);
+	if (ret) {
+		sif_log(sdev, SIF_INFO, "query_qp failed with status %d", ret);
+		return ret;
+	}
+	ret = sif_dfs_add_qp(sdev, qp);
+	if (ret) {
+		sif_log(sdev, SIF_INFO, "Failed to allocate do-not-evict qp, index %d", idx);
+		return ret;
+	}
+	sif_log(sdev, SIF_INFO, "Allocated do-not-evict qp, index %d", idx);
+	return 0;
+}
+
+
+
+static void sif_hw_free_dne_qp(struct sif_dev *sdev)
+{
+	if (sdev->dne_qp) {
+		/* Modify it to reset via error to flush it out.
+		 * We cannot use destroy_qp since it is not a "fully configured" QP:
+		 */
+		struct sif_qp *qp = get_sif_qp(sdev, sdev->dne_qp);
+		struct ib_qp_attr mod_attr = {
+			.qp_state        = IB_QPS_RESET,
+		};
+		modify_qp_hw_wa_qp_retry(sdev, qp, &mod_attr, IB_QP_STATE);
+		sif_dfs_remove_qp(qp);
+		sif_free_qp_idx(sdev->pd, sdev->dne_qp);
+		sdev->dne_qp = 0;
+	}
+}
+
+
+static int sif_hw_allocate_flush_qp(struct sif_dev *sdev)
+{
+	int ret = 0;
+	struct sif_qp *qp = NULL;
+	struct sif_cq *cq = NULL;
+
+	struct ib_qp_init_attr init_attr = {
+		.event_handler = NULL,
+		.srq = NULL,
+		.cap = {
+			.max_send_wr = 64,
+			.max_recv_wr = 64,
+			.max_send_sge = 1,
+			.max_recv_sge = 1,
+		},
+		.sq_sig_type = IB_SIGNAL_ALL_WR,
+		.qp_type = IB_QPT_RC,
+	};
+
+	struct sif_qp_init_attr sif_attr = {
+		.pd = sdev->pd,
+		.qp_type = ib2sif_qp_type(init_attr.qp_type),
+		.user_mode = NULL,
+		.sq_hdl_sz = sizeof(struct sif_sq_hdl),
+		.qosl = QOSL_LOW_LATENCY,
+	};
+
+	enum ib_qp_attr_mask qp_attr_mask =
+		IB_QP_STATE |
+		IB_QP_PKEY_INDEX |
+		IB_QP_PORT |
+		IB_QP_ACCESS_FLAGS;
+
+	struct ib_qp_attr qp_attr = {
+		.qp_state = IB_QPS_INIT,
+		.pkey_index = 0,
+		.port_num = 1,
+		.qp_access_flags =
+		IB_ACCESS_REMOTE_WRITE |
+		IB_ACCESS_REMOTE_READ |
+		IB_ACCESS_REMOTE_ATOMIC,
+	};
+
+	struct ib_port_attr lpa;
+
+	/* No QPs when running in limited mode */
+	if (sdev->limited_mode)
+		return 0;
+
+	ret = sif_query_port(&sdev->ib_dev, 1, &lpa);
+	if (unlikely(ret)) {
+		sif_log(sdev, SIF_INFO, "Failed to query port 1");
+		goto err_query_port;
+	}
+
+	/* CQ */
+	cq = create_cq(sdev->pd,
+		init_attr.cap.max_send_wr + init_attr.cap.max_recv_wr,
+		1, SIFPX_OFF, false);
+	if (IS_ERR(cq)) {
+		sif_log(sdev, SIF_INFO, "Failed to create CQ for flush_retry QP");
+		return -EINVAL;
+	}
+	init_attr.send_cq = &cq->ibcq;
+	init_attr.recv_cq = &cq->ibcq;
+	cq->ibcq.device = &sdev->ib_dev; /* Make destroy cq happy */
+
+	/* QP */
+	qp = create_qp(sdev, &init_attr, &sif_attr);
+	if (IS_ERR(qp)) {
+		sif_log(sdev, SIF_INFO, "Failed to create flush_retry QP");
+		ret = -EINVAL;
+		goto err_create_qp;
+	}
+
+	sif_log(sdev, SIF_QP, "Exit: success flush_retry qp 0x%p  ib qp %d - real qp %d",
+		&qp->ibqp, qp->ibqp.qp_num, qp->qp_idx);
+
+
+	/* Make query & modify qp happy */
+	qp->ibqp.qp_num = qp->qp_idx;
+	qp->ibqp.device = &sdev->ib_dev;
+	qp->ibqp.pd = &sdev->pd->ibpd;
+	qp->ibqp.qp_type = init_attr.qp_type;
+	qp->type = sif_attr.qp_type;
+	qp->port = 1;
+	qp->flags = SIF_QPF_FLUSH_RETRY;
+
+	ret = sif_modify_qp(&qp->ibqp, &qp_attr, qp_attr_mask, NULL);
+	if (ret) {
+		sif_log(sdev, SIF_INFO, "modify_qp to init failed with status %d", ret);
+		goto err_modify_qp;
+	}
+
+	memset(&qp_attr, 0, sizeof(qp_attr));
+	qp_attr.qp_state = IB_QPS_RTR;
+	qp_attr.path_mtu = IB_MTU_2048;
+	qp_attr.dest_qp_num = qp->qp_idx;
+	qp_attr.rq_psn = 0;
+	qp_attr.max_dest_rd_atomic = 1;
+	qp_attr.min_rnr_timer = 1;
+	qp_attr.ah_attr.dlid = lpa.lid;
+	qp_attr.ah_attr.port_num = 1;
+	qp_attr_mask =
+		IB_QP_STATE |
+		IB_QP_AV |
+		IB_QP_PATH_MTU |
+		IB_QP_DEST_QPN |
+		IB_QP_RQ_PSN |
+		IB_QP_MAX_DEST_RD_ATOMIC |
+		IB_QP_MIN_RNR_TIMER;
+
+	ret = sif_modify_qp(&qp->ibqp, &qp_attr, qp_attr_mask, NULL);
+	if (ret) {
+		sif_log(sdev, SIF_INFO, "modify_qp to RTR failed with status %d", ret);
+		goto err_modify_qp;
+	}
+
+	memset(&qp_attr, 0, sizeof(qp_attr));
+	qp_attr.qp_state = IB_QPS_RTS;
+	qp_attr.sq_psn = 0;
+	qp_attr.timeout = 6;
+	qp_attr.retry_cnt = 7;
+	qp_attr.rnr_retry = 7;
+	qp_attr.max_rd_atomic = 1;
+	qp_attr_mask =
+		IB_QP_STATE |
+		IB_QP_TIMEOUT |
+		IB_QP_RETRY_CNT |
+		IB_QP_RNR_RETRY |
+		IB_QP_SQ_PSN |
+		IB_QP_MAX_QP_RD_ATOMIC;
+
+	ret = sif_modify_qp(&qp->ibqp, &qp_attr, qp_attr_mask, NULL);
+	if (ret) {
+		sif_log(sdev, SIF_INFO, "modify_qp to RTS failed with status %d", ret);
+		goto err_modify_qp;
+	}
+
+	sdev->flush_qp = qp->qp_idx;
+	sif_log(sdev, SIF_INFO, "Allocated flush-retry qp, index %d", sdev->flush_qp);
+
+	return ret;
+
+err_modify_qp:
+	destroy_qp(sdev, qp);
+err_create_qp:
+	destroy_cq(cq);
+err_query_port:
+	sdev->flush_qp = 0;
+	sif_log(sdev, SIF_INFO, "Allocated flush-retry qp failed");
+
+	return ret;
+}
+
+static void sif_hw_free_flush_qp(struct sif_dev *sdev)
+{
+	struct sif_qp *qp = NULL;
+	struct sif_sq *sq = NULL;
+	struct sif_cq *cq = NULL;
+
+	if (sdev->flush_qp) {
+		qp = get_sif_qp(sdev, sdev->flush_qp);
+		sq = get_sif_sq(sdev, sdev->flush_qp);
+		cq = get_sif_cq(sdev, sq->cq_idx);
+
+		destroy_qp(sdev, qp);
+		destroy_cq(cq);
+		sdev->flush_qp = 0;
+
+		sif_log(sdev, SIF_QP, "destroy_qp %d success", qp->qp_idx);
+	}
+}
+
+void sif_r3_recreate_flush_qp(struct sif_dev *sdev)
+{
+	/* For simplicity we just destroy the old
+	 * and allocate a new flush_retry qp.
+	 */
+	mutex_lock(&sdev->flush_lock);
+	sif_hw_free_flush_qp(sdev);
+	sif_hw_allocate_flush_qp(sdev);
+	mutex_unlock(&sdev->flush_lock);
+}
+
+int reset_qp_flush_retry(struct sif_dev *sdev)
+{
+	struct sif_qp *qp = NULL;
+	struct psif_query_qp lqqp;
+
+	struct ib_send_wr *sbad_wr;
+	struct ib_send_wr snd_wr = {
+		.wr_id   = 0x1,
+		.sg_list = NULL,
+		.opcode  = IB_WR_SEND,
+		.num_sge = 0, /* ZERO byte */
+		.next    = NULL,
+	};
+	struct ib_recv_wr *rbad_wr;
+	struct ib_recv_wr rcv_wr = {
+		.wr_id   = 0x2,
+		.sg_list = NULL,
+		.next    = NULL,
+		.num_sge = 0,
+	};
+
+	struct sif_rq *rq = NULL;
+	struct sif_cq *cq = NULL;
+
+	int ret = 0;
+	int rte, rtc;
+	int count;
+	unsigned long timeout = sdev->min_resp_ticks;
+	unsigned long timeout_real;
+
+	/* Get access to the flush_retry QP */
+	mutex_lock(&sdev->flush_lock);
+
+	if (!sdev->flush_qp) {
+		sif_log(sdev, SIF_INFO, "special handling WA_3713 failed: flush_qp does not exist");
+		ret = -EINVAL;
+		goto err_flush_qp;
+	}
+
+	qp = get_sif_qp(sdev, sdev->flush_qp);
+
+	/* Query flush_retry QP */
+	ret = epsc_query_qp(qp, &lqqp);
+	if (ret) {
+		sif_log(sdev, SIF_INFO, "epsc_query_qp failed with status %d", ret);
+		goto fail;
+	}
+
+	/* Store retry_tag_err and retry_tag_committed */
+	rte = lqqp.qp.retry_tag_err;
+	rtc = lqqp.qp.retry_tag_committed;
+
+	/* Post one zero byte send */
+	ret = sif_post_send(&qp->ibqp, &snd_wr, &sbad_wr);
+	if (ret) {
+		sif_log(sdev, SIF_INFO, "sif_post_send failed with status %d", ret);
+		goto fail;
+	}
+
+	timeout_real = jiffies + timeout;
+	while (rte == lqqp.qp.retry_tag_err || rtc == lqqp.qp.retry_tag_committed) {
+		if (time_is_after_jiffies(timeout_real)) {
+			cond_resched();
+			ret = epsc_query_qp(qp, &lqqp);
+			if (ret) {
+				sif_log(sdev, SIF_INFO, "epsc_query_qp failed with status %d", ret);
+				goto fail;
+			}
+		} else {
+			sif_log(sdev, SIF_INFO, "Timeout waiting for flush retry");
+			ret = -ETIMEDOUT;
+			goto fail;
+		}
+	}
+
+	/* Post an RQE to the RQ */
+	ret = sif_post_recv(&qp->ibqp, &rcv_wr, &rbad_wr);
+	if (ret) {
+		sif_log(sdev, SIF_INFO, "sif_post_recv failed with status %d", ret);
+		goto fail;
+	}
+
+	/* Poll out the completions of the CQ */
+	rq = get_sif_rq(sdev, qp->rq_idx);
+	cq = get_sif_cq(sdev, rq->cq_idx);
+
+	count = 0;
+	timeout_real = jiffies + timeout;
+	while (count < 2) {
+		struct ib_wc wcs[2];
+		int sts = sif_poll_cq(&cq->ibcq, 2, wcs);
+
+		if (sts < 0) {
+			sif_log(sdev, SIF_INFO, "sif_poll_cq failed with status %d", sts);
+			ret = sts;
+			goto fail;
+		} else
+			count += sts;
+
+		if (time_is_after_jiffies(timeout_real))
+			cond_resched();
+		else {
+			sif_log(sdev, SIF_INFO, "Timeout waiting for completions");
+			for (sts = 0; sts < count; sts++)
+				sif_log(sdev, SIF_INFO, "wr_id %lld status %d opcode %d",
+					wcs[sts].wr_id, wcs[sts].status, wcs[sts].opcode);
+			goto fail;
+		}
+	}
+
+	mutex_unlock(&sdev->flush_lock);
+	return ret;
+fail:
+	sif_hw_free_flush_qp(sdev);
+	sif_hw_allocate_flush_qp(sdev);
+	mutex_unlock(&sdev->flush_lock);
+	return ret;
+
+err_flush_qp:
+	mutex_unlock(&sdev->flush_lock);
+	return ret;
+}
+
+static int outstanding_wqes(struct sif_dev *sdev, struct sif_qp *qp, u16 *head)
+{
+	struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+	struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx);
+	struct psif_query_qp lqqp;
+	int ret = 0;
+
+	ret = epsc_query_qp(qp, &lqqp);
+	if (ret) {
+		sif_log(sdev, SIF_INFO, "epsc_query_qp failed with status %d", ret);
+		return ret;
+	}
+	if (head)
+		*head = lqqp.qp.retry_sq_seq;
+
+	return sq_length(sq, lqqp.qp.retry_sq_seq, sq_sw->last_seq);
+}
+
+int pre_process_wa4074(struct sif_dev *sdev, struct sif_qp *qp)
+{
+	struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+	struct psif_sq_entry *sqe;
+	u16 head;
+	int len;
+
+	len = outstanding_wqes(sdev, qp, &head);
+	if (len <= 0)
+		return -1;
+
+	while (len) {
+		head++;
+		sqe = get_sq_entry(sq, head);
+		set_psif_wr__checksum(&sqe->wr, 0);
+		len--;
+	}
+	return 0;
+}
+
+/* QP is in RESET state, its now safe to do a cq_walk and
+ * flush any completions.
+ */
+int post_process_wa4074(struct sif_dev *sdev, struct sif_qp *qp)
+{
+	struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+	struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx);
+	struct psif_query_qp lqqp;
+	bool last_seq_set = false;
+	u16 last_seq, fence_seq;
+	DECLARE_SIF_CQE_POLL(sdev, lcqe);
+	int ret = 0;
+	bool need_gen_fence_completion = true;
+	struct sif_cq *cq = (sq && sq->cq_idx >= 0) ? get_sif_cq(sdev, sq->cq_idx) : NULL;
+	struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index);
+
+
+	/* if flush SQ is in progress, set FLUSH_SQ_IN_FLIGHT.
+	 */
+	if (test_bit(FLUSH_SQ_IN_PROGRESS, &sq_sw->flags)) {
+		set_bit(FLUSH_SQ_IN_FLIGHT, &sq_sw->flags);
+		return ret;
+	}
+
+	if (test_and_set_bit(FLUSH_SQ_IN_PROGRESS, &sq_sw->flags)) {
+		set_bit(FLUSH_SQ_IN_FLIGHT, &sq_sw->flags);
+		return ret;
+	}
+
+	if ((sq_sw->last_seq - sq_sw->head_seq) == 0)
+		goto err_post_wa4074;
+
+	/* if SQ has been flushed before, continue to generate
+	 * the remaining completions.
+	 */
+	if (test_and_set_bit(FLUSH_SQ_FIRST_TIME, &sq_sw->flags)) {
+		sif_log(sdev, SIF_WCE_V, "flush sq not the first time");
+		last_seq = sq_sw->trusted_seq;
+		goto flush_sq_again;
+	}
+
+	ret = epsc_query_qp(qp, &lqqp);
+	if (ret) {
+		sif_log(sdev, SIF_INFO, "epsc_query_qp failed, ret %d", ret);
+		goto err_post_wa4074;
+	}
+
+	last_seq = sq_sw->last_seq;
+
+	set_bit(CQ_POLLING_NOT_ALLOWED, &cq_sw->flags);
+
+	sif_log(sdev, SIF_WCE_V, "sq_retry_seq %x sq_seq %x last_seq %x head_seq %x",
+		lqqp.qp.retry_sq_seq, lqqp.qp.sq_seq, sq_sw->last_seq, sq_sw->head_seq);
+
+	/* need_gen_fence_completion is used to flush any cqes in the pipeline.
+	 * If this is a good case, no fence completion is needed.
+	 * Proceed directly to walk and update the CQE. The good case
+	 * is only true if retry_tag_committed == retry_tag_err &&
+	 * retry_sq_seq + 1 == sq_seq && !flush_started.
+	 */
+
+	need_gen_fence_completion = ((lqqp.qp.retry_tag_committed != lqqp.qp.retry_tag_err) ||
+				     (lqqp.qp.retry_sq_seq + 1 != lqqp.qp.sq_seq) ||
+				     (lqqp.qp.flush_started));
+
+	if (need_gen_fence_completion) {
+
+		/* This is just a sequence number that we use to flush any cqes in the pipeline.
+		 * Before walking the CQ, we need to ensure that we receive a cqe with fence_seq.
+		 */
+		fence_seq = sq_sw->head_seq + 1;
+
+		sif_log(sdev, SIF_WCE_V, "fence_seq %x",
+			fence_seq);
+
+		/* Completion fence, this also flushes any cqes in pipeline */
+		ret = sif_gen_sq_flush_cqe(sdev, sq, fence_seq, qp->qp_idx, false);
+		if (ret)
+			sif_log(sdev, SIF_INFO, "sq %d, sif_gen_sq_flush_cqe returned %d",
+				sq->index, ret);
+
+		if (ret == -EAGAIN) {
+			ret = gen_pqp_cqe(&lcqe);
+			if (ret < 0)
+				goto err_post_wa4074;
+
+			ret = poll_cq_waitfor(&lcqe);
+			if (ret < 0)
+				goto err_post_wa4074;
+
+			lcqe.written = false;
+		}
+
+		/* Generate a sync.completion for us on the PQP */
+		ret = gen_pqp_cqe(&lcqe);
+		if (ret < 0) {
+			sif_log(sdev, SIF_INFO, "SQ %d, gen_pqp_cqe ret %d", sq->index, ret);
+			goto err_post_wa4074;
+		}
+		ret = poll_cq_waitfor(&lcqe);
+		if (ret < 0) {
+			sif_log(sdev, SIF_INFO, "SQ %d, poll_cq_waitfor failed, ret %d",
+				sq->index, ret);
+			goto err_post_wa4074;
+		}
+
+		last_seq = cq_walk_wa4074(sdev, qp, &last_seq_set);
+
+		if (!last_seq_set) {
+			sif_log(sdev, SIF_INFO, "failed to generate a completion to cq");
+			goto err_post_wa4074;
+		}
+
+		if (last_seq != fence_seq) {
+			sif_log(sdev, SIF_INFO, "last seq (%x) is different than fenced completion (%x)!",
+				last_seq, fence_seq);
+			/* As the Fenced completion cannot be guaranteed to be the last, software still needs to
+			 * walk and update the CQ to avoid unexpected completion/duplicated completion
+			 * even thought the last completion is the CQ is not generated fenced completion.
+			 */
+		}
+
+	sif_log(sdev, SIF_WCE_V, "after: sq_retry_seq %x sq_seq %x last_seq %x head_seq %x",
+		lqqp.qp.retry_sq_seq, lqqp.qp.sq_seq, sq_sw->last_seq, sq_sw->head_seq);
+
+	}
+	last_seq = walk_and_update_cqes(sdev, qp, sq_sw->head_seq + 1, sq_sw->last_seq);
+	sq_sw->trusted_seq = last_seq;
+
+	clear_bit(CQ_POLLING_NOT_ALLOWED, &cq_sw->flags);
+
+	if (GREATER_16(last_seq, sq_sw->last_seq)) {
+		sif_log(sdev, SIF_WCE_V, "last seq %x > sq_sw->last_seq %x\n", last_seq, sq_sw->last_seq);
+		if (!(qp->flags & SIF_QPF_USER_MODE) && (cq->ibcq.comp_handler)) {
+			if (atomic_add_unless(&cq->refcnt, 1, 0)) {
+				sif_log(sdev, SIF_WCE_V, "need to generate an event to cq %d\n", cq->index);
+				cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+				if (atomic_dec_and_test(&cq->refcnt))
+					complete(&cq->cleanup_ok);
+			}
+		}
+		goto check_in_flight_and_return;
+	}
+
+	sif_log(sdev, SIF_WCE_V, "generate completion from %x to %x",
+		last_seq, sq_sw->last_seq);
+flush_sq_again:
+	for (; (!GREATER_16(last_seq, sq_sw->last_seq)); ++last_seq) {
+		sif_log(sdev, SIF_WCE_V, "generate completion %x",
+			last_seq);
+
+		ret = sif_gen_sq_flush_cqe(sdev, sq, last_seq, qp->qp_idx, true);
+		if (ret)
+			sif_log(sdev, SIF_INFO,
+				"sq %d, last_seq %x, sif_gen_sq_flush_cqe returned %d",
+				sq->index, last_seq, ret);
+
+		if (ret == -EAGAIN) {
+			ret = gen_pqp_cqe(&lcqe);
+			if (ret < 0)
+				goto err_post_wa4074;
+
+			ret = poll_cq_waitfor(&lcqe);
+			if (ret < 0)
+				goto err_post_wa4074;
+
+			lcqe.written = false;
+			continue;
+		}
+
+		if (ret < 0)
+			goto err_post_wa4074;
+	}
+
+	/* Generate a sync.completion for us on the PQP itself
+	 * to allow us to wait for the whole to complete:
+	 */
+	ret = gen_pqp_cqe(&lcqe);
+	if (ret < 0) {
+		sif_log(sdev, SIF_INFO, "SQ %d, gen_pqp_cqe ret %d", sq->index, ret);
+		goto err_post_wa4074;
+	}
+	ret = poll_cq_waitfor(&lcqe);
+	if (ret < 0) {
+		sif_log(sdev, SIF_INFO, "SQ %d, poll_cq_waitfor failed, ret %d",
+			sq->index, ret);
+		goto err_post_wa4074;
+	}
+
+	sif_log(sdev, SIF_INFO_V, "SQ %d: recv'd completion on cq %d seq 0x%x - done, ret %d",
+		sq->index, sq->cq_idx, lcqe.cqe.seq_num, ret);
+	sq_sw->trusted_seq = last_seq;
+
+check_in_flight_and_return:
+	if (test_and_clear_bit(FLUSH_SQ_IN_FLIGHT, &sq_sw->flags)) {
+		sif_log(sdev, SIF_WCE_V, "in-flight:generate completion from %x to %x",
+			last_seq, sq_sw->last_seq);
+		goto flush_sq_again;
+	}
+
+err_post_wa4074:
+	clear_bit(CQ_POLLING_NOT_ALLOWED, &cq_sw->flags);
+	clear_bit(FLUSH_SQ_IN_FLIGHT, &sq_sw->flags);
+	clear_bit(FLUSH_SQ_IN_PROGRESS, &sq_sw->flags);
+	qp->flush_sq_done_wa4074 = true;
+	return ret = ret > 0 ? 0 : ret;
+}
+
+/* This is called from teardown (user modify QP->ERR) as well as
+ * any subsequent WQEs posted to SQ.
+ */
+int sq_flush_wa4074(struct sif_dev *sdev, struct sif_qp *qp)
+{
+	struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+	struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx);
+	struct sif_cq *cq = (sq && sq->cq_idx >= 0) ? get_sif_cq(sdev, sq->cq_idx) : NULL;
+	struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index);
+	u16 last_seq;
+	int flushed = 0;
+	DECLARE_SIF_CQE_POLL(sdev, lcqe);
+	int ret = 0;
+
+	sif_log(sdev, SIF_INFO_V, "last_seq %x head_seq %x",
+		sq_sw->last_seq, sq_sw->head_seq);
+
+	set_bit(CQ_POLLING_NOT_ALLOWED, &cq_sw->flags);
+
+	last_seq = walk_and_update_cqes(sdev, qp, sq_sw->head_seq + 1, sq_sw->last_seq);
+
+	clear_bit(CQ_POLLING_NOT_ALLOWED, &cq_sw->flags);
+
+	if (last_seq > sq_sw->last_seq)
+		goto err_sq_flush;
+
+	for (; last_seq <= sq_sw->last_seq; ++last_seq) {
+
+		ret = sif_gen_sq_flush_cqe(sdev, sq, last_seq, qp->qp_idx, true);
+		if (ret)
+			sif_log(sdev, SIF_INFO,
+				"sq %d, last_seq %x, sif_gen_sq_flush_cqe returned %d",
+				sq->index, last_seq, ret);
+
+		if (ret == -EAGAIN) {
+			ret = gen_pqp_cqe(&lcqe);
+			if (ret < 0)
+				goto err_sq_flush;
+
+			ret = poll_cq_waitfor(&lcqe);
+			if (ret < 0)
+				goto err_sq_flush;
+
+			lcqe.written = false;
+			continue;
+		}
+
+		if (ret < 0)
+			goto err_sq_flush;
+		++flushed;
+	}
+
+	/* Generate a sync.completion for us on the PQP itself
+	 * to allow us to wait for the whole to complete:
+	 */
+	ret = gen_pqp_cqe(&lcqe);
+	if (ret < 0) {
+		sif_log(sdev, SIF_INFO, "SQ %d, gen_pqp_cqe ret %d", sq->index, ret);
+		goto err_sq_flush;
+	}
+	ret = poll_cq_waitfor(&lcqe);
+	if (ret < 0) {
+		sif_log(sdev, SIF_INFO, "SQ %d, poll_cq_waitfor failed, ret %d",
+			sq->index, ret);
+		goto err_sq_flush;
+	}
+
+	sif_log(sdev, SIF_INFO_V, "SQ %d: recv'd completion on cq %d seq 0x%x - done, ret %d",
+		sq->index, sq->cq_idx, lcqe.cqe.seq_num, ret);
+
+err_sq_flush:
+	return ret = ret > 0 ? 0 : ret;
+}
+
+/* Walk the CQ, update the cqe from head to end and return the last_seq */
+static u16 walk_and_update_cqes(struct sif_dev *sdev, struct sif_qp *qp, u16 head, u16 end)
+{
+	struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+	struct sif_cq *cq = (sq && sq->cq_idx >= 0) ? get_sif_cq(sdev, sq->cq_idx) : NULL;
+	struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index);
+	volatile struct psif_cq_entry *cqe;
+	u16 last_seq = 0, updated_seq;
+	u32 seqno, polled_value;
+	unsigned long flags = 0;
+	int n = 0;
+
+	updated_seq = head;
+	last_seq = head;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	for (seqno = cq_sw->next_seq;; ++seqno) {
+		struct psif_cq_entry lcqe;
+
+		cqe = get_cq_entry(cq, seqno);
+		polled_value = get_psif_cq_entry__seq_num(cqe);
+
+		if (seqno != polled_value)
+			break;
+
+		if (get_psif_cq_entry__qp(cqe) != qp->qp_idx)
+			continue;
+
+		copy_conv_to_sw(&lcqe, cqe, sizeof(lcqe));
+
+		if (!(lcqe.opcode & IB_WC_RECV)) {
+			last_seq = lcqe.wc_id.sq_id.sq_seq_num;
+			sif_log(sdev, SIF_WCE_V, "last_seq %x updated_seq %x lcqe.seq_num %x",
+				last_seq, updated_seq, lcqe.seq_num);
+			if (last_seq != updated_seq) {
+				lcqe.wc_id.sq_id.sq_seq_num = updated_seq;
+				if (GREATER_16(updated_seq, end)) {
+					/* A scenario might be that an additional CQE
+					 * must be generated to flush all the HW
+					 * generated completions. Thus, igore the polling the cqe.
+					 */
+					lcqe.seq_num = ~lcqe.seq_num;
+					sif_log(sdev, SIF_WCE_V, "corrupt: lcqe.seq_num %x",
+						lcqe.seq_num);
+					set_bit(CQ_POLLING_IGNORED_SEQ, &cq_sw->flags);
+				}
+				copy_conv_to_hw(cqe, &lcqe, sizeof(lcqe));
+			}
+			if (!GREATER_16(updated_seq, end))
+				updated_seq++;
+			++n;
+		}
+	}
+	sif_log(sdev, SIF_WCE_V, "sq/cq %d/%d: %d entries not being pulled yet",
+		sq->index, cq->index, n);
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+	return updated_seq;
+}
+
+/* Walk the CQ and return the last completed sq_seq */
+static u16 cq_walk_wa4074(struct sif_dev *sdev, struct sif_qp *qp, bool *last_seq_set)
+{
+	struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+	struct sif_cq *cq = (sq && sq->cq_idx >= 0) ? get_sif_cq(sdev, sq->cq_idx) : NULL;
+	struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index);
+	volatile struct psif_cq_entry *cqe;
+	u32 seqno, polled_value;
+	unsigned long flags = 0;
+	u16 last_seq = 0, prev_seq = 0;
+	bool prev_seq_set = false;
+	int n = 0;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	for (seqno = cq_sw->next_seq;; ++seqno) {
+		struct psif_cq_entry lcqe;
+
+		cqe = get_cq_entry(cq, seqno);
+		polled_value = get_psif_cq_entry__seq_num(cqe);
+
+		if (seqno != polled_value)
+			break;
+
+		if (get_psif_cq_entry__qp(cqe) != qp->qp_idx)
+			continue;
+
+		copy_conv_to_sw(&lcqe, cqe, sizeof(lcqe));
+
+		if (!(lcqe.opcode & IB_WC_RECV)) {
+			last_seq = lcqe.wc_id.sq_id.sq_seq_num;
+
+			if (!(*last_seq_set))
+				*last_seq_set = true;
+
+			if (unlikely(prev_seq_set && prev_seq >= last_seq))
+				sif_log(sdev, SIF_INFO_V,
+					"sq/cq %d/%d: prev sq_seq (0x%x) >= curr sq_seq (0x%x)",
+					sq->index, cq->index, prev_seq, last_seq);
+
+			prev_seq = last_seq;
+			if (!(prev_seq_set))
+				prev_seq_set = true;
+			n++;
+		}
+	}
+	sif_log(sdev, SIF_WCE_V, "sq/cq %d/%d: %d entries not being pulled yet",
+		sq->index, cq->index, n);
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+	return last_seq;
+}
diff --git a/drivers/infiniband/hw/sif/sif_r3.h b/drivers/infiniband/hw/sif/sif_r3.h
new file mode 100644
index 000000000000..6fffc755952f
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_r3.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_r3.h: Special handling specific for psif revision 3 and earlier
+ */
+
+#ifndef _SIF_R3_H
+#define _SIF_R3_H
+
+int sif_r3_init(struct sif_dev *sdev);
+void sif_r3_deinit(struct sif_dev *sdev);
+
+/* WA for #3713 */
+int reset_qp_flush_retry(struct sif_dev *sdev);
+void sif_r3_recreate_flush_qp(struct sif_dev *sdev);
+
+/* WA for #4074 */
+int pre_process_wa4074(struct sif_dev *sdev, struct sif_qp *qp);
+int post_process_wa4074(struct sif_dev *sdev, struct sif_qp *qp);
+int sq_flush_wa4074(struct sif_dev *sdev, struct sif_qp *qp);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_rq.c b/drivers/infiniband/hw/sif/sif_rq.c
new file mode 100644
index 000000000000..da406db58711
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_rq.c
@@ -0,0 +1,545 @@
+/*
+ * Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_rq.c: Implementation of sif receive queues
+ */
+
+#include <rdma/ib_verbs.h>
+#include "sif_dev.h"
+#include "psif_hw_data.h"
+#include "psif_hw_setget.h"
+#include "sif_dma.h"
+#include "sif_rq.h"
+#include "sif_xrc.h"
+#include "sif_base.h"
+#include "sif_defs.h"
+#include <linux/seq_file.h>
+
+int poll_wait_for_rq_writeback(struct sif_dev *sdev, struct sif_rq *rq)
+{
+	unsigned long timeout = sdev->min_resp_ticks;
+	unsigned long timeout_real = jiffies + timeout;
+	u8 valid;
+
+	sif_log(sdev, SIF_RQ, "enter rq %d", rq->index);
+	do {
+		/* Make sure the update from hw is observed in correct order */
+		smp_rmb();
+		valid = get_psif_rq_hw__valid(&rq->d);
+
+		if (!valid)
+			break;
+
+		if (time_is_after_jiffies(timeout_real))
+			cpu_relax();
+		else {
+			sif_log(sdev, SIF_INFO,
+				"Timeout waiting for write back for RQ %d - still valid",
+				rq->index);
+			return -ETIMEDOUT;
+		}
+	} while (true);
+
+	sif_log(sdev, SIF_RQ, "exit - write-back observed on rq %d", rq->index);
+	return 0;
+}
+
+int alloc_rq(struct sif_dev *sdev, struct sif_pd *pd,
+	u32 entries, u32 sg_entries,
+	struct ib_srq_init_attr *srq_init_attr,
+	bool user_mode)
+{
+	int ret = 0;
+	bool mark_dirty = false;
+	/* Access to receive queue descriptor elements */
+	struct sif_rq *rq;
+	struct sif_rq_sw *rq_sw;
+	volatile struct psif_rq_hw *rq_hw_p;
+	struct psif_rq_sw lrq_sw;
+	struct psif_xrq_hw lrq_hw;
+	int extent_log2;
+	struct psif_rq_entry rqe; /* Receive queue element for size calc only */
+	u32 max_entries;
+	u32 entries_log2;
+	int rq_idx;
+	u64 alloc_sz;
+
+	max_entries = roundup_pow_of_two(entries);
+	entries_log2 = order_base_2(max_entries);
+
+	/* Meaningless with 0 sge */
+	if (!sg_entries)
+		sg_entries = 1;
+	if (sg_entries > 16) {
+		sif_log(sdev, SIF_INFO,
+			"requested %d but sif only supports 16 receive sg entries",
+			sg_entries);
+		return -ENOMEM;
+	}
+
+	/* Max supporter nmbr of RQ WRs are 2^14 - 1 */
+	if (entries > 0x3fff) {
+		sif_log(sdev, SIF_INFO,
+			"requested %d entries, but sif only supports %d",
+			entries, 0x3fff);
+		return -ENFILE; /* 4 bit size_log2 field in rqs but highest value not supported (#2965) */
+	}
+
+	rq_idx = sif_alloc_rq_hw_idx(pd);
+
+	if (rq_idx < 0) {
+		sif_log(sdev, SIF_INFO,
+			"unable to allocate a receive queue, consider increasing rq_size");
+		ret = -ENOMEM;
+		return ret;
+	}
+	rq = get_sif_rq(sdev, rq_idx);
+
+	/* Make sure the RQ is sofware owned: */
+	ret = poll_wait_for_rq_writeback(sdev, rq);
+	if (ret) {
+		mark_dirty = true;
+		goto err_alloc;
+	}
+	rq->index = rq_idx;
+	rq->pd = pd;
+
+	rq_hw_p = &rq->d;
+	rq_sw = get_sif_rq_sw(sdev, rq_idx);
+
+	/* Initialize driver/user space state within sw extent */
+	atomic_set(&rq_sw->length, 0);
+	rq_sw->next_seq = 0;
+
+	rq->entries = max_entries;
+	/* Ref. #2965 */
+	rq->entries_user = (entries_log2 == 0xe ? max_entries - 1 : max_entries);
+	rq->mask = max_entries - 1;
+	rq->extent =
+		roundup_pow_of_two(sizeof(rqe.rqe_id)
+				+ sizeof(struct psif_rq_scatter) * sg_entries);
+
+	/* Now recalculate sge space from the extent to offer any extra room "for free" */
+	sg_entries = min((rq->extent - sizeof(rqe.rqe_id)) / sizeof(struct psif_rq_scatter), 16UL);
+	extent_log2 = order_base_2(rq->extent);
+	alloc_sz = max_entries * rq->extent;
+
+	/* Only whole pages must be exposed to user space */
+	if (user_mode && (alloc_sz & ~PAGE_MASK))
+		alloc_sz = (alloc_sz + PAGE_SIZE) & PAGE_MASK;
+	rq->user_mode = user_mode;
+
+	sif_log(sdev, SIF_QP, "RQ:sw 0x%p, hw 0x%p entries %d index %d extent %d max sge %d",
+		rq_sw, rq_hw_p, rq->entries, rq_idx, rq->extent, sg_entries);
+
+	if (alloc_sz <= SIF_MAX_CONT)
+		rq->mem = sif_mem_create_dmacont(sdev, alloc_sz, GFP_KERNEL | __GFP_ZERO, DMA_BIDIRECTIONAL);
+	else
+		rq->mem = sif_mem_create(sdev, alloc_sz >> PMD_SHIFT,
+					alloc_sz, SIFMT_2M, GFP_KERNEL | __GFP_ZERO, DMA_BIDIRECTIONAL);
+	if (!rq->mem) {
+		sif_log(sdev, SIF_INFO, "Failed RQ buffer pool allocation!");
+		ret = -ENOMEM;
+		goto err_alloc;
+	}
+
+	rq->sg_entries = sg_entries;
+	atomic_set(&rq->refcnt, 1);
+
+	/* Initialize hw part of descriptor */
+	memset(&lrq_hw, 0, sizeof(lrq_hw));
+
+	/* For normal RQs we use the valid bit as follows:
+	 *
+	 *  - If the QP is in RESET state, the RQ is invalid.
+	 *  - The RQ is set to valid as part of transitioning to INIT.
+	 *  - The RQ is still valid when the QP is in ERROR state
+	 *  - A modify to RESET resets the valid bit again.
+	 */
+
+	lrq_hw.size_log2 = entries_log2;
+	lrq_hw.prefetch_threshold_log2 = 1;
+
+	/* scatter = 0 means a single entry etc. */
+	lrq_hw.scatter = rq->sg_entries - 1;
+	lrq_hw.pd = pd->idx;
+
+	lrq_hw.head_indx = 0;
+	lrq_hw.base_addr = sif_mem_dma(rq->mem, 0);
+	lrq_hw.extent_log2 = extent_log2;
+
+	/* Allocate mmu context without wr_access set */
+	ret = sif_map_ctx(sdev, &rq->mmu_ctx, rq->mem, lrq_hw.base_addr,
+			alloc_sz, false);
+	if (ret) {
+		sif_log(sdev, SIF_INFO, "Failed to set mmu context for rq %d",
+			rq->index);
+		goto err_map_ctx;
+	}
+
+	if (srq_init_attr) {
+		/* Request for an SRQ */
+		lrq_hw.valid = 1; /* SRQs are valid for their entire lifetime */
+		lrq_hw.srq = 1;
+		lrq_hw.srq_lim = srq_init_attr->attr.srq_limit;
+		rq->is_srq = true;
+
+		if (srq_init_attr->srq_type == IB_SRQT_XRC) {
+			struct sif_cq *cq = to_scq(srq_init_attr->ext.xrc.cq);
+			struct sif_xrcd *xrcd = to_sxrcd(srq_init_attr->ext.xrc.xrcd);
+			ulong flags;
+
+			rq->cq_idx = cq->index;
+			rq->xrc_domain = lrq_hw.xrc_domain = xrcd->index;
+			lrq_hw.cqd_id = rq->cq_idx;
+			spin_lock_irqsave(&cq->lock, flags);
+			/* We only allow a CQ to be used for one single XSRQ
+			 * This is a violation of the IB standard but one
+			 * that probably should not have practical conseqences:
+			 * See #3521 for details:
+			 */
+			if (cq->xsrq) {
+				sif_log(sdev, SIF_INFO,
+					"xsrq %d: cq %d already used with xsrq %d - please use another cq for this xsrq",
+					rq->index, cq->index, cq->xsrq->index);
+				ret = -EBUSY;
+			} else
+				cq->xsrq = rq;
+			spin_unlock_irqrestore(&cq->lock, flags);
+			if (ret)
+				goto err_map_ctx;
+		}
+	}
+
+	/* Get the hw mmu context populated by sif_map_ctx */
+	lrq_hw.mmu_cntx = rq->mmu_ctx.mctx;
+
+	/* Write network byte order hw copy */
+	copy_conv_to_hw(rq_hw_p, &lrq_hw, sizeof(lrq_hw));
+
+	/* Initialize sw part of descriptor */
+	memset(&lrq_sw, 0, sizeof(lrq_sw));
+	lrq_sw.tail_indx = rq_sw->next_seq;
+
+	copy_conv_to_hw(&rq_sw->d, &lrq_sw, sizeof(lrq_sw));
+
+	spin_lock_init(&rq->lock);
+
+	return rq_idx;
+
+err_map_ctx:
+	sif_mem_free(rq->mem);
+err_alloc:
+	if (!mark_dirty)
+		sif_free_rq_hw_idx(pd, rq_idx);
+	return ret;
+}
+
+
+/* Invalidate the RQ cache and flush a desired amount of
+ * the remaining entries in the given receive queue.
+ * @target_qp indicates the value of the local_qp field in the generated
+ * completion. The qp itself would already have been modified to RESET
+ * to avoid any more traffic;
+ *
+ * Workaround #622: PSIF doesn't generate "FLUSHED IN ERROR" completions.
+ * In order to maintain OFED verbs-programming and IB spec. compatibility,
+ * RQEs needs to be "flushed in error" when
+ *  - Verbs layer modifies QP to error
+ *  - Hardware sends an async event, after setting the QP in error
+ *  - Poll CQ on IB client(kernel/user) receives an error completion
+ *    (Responder class A & C) with QP set to error
+ *  - More WQEs are posted by IB client(kernel/user) when QP in error
+ *  - QP is destroyed
+ *
+ * Note: No locking of the RQ is neccessary as there are multiple trigger points
+ * for flushing RQEs within OFED verbs model.
+ */
+int sif_flush_rq(struct sif_dev *sdev, struct sif_rq *rq, struct sif_qp *target_qp,
+		int max_flushed_in_err)
+{
+	int len, real_len;
+	struct sif_rq_sw *rq_sw = get_sif_rq_sw(sdev, rq->index);
+	int ret = 0;
+	u32 head, tail;
+	enum sif_mqp_type mqp_type = SIF_MQP_SW;
+	DECLARE_SIF_CQE_POLL(sdev, lcqe);
+
+	/* if flush RQ is in progress, set FLUSH_RQ_IN_FLIGHT.
+	 */
+	if (test_bit(FLUSH_RQ_IN_PROGRESS, &rq_sw->flags)) {
+		set_bit(FLUSH_RQ_IN_FLIGHT, &rq_sw->flags);
+		return ret;
+	}
+
+	/* if race condition happened while trying to flush RQ,
+	 * set the FLUSH_RQ_IN_FLIGHT, and let the other party does the job.
+	 */
+	if (test_and_set_bit(FLUSH_RQ_IN_PROGRESS, &rq_sw->flags)) {
+		set_bit(FLUSH_RQ_IN_FLIGHT, &rq_sw->flags);
+		return ret;
+	}
+
+	if (!sif_feature(disable_rq_flush))
+		len = min(max_flushed_in_err, atomic_read(&rq_sw->length));
+	else
+		len = 0;
+	if (len == 0)
+		goto error;
+
+	sif_log(sdev, SIF_INFO_V, "flushing %d entries out of %d/%d entries remaining",
+		len, atomic_read(&rq_sw->length), rq->entries);
+
+	/* Workaround #622 v2 step 1: ModifyQP to RESET
+	 * The QP must be in the RESET state to avoid race condition.
+	 * sif_flush_rq will only be called when the QP is
+	 * in ERROR state. As for now, keeping the same coding style to
+	 * check whether the qp flags SIF_QPF_HW_OWNED is clear.
+	 * If it is clear, it means that the QP is in the shadowed
+	 * software error state (actual hw state is in RESET).
+	 *
+	 * TBD - Should we add new PSIF_QP_STATE_SHADOWED_ERROR state,
+	 * at least to me it is more readable?
+	 */
+	mutex_lock(&target_qp->lock);
+	/* qp lock must be held to make sure not other thread is trying to do modify_qp_hw to RESET */
+	mqp_type = sif_modify_qp_is_ok(target_qp, target_qp->last_set_state, IB_QPS_RESET, IB_QP_STATE);
+
+	if (mqp_type == SIF_MQP_HW) {
+		struct ib_qp_attr attr = {
+			.qp_state = IB_QPS_ERR
+		};
+
+		ret = modify_qp_hw_wa_qp_retry(sdev, target_qp, &attr, IB_QP_STATE);
+
+		if (ret)
+			sif_log(sdev, SIF_INFO, "qp %d RESET failed, ret %d",
+				target_qp->qp_idx, ret);
+
+	}
+	mutex_unlock(&target_qp->lock);
+
+	/* Workaround #622 v2 step 2: Invalidate RQ
+	 * Invalidation of an RQ causes PSIF to flush it's caches for that RQ.
+	 * If PSIF finds the RQ invalid, it will attempt to fetch it.
+	 * It is then required to be valid (otherwise it will be interpreted as an error
+	 * by PSIF (see #2134). So software cannot rely upon the completion of the invalidate
+	 * to signal that the descriptor can be re-used, instead it will have to
+	 * verify by checking the final write-back of the descriptor, which will have
+	 * valid set to 0 by PSIF. In the general case we handle this lazy and check before we
+	 * try to re-use. The request is posted with no completion requested as we
+	 * do not need the completion:
+	 */
+	if (!(test_bit(RQ_IS_INVALIDATED, &rq_sw->flags))) {
+		ret = sif_invalidate_rq_hw(sdev, rq->index, PCM_POST);
+		if (ret) {
+			sif_log(sdev, SIF_INFO,
+				"Invalidate rq_hw failed, status %d", ret);
+			goto error;
+		}
+		set_bit(RQ_IS_INVALIDATED, &rq_sw->flags);
+	}
+
+	/* Make sure the RQ is sofware owned: */
+	ret = poll_wait_for_rq_writeback(sdev, rq);
+	if (ret)
+		goto error;
+
+	/* The RQ is now software owned and the (after a successful invalidate) so we
+	 * should be able to trust rq_hw::head_indx - better than scanning the CQ
+	 * for unprocessed elements:
+	 * Note that only the lowest 14 bits of the sequence number in head_indx is
+	 * valid:
+	 */
+flush_rq_again:
+	head = get_psif_rq_hw__head_indx(&rq->d);
+	tail = rq_sw->next_seq;
+	real_len = rq_length(rq, head, tail & ((1 << 14) - 1)) & ((1 << 14) - 1);
+
+	/* Workaround #622 v2 step 3: Check the last completion on the CQ
+	 * The rq_sw->length is used to track the length of a queue
+	 * with #posted - #completed. If the calculated real_len is
+	 * smaller than the len, it means that a completion is missing.
+	 * Instead of loooping RQ to find rqe of the completed wc_id, the
+	 * rq_sw->length represents the #posted - #completed, and nfixup
+	 * represents the remaining completions after the QP moved to RESET.
+	 * Thus, the number of flush-in error that must be generated is
+	 * rq_sw->length - nfixup.
+	 */
+	if (!(test_bit(FLUSH_RQ_FIRST_TIME, &rq_sw->flags))) {
+		/* need to use a flag to differentiate between the first call of
+		 * sif_flush_rq or the subsequent call. The race condition where
+		 * HW acquired a RWQE but does not generate a completion can
+		 * only happen at the first call of sif_flush_rq. This is because
+		 * the QP state is moved to RESET.
+		 * Besides, if the generated completion arrived later and
+		 * FLUSH_RQ_IN_FLIGHT is set, the test of real_len < len
+		 * might be true.
+		 */
+		len = atomic_read(&rq_sw->length);
+		if (real_len < len) {
+			int nfixup;
+			u32 cq_idx = get_psif_qp_core__rcv_cq_indx(&target_qp->d.state);
+			struct sif_cq *cq = rq ? get_sif_cq(sdev, cq_idx) : NULL;
+
+			nfixup = sif_fixup_cqes(cq, NULL, target_qp);
+			sif_log(sdev, SIF_RQ,
+				"RQ %d: updating calculated entries from %d to %d - %d (%d)",
+				rq->index, real_len, len, nfixup, len - nfixup);
+			real_len = len - nfixup;
+		}
+		set_bit(FLUSH_RQ_FIRST_TIME, &rq_sw->flags);
+	}
+
+	/* Now find the actual 32 bit seq.no */
+	head = tail - real_len;
+
+	sif_log(sdev, SIF_RQ,
+		"RQ %d not empty: sz %d, head %d, next_seq %d, %d/%d entries at exit",
+		rq->index, rq->entries, head, tail, len, real_len);
+
+	if (!real_len)
+		goto error;
+
+	/* Workaround #622 v2 step 4: generate flush in error completion
+	 * Generate flushed in error completions:
+	 * these give no pqp completions but may in theory fail
+	 */
+	while (real_len > 0) {
+		sif_log(sdev, SIF_PQP, "rq %d, len %d", rq->index, real_len);
+		ret = sif_gen_rq_flush_cqe(sdev, rq, head, target_qp);
+		if (ret)
+			sif_log(sdev, SIF_INFO, "rq %d, len %d, sif_gen_rq_flush_cqe returned %d",
+				rq->index, real_len, ret);
+		if (ret == -EAGAIN) {
+			ret = gen_pqp_cqe(&lcqe);
+			if (ret < 0)
+				goto error;
+			ret = poll_cq_waitfor(&lcqe);
+			if (ret < 0)
+				goto error;
+			lcqe.written = false;
+			continue;
+		}
+		if (ret < 0)
+			goto error;
+		real_len--;
+		head++;
+	}
+
+	/* Finally generate a sync.completion for us on the PQP itself
+	 * to allow us to wait for the whole to complete:
+	 */
+	ret = gen_pqp_cqe(&lcqe);
+	if (ret < 0) {
+		sif_log(sdev, SIF_INFO, "rq %d, cqe %p gen_pqp_cqe returned %d",
+			rq->index, &lcqe, ret);
+		goto error;
+	}
+
+	ret = poll_cq_waitfor(&lcqe);
+	if (ret < 0) {
+		sif_log(sdev, SIF_INFO, "rq %d, cqe %p poll_cq_waitfor returned %d",
+			rq->index, &lcqe, ret);
+		goto error;
+	}
+
+	sif_log(sdev, SIF_INFO_V, "RQ %d: received completion on cq %d seq 0x%x - done",
+		rq->index, rq->cq_idx, lcqe.cqe.seq_num);
+
+	/* Make sure hardware pointer reflects the flushed situation */
+	set_psif_rq_hw__head_indx(&rq->d, head);
+	wmb();
+
+	/* if FLUSH_RQ_IN_FLIGHT is set, it means another party is trying to
+	 * flush the rq at the same time. This should be retried
+	 * once as no more than one asynchronous event will be generated if
+	 * QP is in ERROR state. This is to take care of a scenario where
+	 * QP is modified to ERROR explicitly and at the same time received
+	 * the asynchronous event. Nevertheless, the RQ entry changes in between
+	 * of these two scenario that can trigger flush rq.
+	 */
+	if (test_and_clear_bit(FLUSH_RQ_IN_FLIGHT, &rq_sw->flags))
+		goto flush_rq_again;
+
+error:
+	clear_bit(FLUSH_RQ_IN_PROGRESS, &rq_sw->flags);
+	return ret = ret > 0 ? 0 : ret;
+}
+
+
+int free_rq(struct sif_dev *sdev, int rq_idx)
+{
+	struct sif_rq *rq;
+	int stat;
+
+	rq = get_sif_rq(sdev, rq_idx);
+	sif_log(sdev, SIF_RQ, "entry %d", rq_idx);
+
+	stat = atomic_dec_and_test(&rq->refcnt);
+	if (!stat) {
+		sif_log(sdev, SIF_RQ, "rq %d still in use - ref.cnt %d",
+			rq_idx, atomic_read(&rq->refcnt));
+		return -EBUSY;
+	}
+
+	sif_release_rq(sdev, rq->index);
+	return 0;
+}
+
+
+void sif_release_rq(struct sif_dev *sdev, int index)
+{
+	struct sif_rq *rq = get_sif_rq(sdev, index);
+	struct sif_pd *pd = rq->pd;
+
+	if (!pd) {
+		sif_log(sdev, SIF_INFO, "Internal error: no pd associated with rq %d", index);
+		return;
+	}
+
+	sif_unmap_ctx(sdev, &rq->mmu_ctx);
+
+	sif_mem_free(rq->mem);
+	sif_clear_rq_sw(sdev, index);
+
+	if (!sif_feature(disable_invalidate_rq))
+		sif_free_rq_hw_idx(pd, index);
+}
+
+void sif_dfs_print_rq_hw(struct seq_file *s, struct sif_dev *sdev, loff_t pos)
+{
+	struct sif_rq *rq;
+	struct sif_rq_sw *rq_sw;
+	volatile struct psif_rq_hw *rq_hw;
+	u32 tail, head;
+	int qlen;
+
+	if (unlikely(pos < 0)) {
+		seq_puts(s, "# Index  head  sw_tail  entries queue_len nmbr_sge next_seq srq_lim\n");
+		return;
+	}
+	rq = get_sif_rq(sdev, pos);
+	rq_hw = &rq->d;
+	rq_sw = get_sif_rq_sw(sdev, pos);
+
+	head = get_psif_rq_hw__head_indx(rq_hw);
+	tail = get_psif_rq_sw__tail_indx(&rq_sw->d);
+	qlen = atomic_read(&rq_sw->length);
+
+	seq_printf(s, "%7llu %5u %8u %8u %9u %8u %8u %7u", pos,
+		head, tail, rq->entries, qlen, rq->sg_entries, rq_sw->next_seq, rq->srq_limit);
+	if (rq->is_srq & rq->xrc_domain)
+		seq_puts(s, "\t[XRC-SRQ]\n");
+	else if (rq->is_srq)
+		seq_puts(s, "\t[SRQ]\n");
+	else
+		seq_puts(s, "\n");
+}
diff --git a/drivers/infiniband/hw/sif/sif_rq.h b/drivers/infiniband/hw/sif/sif_rq.h
new file mode 100644
index 000000000000..be8bb21fadc2
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_rq.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_rq.h: Interface to sif receive queues
+ */
+
+#ifndef _SIF_RQ_H
+#define _SIF_RQ_H
+
+struct sif_rq {
+	volatile struct psif_rq_hw d;	/* Hardware descriptor */
+	struct ib_srq ibsrq ____cacheline_internodealigned_in_smp; /* Only used if this is an SRQ */
+	spinlock_t lock ____cacheline_internodealigned_in_smp;
+	struct sif_mmu_ctx mmu_ctx;
+	struct sif_pd *pd;  /* Ref to owning protection domain */
+	int index;
+	int cq_idx;  /* Default compl.queue index to use, if any */
+	bool user_mode;  /* Set if this is an RQ to be mapped to user space */
+	bool is_srq; /* Set if this is a shared receive queue */
+	int xrc_domain; /* If != 0: This is an XRC SRQ member of this domain idx */
+	atomic_t refcnt; /* Ref.count for usage as a shared receive queue */
+	u16 entries;      /* Allocated entries */
+	u16 entries_user; /* Entries reported to user (entries -1 if max) */
+	u32 sg_entries; /* Max receive scatter/gather configured for this rq */
+	u32 mask;  /* entries - 1 for modulo using & */
+	u32 extent;
+	u16 srq_limit;
+	struct sif_mem *mem; /* Allocated queue memory */
+};
+
+static inline struct sif_rq *to_srq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct sif_rq, ibsrq);
+}
+
+int poll_wait_for_rq_writeback(struct sif_dev *sdev, struct sif_rq *rq);
+
+/* Allocate a receive queue - if @srq_init_attr is non-nil
+ * this is a shared receive queue (SRQ)
+ * A return value >= 0 is the index of the receive queue descriptor allocated
+ * otherwise it is -errno
+ */
+int alloc_rq(struct sif_dev *sdev, struct sif_pd *pd,
+	u32 entries, u32 sge_entries,
+	struct ib_srq_init_attr *srq_init_attr,
+	bool user_mode);
+
+/* Invalidate the RQ cache and flush a desired amount of
+ * the remaining entries in the given receive queue.
+ * @target_qp indicates the value of the local_qp field in the generated
+ * completion but is not interpreted by SIF in any way.
+ */
+int sif_flush_rq(struct sif_dev *sdev, struct sif_rq *rq,
+		struct sif_qp *target_qp, int max_flushed_in_err);
+
+int free_rq(struct sif_dev *sdev, int rq_idx);
+
+/* Low level callbacks to release memory for these queues
+ * Called from sif_hiw::handle_invalidate_wc
+ */
+void sif_release_rq(struct sif_dev *sdev, int index);
+
+void sif_dfs_print_rq_hw(struct seq_file *s, struct sif_dev *sdev, loff_t pos);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_sndrcv.c b/drivers/infiniband/hw/sif/sif_sndrcv.c
new file mode 100644
index 000000000000..c2afdab16da0
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_sndrcv.c
@@ -0,0 +1,1152 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_sndrcv.c: Implementation of post send/recv logic for SIF
+ */
+#include <linux/sched.h>
+#include <net/checksum.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+#include "sif_dev.h"
+#include "sif_query.h"
+#include "sif_defs.h"
+#include "sif_base.h"
+#include "sif_sndrcv.h"
+#include "sif_qp.h"
+#include "sif_mr.h"
+#include "sif_tqp.h"
+#include "sif_r3.h"
+#include "psif_hw_setget.h"
+#include "sif_checksum.h"
+#include <linux/kgdb.h>
+
+
+/* Handle a NULL terminated array of send work requests */
+#define SQS_ACTIVE (get_psif_sq_hw__sq_next(&sq->d) != 0xFFFFFFFF)
+int sif_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		  struct ib_send_wr **bad_wr)
+{
+	struct sif_dev *sdev = to_sdev(ibqp->device);
+	struct sif_qp *qp = to_sqp(ibqp);
+	struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+	struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx);
+	unsigned long flags;
+	bool doorbell_mode;
+	bool last;
+	u16 first_seq;
+	const int nmbr_wrs_to_bulk_process = 32;
+	int ret = 0;
+	int n;
+
+	sif_log(sdev, SIF_SND, "on qp_idx %d wr 0x%p ibv type %d",
+		qp->qp_idx, wr, wr->opcode);
+
+	if (unlikely(qp->type > PSIF_QP_TRANSPORT_MANSP2)) {
+		sif_log(sdev, SIF_INFO, "Invalid QP type");
+		ret = -EINVAL;
+		goto err_post_send_unlocked;
+	}
+
+	if (unlikely(is_epsa_tunneling_qp(ibqp->qp_type))) {
+		sif_log(sdev, SIF_QP, "epsa tunneling post_send");
+		return sif_epsa_tunneling_post_send(ibqp, wr, bad_wr);
+	}
+
+	/* PSIF does not support SQD. Per IBTA 11.4.1.1, error is only returned
+	 * when the QP is in the RESET, INIT or RTR states.
+	 */
+	if (unlikely(qp->last_set_state < IB_QPS_RTS)) {
+		sif_log(sdev, SIF_INFO, "Invalid QP state - expected RTS(%d) found %d!",
+			(int)IB_QPS_RTS, qp->last_set_state);
+		ret = -EINVAL;
+		goto err_post_send_unlocked;
+	}
+
+	while (wr) {
+		/* Workaround #3595: ring doorbell if SQS active */
+		doorbell_mode = qp->flags & SIF_QPF_FORCE_SQ_MODE || SQS_ACTIVE;
+
+		/* We need to serialize sends on the same send queue
+		 * so we need to keep sq->lock around it all
+		 */
+		spin_lock_irqsave(&sq->lock, flags);
+		first_seq = sq_sw->last_seq + 1;
+		for (n = 0; wr && n < nmbr_wrs_to_bulk_process; ++n, wr = wr->next) {
+			last = !wr->next || n == (nmbr_wrs_to_bulk_process - 1);
+			ret = sif_post_send_single(ibqp, wr, &doorbell_mode, last, &first_seq);
+			if (ret < 0)
+			goto err_post_send;
+		}
+		spin_unlock_irqrestore(&sq->lock, flags);
+	}
+
+	if ((qp->type != PSIF_QP_TRANSPORT_MANSP1)
+	    && (qp->last_set_state == IB_QPS_ERR)) {
+		ret = 0;
+		goto flush_sq_wa4074;
+	}
+
+
+	sif_log(sdev, SIF_SND, "Exit: success");
+	return 0;
+
+err_post_send:
+	spin_unlock_irqrestore(&sq->lock, flags);
+
+err_post_send_unlocked:
+	*bad_wr = wr;
+
+flush_sq_wa4074:
+	if ((qp->type != PSIF_QP_TRANSPORT_MANSP1)
+	    && (qp->last_set_state == IB_QPS_ERR)) {
+		if (post_process_wa4074(sdev, qp))
+			sif_log(sdev, SIF_INFO, "failed to flush SQ %d", qp->qp_idx);
+	}
+
+	sif_log(sdev, SIF_SND, "Exit: error %d", ret);
+	return ret;
+
+}
+#undef SQS_ACTIVE
+
+
+/* The copy_from_user function on x86_64 calls might_fault() to verify that
+ * it is not called from interrupt context. However with our use case the memory is guaranteed
+ * to be pinned, so no faults will ever happen.
+ *
+ * TBD: Sparc does not define _copy_from_user - just use copy_from _user for now
+ */
+inline unsigned long sif_copy_from_user(void *to, const void __user *from, unsigned int n)
+{
+#ifdef __x86_64__
+	return _copy_from_user(to, from, n);
+#else
+	return copy_from_user(to, from, n);
+#endif
+}
+
+
+static int copy_sg(struct sif_qp *qp, void *dest, u64 vaddr, u32 len)
+{
+	struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+
+	if (qp->ibqp.uobject) {
+		unsigned long not_copied;
+
+		sif_log(sdev, SIF_SND, "Copy sg len %d from user addr 0x%llx to %p",
+			len, vaddr, dest);
+		not_copied = sif_copy_from_user(dest, (void __user *)vaddr, len);
+		if (not_copied) {
+			sif_log(sdev, SIF_INFO,
+				"copy_from_user: Failed to copy %ld/%d bytes from uaddr %llx",
+				not_copied, len, vaddr);
+			return -EFAULT;
+		}
+	} else {
+		sif_log(sdev, SIF_SND, "Copy sge len %d from kernel addr 0x%llx to %p",
+			len, vaddr, dest);
+		memcpy(dest, (void *)vaddr, len);
+	}
+	return 0;
+}
+
+
+/* Copy the first @sg_cnt sg entries of @wr into the inline space
+ */
+
+/* TBD: Consider cleaning up/unrolling this into one copy
+ * into temp buffer for csumming/cb copy_convert
+ * and one other plain copy into send queue:
+ */
+static int prep_inline_part(struct sif_qp *qp, struct ib_send_wr *wr, int sg_cnt,
+		struct psif_cb *wqe, struct psif_wr_local *la, u32 sqe_seq,
+		bool is_phys_addr)
+{
+	int ret;
+	int wr_len = 0;
+	struct sif_sq *sq;
+	struct psif_sq_entry *sqe;
+	struct psif_key *key;
+
+	/* collect buffer only supports 256 byte inlined, this first part
+	 * of the inline data must be handled in host byte order to
+	 * make sure the checksum gets right:
+	 */
+	int cb_len = min_t(int, ((qp->max_inline_data + CB_KICK_MASK) & ~CB_KICK_MASK), CB_LENGTH);
+	int space = qp->max_inline_data;
+	int copy = 0;
+	int remaining = -1;
+	int i;
+	u32 len = 0;
+	u64 addr = 0;
+	struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+
+	u8 buf[CB_LENGTH];
+	u8 *dbuf = buf;
+
+	if (wr->send_flags & IB_SEND_IP_CSUM) {
+		/* Cannot use collect-buffer for inline data when offloading */
+		cb_len = 0;
+	}
+
+	sq = get_sif_sq(sdev, qp->qp_idx);
+	sqe = get_sq_entry(sq, sqe_seq);
+
+	sif_log(sdev, SIF_SND, "inline from %d sges, buf at %p sqe at %p", sg_cnt, buf, sqe);
+
+	for (i = 0; i < sg_cnt; ++i) {
+		if (unlikely(remaining >= 0)) {
+			/* Switch to copying directly into send queue
+			 * @copy already holds the offset
+			 */
+			dbuf = ((u8 *)sqe->payload);
+			if (remaining > 0) {
+				addr += len;
+				len = remaining;
+				remaining = -1;
+				goto do_copy;
+			} else
+				remaining = -1;
+		}
+		len = wr->sg_list[i].length;
+		addr = wr->sg_list[i].addr;
+
+		if (len > 0) {
+			u32 lkey = wr->sg_list[i].lkey;
+
+			key = safe_get_key(sdev, lkey);
+			if (!key || PSIF_DMA_KEY_INVALID == get_psif_key__lkey_state(key)) {
+				sif_log(sdev, SIF_INFO,
+					"Attempt to do inline copying from an invalid MR with lkey %d at addr 0x%llx",
+					lkey, addr);
+				return -EPERM;
+			}
+		}
+
+do_copy:
+		wr_len += len;
+		if (unlikely(dbuf == buf && wr_len >= cb_len)) {
+			remaining = wr_len - cb_len;
+			len -= remaining;
+			wr_len -= remaining;
+			if (remaining)
+				i--;  /* Run an extra iter to copy remainder */
+		} else if (unlikely(copy + len > space)) {
+			sif_log(sdev, SIF_INFO,
+				"Inline space exhausted: available %d, copied %d, len %d",
+				space, copy, len);
+			return -ENOMEM;
+		}
+		if (is_phys_addr) {
+			u64 *kva = phys_to_virt(addr);
+
+			sif_log(sdev, SIF_SND,
+				"Phys-addr %llx -> %llx copy %d len %d",
+				addr, (u64)kva, copy, len);
+			memcpy((void *)&dbuf[copy], (void *)kva, len);
+			ret = 0;
+		} else {
+			ret = copy_sg(qp, &dbuf[copy], addr, len);
+		}
+		if (ret < 0)
+			return ret;
+		copy += len;
+	}
+
+	if (buf == dbuf && copy & CB_KICK_MASK) {
+		/* Pad out the misaligned end data */
+		memset(&buf[copy], 0, CB_KICK_ALIGN - (copy & CB_KICK_MASK));
+	}
+
+	sif_log(sdev, SIF_QP, "wr_len is %d bytes, cb_len %d bytes", wr_len, cb_len);
+	if (cb_len > 0) {
+		/* Convert payload twice to get checksum right.
+		 * The 32 bit version of the checksumming in PSIF does not
+		 * have the property that checksumming of the same data
+		 * on different endian hosts yields the same checksum..
+		 */
+		copy_conv_to_sw(wqe->payload, buf, cb_len);
+	}
+	wqe->wr.collect_length = min(wr_len, cb_len);
+	return wr_len;
+}
+
+static inline int prep_inline(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe,
+			struct psif_wr_local *la, u32 sqe_seq,
+			bool is_phys_addr)
+{
+	struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+	struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+	int wr_len = prep_inline_part(qp, wr, wr->num_sge, wqe, la, sqe_seq, is_phys_addr);
+
+	if (wr_len < 0)
+		return wr_len;
+	if (wr_len) {
+		/* la must point to the start of the payload in the send queue
+		 * to have the whole message available in case of retries:
+		 */
+		la->addr = get_sqe_dma(sq, sqe_seq) + offsetof(struct psif_sq_entry, payload);
+		la->lkey = sq->sg_mr->index;
+	}
+	la->length = wr_len;
+	return wr_len;
+}
+
+/* Helper funcs declared below */
+static void prep_atomic(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe);
+static int prep_send(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe,
+		bool inlined, struct psif_wr_local *la, u32 sqe_idx);
+static int prep_send_lso(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe,
+			 bool inlined, struct psif_wr_local *la, u32 sqe_idx);
+static int prep_remote_addr(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe);
+
+
+/* Return bypass mode offset or 0 if invalid for post_sends (see below)
+ * (PSIF will take care of rejecting the post)
+ */
+
+inline u64 mr_uv2dma(struct sif_dev *sdev, int idx)
+{
+	struct sif_mr *mr = safe_get_sif_mr(sdev, idx);
+
+	if (mr)
+		return mr->mmu_ctx.uv2dma;
+	return 0;
+}
+
+
+/*
+ * Handle send of a single wr - can be called from any context.
+ *
+ * Use either CB mode or DB mode. In CB mode, wqe is allocated,
+ * written to SQ, SW pointer updated, and finally the wqe is written
+ * to the CB.  In DB mode, the wqe is allocated and written to the
+ * SQ. On the last wqe, SW pointer is updated and the doorbell is rung
+ * with the seq number of the first sqe.
+ */
+int sif_post_send_single(struct ib_qp *ibqp, struct ib_send_wr *wr, bool *use_db, bool last, u16 *first_seq)
+{
+	bool inlined = false;
+	u64 csum;
+	struct psif_cb wqe;
+	struct psif_sq_entry *sqe;
+	int cb_len = 0;
+	int cb_len_8 = 0;
+	struct sif_dev *sdev = to_sdev(ibqp->device);
+	struct sif_qp *qp = to_sqp(ibqp);
+	struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+	int ret = 0;
+	u16 head, sq_seq, q_sz;
+	struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp->qp_idx);
+	bool is_ud = qp->type == PSIF_QP_TRANSPORT_UD;
+	struct sif_sq_hdl *wh;
+
+	if (wr->num_sge > sq->sg_entries) {
+		sif_log(sdev, SIF_SND, "attempt to post wr with %d/%d sg entries",
+			wr->num_sge, sq->sg_entries);
+		return -EINVAL;
+	}
+
+	sq_seq = ++sq_sw->last_seq;
+	head = sq_sw->head_seq;
+	q_sz = sq_length(sq, head, sq_seq);
+
+	if (q_sz > sq->entries) {
+		sif_log(sdev, SIF_INFO,
+			"Send Queue %d full - head %d, tail %d, entries %d, sge_entries %u, sq->user_mode: %s, sq->alloc_sz: %llu",
+			sq->cq_idx, head, sq_seq, sq->entries, sq->sg_entries,
+			(sq->user_mode) ? "[yes]" : "[no]", sq->mem->size);
+		ret = -EAGAIN;
+		goto fail;
+	}
+
+
+	sqe = get_sq_entry(sq, sq_seq);
+
+	memset(&wqe, 0, sizeof(wqe));
+
+	wqe.wr.tsu_qosl = qp->qosl;
+	wqe.wr.eps_tag = qp->eps_tag;
+
+	ret = prep_remote_addr(qp, wr, &wqe);
+	if (ret)
+		goto fail;
+
+	if (wr->send_flags & IB_SEND_FENCE) /* RC only */
+		wqe.wr.fence = 1;
+
+	if (qp->flags & SIF_QPF_DYNAMIC_MTU)
+		wqe.wr.dynamic_mtu_enable = 1;
+
+	wqe.wr.completion = sq->complete_all;
+	if (wr->send_flags & IB_SEND_SIGNALED)
+		wqe.wr.completion = 1;
+
+	inlined = wr->send_flags & IB_SEND_INLINE;
+
+	if (qp->qp_idx < 4) {
+		/* Field valid for QP0/1 only */
+		wqe.wr.port = qp->port - 1;
+
+		/* and in the work request we must use "real" QP numbers as well */
+		wqe.wr.local_qp = qp->qp_idx & 1;
+	} else
+		wqe.wr.local_qp = qp->qp_idx;
+
+	if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+		wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
+		wqe.wr.imm = cpu_to_be32(wr->ex.imm_data);
+	}
+
+	/* TBD: only set if wr opcode allows it */
+	if (wr->send_flags & IB_SEND_SOLICITED)
+		wqe.wr.se = 1;
+
+	if (wr->send_flags & IB_SEND_IP_CSUM) {
+		wqe.wr.l3_checksum_en = 1;
+		wqe.wr.l4_checksum_en = 1;
+		qp->ipoib_tx_csum_l3++;
+		qp->ipoib_tx_csum_l4++;
+	}
+	switch (wr->opcode) {
+	case IB_WR_LSO:
+	{
+		struct psif_wr_local *la = &wqe.wr.details.send.ud.local_addr;
+
+		if (!supports_offload(qp)) {
+			sif_log(sdev, SIF_INFO,
+				"LSO WR on qp %d which does not support offloading",
+				qp->qp_idx);
+			ret = -EINVAL;
+			goto fail;
+		}
+		ret = prep_send_lso(qp, wr, &wqe, inlined, la, sq_seq);
+		if (ret < 0)
+			goto fail;
+		break;
+	}
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_IMM:
+	{
+		struct psif_wr_local *la = (is_ud ?
+					&wqe.wr.details.send.ud.local_addr :
+					&wqe.wr.details.send.uc_rc_xrc.local_addr);
+		ret = prep_send(qp, wr, &wqe, inlined, la, sq_seq);
+		if (ret < 0)
+			goto fail;
+		break;
+	}
+	case IB_WR_RDMA_READ:
+		/* RDMA READ does not support dynamic MTU */
+		wqe.wr.dynamic_mtu_enable = 0;
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+	{
+		struct psif_wr_local *la = &wqe.wr.details.rdma.local_addr;
+		struct psif_wr_remote *ra = &wqe.wr.details.rdma.remote_addr;
+
+		ra->addr = wr->wr.rdma.remote_addr;
+		ra->rkey = wr->wr.rdma.rkey;
+
+		ret = prep_send(qp, wr, &wqe, inlined, la, sq_seq);
+		if (ret < 0)
+			goto fail;
+
+		ra->length = ret;
+		break;
+	}
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		prep_atomic(qp, wr, &wqe);
+		break;
+	case IB_WR_SEND_WITH_INV:
+	case IB_WR_RDMA_READ_WITH_INV:
+		sif_log(sdev, SIF_SND, "Opcode not implemented");
+		ret = -EOPNOTSUPP;
+		goto fail;
+	case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+	case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
+	{
+		/* Bug 3844, WA for HW bug 3683 */
+		bool masked_atomics_defeatured  = PSIF_REVISION(sdev) <= 3;
+
+		if (masked_atomics_defeatured)
+			sif_log(sdev, SIF_SND, "Opcode not supported");
+		else
+			sif_log(sdev, SIF_SND, "Opcode not yet implemented");
+		ret = -EOPNOTSUPP;
+		goto fail;
+	}
+	default:
+		sif_log(sdev, SIF_SND, "Unsupported opcode");
+		ret = -EINVAL;
+		goto fail;
+	}
+
+	sif_log(sdev, SIF_SND,
+		"copied %d bytes inline, num_sgl %d, sqe at %p",
+		wqe.wr.collect_length, wqe.wr.num_sgl, sqe);
+	cb_len_8 = sizeof(struct psif_wr)
+		+ ((wqe.wr.collect_length + 7) & ~7);
+	cb_len = sizeof(struct psif_wr)
+		+ ((wqe.wr.collect_length + CB_KICK_MASK) & ~CB_KICK_MASK);
+
+	wqe.wr.sq_seq = sq_seq;
+	wqe.wr.tsu_sl = qp->tsl;
+
+	/* Map sqe (repr.by index in sq) to this wr_id */
+	wh = get_sq_hdl(sq, sq_seq);
+	wh->wr_id = wr->wr_id;
+	wh->sq_seq = sq_seq;
+	wh->used = true;
+
+	sif_log(sdev, SIF_SND, "wr_id %llx at tail 0x%x sq_seq_num %d%s",
+		wr->wr_id, sq_seq & sq->mask, wqe.wr.sq_seq, (wqe.wr.completion ? " [req.compl]" : ""));
+
+	/* We can safely checksum any "hole" due to end misalignment + byte swap
+	 * towards the end of the inline data
+	 * as prep_inline has nil'ed these bytes out:
+	 */
+	if (qp->nocsum) {
+		wqe.wr.checksum = qp->magic;
+	} else {
+		csum = csum32_partial(&wqe, cb_len_8, qp->magic);
+		csum = csum32_fold(csum);
+		wqe.wr.checksum = csum;
+	}
+	sif_log(sdev, SIF_SND, "op %s checksum %x cb_len 0x%x",
+		string_enum_psif_wr_type(wqe.wr.op),
+		wqe.wr.checksum, cb_len);
+	sif_logs(SIF_DUMP, write_struct_psif_wr(NULL, 0, &wqe.wr));
+
+	/* First update send queue (any further inline data beyond cb_len
+	 * has already been copied in prep_inline:
+	 */
+	copy_conv_to_hw(sqe, &wqe, cb_len);
+
+	/* A heuristic mechanism to determine the traffic pattern. */
+	/* Even though traffic_patterns.mask is being set by handle_wc, no
+	 * lock is used.The reason is that the mask is used to get a "rough"
+	 * idea about the underlying traffic pattern without adding latency
+	 * in the driver.
+	 */
+	qp->traffic_patterns.mask = (qp->traffic_patterns.mask << 1) |
+		HEUR_TX_DIRECTION;
+	sif_log_perf(sdev, SIF_PERF_V, "qp:traffic_pattern %x",
+		     qp->traffic_patterns.mask);
+	/* If the traffic pattern shows that it's not latency sensitive,
+	 * use SQ mode by ringing the doorbell.
+	 * In a latency sensitive traffic pattern, a SEND should
+	 * be accompanied by a WC_OPCODE_RECEIVE_SEND. Thus,
+	 * a latency sensitve traffic pattern should have
+	 * half_of_bits(sizeof(traffic_patterns.submask[n)) set.
+	 * The constant 7 and 9 are used below as we are adding one
+	 * to half_of_bits(sizeof(traffic_patterns.submask[n]))
+	 * as the tolerance.
+	 */
+	if (((hweight16(qp->traffic_patterns.submask[0]) < 7)  ||
+	     (hweight16(qp->traffic_patterns.submask[0]) > 9)) ||
+	    ((hweight16(qp->traffic_patterns.submask[1]) < 7)  ||
+	     (hweight16(qp->traffic_patterns.submask[1]) > 9)))
+		*use_db = true;
+
+	/* Flush writes before updating the sw pointer,
+	 * This is necessary to ensure that the sqs do not see
+	 * an incomplete entry.
+	 * NB! Note that as opposed to software consuming
+	 * queues this value should point to the last used entry, not the first
+	 * unused:
+	 */
+	if (!*use_db || last) {
+		wmb();
+		set_psif_sq_sw__tail_indx(&sq_sw->d, sq_seq);
+	}
+
+	/* Finally write to collect buffer or ring doorbell if last */
+	if (*use_db && last)
+		/* Write doorbell for first WR when we process the last request */
+		sif_doorbell_from_sqe(qp, *first_seq, true);
+	else if (!*use_db)
+		if (sif_cb_write(qp, &wqe.wr, cb_len)) {
+			/*vcb lock busy, convert to db mode */
+			if (last)
+				sif_doorbell_from_sqe(qp, sq_seq, true);
+			else {
+				*use_db = true;
+				*first_seq = sq_seq;
+			}
+		}
+
+	return ret;
+fail:
+	sif_log(sdev, SIF_SND, "Exit: Fail to post_send a WR");
+	sif_logs(SIF_DUMP, write_struct_psif_wr(NULL, 0, &wqe.wr));
+
+	/* Avoid "using" the allocated entry */
+	sq_sw->last_seq--;
+	return ret;
+}  /* end sif_post_send_single */
+
+
+static int get_gsi_qp_idx(struct sif_qp *qp)
+{
+	struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+	int pma_qp_idx = sdev->pma_qp_idxs[!!(qp->qp_idx & 2)];
+	struct sif_qp *pma_qp = get_sif_qp(sdev, pma_qp_idx);
+	struct sif_rq_sw *rq_sw;
+	int gsi_qlen, pma_qlen;
+
+	rq_sw = get_sif_rq_sw(sdev, qp->rq_idx);
+	gsi_qlen = atomic_read(&rq_sw->length);
+	rq_sw = get_sif_rq_sw(sdev, pma_qp->rq_idx);
+	pma_qlen = atomic_read(&rq_sw->length);
+
+	return (gsi_qlen <= pma_qlen) ? qp->qp_idx : pma_qp->qp_idx;
+}
+
+
+int sif_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		  struct ib_recv_wr **bad_wr)
+{
+	struct sif_qp *qp = to_sqp(ibqp);
+	struct sif_rq *rq;
+	struct sif_dev *sdev = to_sdev(ibqp->device);
+	struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+	bool need_pma_pxy_qp = eps_version_ge(es, 0, 57)
+		&& (qp->qp_idx == 1 || qp->qp_idx == 3);
+
+
+	sif_log(sdev, SIF_RCV, "Enter: wr_id 0x%llx qp_idx %d",
+		wr->wr_id, qp->qp_idx);
+
+	if (need_pma_pxy_qp) {
+		qp = get_sif_qp(sdev, get_gsi_qp_idx(qp));
+		sif_log(sdev, SIF_RCV, "Redirect wr_id 0x%llx to qp_idx %d",
+			wr->wr_id, qp->qp_idx);
+	}
+
+	if (qp->last_set_state == IB_QPS_RESET) {
+		sif_log(sdev, SIF_INFO, "Invalid QP state (IB_QPS_RESET)");
+		return -EINVAL;
+	}
+
+	rq = get_sif_rq(sdev, qp->rq_idx);
+
+	if (wr->num_sge > rq->sg_entries) {
+		sif_log(sdev, SIF_INFO, "qp only supports %d receive sg entries - wr has %d",
+			rq->sg_entries, wr->num_sge);
+		return -ENOMEM;
+	}
+
+	return post_recv(sdev, qp, rq, wr, bad_wr);
+}
+
+
+/* Post a list of receives - can be called from any context */
+int post_recv(struct sif_dev *sdev, struct sif_qp *qp, struct sif_rq *rq,
+		struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr)
+{
+	struct sif_rq_sw *rq_sw = get_sif_rq_sw(sdev, rq->index);
+	int ret = 0;
+	u32 rq_len;
+
+	unsigned long flags;
+
+	if (unlikely(rq->user_mode)) {
+		sif_log(sdev, SIF_INFO,
+			"rq %d: Attempt to use kernel API to post to user mode receive queue",
+			rq->index);
+		return -EINVAL;
+	}
+
+	if (!wr)
+		return ret;
+
+	/* TBD: Revisit locking scheme again later
+	 * to allow more parallelism. For now serialize to avoid
+	 * having to handle "holes":
+	 */
+	spin_lock_irqsave(&rq->lock, flags);
+
+	for (; wr; wr = wr->next) {
+		struct psif_rq_entry *rqe;
+		struct psif_rq_entry lrqe;
+		struct psif_rq_scatter *sge;
+		int i = 0;
+		int rqe_sz = 8 + wr->num_sge*sizeof(struct psif_rq_scatter);
+		int max_rqe_sz = 8 + rq->sg_entries*sizeof(struct psif_rq_scatter);
+
+		rq_len = atomic_inc_return(&rq_sw->length);
+		if (rq_len > rq->entries) {
+			sif_log(sdev, SIF_INFO, "queue full - rq %d entries %d len %d",
+				rq->index, rq->entries, rq_len);
+			atomic_dec(&rq_sw->length);
+			ret = -ENOMEM;
+			goto err_post_recv;
+		}
+		if (wr->num_sge > rq->sg_entries) {
+			sif_log(sdev, SIF_INFO, "too many sges - rq %d sges configured %d, sges in wr %d",
+				rq->index, rq->sg_entries, wr->num_sge);
+			atomic_dec(&rq_sw->length);
+			ret = -EINVAL;
+			goto err_post_recv;
+		}
+
+		rqe = get_rq_entry(rq, rq_sw->next_seq++);
+
+		/* On the receive side we use the full wr_id directly */
+		lrqe.rqe_id = wr->wr_id;
+
+		sge = lrqe.scatter;
+		for (i = 0; i < wr->num_sge; i++) {
+			u32 lkey = wr->sg_list[i].lkey;
+
+			sge[i].lkey = lkey;
+			sge[i].base_addr = wr->sg_list[i].addr + mr_uv2dma(sdev, lkey);
+			sge[i].length = wr->sg_list[i].length;
+			sif_log(sdev, SIF_RCV,
+				"sg_adr 0x%llx sg_len %d lkey %d",
+				wr->sg_list[i].addr, wr->sg_list[i].length, lkey);
+		}
+
+		copy_conv_to_hw(rqe, &lrqe, rqe_sz);
+
+		/* As per PRM, unused sges shall be zero, which is endian neutral */
+		if (max_rqe_sz > rqe_sz)
+			memset(rqe->scatter + wr->num_sge, 0, max_rqe_sz - rqe_sz);
+
+		sif_log(sdev, SIF_RCV,
+			" entries %u extent %u RQ %d next_seq %x length %d",
+			rq->entries, rq->extent, rq->index,
+			rq_sw->next_seq, atomic_read(&rq_sw->length));
+	}
+	/* Enforce reordering of new rq entries and tail */
+	wmb();
+	set_psif_rq_sw__tail_indx(&rq_sw->d, rq_sw->next_seq);
+	/* Enforce visibility of rq tail on hw */
+	smp_wmb();
+
+	sif_log(sdev, SIF_RCV, "Exit: success");
+err_post_recv:
+	spin_unlock_irqrestore(&rq->lock, flags);
+	*bad_wr = wr;
+
+	/* WA #622, Check if QP in ERROR, flush RQ */
+	if (!rq->is_srq && is_regular_qp(qp) && qp->last_set_state == IB_QPS_ERR) {
+		if (sif_flush_rq(sdev, rq, qp, atomic_read(&rq_sw->length)))
+			sif_log(sdev, SIF_INFO, "failed to flush RQ %d", rq->index);
+	}
+
+	return ret;
+}
+
+int sif_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct sif_dev *sdev = to_sdev(ibqp->device);
+	struct sif_qp *qp = to_sqp(ibqp);
+	struct psif_epsc_csr_rsp rsp;
+	struct psif_epsc_csr_req req;
+
+	sif_log(sdev, SIF_MC, "qp %d mc gid %llx.%llx lid 0x%x",
+		qp->qp_idx, gid->global.subnet_prefix, gid->global.interface_id, lid);
+
+	memset(&req, 0, sizeof(req));
+	req.opcode = EPSC_MC_ATTACH;
+	req.u.mc.qp = qp->qp_idx;
+	req.u.mc.port = qp->port; /* The EPS uses IB port space */
+	/* union ib_gid contains BE gids and we do copy_convert later.. */
+	req.u.mc.mgid_0 = be64_to_cpu(gid->global.subnet_prefix);
+	req.u.mc.mgid_1 = be64_to_cpu(gid->global.interface_id);
+	return sif_epsc_wr(sdev, &req, &rsp);
+}
+
+int sif_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct sif_dev *sdev = to_sdev(ibqp->device);
+	struct sif_qp *qp = to_sqp(ibqp);
+	struct psif_epsc_csr_rsp rsp;
+	struct psif_epsc_csr_req req;
+
+	sif_log(sdev, SIF_MC, "qp %d mc gid %llx.%llx lid 0x%x",
+		qp->qp_idx, gid->global.subnet_prefix, gid->global.interface_id, lid);
+
+	memset(&req, 0, sizeof(req));
+	req.opcode = EPSC_MC_DETACH;
+	req.u.mc.qp = qp->qp_idx;
+	req.u.mc.port = qp->port; /* The EPS uses IB port space */
+	/* union ib_gid contains BE gids and we do copy_convert later.. */
+	req.u.mc.mgid_0 = be64_to_cpu(gid->global.subnet_prefix);
+	req.u.mc.mgid_1 = be64_to_cpu(gid->global.interface_id);
+	return sif_epsc_wr(sdev, &req, &rsp);
+}
+
+
+/* Workaround to emulate extra send sg entries from software:
+ * We use the available inline space and copy the first fitting
+ * xsg = wr->num_sge - hw_max + 1 entries into this space:
+ */
+static int prep_sw_sg(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe,
+		struct psif_wr_local *la, u32 sqe_seq)
+{
+	struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+	struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+	struct psif_sq_entry *sqe = get_sq_entry(sq, sqe_seq);
+	void *sgl_start = sq_sgl_offset(sq, sqe);
+	struct psif_rq_scatter *sge = sq->tmp_sge;
+	int i;
+	int xsg = wr->num_sge - SIF_HW_MAX_SEND_SGE + 1;
+	int xi = -1;
+	int pi = 0;
+	u32 xcnt = 0;
+	u32 len = 0;
+	int ret;
+	u32 xlen = 0;
+	u64 addr = 0;
+	int space = qp->max_inline_data;
+
+	la->addr = get_sqe_dma(sq, sqe_seq) + sq->sgl_offset;
+	la->lkey = sq->sg_mr->index;
+
+	for (i = 0; i < wr->num_sge; i++) {
+		if (i == xsg)
+			space -= 256; /* We can no longer use the inline bytes */
+		xlen += wr->sg_list[i].length;
+		sif_log(sdev, SIF_SND, "xsg %d, xlen 0x%x space 0x%x", xsg, xlen, space);
+		if (xcnt < xsg) {
+			xcnt++;
+			if (xcnt < xsg)
+				continue;
+		}
+		if (xlen <= space) {
+			xi = i - xsg + 1;
+			break;
+		}
+		xlen -= wr->sg_list[i - xsg].length;
+	}
+	if (xi < 0) {
+		/* If our worst case calculations are right, this should not happen.. */
+		sif_log(sdev, SIF_INFO, "Failed to find sg entries to collapse into inline space!");
+		return -ENOMEM;
+	}
+	if (xi == 0) {
+		ret = prep_inline_part(qp, wr, xsg, wqe, la, sqe_seq, false);
+		if (ret < 0)
+			return ret;
+	} else {
+		/* TBD: We can consider merging xsg + 1 entries into two
+		 * sg entries, one containing the first entries, but for now
+		 * keep it simple and just not use the first 256 bytes:
+		 */
+		u8 *dbuf = ((u8 *)sqe->payload);
+		int copy = 0;
+
+		for (i = xi; i < xi + xsg; i++) {
+			u32 lkey = wr->sg_list[i].lkey;
+
+			len = wr->sg_list[i].length;
+			addr = wr->sg_list[i].addr;
+			if (len > 0) {
+				struct psif_key *key = safe_get_key(sdev, lkey);
+
+				if (!key || PSIF_DMA_KEY_INVALID == get_psif_key__lkey_state(key)) {
+					sif_log(sdev, SIF_INFO,
+						"Attempt to do inline copying from an invalid MR with lkey %d at addr 0x%llx",
+						wr->sg_list[i].lkey, addr);
+					return -EPERM;
+				}
+			}
+
+			ret = copy_sg(qp, &dbuf[copy], addr, len);
+			if (ret < 0)
+				return ret;
+			copy += len;
+		}
+	}
+
+	la->length = 0;
+	for (i = 0; i < wr->num_sge; i++) {
+		u32 lkey;
+		u32 offset = i ? 256 : 0;
+
+		if (i == xi) {
+			sge[pi].lkey = sq->sg_mr->index;
+			sge[pi].base_addr =
+				get_sqe_dma(sq, sqe_seq) +
+				offsetof(struct psif_sq_entry, payload) + offset;
+			sge[pi].length = xlen;
+			la->length += xlen;
+			i += xsg - 1;
+			sif_log(sdev, SIF_SND,
+			"sg_list[%d]: sge entry: dma addr 0x%llx, len = %d, lkey %d",
+			pi, sge[pi].base_addr, sge[pi].length, sge[pi].lkey);
+			pi++;
+			continue;
+		}
+		lkey = wr->sg_list[i].lkey;
+		sge[pi].base_addr = wr->sg_list[i].addr
+			+ mr_uv2dma(sdev, lkey);
+		sge[pi].lkey      = wr->sg_list[i].lkey;
+		sge[pi].length    = wr->sg_list[i].length;
+		la->length += sge[pi].length;
+		sif_log(sdev, SIF_SND,
+			"sg_list[%d]: sge entry: dma addr 0x%llx, len = %d, lkey %d",
+			pi, sge[pi].base_addr, sge[pi].length, sge[pi].lkey);
+		pi++;
+	}
+	sif_log(sdev, SIF_SND,
+		"ready with sgl_start %p, sg list addr 0x%llx, message len %d, lkey %d, sge %p",
+		sgl_start, la->addr, la->length, la->lkey, sge);
+
+	copy_conv_to_hw(sgl_start, sge,
+			sizeof(struct psif_rq_scatter) * SIF_HW_MAX_SEND_SGE);
+	wqe->wr.num_sgl = SIF_HW_MAX_SEND_SGE - 1;
+	return la->length;
+}
+
+
+static int prep_send(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe,
+		bool inlined, struct psif_wr_local *la, u32 sqe_seq)
+{
+	struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+	int ret = 0;
+	int num_sge;
+	int use_inline_first_sge  = 0;
+
+	if (inlined)
+		return prep_inline(qp, wr, wqe, la, sqe_seq, false);
+
+	la->length = 0;
+	num_sge = wr->num_sge;
+	if (num_sge == 0) {
+		sif_log(sdev, SIF_SND, "no sge entries - local_addr left as 0");
+		return 0;
+	}
+	if (!sif_feature(disable_inline_first_sge) && qp->ulp_type == RDS_ULP && num_sge == 2
+	    && wr->sg_list[0].length <= qp->max_inline_data) {
+		use_inline_first_sge = 1;
+	}
+
+	if (use_inline_first_sge) {
+		int wr_len;
+		u32 lkey = wr->sg_list[0].lkey;
+		struct sif_mr *mr = safe_get_sif_mr(sdev, lkey);
+		int mem_type = mr ?  mr->mem->mem_type : 0;
+		bool is_phys_addr = mem_type != SIFMT_UMEM;
+
+		sif_log(sdev, SIF_SND, "qp_%d handle special case; "
+			"#sge == 2 && sg[0].len == 48 max_inline_data %d, mem_type %d",
+			qp->qp_idx, qp->max_inline_data, mem_type);
+		/* Copy first sge inline */
+		if ((wr->sg_list[0].length + wr->sg_list[1].length) <= qp->max_inline_data) {
+			sif_log(sdev, SIF_SND, "qp_%d Inlining both %d + %d = %d",
+				qp->qp_idx,
+				wr->sg_list[0].length,
+				wr->sg_list[1].length,
+				(wr->sg_list[0].length + wr->sg_list[1].length));
+			return prep_inline(qp, wr, wqe, la, sqe_seq, is_phys_addr);
+		}
+		wr_len = prep_inline_part(qp, wr, 1, wqe, la, sqe_seq, is_phys_addr);
+		if (wr_len < 0)
+			return wr_len;
+		lkey = wr->sg_list[1].lkey;
+		/* Subtract to get address "correct" for hw-usage */
+		la->addr   = wr->sg_list[1].addr + mr_uv2dma(sdev, lkey) - wr_len;
+		la->lkey   = lkey;
+		la->length = wr_len + wr->sg_list[1].length;
+		num_sge = 1;
+		sif_log(sdev, SIF_SND,
+			"Changed to single sge user addr 0x%llx dma addr 0x%llx, message len %d,  key %d collect_len %d wr_len %d",
+			wr->sg_list[1].addr, la->addr, la->length, lkey, wqe->wr.collect_length, wr_len);
+	} else if (num_sge == 1) {
+		/* Single entry S/G list result after inlining */
+		u32 lkey = wr->sg_list[0].lkey;
+
+		la->addr   = wr->sg_list[0].addr + mr_uv2dma(sdev, lkey);
+		la->lkey   = lkey;
+		la->length += wr->sg_list[0].length;
+		sif_log(sdev, SIF_SND,
+			"single sge user addr 0x%llx dma addr 0x%llx, message len %d, key %d",
+			wr->sg_list[0].addr, la->addr, la->length, lkey);
+	} else if (unlikely(wr->num_sge > SIF_HW_MAX_SEND_SGE)) {
+		return prep_sw_sg(qp, wr, wqe, la, sqe_seq);
+	} else {
+		struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+		struct psif_sq_entry *sqe = get_sq_entry(sq, sqe_seq);
+		void *sgl_start = sq_sgl_offset(sq, sqe);
+		struct psif_rq_scatter *sge = sq->tmp_sge;
+		int i;
+
+		la->addr   = get_sqe_dma(sq, sqe_seq) + sq->sgl_offset;
+		la->lkey = sq->sg_mr->index;
+
+		for (i = 0; i < num_sge; i++) {
+			u32 lkey = wr->sg_list[i].lkey;
+
+			sge[i].base_addr = wr->sg_list[i].addr
+				+ mr_uv2dma(sdev, lkey);
+			sge[i].lkey      = wr->sg_list[i].lkey;
+			sge[i].length    = wr->sg_list[i].length;
+			la->length += sge[i].length;
+			sif_log(sdev, SIF_SND,
+				"sg_list[%d]: sge entry: dma addr 0x%llx, len = %d, lkey %d",
+				i, sge[i].base_addr, sge[i].length, sge[i].lkey);
+		}
+		sif_log(sdev, SIF_SND,
+			"ready with sgl_start %p, sg list addr 0x%llx, message len %d, lkey %d, sge %p",
+			sgl_start, la->addr, la->length, la->lkey, sge);
+
+		copy_conv_to_hw(sgl_start, sge,
+				sizeof(struct psif_rq_scatter) * wr->num_sge);
+		ret = la->length;
+	}
+	/* 0 here means a single entry, but input 0 must also be 0 */
+	wqe->wr.num_sgl = num_sge ? num_sge - 1 : 0;
+	return ret;
+}
+static int prep_send_lso(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe,
+			bool inlined, struct psif_wr_local *la, u32 sqe_seq)
+{
+	struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+	void *sgl_start;
+	int ret = 0;
+	int i;
+	u8 *p8;
+	struct sif_sq *sq;
+	struct psif_sq_entry *sqe;
+	struct psif_rq_scatter *sge;
+	const int stencil_sge = 1;
+
+	sq = get_sif_sq(sdev, qp->qp_idx);
+	sqe = get_sq_entry(sq, sqe_seq);
+	sge = sq->tmp_sge;
+	sgl_start = sq_sgl_offset(sq, sqe);
+
+	if (unlikely(wr->num_sge >= SIF_HW_MAX_SEND_SGE || wr->num_sge < 1)) {
+		sif_log(sdev, SIF_INFO, "attempt to post lso wr with %d/%d sg entries",
+			wr->num_sge, sq->sg_entries);
+		return -EINVAL;
+	}
+
+	wqe->wr.details.send.ud.mss = wr->wr.ud.mss;
+
+	la->addr   = get_sqe_dma(sq, sqe_seq) + sq->sgl_offset;
+	la->lkey = sq->sg_mr->index;
+	la->length = 0;
+
+	/* copy stencil to payload-area in send_queue */
+	p8 = (u8 *)wr->wr.ud.header;
+	memcpy((u8 *)sqe->payload, p8, wr->wr.ud.hlen);
+
+	sge[0].base_addr = get_sqe_dma(sq, sqe_seq)
+		+ offsetof(struct psif_sq_entry, payload) + mr_uv2dma(sdev, la->lkey);
+	sge[0].lkey = sq->sg_mr->index;
+	sge[0].length = wr->wr.ud.hlen;
+	la->length += sge[0].length;
+
+	sif_log(sdev, SIF_SND,
+		"sg_list[%d]: sge entry: dma addr 0x%llx, len = %d, lkey %d",
+		0, sge[0].base_addr, sge[0].length, sge[0].lkey);
+
+	for (i = 0; i < wr->num_sge; i++) {
+		u32 lkey = wr->sg_list[i].lkey;
+
+		sge[i+1].base_addr = wr->sg_list[i].addr + mr_uv2dma(sdev, lkey);
+		sge[i+1].lkey      = wr->sg_list[i].lkey;
+		sge[i+1].length    = wr->sg_list[i].length;
+		la->length += sge[i+1].length;
+		sif_log(sdev, SIF_SND,
+			"sg_list[%d]: sge entry: dma addr 0x%llx, len = %d, lkey %d",
+			i+1, sge[i+1].base_addr, sge[i+1].length, sge[i+1].lkey);
+	}
+	copy_conv_to_hw(sgl_start, sge,
+			sizeof(struct psif_rq_scatter) * (wr->num_sge+1));
+
+	wmb();
+	wqe->wr.num_sgl = wr->num_sge - 1 + stencil_sge;
+	sif_log(sdev, SIF_SND,
+		"num_sgl %d, sqe at %p la ->addr 0x%llx ->lkey %d ->length %d %d", wqe->wr.num_sgl, sqe,
+		la->addr, la->lkey, la->length, la->length-sge[0].length);
+	qp->ipoib_tx_lso_pkt++;
+	qp->ipoib_tx_lso_bytes += (la->length - sge[0].length);
+	return ret;
+}
+
+
+static int prep_remote_addr(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe)
+{
+	struct sif_ah *ah = NULL;
+	struct psif_ah *ah_p;
+	bool is_dr = false;
+	struct sif_dev *sdev = to_sdev(qp->ibqp.device);
+
+	sif_log(sdev, SIF_SND, "");
+	switch (qp->type) {
+	case PSIF_QP_TRANSPORT_UD:
+		if (!wr->wr.ud.ah) {
+			sif_log(sdev, SIF_INFO, "No ah supplied for ud packet");
+			return -EINVAL;
+		}
+		ah = to_sah(wr->wr.ud.ah);
+		ah_p = get_ah(sdev, ah->index);
+		is_dr = get_psif_ah__remote_lid(ah_p) == 0xffff;
+
+		/* Direct routed packets are destined for the SMA at uf 33.
+		 * For all other packets this field is ignored by the hw:
+		 */
+		if (is_dr)
+			wqe->wr.destuf = 33;
+		wqe->wr.details.send.ud.remote_addr.ah_indx
+			= ah->index;
+		wqe->wr.details.send.ud.qp.qkey = wr->wr.ud.remote_qkey;
+		wqe->wr.details.send.ud.qp.remote_qp = wr->wr.ud.remote_qpn;
+		wqe->wr.ud_pkt = 1;
+		break;
+	case PSIF_QP_TRANSPORT_UC:
+	case PSIF_QP_TRANSPORT_RC:
+		break;
+	case PSIF_QP_TRANSPORT_XRC:
+		wqe->wr.xrc_hdr.xrqd_id = wr->xrc_remote_srq_num;
+		break;
+	default:
+		sif_log(sdev, SIF_INFO,
+			"unhandled transport type %s", string_enum_psif_qp_trans(qp->type));
+		return -EINVAL;
+	}
+	wqe->wr.op = ib2sif_wr_op(wr->opcode, is_dr);
+	return 0;
+}
+
+
+
+static void prep_atomic(struct sif_qp *qp, struct ib_send_wr *wr, struct psif_cb *wqe)
+{
+	struct psif_wr_local  *la = &wqe->wr.details.atomic.local_addr;
+	struct psif_wr_remote *ra = &wqe->wr.details.atomic.remote_addr;
+
+	la->addr = wr->sg_list[0].addr;
+	la->lkey = wr->sg_list[0].lkey;
+	la->length = sizeof(long);
+
+	ra->addr = wr->wr.atomic.remote_addr;
+	ra->rkey = wr->wr.atomic.rkey;
+	ra->length = sizeof(long);
+
+	/* Payload order as in IB header */
+	if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+		wqe->payload[0] = cpu_to_be64(wr->wr.atomic.swap);
+		wqe->payload[1] = cpu_to_be64(wr->wr.atomic.compare_add);
+		wqe->wr.collect_length = 16;
+	} else {
+		wqe->payload[0] = cpu_to_be64(wr->wr.atomic.compare_add);
+		wqe->wr.collect_length = 8;
+	}
+}
diff --git a/drivers/infiniband/hw/sif/sif_sndrcv.h b/drivers/infiniband/hw/sif/sif_sndrcv.h
new file mode 100644
index 000000000000..af1a535b7871
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_sndrcv.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_sndrcv.h: Interface to IB send/receive, MAD packet recv and
+ *   multicast send/recv
+ */
+
+#ifndef __SIF_SNDRCV_H
+#define __SIF_SNDRCV_H
+
+struct sif_rq;
+struct sif_dev;
+
+int sif_post_send(struct ib_qp *ibqp,
+		  struct ib_send_wr *wr, struct ib_send_wr **bad_wr);
+int sif_post_recv(struct ib_qp *ibqp,
+		  struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr);
+
+int sif_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+int sif_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int post_recv(struct sif_dev *sdev, struct sif_qp *qp, struct sif_rq *rq,
+	struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr);
+
+/* Send a single wr */
+int sif_post_send_single(struct ib_qp *ibqp, struct ib_send_wr *wr, bool *use_db, bool last, u16 *first_seq);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_spt.c b/drivers/infiniband/hw/sif/sif_spt.c
new file mode 100644
index 000000000000..a2faa4b0fca5
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_spt.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Vinay Shaw <vinay.shaw@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_spt.c: Experimental implementation of shared use of the OS's page tables.
+ *   Default is to use private page tables - shared page tables can be enabled using
+ *   a vendor flag. This implementation assumes that physical addresses and DMA addresses
+ *   are 1-1, which might not in general be the case if going through an IOMMU.
+ */
+
+#include "sif_mmu.h"
+#include "sif_dev.h"
+#include "sif_base.h"
+#include "sif_dma.h"
+#include "sif_hwi.h"
+#include "sif_spt.h"
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/highmem.h>
+#include <rdma/ib_umem.h>
+
+
+#define PMD_ALIGN(addr)   ALIGN(addr, PMD_SIZE)
+#define PUD_ALIGN(addr)   ALIGN(addr, PUD_SIZE)
+#define PGDIR_ALIGN(addr) ALIGN(addr, PGDIR_SIZE)
+
+
+static void set_ctx_w_page(struct sif_dev *sdev,
+			struct sif_mmu_ctx *ctx,
+			enum psif_table_level level,
+			enum psif_page_size pg_sz, u64 val)
+{
+	struct psif_mmu_cntx *hw_ctx = &ctx->mctx;
+
+	hw_ctx->page_size = pg_sz;
+	hw_ctx->table_ptr = ((val) >> PAGE_SHIFT) & ~PSIF_TABLE_PTR_MASK;
+	hw_ctx->table_level = level;
+	sif_log(sdev, SIF_MMU, "pte 0x%08llx level %d", val, level);
+}
+
+
+static int sif_set_mmu_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *sctx,
+			struct sif_mem *mem, bool write);
+
+int sif_spt_map_gva_ctx(struct sif_dev *sdev,
+			struct sif_mmu_ctx *ctx,
+			struct sif_mem *mem,
+			bool write)
+{
+	int ret;
+
+	if (!(mem->mem_type == SIFMT_UMEM) || !mem->m.u.umem) {
+		sif_log(sdev, SIF_MMU, "Only implemented for user space mappings!");
+		return -EINVAL;
+	}
+
+	ret = sif_set_mmu_ctx(sdev, ctx, mem, write);
+	if (ret)
+		goto mmctx_failed;
+	return 0;
+
+mmctx_failed:
+	return ret;
+}
+
+
+static int sif_set_mmu_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx,
+			struct sif_mem *mem, bool write)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *ptep, pte;
+	u64 start = ctx->base;
+	u64 len = ctx->size;
+	struct psif_mmu_cntx *pctx = &ctx->mctx;
+	int npgds, npuds, npmds, nptes;
+	int ret = 0;
+
+	sif_log(sdev, SIF_MMU, "start 0x%llx len 0x%llx", start, len);
+
+	if (len == 0)
+		goto err;
+
+	pgd = pgd_offset(mem->m.u.umem->mm, start);
+	if (pgd_none(*pgd))
+		goto err;
+
+	ctx->pt = (void *)pgd; /* Misuse pt to save the pointer to avoid going via mm at dealloc time */
+	ctx->mt = SIFMT_ZERO;
+	pud = pud_offset(pgd, start);
+	if (pud_none(*pud))
+		goto err;
+
+	pctx->wr_access = write;
+	pctx->translation_type = MMU_GVA2GPA_MODE;
+
+	npgds = PGDIR_ALIGN(len + (start & ~PGDIR_MASK)) >> PGDIR_SHIFT;
+	npuds = PUD_ALIGN(len + (start & ~PUD_MASK)) >> PUD_SHIFT;
+
+#ifndef __aarch64__
+	if (pud_large(*pud)) {
+		ptep = (pte_t *) pud;
+		pte = *ptep;
+
+		if (!pte_present(pte)) {
+			sif_log(sdev, SIF_MMU,
+				"Page not present, bugging out..");
+			BUG();
+			goto err;
+		}
+
+		if (npuds == 1) {
+			set_ctx_w_page(sdev, ctx, PAGE_LEVEL2, PAGE_SIZE_IA32E_1GB,
+				pte_val(pte));
+		} else if (npgds == 1)
+			set_ctx_w_page(sdev, ctx, PAGE_LEVEL3, PAGE_SIZE_IA32E_1GB,
+				pgd_val(*pgd));
+#ifdef CONFIG_X86
+		else
+			set_ctx_w_page(sdev, ctx, PAGE_LEVEL4, PAGE_SIZE_IA32E_1GB,
+				read_cr3());
+#endif
+		goto out;
+	}
+#endif /* !__aarch64__ */
+
+	pmd = pmd_offset(pud, start);
+	if (pmd_none(*pmd))
+		goto err;
+
+	npmds = PMD_ALIGN(len + (start & ~PMD_MASK)) >> PMD_SHIFT;
+
+#ifndef __aarch64__
+	if (pmd_large(*pmd)) {
+		ptep = (pte_t *) pmd;
+		pte = *ptep;
+
+		if (!pte_present(pte)) {
+			sif_log(sdev, SIF_MMU,
+				"Page not present, bugging out..");
+			BUG();
+			goto err;
+		}
+
+		if (npmds == 1) {
+			set_ctx_w_page(sdev, ctx, PAGE_LEVEL1, PAGE_SIZE_IA32E_2MB,
+				pte_val(pte));
+		} else if (npuds == 1)
+			set_ctx_w_page(sdev, ctx, PAGE_LEVEL2, PAGE_SIZE_IA32E_2MB,
+				pud_val(*pud));
+		else if (npgds == 1)
+			set_ctx_w_page(sdev, ctx, PAGE_LEVEL3, PAGE_SIZE_IA32E_2MB,
+				pgd_val(*pgd));
+#ifdef CONFIG_X86
+		else
+			set_ctx_w_page(sdev, ctx, PAGE_LEVEL4, PAGE_SIZE_IA32E_2MB,
+				read_cr3());
+#endif
+		goto out;
+	}
+#endif /* !__aarch64__ */
+
+	ptep = pte_offset_map(pmd, start);
+	pte = *ptep;
+	if (!pte_present(pte)) {
+		sif_log(sdev, SIF_MMU, "Page not present, bugging out..");
+		BUG();
+		goto err;
+	}
+
+	nptes = PAGE_ALIGN(len + (start & ~PAGE_MASK)) >> PAGE_SHIFT;
+	if (nptes == 1) {
+		set_ctx_w_page(sdev, ctx, PAGE_LEVEL0, PAGE_SIZE_IA32E_4KB, pte_val(pte));
+	} else if (npmds == 1) {
+		set_ctx_w_page(sdev, ctx, PAGE_LEVEL1, PAGE_SIZE_IA32E_4KB, pmd_val(*pmd));
+	} else if (npuds == 1) {
+		set_ctx_w_page(sdev, ctx, PAGE_LEVEL2, PAGE_SIZE_IA32E_4KB, pud_val(*pud));
+	} else if (npgds == 1) {
+		set_ctx_w_page(sdev, ctx, PAGE_LEVEL3, PAGE_SIZE_IA32E_4KB, pgd_val(*pgd));
+#ifdef CONFIG_X86
+	} else {
+		set_ctx_w_page(sdev, ctx, PAGE_LEVEL4, PAGE_SIZE_IA32E_4KB, read_cr3());
+#endif
+	}
+	goto out;
+err:
+	sif_log(sdev, SIF_MMU, "Error in setting mmu context");
+	ret = -1;
+out:
+	return ret;
+}
+
+void sif_spt_unmap_gva_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *sctx)
+{
+	u64 start = sctx->base;
+	u64 len = sctx->size;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *ptep, pte;
+
+	int npgds, npuds, npmds, nptes;
+
+	sif_log(sdev, SIF_MMU, "start 0x%llx len 0x%llx", start, len);
+
+	if (len == 0)
+		goto err;
+
+	pgd = (pgd_t *)sctx->pt;
+	if (pgd_none(*pgd))
+		goto err;
+
+	if (pgd_none(*pgd)) {
+		sif_log(sdev, SIF_MMU, "Table entry(pgd) already freed");
+		goto out;
+	}
+
+	pud = pud_offset(pgd, start);
+	if (pud_none(*pud)) {
+		sif_log(sdev, SIF_MMU, "Table entry(pud) already freed");
+		goto out;
+	}
+
+	npgds = PGDIR_ALIGN(len + (start & ~PGDIR_MASK)) >> PGDIR_SHIFT;
+	npuds = PUD_ALIGN(len + (start & ~PUD_MASK)) >> PUD_SHIFT;
+
+#ifndef __aarch64__
+	if (pud_large(*pud)) {
+		ptep = (pte_t *) pud;
+		pte = *ptep;
+
+		if (!pte_present(pte)) {
+			sif_log(sdev, SIF_MMU,
+				"Page not present, bugging out..");
+			BUG();
+			goto err;
+		}
+		goto out;
+	}
+#endif /* !__aarch64__ */
+
+	pmd = pmd_offset(pud, start);
+	if (pmd_none(*pmd)) {
+		sif_log(sdev, SIF_MMU, "Table entry(pmd) already freed");
+		goto out;
+	}
+
+	npmds = PMD_ALIGN(len + (start & ~PMD_MASK)) >> PMD_SHIFT;
+
+#ifndef __aarch64__
+	if (pmd_large(*pmd)) {
+		ptep = (pte_t *) pmd;
+		pte = *ptep;
+
+		if (!pte_present(pte)) {
+			sif_log(sdev, SIF_MMU,
+				"Page not present, bugging out..");
+			BUG();
+			goto err;
+		}
+		goto out;
+	}
+#endif /* !__aarch64__ */
+
+	ptep = pte_offset_map(pmd, start);
+	pte = *ptep;
+	if (!pte_present(pte)) {
+		sif_log(sdev, SIF_MMU, "Page not present, bugging out..");
+		BUG();
+		goto err;
+	}
+
+	nptes = PAGE_ALIGN(len + (start & ~PAGE_MASK)) >> PAGE_SHIFT;
+
+	goto out;
+err:
+	sif_log(sdev, SIF_MMU, "Error releasing mmu context");
+out:
+	return;
+}
+
diff --git a/drivers/infiniband/hw/sif/sif_spt.h b/drivers/infiniband/hw/sif/sif_spt.h
new file mode 100644
index 000000000000..b66ef57de32e
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_spt.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_spt.h: Experimental (still unsafe)
+ *   implementation of direct use of the operating system's
+ *   page tables (shared page tables)
+ */
+
+#ifndef _SIF_SPT_H
+#define _SIF_SPT_H
+
+struct sif_dev;
+struct sif_mmu_ctx;
+
+
+#define PSIF_TABLE_PTR_SHIFT 52
+#define PSIF_TABLE_PTR_SIZE (_AC(1, UL) << PSIF_TABLE_PTR_SHIFT)
+#define PSIF_TABLE_PTR_MASK (~(PSIF_TABLE_PTR_SIZE-1))
+
+int sif_spt_map_gva_ctx(struct sif_dev *sdev,
+				struct sif_mmu_ctx *ctx,
+				struct sif_mem *mem,
+				bool write);
+
+void sif_spt_unmap_gva_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_sq.c b/drivers/infiniband/hw/sif/sif_sq.c
new file mode 100644
index 000000000000..2d5bcd26e532
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_sq.c
@@ -0,0 +1,518 @@
+/*
+ * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_sq.c: Implementation of the send queue side of an IB queue pair
+ */
+
+#include <rdma/ib_verbs.h>
+#include "sif_dev.h"
+#include "sif_base.h"
+#include "sif_defs.h"
+#include "sif_dma.h"
+#include "sif_mmu.h"
+#include "sif_pt.h"
+#include "sif_mr.h"
+#include "sif_sq.h"
+#include "sif_hwi.h"
+#include "psif_hw_setget.h"
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+/* Figure out the minimal space needed in each send queue element
+ * given the input sizes.
+ *
+ * We also use this space to collapse sg entries if we need to emulate more
+ * sg entries in software than what hardware supports.
+ *
+ * TBD: Note that the SQS sometimes checksums more data
+ * (up to 256 bytes depending on max_inline??) which we then cannot use
+ * as sg list data area.
+ * Note also that no sgl is needed in PSIF for the single sg entry case:
+ */
+
+static u32 compute_sq_extent(u32 sge_entries, u32 max_inline_data,
+			u32 *sgl_offset, u32 *min_extent_p,
+			u32 *sgl_size_p, u32 *max_inline_p)
+{
+	u32 hw_sge_entries = min_t(u32, SIF_HW_MAX_SEND_SGE, sge_entries);
+	u32 sgl_size = sge_entries > 1 ? hw_sge_entries * sizeof(struct psif_wr_local) : 0;
+	u32 xsge = sge_entries - hw_sge_entries;
+
+	/* This amount must be reserved for 0-padded inline data due to
+	 * restrictions in the SQS:
+	 */
+	u32 sqs_headroom = min(256U, ((max_inline_data + 63U) & ~63U));
+	u32 sqs_inline_extra =
+		max_inline_data > sqs_headroom ? max_inline_data - sqs_headroom : 0;
+
+	/* This applies to UD only, with max 4K message size:
+	 * Set aside room for inlining of @xsge sg entries.
+	 * Average size of an sge entry will be max 256 bytes, add an extra
+	 * 256 to handle the case where we cannot use the initial inline space:
+	 */
+	u32 xsge_space = !xsge ? 0 : (xsge + 2) * 256;
+
+	u32 min_extent = sizeof(struct psif_wr)
+		+ sqs_headroom
+		+ max(max(sqs_inline_extra, sgl_size), xsge_space);
+
+	u32 real_extent = roundup_pow_of_two(min_extent);
+
+	if (sgl_offset)
+		*sgl_offset = real_extent - sgl_size;
+	if (sgl_size_p)
+		*sgl_size_p = sgl_size;
+	if (min_extent_p)
+		*min_extent_p = min_extent;
+	if (max_inline_p)
+		*max_inline_p = max_t(int, xsge_space - sqs_headroom, sqs_inline_extra);
+	return real_extent;
+}
+
+
+int sif_alloc_sq(struct sif_dev *sdev, struct sif_pd *pd,
+	struct sif_qp *qp, struct ib_qp_cap *cap,
+		bool user_mode, int wr_hdl_sz)
+{
+	/* Send queues always uses same indexes as the corresponding qp */
+	int ret = 0;
+	int extent_log2;
+	struct sif_sq *sq;
+	struct sif_sq_sw *sq_sw;
+	struct psif_sq_hw *sq_hw_p;
+	struct psif_sq_rspq *sq_rspq_p;
+	struct psif_sq_sw lsq_sw;
+	struct psif_sq_hw lsq_hw;
+	struct psif_sq_entry sqe;
+
+	u32 min_entries = cap->max_send_wr;
+	u32 max_entries;
+	u32 entries_log2;
+	u32 min_extent;
+	u32 sgl_size;
+	u32 max_inline;
+	u64 alloc_sz;
+	dma_addr_t dma_start;
+	bool need_page_aligned;
+	bool need_wa_4049 = PSIF_REVISION(sdev) <= 3;
+
+
+	max_entries = roundup_pow_of_two(max(2U, min_entries));
+	entries_log2 = order_base_2(max_entries);
+
+	if (entries_log2 > SIF_SW_MAX_SQE_LOG2) {
+		sif_log(sdev, SIF_INFO,
+			"requested %d entries -> %d but sif only supports %d",
+			cap->max_send_wr, max_entries, SIF_SW_MAX_SQE);
+		return -ENFILE; /* Limited by 4 bit size_log2 field in sq desc */
+	}
+
+	sq = get_sif_sq(sdev, qp->qp_idx);
+	sq_sw = get_sif_sq_sw(sdev, qp->qp_idx);
+	sq->index = qp->qp_idx;
+	sq->wr_hdl_sz = wr_hdl_sz;
+
+	/* Due to IB standard requirements for ssn = 1 on the first packet
+	 * on a QP and that psif now uses send queue sequence number == ssn
+	 * we must initialize so the first packet is sent on index 1.
+	 * Also the send queue in psif uses last_seq == last used seq instead of
+	 * next_seq == next seq to use..
+	 * NB! This applies only to the send queue - we start at index 0 on all the others!
+	 */
+	sq_sw->last_seq = sq_sw->head_seq = 0;
+
+	sq_hw_p = get_sq_hw(sdev, qp->qp_idx);
+
+	sq->entries = max_entries;
+	sq->mask = max_entries - 1;
+	sq->sg_entries = need_wa_4049 ? roundup_pow_of_two(cap->max_send_sge) : cap->max_send_sge;
+
+	sq->extent = compute_sq_extent(sq->sg_entries, cap->max_inline_data,
+				&sq->sgl_offset, &min_extent, &sgl_size, &max_inline);
+
+	qp->max_inline_data = cap->max_inline_data;
+	if (sq->extent > min_extent) {
+		int extra_extent = sq->extent - min_extent;
+
+		if (sq->sg_entries > SIF_HW_MAX_SEND_SGE) {
+			qp->max_inline_data = max_inline + extra_extent;
+		} else if (cap->max_inline_data >= 256) {
+			sif_log(sdev, SIF_QP, "QP %d has room for %d bytes of extra inline space",
+				qp->qp_idx, extra_extent);
+			qp->max_inline_data += extra_extent;
+		}
+	}
+
+	extent_log2 = order_base_2(sq->extent);
+	alloc_sz = max_entries * sq->extent;
+
+	/* Only whole pages must be exposed to user space.
+	 * For simplicity we impose the same for reliable QPs as their SQs
+	 * have to be page aligned to ensure proper access from SQ_CMPL:
+	 */
+	need_page_aligned = user_mode || reliable_qp(qp->type);
+
+	if (need_page_aligned && (alloc_sz & ~PAGE_MASK))
+		alloc_sz = (alloc_sz + ~PAGE_MASK) & PAGE_MASK;
+	sq->user_mode = user_mode;
+
+	if (alloc_sz <= SIF_MAX_CONT)
+		sq->mem = sif_mem_create_dmacont(sdev, alloc_sz, GFP_KERNEL, DMA_BIDIRECTIONAL);
+	else {
+		alloc_sz = (alloc_sz + ~PMD_MASK) & PMD_MASK;
+		sq->mem = sif_mem_create(sdev, alloc_sz >> PMD_SHIFT,
+					alloc_sz, SIFMT_2M, GFP_KERNEL | __GFP_ZERO,
+					DMA_BIDIRECTIONAL);
+	}
+	if (!sq->mem) {
+		sif_log(sdev, SIF_INFO,	"Failed to allocate %llu bytes of SQ buffer pool",
+			alloc_sz);
+		ret = -ENOMEM;
+		goto err_alloc_dma;
+	}
+
+	dma_start = sif_mem_dma(sq->mem, 0);
+
+	sif_log(sdev, SIF_QP, "SQ dma %pad va 0x%p, sz %d, min_extent %d -> extent %d",
+		&dma_start, sif_mem_kaddr(sq->mem, 0), sq->entries, min_extent, sq->extent);
+	sif_log(sdev, SIF_SQ, "SQ wr sz %ld, sgl_offset/sz %d/%d, max_inline %d, max sge %d",
+		sizeof(sqe.wr), sq->sgl_offset, sgl_size,
+		qp->max_inline_data, sq->sg_entries);
+
+	sq->wr_hdl = kzalloc(max_entries * sq->wr_hdl_sz, GFP_KERNEL);
+	if (!sq->wr_hdl) {
+		sif_log(sdev, SIF_INFO, "Failed to allocate wr_hdl table!");
+		ret = -ENOMEM;
+		goto err_alloc_wrid;
+	}
+
+	if (qp->type != PSIF_QP_TRANSPORT_MANSP1 && (qp->max_inline_data || sgl_size)) {
+		/* Allocate a DMA validation entry to be used for sif to access
+		 * s/g lists, which we put in the spare space between entries
+		 * in the send queue. This MR is also used by the SQS to access
+		 * inline data.
+		 */
+		sq->sg_mr = alloc_mr(sdev, pd, sq->mem, dma_start, 0);
+		if (IS_ERR(sq->sg_mr)) {
+			ret = PTR_ERR(sq->sg_mr);
+			sif_log(sdev, SIF_INFO, "Failed to allocate lkey for s/g list (%d)",
+				ret);
+			goto err_alloc_sg_mr;
+		}
+	}
+
+	/* Initialize hw part of descriptor */
+	memset(&lsq_hw, 0, sizeof(lsq_hw));
+
+	lsq_hw.size_log2 = entries_log2;
+	lsq_hw.extent_log2 = extent_log2;
+	/* TBD: mmu_context */
+
+	/* See comment above */
+	lsq_hw.last_seq = 0;
+	lsq_hw.base_addr = dma_start;
+	lsq_hw.sq_max_inline = min(256U, qp->max_inline_data);
+	lsq_hw.sq_max_sge = sq->sg_entries - 1;
+
+	/* These are needed for sq mode to work */
+	lsq_hw.sq_next.next_qp_num = 0xffffff;
+	lsq_hw.sq_next.next_null = 0xff;
+
+	/* Allocate mmu context for the send queue - only read access needed
+	 * for the queue itself:
+	 */
+	ret = sif_map_ctx(sdev, &sq->mmu_ctx, sq->mem, lsq_hw.base_addr,
+			alloc_sz, false);
+	if (ret) {
+		sif_log(sdev, SIF_INFO, "Failed to set mmu context for sq %d",
+			sq->index);
+		goto err_map_ctx;
+	}
+
+
+	lsq_hw.mmu_cntx = sq->mmu_ctx.mctx;
+
+	/* Write network byte order copy */
+	copy_conv_to_hw(sq_hw_p, &lsq_hw, sizeof(lsq_hw));
+
+	/* Initialize sw part of descriptor */
+	memset(&lsq_sw, 0, sizeof(lsq_sw));
+
+	copy_conv_to_hw(&sq_sw->d, &lsq_sw, sizeof(lsq_sw));
+
+	spin_lock_init(&sq->lock);
+
+	sq_rspq_p = get_sq_rspq(sdev, qp->qp_idx);
+
+	/* We need to set the (network byte order)
+	 * fields next_qp_num and rspq_next to all 1's (see bug 3479)
+	 * TBD: This needs to be properly set up in psifapi
+	 */
+	sq_rspq_p->something_tbd[0] = (u64)-1;
+	return 0;
+
+	sif_unmap_ctx(sdev, &sq->mmu_ctx);
+err_map_ctx:
+	if (sq->sg_mr)
+		dealloc_mr(sdev, sq->sg_mr);
+err_alloc_sg_mr:
+	kfree(sq->wr_hdl);
+err_alloc_wrid:
+	sif_mem_free(sq->mem);
+err_alloc_dma:
+	return ret;
+}
+
+
+int sif_flush_sqs(struct sif_dev *sdev, struct sif_sq *sq)
+{
+	ulong start_time = jiffies;
+	ulong timeout = start_time + sdev->min_resp_ticks * 2;
+	struct sif_qp *qp = get_sif_qp(sdev, sq->index);
+	bool sqs_idle = false;
+	u32 sq_next;
+	u32 prev_sq_next;
+	struct psif_wr wr;
+	struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, sq->index);
+
+	if (qp->ibqp.xrcd) /* XRC target QPs dont have any valid sqs setup */
+		return 0;
+
+	memset(&wr, 0, sizeof(struct psif_wr));
+	wr.local_qp = sq->index;
+
+	/* Trigger a stop of SQS (rev2 feature) */
+	sif_doorbell_write(qp, &wr, false);
+
+	prev_sq_next = sq_next = get_psif_sq_hw__sq_next(&sq->d);
+
+	sif_log(sdev, SIF_SQ, "Entering sq_hw poll for sq %d: last_seq %d head_seq %d sq_next %x",
+		sq->index, sq_sw->last_seq, sq_sw->head_seq, sq_next);
+	for (;;) {
+		if (!sqs_idle) {
+			sqs_idle = get_psif_sq_hw__destroyed(&sq->d);
+			if (sqs_idle) {
+				rmb(); /* Make sure we observe sq_next after the
+					* destroyed bit has been set
+					*/
+				sq_next = get_psif_sq_hw__sq_next(&sq->d);
+			}
+		}
+		if (sqs_idle && sq_next == 0xffffffff)
+			break;
+		if (sq_next != prev_sq_next) {
+			/* Reset timeout */
+			timeout = jiffies + sdev->min_resp_ticks * 2;
+			sif_log(sdev, SIF_INFO_V, "sq %d: sq_next moved from %d -> %d",
+				sq->index, prev_sq_next, sq_next);
+		} else if (time_is_before_jiffies(timeout)) {
+			if (sif_feature(pcie_trigger))
+				force_pcie_link_retrain(sdev);
+			sif_log(sdev, SIF_INFO,
+				"Error: sq %d timed out - waited %d ms for SQ flush. Idle:%d sq_next:%x",
+				sq->index, jiffies_to_msecs(jiffies - start_time), sqs_idle, sq_next);
+			return -ETIMEDOUT;
+		}
+		/* TBD: No sleep necessary as this should be really quick (?) */
+		cpu_relax();
+		prev_sq_next = sq_next;
+		sq_next = get_psif_sq_hw__sq_next(&sq->d);
+	}
+
+	sif_log(sdev, SIF_SQ, " sq %d: done waiting for SQS to finish", sq->index);
+	return 0;
+}
+
+
+void sif_free_sq(struct sif_dev *sdev, struct sif_qp *qp)
+{
+	struct sif_sq *sq;
+	volatile struct psif_sq_hw *sq_hw_p;
+	volatile struct psif_sq_sw *sq_sw_p;
+
+	int index = qp->qp_idx;
+
+	sq = get_sif_sq(sdev, index);
+	sif_log(sdev, SIF_SQ, "idx %d", sq->index);
+
+	sq_sw_p = get_sq_sw(sdev, index);
+	sq_hw_p = &sq->d;
+
+	if (reliable_qp(qp->type) && qp->sq_cmpl_map_valid)
+		sif_sq_cmpl_unmap_sq(sdev, sq);
+
+	sif_unmap_ctx(sdev, &sq->mmu_ctx);
+
+	/* We clear the whole sq field including sq_hw below */
+	sif_clear_sq_sw(sdev, index);
+
+	if (sq->sg_mr)
+		dealloc_mr(sdev, sq->sg_mr);
+
+	sif_mem_free(sq->mem);
+	kfree(sq->wr_hdl);
+	memset(sq, 0, sizeof(struct sif_sq));
+}
+
+
+/* Setup of the root node(s) of a page table mapping all
+ * active send queues:
+ */
+int sif_sq_cmpl_setup(struct sif_table *tp)
+{
+	u32 max_sq_extent = compute_sq_extent(16, sif_max_inline,
+					NULL, NULL, NULL, NULL);
+	struct sif_dev *sdev = tp->sdev;
+
+	tp->ext_sz = SIF_SW_MAX_SQE * max_sq_extent; /* Largest possible send queue */
+	tp->table_sz = (size_t)tp->ext_sz * tp->entry_cnt;
+	tp->sif_base = SIF_SQ_CMPL_START;
+	tp->mem = sif_mem_create_ref(sdev, SIFMT_CS, tp->sif_base, tp->table_sz,
+				GFP_KERNEL);
+
+	sif_log(sdev, SIF_SQ, "ext.sz %d entry cnt %d max sq extent 0x%x tbl.sz 0x%lx",
+		tp->ext_sz, tp->entry_cnt, max_sq_extent, tp->table_sz);
+	return 0;
+}
+
+
+/* Map/unmap the page table of a send queue in the sq_cmpl mapping
+ * The way to map it depends on the map type of the send queue itself:
+ */
+int sif_sq_cmpl_map_sq(struct sif_dev *sdev, struct sif_sq *sq)
+{
+	struct sif_table *sctp = &sdev->ba[sq_cmpl];
+
+	/* Start offset of this send queue in the large virtual sq_cmpl mapping: */
+	u64 virt_base = sctp->mmu_ctx.base + (u64)sq->index * sctp->ext_sz;
+	u64 size = sq->mem->size;
+
+	return sif_map_ctx_part(sdev, &sctp->mmu_ctx, sq->mem, virt_base, size);
+}
+
+
+int sif_sq_cmpl_unmap_sq(struct sif_dev *sdev, struct sif_sq *sq)
+{
+	struct sif_table *sctp = &sdev->ba[sq_cmpl];
+
+	/* Start offset of this send queue in the large virtual sq_cmpl mapping: */
+	u64 virt_base = sctp->mmu_ctx.base + (u64)sq->index * sctp->ext_sz;
+	u64 size = sq->mem->size;
+
+	sif_log(sdev, SIF_SQ, "sq %d, virt_base 0x%llx size 0x%llx", sq->index, virt_base, size);
+	return sif_unmap_gva_ctx_part(sdev, &sctp->mmu_ctx, virt_base, size);
+}
+
+
+void sif_dfs_print_sq_hw(struct seq_file *s, struct sif_dev *sdev, loff_t pos)
+{
+	struct sif_sq *sq;
+	int qlen;
+	u32 head, tail;
+	struct psif_sq_hw lhw;
+	struct sif_sq_sw *sq_sw;
+	struct sif_qp *qp;
+	int tsv;
+
+	if (unlikely(pos < 0)) {
+		seq_puts(s, "# N = next_null, T = sq_timestamp_valid, D = sq_done, X = destroyed\n");
+		seq_puts(s, "#                    [----------------------- sw view ----------------------]  [----------- hw view ------------]\n");
+		seq_puts(s, "# Index  cq_idx     head     tail     q_sz     q_len    q_high max_sge inline    head    tail   n.qp  N  T  D  X\n");
+		return;
+	}
+	sq = get_sif_sq(sdev, pos);
+	sq_sw = get_sif_sq_sw(sdev, pos);
+	qp = get_sif_qp(sdev, pos);
+
+	/* Check for QP0/1 which is reserved but not initialized */
+	if (sq->entries == 0)
+		return;
+
+	head = sq_sw->head_seq;
+	tail = sq_sw->last_seq;
+	qlen = sq_length(sq, head, tail);
+
+	copy_conv_to_sw(&lhw, &sq->d, sizeof(lhw));
+	tsv = lhw.sq_timestamp_valid;
+
+	seq_printf(s, "%7lld %7d %8d %8d %8d %9d %9d %7d %6d %8d%8d %06x %2x  %d  %d  %d\n",
+		pos,
+		sq->cq_idx, head, tail, sq->entries, qlen, sq->max_outstanding,
+		sq->sg_entries, qp->max_inline_data,
+		get_psif_sq_sw__tail_indx(&sq_sw->d), lhw.last_seq,
+		lhw.sq_next.next_qp_num, lhw.sq_next.next_null,
+		tsv, lhw.sq_done, lhw.destroyed);
+}
+
+
+void sif_dfs_print_sq_cmpl(struct seq_file *s, struct sif_dev *sdev, loff_t pos)
+{
+	struct sif_sq *sq;
+	struct sif_qp *qp;
+	struct sif_table *sctp = &sdev->ba[sq_cmpl];
+	u64 virt_base;
+	dma_addr_t val;
+	u64 pte_cnt, i;
+	dma_addr_t dma_start;
+	struct sif_mmu_ctx *ctx = &sctp->mmu_ctx;
+
+	if (unlikely(pos < 0)) {
+		u64 table_ptr = sif_pt_dma_root(ctx->pt);
+
+		seq_printf(s, "# - mmu_cntx: root %016llx, level %d\n",
+			table_ptr, sctp->mmu_ctx.mctx.table_level);
+		seq_puts(s, "# Index  psif vaddr         #pages   @pte[0]          pte[0..]\n");
+		return;
+	}
+	sq = get_sif_sq(sdev, pos);
+	qp = get_sif_qp(sdev, pos);
+	virt_base = sctp->mmu_ctx.base + (u64)sq->index * sctp->ext_sz;
+
+	/* Check for QP0/1 which is reserved but not initialized */
+	if (sq->entries == 0)
+		return;
+
+	/* Only QPs with multipacket support is mapped here; */
+	if (!reliable_qp(qp->type))
+		return;
+
+	if (sif_pt_entry(ctx->pt, virt_base, &dma_start, &val))
+		return;
+
+	pte_cnt = 1;  /* TBD: read the correct value to report all pages the pt refers to */
+	seq_printf(s, " %6lld  %016llx  %6lld  @%pad: [", pos, virt_base, pte_cnt, &dma_start);
+	for (i = 0; i < pte_cnt; i++) {
+		if (i > 0)
+			seq_puts(s, ",");
+		seq_printf(s, "%pad", &val);
+	}
+	seq_puts(s, "]\n");
+}
+
+
+bool multipacket_qp(enum psif_qp_trans type)
+{
+	switch (type) {
+	case PSIF_QP_TRANSPORT_RC:
+	case PSIF_QP_TRANSPORT_UC:
+	case PSIF_QP_TRANSPORT_XRC:
+		return true;
+	default:
+		return false;
+	}
+}
+
+
+bool reliable_qp(enum psif_qp_trans type)
+{
+	return
+		type == PSIF_QP_TRANSPORT_RC ||
+		type == PSIF_QP_TRANSPORT_XRC;
+}
diff --git a/drivers/infiniband/hw/sif/sif_sq.h b/drivers/infiniband/hw/sif/sif_sq.h
new file mode 100644
index 000000000000..6d0cc306f4c3
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_sq.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_sq.h: Implementation of the send queue side of an IB queue pair
+ */
+
+#ifndef __SIF_SQ_H
+#define __SIF_SQ_H
+
+struct sif_sq_hdl {
+	u64 wr_id;  /* Stored work id */
+	u32 sq_seq; /* Extra sanity checks */
+	bool used;
+};
+
+
+struct sif_sq {
+	volatile struct psif_sq_hw d; /* Hardware descriptor */
+	/* Serializes access to sq_sw->last_seq (alloc of new sqes): */
+	spinlock_t lock ____cacheline_internodealigned_in_smp;
+	struct sif_mmu_ctx mmu_ctx;
+	int index;   /* Send queue index (same as the qp index) */
+	int cq_idx;  /* Default send compl.queue index to use */
+	u32 sg_entries; /* Max send scatter/gather configured for this sq */
+	u16 entries;
+	u16 mask;  /* entries - 1 for modulo using & */
+	u16 max_outstanding;  /* Longest observed send queue len */
+	u8 complete_all;          /* Gets or'ed into completion bit in WRs */
+	u32 extent;
+	u32 sgl_offset; /* Offset from start of the sqe where the sgl starts */
+	bool user_mode;  /* Set if this is an SQ to be mapped to user space */
+	struct sif_mem *mem; /* Allocated queue memory */
+	void *wr_hdl; /* map from sq entry index to wr_id + optional bookkeeping */
+	int wr_hdl_sz; /* Sz of each elem. in wr_hdl - PQP and std send path uses different sizes */
+	struct sif_mr *sg_mr; /* DMA val.entry for the sge list when in the send queue */
+	struct psif_rq_scatter tmp_sge[16]; /* Temp.storage for buildup of LE sge list */
+};
+
+
+/* Lookup function for the handle for a particular request: */
+static inline struct sif_sq_hdl *get_sq_hdl(struct sif_sq *sq, u32 seq)
+{
+	return (struct sif_sq_hdl *)(sq->wr_hdl + sq->wr_hdl_sz * (seq & sq->mask));
+}
+
+int sif_sq_cmpl_setup(struct sif_table *tp);
+
+int sif_alloc_sq(struct sif_dev *sdev, struct sif_pd *pd,
+		struct sif_qp *qp, struct ib_qp_cap *cap,
+		bool user_mode, int sq_hdl_sz);
+
+void sif_free_sq(struct sif_dev *sdev, struct sif_qp *qp);
+
+int sif_flush_sqs(struct sif_dev *sdev, struct sif_sq *sq);
+
+int sif_sq_cmpl_map_sq(struct sif_dev *sdev, struct sif_sq *sq);
+int sif_sq_cmpl_unmap_sq(struct sif_dev *sdev, struct sif_sq *sq);
+
+/* Line printers for debugfs files */
+void sif_dfs_print_sq_hw(struct seq_file *s, struct sif_dev *sdev, loff_t pos);
+void sif_dfs_print_sq_cmpl(struct seq_file *s, struct sif_dev *sdev, loff_t pos);
+
+bool multipacket_qp(enum psif_qp_trans type);
+bool reliable_qp(enum psif_qp_trans type);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_srq.c b/drivers/infiniband/hw/sif/sif_srq.c
new file mode 100644
index 000000000000..db40c54829d7
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_srq.c
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_srq.c: Interface to shared receive queues for SIF
+ */
+
+#include <rdma/ib_verbs.h>
+#include "sif_dev.h"
+#include "sif_qp.h"
+#include "sif_srq.h"
+#include "sif_base.h"
+#include "sif_defs.h"
+#include "sif_sndrcv.h"
+
+struct ib_srq *sif_create_srq(struct ib_pd *ibpd,
+			      struct ib_srq_init_attr *srq_init_attr,
+			      struct ib_udata *udata)
+{
+	int rq_idx;
+	struct sif_dev *sdev = to_sdev(ibpd->device);
+	struct sif_rq *rq;
+	ulong user_flags = 0;
+	int ret = 0;
+	bool user_mode = udata != NULL;
+
+	if (sif_feature(disable_srq))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (udata) {
+		struct sif_create_srq_ext cmd;
+
+		ret = ib_copy_from_udata(&cmd, udata, sizeof(cmd));
+		if (ret)
+			goto err_create_srq;
+		user_flags = cmd.flags;
+
+		if (sif_vendor_enable(SVF_kernel_mode, user_flags))
+			user_mode = false;
+	}
+
+	sif_log(sdev, SIF_SRQ, "%s", (user_mode ? "(user)" : "(kernel)"));
+
+	rq_idx = alloc_rq(sdev, to_spd(ibpd), srq_init_attr->attr.max_wr,
+			srq_init_attr->attr.max_sge, srq_init_attr, user_mode);
+	if (rq_idx < 0) {
+		ret = rq_idx;
+		goto err_create_srq;
+	}
+
+	rq = get_sif_rq(sdev, rq_idx);
+
+	if (udata) {
+		struct sif_create_srq_resp_ext resp;
+
+		memset(&resp, 0, sizeof(resp));
+		resp.index = rq_idx;
+		resp.extent = rq->extent;
+		ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (ret)
+			goto err_udata;
+	}
+
+	srq_init_attr->attr.max_wr = rq->entries_user;
+
+	return &rq->ibsrq;
+err_udata:
+	free_rq(sdev, rq->index);
+err_create_srq:
+	return ERR_PTR(ret);
+}
+
+#define ARM_SRQ_HOLDOFF (10 + jiffies)
+
+static int sif_arm_srq(struct sif_dev *sdev, struct sif_rq *srq, u32 srq_limit)
+{
+	int ret;
+	struct psif_wr wr;
+	struct psif_cq_entry *cqe;
+	DECLARE_SIF_CQE_POLL_WITH_RR_PQP(sdev, lcqe);
+	struct sif_pqp *pqp = lcqe.pqp;
+
+	if (unlikely(!pqp))
+		return -EAGAIN;
+
+	memset(&wr, 0, sizeof(struct psif_wr));
+
+	wr.completion = 1;
+	wr.op = PSIF_WR_SET_SRQ_LIM;
+	wr.details.su.srq_lim = srq_limit;
+	wr.details.su.u2.rq_id = srq->index;
+
+try_again:
+	if (time_is_after_jiffies((unsigned long)atomic64_read(&pqp->qp->arm_srq_holdoff_time))) {
+		cpu_relax();
+		goto try_again;
+	}
+
+	atomic64_set(&pqp->qp->arm_srq_holdoff_time, ARM_SRQ_HOLDOFF);
+	pqp->qp->srq_idx = srq->index;
+
+	ret = sif_pqp_poll_wr(sdev, &wr, &lcqe);
+	if (ret < 0) {
+		sif_log(sdev, SIF_INFO, "pqp request failed with errno %d", ret);
+		return ret;
+	}
+
+	cqe = &lcqe.cqe;
+	if (cqe->status != PSIF_WC_STATUS_SUCCESS) {
+		sif_log(sdev, SIF_INFO, "failed with status %s(%d) for cq_seq %d",
+			string_enum_psif_wc_status(cqe->status), cqe->status, cqe->seq_num);
+		return -EIO;
+	}
+
+	srq->srq_limit = srq_limit;
+
+	return 0;
+}
+
+int sif_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		   enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct sif_dev *sdev = to_sdev(ibsrq->device);
+	struct sif_rq *srq = to_srq(ibsrq);
+	u16 srq_limit;
+	int ret;
+
+	if (attr_mask & IB_SRQ_MAX_WR) {
+		sif_log(sdev, SIF_SRQ, "SRQ_MAX_WR not supported");
+		return -EINVAL;
+	}
+
+	if (attr_mask & IB_SRQ_LIMIT) {
+		srq_limit = attr->srq_limit & 0x3fff;
+		if (srq_limit >= srq->entries)
+			return -EINVAL;
+
+		ret = sif_arm_srq(sdev, srq, srq_limit);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+int sif_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+	struct sif_rq *srq = to_srq(ibsrq);
+
+	attr->max_wr	= srq->entries;
+	attr->max_sge	= srq->sg_entries;
+	attr->srq_limit = srq->srq_limit;
+
+	return 0;
+}
+
+int sif_destroy_srq(struct ib_srq *ibsrq)
+{
+	int sts;
+	struct sif_dev *sdev = to_sdev(ibsrq->device);
+	struct sif_rq *rq = to_srq(ibsrq);
+
+	sif_log(sdev, SIF_SRQ, "rq %d", rq->index);
+
+	if (atomic_read(&rq->refcnt) > 1)
+		return -EBUSY;
+
+	/* An SRQ cannot be flushed with flushed-in-error completions
+	 * as we don't know which completion queue to generate
+	 * the flushed-in-error completions for, and this should be fine
+	 * from a standards perspective:
+	 * IB spec refs: 10.2.9.4, 11.2.3.4.
+	 */
+	sts = sif_invalidate_rq_hw(sdev, rq->index, PCM_WAIT);
+	if (sts) {
+		sif_log(sdev, SIF_INFO,
+			"Invalidate rq_hw failed");
+	}
+
+	return free_rq(sdev, rq->index);
+}
+
+int sif_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *recv_wr,
+		      struct ib_recv_wr **bad_recv_wr)
+{
+	struct sif_dev *sdev = to_sdev(ibsrq->device);
+	struct sif_rq *rq = to_srq(ibsrq);
+
+	sif_logi(ibsrq->device, SIF_SRQ, "rq %d (SRQ)", rq->index);
+
+	return post_recv(sdev, NULL, rq, recv_wr, bad_recv_wr);
+}
diff --git a/drivers/infiniband/hw/sif/sif_srq.h b/drivers/infiniband/hw/sif/sif_srq.h
new file mode 100644
index 000000000000..8ea4b32b70bd
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_srq.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_srq.h: Interface to internal Shared receive queue logic for SIF
+ */
+
+#ifndef __SIF_SRQ_H
+#define __SIF_SRQ_H
+
+struct ib_srq *sif_create_srq(struct ib_pd *ibpd,
+			      struct ib_srq_init_attr *srq_init_attr,
+			      struct ib_udata *udata);
+int sif_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr,
+		   enum ib_srq_attr_mask srq_attr_mask, struct ib_udata *udata);
+int sif_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr);
+int sif_destroy_srq(struct ib_srq *ibsrq);
+
+int sif_post_srq_recv(struct ib_srq *ibsrq,
+		      struct ib_recv_wr *recv_wr,
+		      struct ib_recv_wr **bad_recv_wr);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_tqp.c b/drivers/infiniband/hw/sif/sif_tqp.c
new file mode 100644
index 000000000000..2c883481eaed
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_tqp.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Wei Lin Guay <wei.lin.guay@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_tqp.c: Implementation of EPSA tunneling QP for SIF
+ */
+#include <linux/sched.h>
+#include <rdma/ib_verbs.h>
+#include "sif_tqp.h"
+#include "psif_hw_setget.h"
+#include "sif_defs.h"
+
+/*
+ * This is a host-EPSA mailbox function that is called via ib_post_send()
+ * The conditions and assumptions are:-
+ * 1. qp_type == IB_QPT_EPSA_TUNNELING.
+ * 2. opcode == IB_WR_SEND_WITH_IMM
+ * 3. Only receive completion - no send completion will be generated.
+ * 4. Only the first wr.sge will be handled.
+ * 5. wr.ex.imm_data is the EPSA_N
+ */
+int sif_epsa_tunneling_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+				 struct ib_send_wr **bad_wr)
+{
+	struct psif_epsc_csr_req req;
+	struct psif_epsc_csr_rsp rsp;
+	struct sif_dev *sdev = to_sdev(ibqp->device);
+
+	/* The status of the epsa mailbox communication is logged in the received cq: */
+	struct sif_cq *cq = to_scq(ibqp->recv_cq);
+	struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index);
+	volatile struct psif_cq_entry *cqe;
+	struct psif_cq_entry lcqe;
+	u32 seqno;
+	int ret;
+
+	memset(&req, 0, sizeof(req));
+	memset(&rsp, 0, sizeof(rsp));
+
+	req.uf = 0;
+	req.opcode = EPSC_A_COMMAND;
+	req.u.epsa_cmd.cmd = EPSA_GENERIC_CMD;
+	req.u.epsa_cmd.length = wr->sg_list[0].length;
+	req.u.epsa_cmd.host_addr = wr->sg_list[0].addr;
+	req.u.epsa_cmd.key = wr->sg_list[0].lkey;
+
+	if (wr->ex.imm_data > 3) {
+		sif_log(sdev, SIF_INFO, "Exit: Fail to post_send a WR");
+		return -EINVAL;
+	}
+
+	sif_log(sdev, SIF_SND, "len %d host addr addr 0x%llx key 0x%x",
+		req.u.epsa_cmd.length, req.u.epsa_cmd.host_addr, key);
+
+	ret = sif_eps_wr(sdev, u32_to_mbox(wr->ex.imm_data), &req, &rsp);
+
+	seqno = cq_sw->next_seq;
+	cqe = get_cq_entry(cq, seqno);
+
+	memset(&lcqe, 0, sizeof(lcqe));
+	/* construct the required info for WC during poll_cq.
+	 * As for now include the wr_id, mailbox status, qp_num, and status:
+	 */
+	lcqe.seq_num = seqno;
+	lcqe.wc_id.rq_id = wr->wr_id;
+	lcqe.vendor_err = rsp.status;
+	lcqe.qp = ibqp->qp_num;
+	lcqe.status = ret == 0 ? PSIF_WC_STATUS_SUCCESS : PSIF_WC_STATUS_GENERAL_ERR;
+
+	copy_conv_to_hw(cqe, &lcqe, sizeof(*cqe));
+
+	return ret;
+}