From: Knut Omang <knut.omang@oracle.com>
Date: Wed, 25 May 2016 09:01:11 +0000 (+0200)
Subject: sif driver initial commit part 2
X-Git-Tag: v4.1.12-92~148^2~19
X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=c9f42a310e20c1d7491b014457fed14ee3559f1e;p=users%2Fjedix%2Flinux-maple.git

sif driver initial commit part 2

sif_fwa.c:       Firmware access API (netlink based out-of-band comm)
sif_fwa.h:       Low level access to a SIF device
sif_hwi.c:       Hardware init for SIF - combines the various init steps for psif
sif_hwi.h:       Hardware init for SIF
sif_ibcq.h:      External interface to IB completion queue logic for SIF
sif_ibpd.h:      External interface to (IB) protection domains for SIF
sif_ibqp.h:      External interface to IB queue pair logic for sif
sif_idr.c:       Synchronized ID ref allocation
sif_idr.h:       simple id allocation and deallocation for SIF
sif_int_user.h:  This file defines special internal data structures used
sif_ireg.c:      Utilities and entry points needed for Infiniband registration
sif_ireg.h:      support functions used in setup of sif as an IB HCA
sif_main.c:      main entry points and initialization
sif_mem.c:       SIF table memory and page table management
sif_mem.h:       A common interface for all memory used by
sif_mmu.c:       main entry points and initialization
sif_mmu.h:       API for management of sif's on-chip mmu.
sif_mr.c:        Implementation of memory regions support for SIF
sif_mr.h:        Interface to internal IB memory registration logic for SIF
sif_mw.c:        Implementation of memory windows for SIF
sif_mw.h:        Interface to internal IB memory window logic for SIF
sif_pd.c:        Implementation of IB protection domains for SIF
sif_pd.h:        Internal interface to protection domains
sif_pqp.c:       Privileged QP handling
sif_pqp.h:       Privileged QP handling

Signed-off-by: Knut Omang <knut.omang@oracle.com>
---

diff --git a/drivers/infiniband/hw/sif/sif_fwa.c b/drivers/infiniband/hw/sif/sif_fwa.c
new file mode 100644
index 0000000000000..c6501db7d6521
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_fwa.c
@@ -0,0 +1,589 @@
+/*
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_fwa.c: Firmware access API (netlink based out-of-band comm)
+ *
+ */
+#include "sif_dev.h"
+#include "sif_fwa.h"
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include "sif_enl.h"
+#include "sif_defs.h"
+#include "sif_query.h"
+#include "sif_base.h"
+#include "sif_qp.h"
+#include "psif_hw_csr.h"
+#include "sif_drvapi.h"
+
+/* Generic netlink protocol family definition */
+static struct genl_family sif_enl_family = {
+	.id = GENL_ID_GENERATE,
+	.hdrsize = 0,
+	.name = "sif_enl",
+	.version = 1,
+	.maxattr = 16
+};
+
+/* Netlink request handlers */
+static int sif_fwa_req(struct sk_buff *skb, struct genl_info *info);
+static int sif_fwa_drv_req(struct sk_buff *skb, struct genl_info *info);
+
+/* Netlink req operation definition */
+static struct genl_ops sif_enl_ops[] = {
+	{
+		.cmd = SIF_ENL_CMD_REQ,
+		.flags = 0,
+		.policy = sif_enl_policy,
+		.doit = sif_fwa_req,
+		.dumpit = NULL,
+	},
+
+	{
+		.cmd = SIF_ENL_CMD_REQ_DRV,
+		.flags = 0,
+		.policy = sif_enl_policy,
+		.doit = sif_fwa_drv_req,
+		.dumpit = NULL,
+	}
+};
+
+
+/* Global datastructure to keep track of instances and number of active
+ * processes:
+ */
+
+struct fwa_data {
+	struct list_head sdev_list;  /* Access to devices */
+	spinlock_t lock;             /* Protects device list */
+};
+
+static struct fwa_data fwa;
+
+
+/* Called from sif_init/exit to set up/clean up global data structures
+ * such as netlink communication and device registry:
+ */
+int sif_fwa_init(void)
+{
+	int stat;
+
+	INIT_LIST_HEAD(&fwa.sdev_list);
+	spin_lock_init(&fwa.lock);
+
+	stat = genl_register_family_with_ops(&sif_enl_family, sif_enl_ops);
+	if (stat)
+		goto fail;
+
+	sif_log0(SIF_INIT, "Enabled firmware access API");
+	return 0;
+fail:
+	sif_log0(SIF_INIT, "ERROR: Failed to enable firmware access API - error %d", stat);
+	return stat;
+}
+
+void sif_fwa_exit(void)
+{
+	sif_log0(SIF_INIT, "Disabling firmware access API");
+	genl_unregister_family(&sif_enl_family);
+}
+
+
+/* Called from probe to register a new device */
+int sif_fwa_register(struct sif_dev *sdev)
+{
+	struct pci_dev *pdev = sdev->pdev;
+
+	sif_log(sdev, SIF_INIT, "register device %02x:%02x.%d",
+		pdev->bus->number, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+	spin_lock(&fwa.lock);
+	list_add_tail(&sdev->fwa.list, &fwa.sdev_list);
+	spin_unlock(&fwa.lock);
+	return 0;
+}
+
+/* Called from remove to unregister a device */
+void sif_fwa_unregister(struct sif_dev *sdev)
+{
+	spin_lock(&fwa.lock);
+	list_del(&sdev->fwa.list);
+	spin_unlock(&fwa.lock);
+}
+
+
+static struct sif_dev *fwa_find_dev(struct genl_info *info)
+{
+	struct sif_dev *sdev = NULL;
+	struct sif_dev *s;
+
+	u16 domain = nla_get_u16(info->attrs[SIF_ENL_A_COMPLEX]);
+	u16 bus = nla_get_u16(info->attrs[SIF_ENL_A_BUS]);
+	u16 devfn = nla_get_u16(info->attrs[SIF_ENL_A_DEVFN]);
+
+	/* TBD: Ref.count access to sdev */
+	sif_log0(SIF_FWA, "bus %x devfn %x",
+		bus, devfn);
+
+	spin_lock(&fwa.lock);
+	list_for_each_entry(s, &fwa.sdev_list, fwa.list) {
+		if (domain == pci_domain_nr(s->pdev->bus) &&
+			bus == s->pdev->bus->number &&
+			devfn == s->pdev->devfn) {
+			sdev = s;
+			break;
+		}
+		sif_log(s, SIF_FWA, "bus %x devfn %x", s->pdev->bus->number, s->pdev->devfn);
+	}
+	spin_unlock(&fwa.lock);
+	return sdev;
+}
+
+
+static int fwa_valid_opcode(struct sif_dev *sdev, struct psif_epsc_csr_req *req,
+		enum psif_mbox_type eps_num)
+{
+	switch (req->opcode) {
+	case EPSC_SETUP:
+	case EPSC_SETUP_BASEADDR:
+	case EPSC_SET_BASEADDR:
+	case EPSC_SET_BASEADDR_EQ:
+	case EPSC_SET_ONE_CSR:
+		/* These are kernel only */
+		return -EPERM;
+	case EPSC_HOST_INT_CHANNEL_CTRL:
+	case EPSC_HOST_INT_COMMON_CTRL:
+	case EPSC_SET_LID:
+	case EPSC_SET_EOIB_MAC:
+	case EPSC_UF_RESET:
+	case EPSC_MODIFY_QP:
+	case EPSC_GET_SINGLE:
+	case EPSC_GET_ONE_CSR:
+	case EPSC_QUERY:
+	case EPSC_SET:
+	case EPSC_QUERY_QP:
+	case EPSC_QUERY_DEVICE:
+	case EPSC_QUERY_PORT_1:
+	case EPSC_QUERY_PORT_2:
+	case EPSC_QUERY_PKEY:
+	case EPSC_QUERY_GID:
+	case EPSC_MODIFY_DEVICE:
+	case EPSC_MODIFY_PORT_1:
+	case EPSC_MODIFY_PORT_2:
+	case EPSC_MC_ATTACH:
+	case EPSC_MC_DETACH:
+	case EPSC_MC_QUERY:
+	case EPSC_FLASH_START:
+	case EPSC_FLASH_ERASE_SECTOR:
+	case EPSC_FLASH_RD:
+	case EPSC_FLASH_WR:
+	case EPSC_FLASH_STOP:
+	case EPSC_A_CONTROL:
+	case EPSC_LINK_CNTRL:
+	case EPSC_UPDATE:
+	case EPSC_UF_CTRL:
+	case EPSC_VIMMA_CTRL:
+		/* These are not meaningful for the EPSAs for now */
+		if (eps_num == sdev->mbox_epsc)
+			return 0;
+		else
+			return -EPERM;
+	case EPSC_NOOP:
+	case EPSC_MAILBOX_PING:
+	case EPSC_KEEP_ALIVE:
+	case EPSC_EVENT_ACK:
+	case EPSC_EVENT_INDEX:
+	case EPSC_TEST_HOST_RD:
+	case EPSC_TEST_HOST_WR:
+	case EPSC_FW_VERSION:
+	case EPSC_LOG_CTRL:
+	case EPSC_LOG_REQ_NOTIFY:
+	case EPSC_A_COMMAND:
+	case EPSC_EXERCISE_MMU:
+	case EPSC_CLI_ACCESS:
+		break;
+	case EPSC_LAST_OP:
+	default:
+		/* Fail on all unknown operations: */
+		sif_log(sdev, SIF_FWA, "Unknown operation %d", req->opcode);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+
+static int sif_fwa_verify_find_dev(struct genl_info *info, struct sif_dev **sdev_p, int payload_len)
+{
+	struct sif_dev *sdev;
+	int len;
+
+	if (!info->attrs[SIF_ENL_A_COMPLEX]) {
+		sif_log0(SIF_FWA, "PCI complex no. not set!");
+		return -EINVAL;
+	}
+
+	if (!info->attrs[SIF_ENL_A_BUS]) {
+		sif_log0(SIF_FWA, "PCI bus no. not set!");
+		return -EINVAL;
+	}
+
+	if (!info->attrs[SIF_ENL_A_DEVFN]) {
+		sif_log0(SIF_FWA, "PCI device/function not set!");
+		return -EINVAL;
+	}
+
+	if (!info->attrs[SIF_ENL_A_PAYLOAD]) {
+		sif_log0(SIF_FWA, "Received empty request!");
+		return -EINVAL;
+	}
+	len = nla_len(info->attrs[SIF_ENL_A_PAYLOAD]);
+	if (len < payload_len) {
+		sif_log0(SIF_FWA, "Request too short!");
+		return -EFAULT;
+	}
+
+	/* TBD: Better input checking... */
+
+	sdev = fwa_find_dev(info);
+	if (!sdev) {
+		sif_log0(SIF_FWA, "No such device found!");
+		return -ENODEV;
+	}
+	*sdev_p = sdev;
+	return 0;
+}
+
+
+static int sif_fwa_drv_req(struct sk_buff *skb, struct genl_info *info)
+{
+	int msg_sz;
+	int stat;
+	size_t data_sz = 0;
+	struct sif_dev *sdev;
+	struct sif_drv_req *req = NULL;
+	struct sif_drv_rsp rsp;
+	enum psif_mbox_type eps_num;
+	struct sk_buff *resp_skb;
+	void *data;
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN)) {
+		sif_log0(SIF_FWA, "Request from client without the CAP_NET_ADMIN privilege");
+		return -EPERM;
+	}
+
+	ret = sif_fwa_verify_find_dev(info, &sdev, sizeof(struct sif_drv_req));
+	if (ret)
+		return ret;
+
+	req = nla_data(info->attrs[SIF_ENL_A_PAYLOAD]);
+
+	sif_log(sdev, SIF_FWA, "op %d", req->opcode);
+
+	if (IS_SIBS(sdev)) {
+		sif_log(sdev, SIF_FWA, "Device does not have any EPS-A modules");
+		return -EINVAL;
+	}
+
+	eps_num = epsa_to_mbox(req->u.epsa.epsa);
+	if (eps_num == (enum psif_mbox_type)-1) {
+		sif_log(sdev, SIF_FWA, "Unknown EPS-A %d", req->u.epsa.epsa);
+		return -EINVAL;
+	}
+
+	switch (req->opcode) {
+	case SIF_DRV_CMD_EPSA_SETUP:
+		ret = sif_activate_epsa(sdev, eps_num);
+		rsp.opcode = SIF_DRV_CMD_EPSA_SETUP;
+		rsp.eps_rsp.status = ret;
+		break;
+	case SIF_DRV_CMD_EPSA_TEARDOWN:
+		break;
+	}
+
+	if (ret)
+		return ret;
+
+	/* Start building a response */
+	msg_sz = NLMSG_DEFAULT_SIZE + data_sz;
+	resp_skb = nlmsg_new(msg_sz, GFP_KERNEL);
+	if (!resp_skb)
+		return -ENOMEM;
+
+	data = genlmsg_put_reply(resp_skb, info, &sif_enl_family,
+				0, SIF_ENL_CMD_RSP_DRV);
+	if (data == NULL) {
+		stat = -ENOMEM;
+		goto put_fail;
+	}
+
+	stat = nla_put(resp_skb, SIF_ENL_A_PAYLOAD, sizeof(struct sif_drv_rsp), &rsp);
+	if (stat) {
+		sif_log(sdev, SIF_FWA, "failed to append response to netlink packet");
+		goto put_fail;
+	}
+
+	/* Recompute message header */
+	genlmsg_end(resp_skb, data);
+
+	stat = genlmsg_reply(resp_skb, info);
+	if (stat) {
+		sif_log(sdev, SIF_FWA, "failed to send reply - status %d", stat);
+		goto put_fail;
+	}
+
+	sif_log(sdev, SIF_FWA, "Sent response with drv opcode %d msg sz %d",
+		rsp.opcode, msg_sz);
+	return 0;
+put_fail:
+	nlmsg_free(resp_skb);
+	return stat;
+}
+
+static int sif_fwa_req(struct sk_buff *skb, struct genl_info *info)
+{
+	int len;
+	int stat;
+	int msg_sz;
+	struct sif_dev *sdev;
+	enum psif_mbox_type eps_num;
+	struct sif_eps *es;
+	void *data;
+	size_t data_sz = 0;
+	struct psif_epsc_csr_req *req = NULL;
+	struct psif_epsc_csr_rsp rsp;
+	struct psif_query_qp *qqp;
+	struct sk_buff *resp_skb;
+	void *kaddr = NULL;
+
+	if (!capable(CAP_NET_ADMIN)) {
+		sif_log0(SIF_FWA, "Request from client without the CAP_NET_ADMIN privilege");
+		return -EPERM;
+	}
+
+	stat = sif_fwa_verify_find_dev(info, &sdev, sizeof(struct psif_epsc_csr_req));
+	if (stat)
+		return stat;
+
+	req = nla_data(info->attrs[SIF_ENL_A_PAYLOAD]);
+
+	if (info->attrs[SIF_ENL_A_INDEX]) {
+		eps_num = nla_get_u32(info->attrs[SIF_ENL_A_INDEX]);
+		if (IS_SIBS(sdev)) {
+			if (eps_num == MBOX_EPSC)
+				eps_num = SIBS_MBOX_EPSC;
+			else {
+				sif_log(sdev, SIF_FWA, "Invalid EPS selection (%d)", eps_num);
+				return -EINVAL;
+			}
+		}
+		if (eps_num >= sdev->eps_cnt) {
+			sif_log(sdev, SIF_FWA, "Invalid EPS selection (%d)", eps_num);
+			return -EINVAL;
+		}
+	} else {
+		/* Default to use the EPSC (bw.comp) */
+		eps_num = sdev->mbox_epsc;
+	}
+
+	sif_log(sdev, SIF_FWA, "%s to %s",
+		string_enum_psif_epsc_csr_opcode(req->opcode),
+		string_enum_psif_mbox_type(eps_num));
+
+	es = &sdev->es[eps_num];
+	if (es->state != ES_ACTIVE) {
+		sif_log0(SIF_FWA, "Communication with EPS%s has not been set up (state = %d)!",
+			eps_name(sdev, eps_num), es->state);
+		return -ENODEV;
+	}
+
+	/* Check that this opcode is valid in this context */
+	stat = fwa_valid_opcode(sdev, req, eps_num);
+	if (stat) {
+		if (stat == -EPERM)
+			sif_log(sdev, SIF_FWA,
+				"Operation %s not permitted for EPS%s from user space",
+				string_enum_psif_epsc_csr_opcode(req->opcode),
+				eps_name(sdev, eps_num));
+		return stat;
+	}
+
+
+	/* The below opcodes picks up additional data from (fixed) buffers */
+	switch (req->opcode) {
+	case EPSC_QUERY_DEVICE:
+		req->u.query_hw.address =
+			(u64)es->data_dma_hdl +
+			offsetof(struct sif_epsc_data, dev);
+		kaddr = &es->data->dev;
+		data_sz = sizeof(struct psif_epsc_device_attr);
+		break;
+	case EPSC_QUERY_PORT_1:
+		req->u.query_hw.address =
+			(u64)es->data_dma_hdl +
+			offsetof(struct sif_epsc_data, port[0]);
+		kaddr = &es->data->port[0];
+		data_sz = sizeof(struct psif_epsc_port_attr);
+		break;
+	case EPSC_QUERY_PORT_2:
+		req->u.query_hw.address =
+			(u64)es->data_dma_hdl +
+			offsetof(struct sif_epsc_data, port[1]);
+		kaddr = &es->data->port[1];
+		data_sz = sizeof(struct psif_epsc_port_attr);
+		break;
+	case EPSC_QUERY_QP:
+	{
+		struct sif_qp *qps;
+		u32 qp_idx = req->u.query_qp.ctrl.qp_num;
+
+		if (qp_idx >= sdev->ba[qp].entry_cnt)
+			return -ENOENT;
+		qps = get_sif_qp(sdev, qp_idx);
+		kaddr = qqp = &qps->qqp;
+		req->u.query_qp.address = sif_qqp_dma_addr(sdev, qps);
+		data_sz = sizeof(struct psif_query_qp);
+		break;
+	}
+	case EPSC_FLASH_RD:
+	case EPSC_FLASH_WR:
+		data_sz = req->u.flash.length;
+		if (data_sz)
+			kaddr = &es->data->flash;
+
+		/* Use the reserved 'flash' buffer allocated with the EPSC's resp.queue: */
+		req->u.flash.host_addr = es->data_dma_hdl +
+			offsetof(struct sif_epsc_data, flash);
+		req->u.flash.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx;
+		break;
+	case EPSC_CLI_ACCESS:
+		data_sz = MAX_FWA_NL_PAYLOAD;
+		kaddr = &es->data->epsc_cli;
+
+		/* Use the reserved 'epsc_cli' buffer allocated with the EPSC's resp. queue: */
+		req->u.cli.host_addr = es->data_dma_hdl +
+			offsetof(struct sif_epsc_data, epsc_cli);
+		req->u.cli.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx;
+		break;
+	case EPSC_VIMMA_CTRL:
+		data_sz = MAX_FWA_NL_PAYLOAD;
+		kaddr = &es->data->vimm_agent;
+
+		/* Use the reserved 'vimm_agent' buffer allocated with the EPSC's resp. queue: */
+		req->u.vimma_ctrl.host_addr = es->data_dma_hdl +
+			offsetof(struct sif_epsc_data, vimm_agent);
+		req->u.vimma_ctrl.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx;
+		break;
+	case EPSC_UPDATE:
+		switch (req->u.update.opcode) {
+		case EPSC_UPDATE_OP_READ:
+		case EPSC_UPDATE_OP_WRITE:
+			/* Use the reserved 'flash' buffer allocated with the EPSC's resp.queue: */
+			req->u.update.host_addr = es->data_dma_hdl +
+				offsetof(struct sif_epsc_data, flash);
+			req->u.update.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx;
+			/* fall through */
+		case EPSC_UPDATE_OP_POLL:
+			data_sz = req->u.update.length;
+			kaddr = &es->data->flash;
+			break;
+		default:
+			break;
+		}
+		break;
+	default:
+		break;
+	}
+
+	/* Copy any extra input data to the kernel buffer: */
+	if (info->attrs[SIF_ENL_A_DATA]) {
+		len = nla_len(info->attrs[SIF_ENL_A_DATA]);
+		data = nla_data(info->attrs[SIF_ENL_A_DATA]);
+		switch (req->opcode) {
+		case EPSC_UPDATE:
+			if (req->u.update.opcode != EPSC_UPDATE_OP_WRITE)
+				break;
+			/* fall through */
+		case EPSC_FLASH_WR:
+		case EPSC_CLI_ACCESS:
+		case EPSC_VIMMA_CTRL:
+			if (kaddr) {
+				memcpy(kaddr, data, len);
+				sif_log(sdev, SIF_FWA, "dma kaddr %p data %p len %x",
+					kaddr, data, len);
+				mb();
+			} else
+				sif_log(sdev, SIF_FWA, "Found aux.data input but no data area");
+			break;
+		default:
+			sif_log(sdev, SIF_FWA, "Found aux.data input in unexpected op %s",
+				string_enum_psif_epsc_csr_opcode(req->opcode));
+			break;
+		}
+	}
+
+	stat = sif_eps_wr(sdev, eps_num, req, &rsp);
+	switch (stat) {
+	case -ETIMEDOUT:
+		return stat;
+	default:
+		break;
+	}
+
+	if (data_sz > MAX_FWA_NL_PAYLOAD)
+		return -EMSGSIZE;
+
+	/* Start building a response */
+	msg_sz = NLMSG_DEFAULT_SIZE + data_sz;
+	resp_skb = nlmsg_new(msg_sz, GFP_KERNEL);
+	if (!resp_skb) {
+		sif_log(sdev, SIF_FWA, "failed to allocate netlink packet");
+		return -ENOMEM;
+	}
+
+	data = genlmsg_put_reply(resp_skb, info, &sif_enl_family,
+				0, SIF_ENL_CMD_RSP);
+	if (data == NULL) {
+		sif_log(sdev, SIF_FWA, "failed to add generic netlink header");
+		stat = -ENOMEM;
+		goto put_fail;
+	}
+
+	stat = nla_put(resp_skb, SIF_ENL_A_PAYLOAD, sizeof(struct psif_epsc_csr_rsp), &rsp);
+	if (stat) {
+		sif_log(sdev, SIF_FWA, "failed to append response to netlink packet");
+		goto put_fail;
+	}
+
+	if (kaddr && req->opcode != EPSC_FLASH_WR &&
+	    !(req->opcode == EPSC_UPDATE && req->u.update.opcode == EPSC_UPDATE_OP_WRITE)) {
+		stat = nla_put(resp_skb, SIF_ENL_A_DATA, data_sz, kaddr);
+		if (stat) {
+			sif_log(sdev, SIF_FWA, "failed to append %ld bytes of data", data_sz);
+			goto put_fail;
+		}
+	}
+
+	/* Recompute message header */
+	genlmsg_end(resp_skb, data);
+
+	stat = genlmsg_reply(resp_skb, info);
+	if (stat) {
+		sif_log(sdev, SIF_FWA, "failed to send reply - status %d", stat);
+		goto put_fail;
+	}
+
+	sif_log(sdev, SIF_FWA, "Sent response with opcode %s msg sz %d",
+		string_enum_psif_epsc_csr_opcode(rsp.opcode), msg_sz);
+	return 0;
+put_fail:
+	nlmsg_free(resp_skb);
+	return stat;
+}
diff --git a/drivers/infiniband/hw/sif/sif_fwa.h b/drivers/infiniband/hw/sif/sif_fwa.h
new file mode 100644
index 0000000000000..dd806c3bacc37
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_fwa.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_fwa.h: Low level access to a SIF device
+ *
+ *   Allows access to low level functions such as (re)programming the EPSC flash
+ *   via direct access to the EPSC protocol proxied via Netlink.
+ *   Requires CAP_NETADMIN privileges.
+ */
+
+#ifndef __SIF_FWA_H
+#define __SIF_FWA_H
+#include <linux/list.h>
+
+struct sif_dev;
+
+/* The max size we support sending/receiving from user space
+ * in a single netlink message.
+ * Limited by a 4k max netlink message size:
+ */
+#define MAX_FWA_NL_PAYLOAD 0x800
+
+/* Per instance data structure */
+struct sif_fwa {
+	struct list_head list;   /* Linkage for the global list */
+};
+
+/* Called from sif_init/exit to set up/clean up global data structures
+ * such as netlink communication and device registry:
+ */
+int sif_fwa_init(void);
+void sif_fwa_exit(void);
+
+/* Called from probe to register a new device */
+int sif_fwa_register(struct sif_dev *sdev);
+
+/* Called from remove to unregister a device */
+void sif_fwa_unregister(struct sif_dev *sdev);
+
+/* Value definition for the fwa module parameter: */
+#define SIF_FWA_MR_ENABLE	   0x1   /* Enable FWA mode */
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_hwi.c b/drivers/infiniband/hw/sif/sif_hwi.c
new file mode 100644
index 0000000000000..0c07b45e9ce10
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_hwi.c
@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_hwi.c: Hardware init for SIF - combines the various init steps for psif
+ */
+
+#include "sif_dev.h"
+#include "sif_hwi.h"
+#include "sif_base.h"
+#include "sif_cq.h"
+#include "sif_pqp.h"
+#include "sif_qp.h"
+#include "sif_ibqp.h"
+#include "sif_pd.h"
+#include "sif_eq.h"
+#include "sif_xrc.h"
+#include "sif_defs.h"
+#include "sif_query.h"
+#include "psif_hw_data.h"
+#include "psif_hw_setget.h"
+#include <net/checksum.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+/* Create the special SIF privileged QP which is used
+ * for special sif specific work requests such as for instance
+ * requesting completion event notification on a cq.
+ */
+
+static void sif_pqp_fini(struct sif_dev *sdev);
+
+
+static int sif_chip_init(struct sif_dev *sdev);
+static void sif_chip_deinit(struct sif_dev *sdev);
+
+
+static int sif_pqp_init(struct sif_dev *sdev)
+{
+	struct sif_pqp *pqp;
+	struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+	int i;
+	int ret = 0;
+	uint n_pqps = es->eqs.cnt - 2;
+
+	sdev->pqp = sif_kmalloc(sdev, sizeof(struct sif_pqp *) * n_pqps, GFP_KERNEL | __GFP_ZERO);
+	if (!sdev->pqp)
+		return -ENOMEM;
+
+	for (i = 0; i < n_pqps; i++) {
+		pqp = sif_create_pqp(sdev, i);
+		if (IS_ERR(pqp)) {
+			if ((i > 0) &&
+			    !(eps_version_ge(es, 0, 42))) {
+				sif_log(sdev, SIF_INFO,
+				"SIF device has an old FW version that only supports one pqp");
+				break;
+			}
+			ret = PTR_ERR(pqp);
+			goto failed;
+		}
+		sdev->pqp[i] = pqp;
+	}
+	sdev->pqp_cnt = i;
+	atomic_set(&sdev->next_pqp, 0);
+	return 0;
+
+failed:
+	sdev->pqp_cnt = i;
+	sif_pqp_fini(sdev);
+	return ret;
+}
+
+
+static void sif_pqp_fini(struct sif_dev *sdev)
+{
+	/* we must maintain a consistent state of the PQP array
+	 * during takedown as these operations themselves
+	 * generate PQP requests..
+	 */
+	while (sdev->pqp_cnt > 0) {
+		int i = sdev->pqp_cnt - 1;
+		struct sif_pqp *pqp = sdev->pqp[i];
+
+		if (i > 0) {
+			/* Remove ourselves first, except the final PQP */
+			sdev->pqp[i] = NULL;
+			sdev->pqp_cnt--;
+		}
+		sif_destroy_pqp(sdev, pqp);
+		if (i == 0)
+			sdev->pqp_cnt--;
+	}
+	kfree(sdev->pqp);
+	sdev->pqp = NULL;
+}
+
+
+static void sif_ki_spqp_fini(struct sif_dev *sdev);
+
+static int sif_ki_spqp_init(struct sif_dev *sdev)
+{
+	int i;
+	int ret = 0;
+	int n = max(sif_ki_spqp_size, 0U);
+	int bm_len = max(1, n/8);
+
+	mutex_init(&sdev->ki_spqp.lock);
+	sdev->ki_spqp.spqp =
+#ifdef CONFIG_NUMA
+		kmalloc_node(sizeof(struct sif_st_pqp *) * n, GFP_KERNEL | __GFP_ZERO,
+			sdev->pdev->dev.numa_node);
+#else
+		kmalloc(sizeof(struct sif_st_pqp *) * n, GFP_KERNEL | __GFP_ZERO);
+#endif
+	if (!sdev->ki_spqp.spqp)
+		return -ENOMEM;
+
+	sdev->ki_spqp.bitmap =
+#ifdef CONFIG_NUMA
+		kmalloc_node(sizeof(ulong) * bm_len, GFP_KERNEL | __GFP_ZERO,
+			sdev->pdev->dev.numa_node);
+#else
+		kmalloc(sizeof(ulong) * bm_len, GFP_KERNEL | __GFP_ZERO);
+#endif
+	if (!sdev->ki_spqp.bitmap) {
+		ret = -ENOMEM;
+		goto bm_failed;
+	}
+
+	for (i = 0; i < n; i++) {
+		struct sif_st_pqp *spqp = sif_create_inv_key_st_pqp(sdev);
+
+		if (IS_ERR(spqp)) {
+			ret = PTR_ERR(spqp);
+			break;
+		}
+		sdev->ki_spqp.spqp[i] = spqp;
+		spqp->index = i;
+	}
+	sdev->ki_spqp.pool_sz = i;
+	if (ret && i) {
+		sif_log(sdev, SIF_INFO, "Failed to create %d INVALIDATE_KEY stencil QPs", i);
+		sif_ki_spqp_fini(sdev);
+	}
+
+	if (i)
+		sif_log(sdev, SIF_INFO, "Created %d INVALIDATE_KEY stencil QPs", i);
+bm_failed:
+	if (ret)
+		kfree(sdev->ki_spqp.spqp);
+	return 0;  /* Never fail on stencil PQP allocation */
+}
+
+
+static void sif_ki_spqp_fini(struct sif_dev *sdev)
+{
+	int i;
+
+	if (!sdev->ki_spqp.spqp)
+		return;
+	for (i = sdev->ki_spqp.pool_sz - 1; i >= 0; i--)
+		sif_destroy_st_pqp(sdev, sdev->ki_spqp.spqp[i]);
+	kfree(sdev->ki_spqp.bitmap);
+	kfree(sdev->ki_spqp.spqp);
+	sdev->ki_spqp.spqp = NULL;
+}
+
+
+static void sif_hw_kernel_cb_fini(struct sif_dev *sdev)
+{
+	int i;
+
+	while (sdev->kernel_cb_cnt > 0) {
+		int j = sdev->kernel_cb_cnt - 1;
+
+		for (i = 0; i < 2; i++)
+			if (sdev->kernel_cb[i][j])
+				release_cb(sdev, sdev->kernel_cb[i][j]);
+		sdev->kernel_cb_cnt--;
+	}
+	for (i = 0; i < 2; i++)
+		kfree(sdev->kernel_cb[i]);
+}
+
+
+
+static int sif_hw_kernel_cb_init(struct sif_dev *sdev)
+{
+	int i;
+	uint n_cbs = min(sif_cb_max, num_present_cpus());
+
+	if (!n_cbs)
+		n_cbs = 1;
+
+	for (i = 0; i < 2; i++) {
+		sdev->kernel_cb[i] = kcalloc(n_cbs, sizeof(struct sif_cb *), GFP_KERNEL);
+		if (!sdev->kernel_cb[i])
+			goto alloc_failed;
+	}
+
+	for (i = 0; i < n_cbs; i++) {
+		sdev->kernel_cb[0][i] = alloc_cb(sdev, false);
+		if (!sdev->kernel_cb[0][i])
+			goto alloc_failed;
+		sdev->kernel_cb[1][i] = alloc_cb(sdev, true);
+		if (!sdev->kernel_cb[1][i])
+			goto alloc_failed;
+	}
+	sdev->kernel_cb_cnt = i;
+	return 0;
+
+alloc_failed:
+	sdev->kernel_cb_cnt = i;
+	sif_hw_kernel_cb_fini(sdev);
+	return -ENOMEM;
+}
+
+
+static int get_tsl_map(struct sif_dev *sdev,
+		int opcode,
+		int port,
+		struct psif_tsl_map *map)
+{
+	struct psif_epsc_csr_req req;
+	struct psif_epsc_csr_rsp rsp;
+
+	/* EPSC supports the new requests starting from v.0.56 */
+	if (eps_fw_version_ge(&sdev->es[sdev->mbox_epsc], 0, 56)) {
+		int ret = 0;
+
+		memset(&req, 0, sizeof(req));
+		req.opcode = EPSC_QUERY;
+		req.u.query.data.op = opcode;
+		req.u.query.data.index = port;
+
+		ret = sif_epsc_wr(sdev, &req, &rsp);
+		if (ret) {
+			sif_log(sdev, SIF_INFO, "Failed to query sl to tsl map, opcode %s, port %d",
+				string_enum_psif_epsc_query_op(opcode) + strlen("EPSC_QUERY_"),
+				port);
+			return ret;
+		}
+		*map = *((struct psif_tsl_map *)&rsp.data);
+		return 0;
+	}
+
+	sif_log(sdev, SIF_INFO, "PSIF API %s has fw version less than %s. Cannot retrieve SL2TSL map",
+		"0.98", "0.56");
+	return -EOPNOTSUPP;
+}
+
+
+static void setup_sl2tsl_map(struct sif_dev *sdev)
+{
+	int port;
+	int sl;
+	int qosl;
+
+
+	/* TBD: separate bulk and rcv pqp vcb/tsl */
+	for (port = 0; port < 2; ++port) {
+		sdev->pqp_rcn_tsl[port] = TSL_PRIV;
+		sdev->pqp_bulk_tsl[port] = TSL_PRIV;
+		sdev->pqp_qosl_rcn_hint[port] = QOSL_LOW_LATENCY;
+		sdev->pqp_qosl_bulk_hint[port] = QOSL_LOW_LATENCY;
+	}
+
+	/* Default or least aggressive common denominator */
+	memset(sdev->sl2tsl + 0, TSL_DATA, sizeof(sdev->sl2tsl));
+	memset(sdev->qp0_tsl + 0, TSL_DATA, sizeof(sdev->qp0_tsl));
+
+	if (eps_fw_version_lt(&sdev->es[sdev->mbox_epsc], 0, 50)) {
+		sif_log(sdev, SIF_INFO, "Using a single TSL for regular QPs (fw < 0.50)");
+		return;
+	}
+
+	/* See BZ 3883 and https://cod.no.oracle.com/gerrit/r/#/c/6587/ */
+	for (sl = 0; sl < 16; ++sl)
+		for (port = 0; port < 2; ++port)
+			for (qosl = QOSL_HIGH_BANDWIDTH; qosl <= QOSL_LOW_LATENCY; ++qosl)
+				sdev->sl2tsl[sl][port][qosl] = port ? TSL_DATA_1 : TSL_DATA;
+
+	if (eps_fw_version_lt(&sdev->es[sdev->mbox_epsc], 0, 56)) {
+		sif_log(sdev, SIF_INFO, "Setting up TSL per port (0.50 <= fw <= 0.56)");
+		return;
+	}
+
+#define GET_TSL(i) map.m ## i ## _tsl
+#define GET_QOS(i) map.m ## i ## _tqos
+
+	{
+		struct psif_tsl_map map;
+		int opc;
+
+		sif_log(sdev, SIF_TSL, "Retrieving SL to TSL map from epsc (fw >= 0.56)");
+
+		for (port = 0; port < 2; ++port) {
+			if (get_tsl_map(sdev, EPSC_QUERY_MAP_PQP_TO_TSL, port + 1, &map))
+				return;
+			/* RCN pqp info in first entry, bulk in second */
+			sdev->pqp_rcn_tsl[port] = GET_TSL(0);
+			sdev->pqp_bulk_tsl[port] = GET_TSL(1);
+			sdev->pqp_qosl_rcn_hint[port] = GET_QOS(0);
+			sdev->pqp_qosl_bulk_hint[port] = GET_QOS(1);
+		}
+
+		for (opc = EPSC_QUERY_MAP_SL_TO_TSL_LO; opc <= EPSC_QUERY_MAP_SL_TO_TSL_HI; ++opc) {
+			bool last8 = opc == EPSC_QUERY_MAP_SL_TO_TSL_HI;
+
+			for (port = 0; port < 2; ++port) {
+				if (get_tsl_map(sdev, opc, port + 1, &map))
+					return;
+				for (qosl = QOSL_HIGH_BANDWIDTH; qosl <= QOSL_LOW_LATENCY; ++qosl) {
+					sdev->sl2tsl[8*last8 + 0][port][qosl] = GET_TSL(0);
+					sdev->sl2tsl[8*last8 + 1][port][qosl] = GET_TSL(1);
+					sdev->sl2tsl[8*last8 + 2][port][qosl] = GET_TSL(2);
+					sdev->sl2tsl[8*last8 + 3][port][qosl] = GET_TSL(3);
+					sdev->sl2tsl[8*last8 + 4][port][qosl] = GET_TSL(4);
+					sdev->sl2tsl[8*last8 + 5][port][qosl] = GET_TSL(5);
+					sdev->sl2tsl[8*last8 + 6][port][qosl] = GET_TSL(6);
+					sdev->sl2tsl[8*last8 + 7][port][qosl] = GET_TSL(7);
+
+					sdev->qp_qosl_hint[8*last8 + 0][port] = GET_QOS(0);
+					sdev->qp_qosl_hint[8*last8 + 1][port] = GET_QOS(1);
+					sdev->qp_qosl_hint[8*last8 + 2][port] = GET_QOS(2);
+					sdev->qp_qosl_hint[8*last8 + 3][port] = GET_QOS(3);
+					sdev->qp_qosl_hint[8*last8 + 4][port] = GET_QOS(4);
+					sdev->qp_qosl_hint[8*last8 + 5][port] = GET_QOS(5);
+					sdev->qp_qosl_hint[8*last8 + 6][port] = GET_QOS(6);
+					sdev->qp_qosl_hint[8*last8 + 7][port] = GET_QOS(7);
+				}
+			}
+		}
+
+		if (!eps_version_ge(&sdev->es[sdev->mbox_epsc], 1, 6)) {
+			sif_log(sdev, SIF_INFO, "FW version does not not support special QP0 TSL");
+			return;
+		}
+		for (port = 0; port < 2; ++port) {
+			if (get_tsl_map(sdev, EPSC_QUERY_MAP_QP0_TO_TSL, port + 1, &map))
+				return;
+			sdev->qp0_tsl[port] = GET_TSL(0);
+			sdev->qp0_qosl_hint[port] = GET_QOS(0);
+		}
+	}
+#undef GET_TSL
+#undef GET_QOS
+}
+
+
+static void dump_sl2tsl_map(struct sif_dev *sdev)
+{
+	int sl;
+	int port;
+	int qosl;
+
+	for (port = 0; port < 2; ++port) {
+		sif_log(sdev, SIF_TSL, "rcn  pqp port:%d tsl:%2d fw_hint:%s",
+			port + 1, sdev->pqp_rcn_tsl[port],
+			string_enum_psif_tsu_qos(sdev->pqp_qosl_rcn_hint[port]) + strlen("QOSL_"));
+		sif_log(sdev, SIF_TSL, "bulk pqp port:%d tsl:%2d fw_hint:%s",
+			port + 1, sdev->pqp_bulk_tsl[port],
+			string_enum_psif_tsu_qos(sdev->pqp_qosl_bulk_hint[port]) + strlen("QOSL_"));
+	}
+
+	for (port = 0; port < 2; ++port)
+		for (sl = 0; sl < 16; ++sl)
+			for (qosl = QOSL_HIGH_BANDWIDTH; qosl <= QOSL_LOW_LATENCY; ++qosl)
+				sif_log(sdev, SIF_TSL,
+					"plain qp port:%d sl:%2d qosl:%-14s tsl:%2d fw_hint:%s",
+					port + 1, sl, string_enum_psif_tsu_qos(qosl) + strlen("QOSL_"),
+					sdev->sl2tsl[sl][port][qosl],
+					string_enum_psif_tsu_qos(sdev->qp_qosl_hint[sl][port]) +
+					strlen("QOSL_"));
+
+	for (port = 0; port < 2; ++port) {
+		sif_log(sdev, SIF_TSL, "qp0 port:%d tsl:%2d fw_hint:%s",
+			port + 1, sdev->qp0_tsl[port],
+			string_enum_psif_tsu_qos(sdev->qp0_qosl_hint[port]) + strlen("QOSL_"));
+	}
+}
+
+/* Device is degraded; set limited mode and report cause */
+static int sif_handle_degraded(struct sif_dev *sdev)
+{
+	int ret = 0;
+
+	sdev->limited_mode = true;
+	if (eps_version_ge(&sdev->es[sdev->mbox_epsc], 1, 0)) {
+		struct psif_epsc_csr_req req;
+		struct psif_epsc_csr_rsp rsp;
+
+		/* Ask the EPSC if it's running in degraded mode */
+		memset(&req, 0, sizeof(req));
+		req.opcode = EPSC_QUERY;
+		req.u.query.data.op = EPSC_QUERY_DEGRADED_CAUSE;
+		ret = sif_epsc_wr(sdev, &req, &rsp);
+		if (ret) {
+			sif_log(sdev, SIF_INFO,
+				"Request to the EPSC for degraded cause failed with %d", ret);
+			return ret;
+		}
+		if (rsp.data != 0)
+			epsc_report_degraded(sdev, rsp.data);
+		sif_log(sdev, SIF_EPS, "Device reports degraded mode, mask 0x%llx", rsp.data);
+	}
+	return ret;
+}
+
+
+int sif_hw_init(struct sif_dev *sdev)
+{
+	int i;
+	int ret = -ENOMEM;
+	struct sif_pd *pd = NULL;
+
+	/* PSIF 2.x requires MRRS to be at least 512, ref BZ #3301 */
+	if (pcie_get_readrq(sdev->pdev) < 512) {
+		sif_log(sdev, SIF_INFO, "PSIF 2.x requires MRRS to be at least 512 bytes");
+		ret = -EINVAL;
+		goto chip_init_failed;
+	}
+
+	sif_mem_init(sdev);
+
+	/* Misc. PSIF chip version specific
+	 * configuration (must be before base_init):
+	 */
+	ret = sif_chip_init(sdev);
+	if (ret)
+		goto chip_init_failed;
+
+	ret = sif_base_init(sdev);
+	if (ret)
+		goto base_failed;
+
+	/* Allocate collect buffers for kernel usage */
+	ret = sif_hw_kernel_cb_init(sdev);
+	if (ret)
+		goto cb_alloc_failed;
+
+	ret = sif_init_pd(sdev);
+	if (ret)
+		goto pd_init_failed;
+
+	/* We need a kernel protection domain for resource allocation */
+	pd = alloc_pd(sdev);
+	if (!pd)
+		goto pd_alloc_failed;
+	pd->ibpd.device = &sdev->ib_dev;
+	sdev->pd = pd;
+	if (sdev->degraded)
+		sif_handle_degraded(sdev);
+	if (sdev->limited_mode) {
+		sif_log(sdev, SIF_INFO, "Running in limited mode\n");
+		return 0;
+	}
+
+	/* Initialize the SL to TSL map, before any QPs are created */
+	setup_sl2tsl_map(sdev);
+	dump_sl2tsl_map(sdev);
+
+	/* Reserve indices for qp 0 and 1, ports 1 and 2 */
+	for (i = 0; i <= 3; i++)
+		sif_alloc_qp_idx(pd);
+
+	ret = sif_pqp_init(sdev);
+	if (ret)
+		goto pqp_failed;
+
+	ret = sif_ki_spqp_init(sdev);
+	if (ret)
+		goto ki_spqp_failed;
+
+	ret = sif_init_xrcd(sdev);
+	if (ret)
+		goto xrcd_failed;
+
+	return 0;
+
+xrcd_failed:
+	sif_ki_spqp_fini(sdev);
+ki_spqp_failed:
+	sif_pqp_fini(sdev);
+pqp_failed:
+	/* Release indices for qp 0 and 1 */
+	for (i = 3; i >= 0; i--)
+		sif_free_qp_idx(pd, i);
+	dealloc_pd(pd);
+
+pd_alloc_failed:
+	sif_deinit_pd(sdev);
+pd_init_failed:
+	sif_hw_kernel_cb_fini(sdev);
+cb_alloc_failed:
+	sif_base_deinit(sdev);
+base_failed:
+	sif_chip_deinit(sdev);
+chip_init_failed:
+	return ret;
+}
+
+void sif_hw_deinit(struct sif_dev *sdev)
+{
+	int i;
+
+	if (!sdev->limited_mode) {
+		sif_log(sdev, SIF_PQP, "enter");
+		sif_ki_spqp_fini(sdev);
+		sif_pqp_fini(sdev);
+
+		/* Release indices for qp 0 and 1 */
+		for (i = 3; i >= 0; i--)
+			sif_free_qp_idx(sdev->pd, i);
+	}
+
+	dealloc_pd(sdev->pd);
+	sif_deinit_pd(sdev);
+	sif_hw_kernel_cb_fini(sdev);
+	sif_base_deinit(sdev);
+	sif_chip_deinit(sdev);
+}
+
+
+int force_pcie_link_retrain(struct sif_dev *sdev)
+{
+	int err, parent_pcie_cap;
+	u16 parent_lnkctl;
+
+	parent_pcie_cap = pci_find_capability(sdev->pdev->bus->self, PCI_CAP_ID_EXP);
+	err = pci_read_config_word(sdev->pdev, parent_pcie_cap + PCI_EXP_LNKCTL, &parent_lnkctl);
+	parent_lnkctl |= PCI_EXP_LNKCTL_RL;
+	err = pci_write_config_word(sdev->pdev->bus->self, parent_pcie_cap + PCI_EXP_LNKCTL,
+				parent_lnkctl);
+	return err;
+}
+
+
+static int sif_chip_init(struct sif_dev *sdev)
+{
+	u16 devid;
+
+	/* Chip version specific config */
+	devid = sdev->pdev->device;
+	switch (devid) {
+	case PCI_DEVICE_ID_PSIF_VF:
+		sdev->is_vf = true;
+		sdev->num_vfs = 0;
+		sdev->mbox_epsc = MBOX_EPSC;
+		sdev->eps_cnt = MBOX_EPSC + 1;
+		break;
+
+	case PCI_DEVICE_ID_PSIF_PF:
+		sdev->is_vf = false;
+		sdev->mbox_epsc = MBOX_EPSC;
+		sdev->eps_cnt = MBOX_EPSC + 1;
+		break;
+
+	case PCI_DEVICE_ID_SN1_VF:
+		sdev->is_vf = true;
+		sdev->num_vfs = 0;
+		sdev->mbox_epsc = SIBS_MBOX_EPSC;
+		sdev->eps_cnt = SIBS_MBOX_EPSC + 1;
+		break;
+
+	case PCI_DEVICE_ID_SN1_PF:
+		sdev->is_vf = false;
+		sdev->mbox_epsc = SIBS_MBOX_EPSC;
+		sdev->eps_cnt = SIBS_MBOX_EPSC + 1;
+		break;
+
+	default:
+		sif_log(sdev, SIF_INFO, "Unknown device id %x", devid);
+		return -ENODEV;
+	}
+
+	if (!sif_vf_en && sdev->is_vf) {
+		sif_log(sdev, SIF_INFO, "Parameter vf_en=0: VF driver load disabled");
+		return -EINVAL;
+	}
+
+
+	sdev->es = kcalloc(sdev->eps_cnt, sizeof(struct sif_eps), GFP_KERNEL);
+	if (!sdev->es)
+		return -ENOMEM;
+
+	return 0;
+}
+
+
+static void sif_chip_deinit(struct sif_dev *sdev)
+{
+	kfree(sdev->es);
+	sdev->es = NULL;
+}
diff --git a/drivers/infiniband/hw/sif/sif_hwi.h b/drivers/infiniband/hw/sif/sif_hwi.h
new file mode 100644
index 0000000000000..a67141229098a
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_hwi.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_hwi.h: Hardware init for SIF
+ */
+
+#ifndef _SIF_HWI_H
+#define _SIF_HWI_H
+#include <rdma/ib_verbs.h>
+#include "sif_cq.h"
+#include "sif_r3.h"
+
+struct sif_dev;
+struct sif_pqp;
+struct sif_qp;
+struct sif_compl;
+struct sif_cqe;
+struct psif_wr;
+struct psif_cq_entry;
+enum psif_wr_type;
+
+/* Main calls for hardware specific initialization/deinitialization */
+
+int force_pcie_link_retrain(struct sif_dev *sdev);
+int sif_hw_init(struct sif_dev *sdev);
+void sif_hw_deinit(struct sif_dev *sdev);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_ibcq.h b/drivers/infiniband/hw/sif/sif_ibcq.h
new file mode 100644
index 0000000000000..1c5476e582438
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_ibcq.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_ibcq.h: External interface to IB completion queue logic for SIF
+ */
+
+#ifndef __SIF_IBCQ_H
+#define __SIF_IBCQ_H
+
+struct ib_cq *sif_create_cq(struct ib_device *ibdev, int cqe,
+			int comp_vector, struct ib_ucontext *context,
+			struct ib_udata *udata,
+			enum sif_proxy_type proxy);
+
+int sif_modify_cq(struct ib_cq *ibcq, u16 cq_count, u16 cq_period);
+int sif_destroy_cq(struct ib_cq *ibcq);
+int sif_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
+int sif_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int sif_peek_cq(struct ib_cq *ibcq, int wc_cnt);
+
+int sif_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
+int sif_req_ncomp_notif(struct ib_cq *ibcq, int wc_cnt);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_ibpd.h b/drivers/infiniband/hw/sif/sif_ibpd.h
new file mode 100644
index 0000000000000..41773fe124bf0
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_ibpd.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_ibpd.h: External interface to (IB) protection domains for SIF
+ */
+
+#ifndef __SIF_IBPD_H
+#define __SIF_IBPD_H
+
+struct ib_pd *sif_alloc_pd(struct ib_device *ibdev,
+			   struct ib_ucontext *ibucontext,
+			   struct ib_udata *udata);
+
+int sif_dealloc_pd(struct ib_pd *ibpd);
+
+struct ib_shpd *sif_alloc_shpd(struct ib_device *ibdev,
+				struct ib_pd *ibpd,
+				struct ib_udata *udata);
+
+struct ib_pd *sif_share_pd(struct ib_device *ibdev,
+			struct ib_ucontext *context,
+			struct ib_udata *udata,
+			struct ib_shpd *shpd);
+
+int sif_remove_shpd(struct ib_device *ibdev,
+		struct ib_shpd *shpd,
+		int atinit);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_ibqp.h b/drivers/infiniband/hw/sif/sif_ibqp.h
new file mode 100644
index 0000000000000..bde0740570bed
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_ibqp.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_ibqp.h: External interface to IB queue pair logic for sif
+ */
+
+#ifndef __SIF_IBQP_H
+#define __SIF_IBQP_H
+
+struct ib_qp *sif_create_qp(struct ib_pd *ibpd,
+			    struct ib_qp_init_attr *qp_init_attr,
+			    struct ib_udata *udata);
+int sif_modify_qp(struct ib_qp *ibqp,
+		  struct ib_qp_attr *qp_attr,
+		  int qp_attr_mask, struct ib_udata *udata);
+
+int sif_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+int sif_destroy_qp(struct ib_qp *ibqp);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_idr.c b/drivers/infiniband/hw/sif/sif_idr.c
new file mode 100644
index 0000000000000..ff726bd5d8371
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_idr.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_idr.c: Synchronized ID ref allocation
+ */
+
+#include "sif_idr.h"
+
+int sif_idr_init(struct sif_idr *sidr, int id_min, int id_max)
+{
+	int ret = 0;
+	idr_init(&sidr->idr);
+	mutex_init(&sidr->lock);
+	sidr->id_min = id_min;
+	sidr->id_max = id_max;
+	return ret;
+}
+
+
+void sif_idr_deinit(struct sif_idr *sidr)
+{
+	idr_destroy(&sidr->idr);
+}
+
+
+int sif_idr_alloc(struct sif_idr *sidr, void *ref, gfp_t gfp_mask)
+{
+	int index;
+
+	mutex_lock(&sidr->lock);
+	index = idr_alloc(&sidr->idr, ref, sidr->id_min, sidr->id_max, gfp_mask);
+	mutex_unlock(&sidr->lock);
+	return index;
+}
+
+void sif_idr_remove(struct sif_idr *sidr, int index)
+{
+	mutex_lock(&sidr->lock);
+	idr_remove(&sidr->idr, index);
+	mutex_unlock(&sidr->lock);
+}
diff --git a/drivers/infiniband/hw/sif/sif_idr.h b/drivers/infiniband/hw/sif/sif_idr.h
new file mode 100644
index 0000000000000..4bdfcfd575d51
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_idr.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_idr.h: simple id allocation and deallocation for SIF
+ */
+
+#ifndef _SIF_IDR_H
+#define _SIF_IDR_H
+#include <linux/version.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+
+/* Synchronized ID ref allocation */
+
+struct sif_idr {
+	struct idr idr;
+	struct mutex lock;
+	int id_min;
+	int id_max;
+};
+
+int sif_idr_init(struct sif_idr *sidr, int id_min, int id_max);
+void sif_idr_deinit(struct sif_idr *sidr);
+
+int sif_idr_alloc(struct sif_idr *sidr, void *ref, gfp_t gfp_mask);
+void sif_idr_remove(struct sif_idr *sidr, int index);
+
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_int_user.h b/drivers/infiniband/hw/sif/sif_int_user.h
new file mode 100644
index 0000000000000..bed597d1b6946
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_int_user.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_int_user.h: This file defines special internal data structures used
+ *   to communicate between libsif and the sif driver.
+ *   This file is included both from user space and kernel space so
+ *   it must not contain any kernel/user specific header file includes.
+ *   This file is internal to libsif/sif driver since it relies on HW specific
+ *   include files.
+ */
+
+#ifndef _SIF_INT_USER_H
+#define _SIF_INT_USER_H
+
+
+#include "psif_hw_data.h"
+
+/* Do this the brute force way, since structs are used in user-space */
+#if defined(__x86_64__) || defined(__sparc__)
+#define SIF_CACHE_BYTES 64
+#else
+#define SIF_CACHE_BYTES 64
+#endif
+
+/* We use the extension here to communicate with the driver
+ * (for correct debugfs reporting)
+ */
+
+/* sif_sq_sw flags definition
+ */
+enum sq_sw_state {
+	FLUSH_SQ_IN_PROGRESS = 0,
+	FLUSH_SQ_IN_FLIGHT   = 1,
+};
+
+struct sif_sq_sw {
+	struct psif_sq_sw d;	/* Hardware visible descriptor */
+	__u8 fill[SIF_CACHE_BYTES - sizeof(struct psif_sq_sw)]; /* separate the cache lines */
+	__u16 last_seq;         /* Last used sq seq.num (req. sq->lock) */
+	__u16 head_seq;         /* Last sq seq.number seen in a compl (req. cq->lock) */
+	__u16 trusted_seq;	/* Last next_seq that was either generate or exist in the cq */
+	__u8 tsl;               /* Valid after transition to RTR */
+	unsigned long flags;    /* Flags, using unsigned long due to test_set/test_and_set_bit */
+};
+
+/* sif_rq_sw flags definition
+ */
+enum rq_sw_state {
+	FLUSH_RQ_IN_PROGRESS = 0,
+	FLUSH_RQ_IN_FLIGHT   = 1,
+	FLUSH_RQ_FIRST_TIME  = 2,
+	RQ_IS_INVALIDATED    = 3,
+};
+
+struct sif_rq_sw {
+	struct psif_rq_sw d;	/* Hardware visible descriptor */
+	__u8 fill[SIF_CACHE_BYTES - sizeof(struct psif_rq_sw)]; /* separate the cache lines */
+	atomic_t length;	/* current length of queue as #posted - #completed */
+	__u32 next_seq;	/* First unused sequence number */
+	unsigned long flags;    /* Flags, using unsigned long due to test_set/test_and_set_bit */
+};
+
+enum cq_sw_state {
+	CQ_POLLING_NOT_ALLOWED = 0,
+	CQ_POLLING_IGNORED_SEQ = 1,
+	FLUSH_SQ_FIRST_TIME    = 2,
+};
+
+struct sif_cq_sw {
+	struct psif_cq_sw d;	/* Hardware visible descriptor */
+	__u8 fill[SIF_CACHE_BYTES - sizeof(struct psif_cq_sw)]; /* separate the cache lines */
+	__u32 next_seq;		/* First unused sequence number */
+	__u32 cached_head;	/* Local copy kept in sync w/hw visible head_indx */
+	__u32 last_hw_seq;	/* Last next_seq reported in completion for req_notify_cq */
+	__u32 armed;		/* Set if req_notify_cq has been called but event not processed */
+	__u32 miss_cnt;		/* Number of in-flight completions observed by poll_cq */
+	__u32 miss_occ;		/* Number of times 1 or more in-flight completions was seen */
+	unsigned long flags;    /* Flags, using unsigned long due to test_set/test_and_set_bit */
+};
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_ireg.c b/drivers/infiniband/hw/sif/sif_ireg.c
new file mode 100644
index 0000000000000..2f19ce2b3aae4
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_ireg.c
@@ -0,0 +1,930 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_ireg.c: Utilities and entry points needed for Infiniband registration
+ */
+
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include "sif_dev.h"
+#include "sif_ireg.h"
+#include "sif_user.h"
+#include "sif_dma.h"
+#include "sif_ibpd.h"
+#include "sif_ibcq.h"
+#include "sif_ibqp.h"
+#include "sif_mr.h"
+#include "sif_mw.h"
+#include "sif_fmr.h"
+#include "sif_ah.h"
+#include "sif_srq.h"
+#include "sif_xrc.h"
+#include "sif_sndrcv.h"
+#include "sif_hwi.h"
+#include "sif_query.h"
+#include "sif_pd.h"
+#include "sif_base.h"
+#include "version.h"
+
+
+static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct sif_dev *sdev = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%x\n", PSIF_REVISION(sdev));
+}
+
+static ssize_t show_fw_ver(struct device *device,
+			struct device_attribute *attr, char *buf)
+{
+	struct sif_dev *sdev = dev_get_drvdata(device);
+	struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+
+	return sprintf(buf, "%hu.%hu.0\n", es->ver.fw_major, es->ver.fw_minor);
+}
+
+static ssize_t show_eps_api_ver(struct device *device,
+				struct device_attribute *attr, char *buf)
+{
+	struct sif_dev *sdev = dev_get_drvdata(device);
+	struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+
+	return sprintf(buf, "%hu.%hu\n", es->ver.epsc_major, es->ver.epsc_minor);
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct sif_dev *sdev = dev_get_drvdata(device);
+
+	return sprintf(buf, "ORCL%d\n", PSIF_DEVICE(sdev));
+}
+
+static ssize_t show_board(struct device *device,
+			struct device_attribute *attr, char *buf)
+{
+	struct sif_dev *sdev = dev_get_drvdata(device);
+	const char *prod_str = get_product_str(sdev);
+	/*
+	 * Paranoia level: avoid dumping the whole kernel to
+	 * user-space if the zero termination character in the product
+	 * string has been compromised
+	 */
+	const int n = min_t(int, 64, (int)strlen(prod_str));
+
+	return sprintf(buf, "%.*s\n", n, prod_str);
+}
+
+static ssize_t show_stats(struct device *device,
+			struct device_attribute *attr, char *buf)
+{
+	struct sif_dev *sdev = dev_get_drvdata(device);
+	/* TBD: device specific counters, stats registers */
+	sif_log(sdev, SIF_VERBS, "Not implemented");
+	return -EOPNOTSUPP;
+}
+
+
+/* PSIF specific extensions */
+
+/* Version information details (git revision of driver and firmware etc) */
+static ssize_t show_versioninfo(struct device *device,
+				struct device_attribute *attr, char *buf)
+{
+	struct sif_dev *sdev = dev_get_drvdata(device);
+	char **fwv = sdev->es[sdev->mbox_epsc].ver.fw_version;
+
+	return snprintf(buf, PAGE_SIZE, "%s - build user %s at %s\n"
+			"sifdrv git tag:\n%s\n%s\n"
+			"EPSC firmware: build user %s at %s\nimage revision string %s\n"
+			"version tag:\n%s\n%s",
+			sif_version.git_repo,
+			sif_version.build_user, sif_version.build_git_time,
+			sif_version.last_commit,
+			(sif_version.git_status[0] != '\0' ? sif_version.git_psifapi_status : ""),
+			fwv[FWV_EPS_BUILD_USER], fwv[FWV_EPS_BUILD_GIT_TIME],
+			fwv[FWV_EPS_REV_STRING], fwv[FWV_EPS_GIT_LAST_COMMIT],
+			(fwv[FWV_EPS_GIT_STATUS][0] != '\0' ? fwv[FWV_EPS_GIT_STATUS] : ""));
+}
+
+
+static ssize_t show_resp_ms(struct device *device,
+			struct device_attribute *attr, char *buf)
+{
+	struct sif_dev *sdev = dev_get_drvdata(device);
+
+	return sprintf(buf, "%d\n", jiffies_to_msecs(sdev->min_resp_ticks));
+}
+
+
+static ssize_t set_resp_ms(struct device *device,
+			struct device_attribute *attr,
+			const char *buf,
+			size_t count)
+{
+	struct sif_dev *sdev = dev_get_drvdata(device);
+	size_t old_val = jiffies_to_msecs(sdev->min_resp_ticks);
+	size_t new_val;
+	int ret = kstrtoul(buf, 0, &new_val);
+
+	if (ret || !new_val)
+		new_val = 1;
+	sif_log(sdev, SIF_INFO, "%ld ms -> %ld ms", old_val, new_val);
+	sdev->min_resp_ticks = msecs_to_jiffies(new_val);
+	return strlen(buf);
+}
+
+static ssize_t show_irq_moderation(struct device *device,
+				struct device_attribute *attr, char *buf)
+{
+	struct sif_dev *sdev = dev_get_drvdata(device);
+
+	return sprintf(buf, "%hu\n", sdev->es[sdev->mbox_epsc].eqs.irq_moderation);
+}
+
+static ssize_t set_irq_moderation(struct device *device,
+				struct device_attribute *attr,
+				const char *buf,
+				size_t count)
+{
+	struct sif_dev *sdev = dev_get_drvdata(device);
+	struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+	u16 old_val = es->eqs.irq_moderation;
+	u16 new_val;
+
+	int ret	= kstrtou16(buf, 0, &new_val);
+	struct psif_epsc_csr_req req; /* local epsc wr copy */
+	struct psif_epsc_csr_rsp resp;
+
+	if (ret || !new_val)
+		new_val = 0;
+
+	if (eps_version_ge(es, 0, 36)) {
+		memset(&req, 0, sizeof(req));
+		req.opcode = EPSC_HOST_INT_COMMON_CTRL;
+		req.uf = 0;
+		req.u.int_common.total_usec = (uintptr_t)new_val;
+		ret = sif_epsc_wr_poll(sdev, &req, &resp);
+		if (ret) {
+			sif_log(sdev, SIF_INFO, "Failed to configure device interrupt total moderation\n");
+			return ret;
+		}
+		es->eqs.irq_moderation = new_val;
+		sif_log(sdev, SIF_INFO, "Interrupt total moderation: %d usecs -> %d usecs",
+			old_val, new_val);
+		return strlen(buf);
+	} else
+		return -1;
+}
+
+static ssize_t show_mt_override(struct device *device,
+				struct device_attribute *attr, char *buf)
+{
+	struct sif_dev *sdev = dev_get_drvdata(device);
+
+	switch (sdev->mt_override) {
+	case SIFMT_BYPASS:
+		sprintf(buf, "bypass\n");
+		break;
+	case SIFMT_UMEM:
+		sprintf(buf, "umem (no override)\n");
+		break;
+	case SIFMT_UMEM_SPT:
+		sprintf(buf, "spt\n");
+		break;
+	case SIFMT_ZERO:
+		sprintf(buf, "zero\n");
+		break;
+	default:
+		/* Sanity check for debugging the driver only */
+		sprintf(buf, "***undefined***\n");
+		break;
+	}
+	return strlen(buf);
+}
+
+
+static ssize_t set_mt_override(struct device *device,
+			struct device_attribute *attr,
+			const char *buf,
+			size_t count)
+{
+	struct sif_dev *sdev = dev_get_drvdata(device);
+
+	if (strcmp(buf, "bypass\n") == 0)
+		sdev->mt_override = SIFMT_BYPASS;
+	else if (strcmp(buf, "umem\n") == 0 || strcmp(buf, "none\n") == 0)
+		sdev->mt_override = SIFMT_UMEM;
+	else if (strcmp(buf, "spt\n") == 0)
+		sdev->mt_override = SIFMT_UMEM_SPT;
+	else if (strcmp(buf, "zero\n") == 0)
+		sdev->mt_override = SIFMT_ZERO;
+	else
+		return -EINVAL;
+	return strlen(buf);
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(eps_api_ver, S_IRUGO, show_eps_api_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+static DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL);
+static DEVICE_ATTR(versioninfo, S_IRUGO, show_versioninfo, NULL);
+static DEVICE_ATTR(min_resp_ms, S_IWUSR | S_IRUGO, show_resp_ms, set_resp_ms);
+static DEVICE_ATTR(mt_override, S_IWUSR | S_IRUGO, show_mt_override, set_mt_override);
+static DEVICE_ATTR(irq_moderation, S_IWUSR | S_IRUGO, show_irq_moderation, set_irq_moderation);
+
+static struct device_attribute *sif_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_eps_api_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id,
+	&dev_attr_stats,
+	&dev_attr_versioninfo,
+	&dev_attr_min_resp_ms,
+	&dev_attr_mt_override,
+	&dev_attr_irq_moderation,
+};
+
+static u64 dev_show(const struct device *device,
+		struct device_attribute *attr,
+		char *buf,
+		int opcode)
+{
+	struct sif_dev *sdev = dev_get_drvdata(device);
+	struct psif_epsc_csr_req req;
+	struct psif_epsc_csr_rsp rsp;
+
+	/* EPSC supports the new requests starting from v.0.43 */
+	if (eps_version_ge(&sdev->es[sdev->mbox_epsc], 0, 43)) {
+		int ret = 0;
+
+		memset(&req, 0, sizeof(req));
+		req.opcode = EPSC_QUERY;
+		req.u.query.data.op = opcode;
+		ret = sif_epsc_wr(sdev, &req, &rsp);
+		if (ret)
+			sif_log(sdev, SIF_INFO, "Failed to query tsu error counter\n");
+		else
+			sprintf(buf, "%llu\n", rsp.data);
+	}
+	return strlen(buf);
+}
+
+#define DEVICE_SHOW(field)					\
+static ssize_t show_##field(struct device *dev,		\
+			struct device_attribute *attr,		\
+			char *buf)				\
+{								\
+	return dev_show(dev, attr, buf, EPSC_QUERY_##field);	\
+}
+
+DEVICE_SHOW(SQ_NUM_BRE);
+DEVICE_SHOW(NUM_CQOVF);
+DEVICE_SHOW(SQ_NUM_WRFE);
+DEVICE_SHOW(RQ_NUM_WRFE);
+DEVICE_SHOW(RQ_NUM_LAE);
+DEVICE_SHOW(RQ_NUM_LPE);
+DEVICE_SHOW(SQ_NUM_LLE);
+DEVICE_SHOW(RQ_NUM_LLE);
+DEVICE_SHOW(SQ_NUM_LQPOE);
+DEVICE_SHOW(RQ_NUM_LQPOE);
+DEVICE_SHOW(SQ_NUM_OOS);
+DEVICE_SHOW(RQ_NUM_OOS);
+DEVICE_SHOW(SQ_NUM_RREE);
+DEVICE_SHOW(SQ_NUM_TREE);
+DEVICE_SHOW(SQ_NUM_ROE);
+DEVICE_SHOW(RQ_NUM_ROE);
+DEVICE_SHOW(SQ_NUM_RAE);
+DEVICE_SHOW(RQ_NUM_RAE);
+DEVICE_SHOW(RQ_NUM_UDSDPRD);
+DEVICE_SHOW(RQ_NUM_UCSDPRD);
+DEVICE_SHOW(SQ_NUM_RIRE);
+DEVICE_SHOW(RQ_NUM_RIRE);
+DEVICE_SHOW(SQ_NUM_RNR);
+DEVICE_SHOW(RQ_NUM_RNR);
+
+static ssize_t clear_diag(struct device *device,
+			struct device_attribute *attr,
+			const char *buf,
+			size_t count)
+{
+
+	struct sif_dev *sdev = dev_get_drvdata(device);
+	int ret;
+	struct psif_epsc_csr_req req;
+	struct psif_epsc_csr_rsp resp;
+
+	if (strcmp(buf, "1\n") == 0) {
+
+		memset(&req, 0, sizeof(req));
+		memset(&resp, 0, sizeof(resp));
+
+		req.opcode = EPSC_SET;
+		req.u.set.data.op = EPSC_QUERY_RESET_CBLD_DIAG_COUNTERS;
+		req.u.set.data.value = 0xffffff;
+		ret = sif_epsc_wr_poll(sdev, &req, &resp);
+		if (ret)
+			sif_log(sdev, SIF_INFO, "Failed to clear psif diag counters\n");
+	} else
+		return -EINVAL;
+
+	return strlen(buf);
+}
+
+static DEVICE_ATTR(clear_diag, S_IWUSR, NULL, clear_diag);
+static DEVICE_ATTR(sq_num_bre, S_IRUGO, show_SQ_NUM_BRE, NULL);
+static DEVICE_ATTR(num_cqovf, S_IRUGO, show_NUM_CQOVF, NULL);
+static DEVICE_ATTR(sq_num_wrfe, S_IRUGO, show_SQ_NUM_WRFE, NULL);
+static DEVICE_ATTR(rq_num_wrfe, S_IRUGO, show_RQ_NUM_WRFE, NULL);
+static DEVICE_ATTR(rq_num_lae, S_IRUGO, show_RQ_NUM_LAE, NULL);
+static DEVICE_ATTR(rq_num_lpe, S_IRUGO, show_RQ_NUM_LPE, NULL);
+static DEVICE_ATTR(sq_num_lle, S_IRUGO, show_SQ_NUM_LLE, NULL);
+static DEVICE_ATTR(rq_num_lle, S_IRUGO, show_RQ_NUM_LLE, NULL);
+static DEVICE_ATTR(sq_num_lqpoe, S_IRUGO, show_SQ_NUM_LQPOE, NULL);
+static DEVICE_ATTR(rq_num_lqpoe, S_IRUGO, show_RQ_NUM_LQPOE, NULL);
+static DEVICE_ATTR(sq_num_oos, S_IRUGO, show_SQ_NUM_OOS, NULL);
+static DEVICE_ATTR(rq_num_oos, S_IRUGO, show_RQ_NUM_OOS, NULL);
+static DEVICE_ATTR(sq_num_rree, S_IRUGO, show_SQ_NUM_RREE, NULL);
+static DEVICE_ATTR(sq_num_tree, S_IRUGO, show_SQ_NUM_TREE, NULL);
+static DEVICE_ATTR(sq_num_roe, S_IRUGO, show_SQ_NUM_ROE, NULL);
+static DEVICE_ATTR(rq_num_roe, S_IRUGO, show_RQ_NUM_ROE, NULL);
+static DEVICE_ATTR(sq_num_rae, S_IRUGO, show_SQ_NUM_RAE, NULL);
+static DEVICE_ATTR(rq_num_rae, S_IRUGO, show_RQ_NUM_RAE, NULL);
+static DEVICE_ATTR(rq_num_udsdprd, S_IRUGO, show_RQ_NUM_UDSDPRD, NULL);
+static DEVICE_ATTR(rq_num_ucsdprd, S_IRUGO, show_RQ_NUM_UCSDPRD, NULL);
+static DEVICE_ATTR(sq_num_rire, S_IRUGO, show_SQ_NUM_RIRE, NULL);
+static DEVICE_ATTR(rq_num_rire, S_IRUGO, show_RQ_NUM_RIRE, NULL);
+static DEVICE_ATTR(sq_num_rnr, S_IRUGO, show_SQ_NUM_RNR, NULL);
+static DEVICE_ATTR(rq_num_rnr, S_IRUGO, show_RQ_NUM_RNR, NULL);
+
+static struct attribute *sif_diag_counters_class_attributes[] = {
+	&dev_attr_clear_diag.attr,
+	&dev_attr_sq_num_bre.attr,
+	&dev_attr_num_cqovf.attr,
+	&dev_attr_sq_num_wrfe.attr,
+	&dev_attr_rq_num_wrfe.attr,
+	&dev_attr_rq_num_lae.attr,
+	&dev_attr_rq_num_lpe.attr,
+	&dev_attr_sq_num_lle.attr,
+	&dev_attr_rq_num_lle.attr,
+	&dev_attr_sq_num_lqpoe.attr,
+	&dev_attr_rq_num_lqpoe.attr,
+	&dev_attr_sq_num_oos.attr,
+	&dev_attr_rq_num_oos.attr,
+	&dev_attr_sq_num_rree.attr,
+	&dev_attr_sq_num_tree.attr,
+	&dev_attr_sq_num_roe.attr,
+	&dev_attr_rq_num_roe.attr,
+	&dev_attr_sq_num_rae.attr,
+	&dev_attr_rq_num_rae.attr,
+	&dev_attr_rq_num_udsdprd.attr,
+	&dev_attr_rq_num_ucsdprd.attr,
+	&dev_attr_sq_num_rire.attr,
+	&dev_attr_rq_num_rire.attr,
+	&dev_attr_sq_num_rnr.attr,
+	&dev_attr_rq_num_rnr.attr,
+	NULL,
+};
+
+static struct attribute_group diag_counters_attr_group = {
+	.attrs = sif_diag_counters_class_attributes,
+	.name = "diag_counters",
+};
+
+static struct ib_ucontext *sif_alloc_ucontext(struct ib_device *ibdev,
+					struct ib_udata *udata)
+{
+	int ret;
+	struct sif_dev *sdev = to_sdev(ibdev);
+	struct sif_ucontext *s_uc;
+
+	s_uc = kzalloc(sizeof(*s_uc), GFP_KERNEL);
+	if (!s_uc)
+		return NULL;
+
+	s_uc->pd = alloc_pd(sdev);
+	if (!s_uc->pd) {
+		ret = -ENOMEM;
+		goto alloc_pd_failed;
+	}
+	s_uc->pd->ibpd.device = ibdev;
+
+	s_uc->cb = alloc_cb(sdev, false);
+	if (!s_uc->cb) {
+		ret = -ENOMEM;
+		goto alloc_cb_failed;
+	}
+
+	if (udata) {
+		struct sif_get_context_ext cmd;
+		struct sif_get_context_resp_ext resp;
+		u16 major_ver, minor_ver;
+
+		memset(&cmd, 0, sizeof(cmd));
+		ib_copy_from_udata(&cmd, udata, sizeof(cmd));
+
+		s_uc->abi_version = cmd.abi_version;
+		major_ver = s_uc->abi_version >> 8;
+		minor_ver = s_uc->abi_version & 0xff;
+		if (major_ver != SIF_UVERBS_ABI_MAJOR_VERSION) {
+			if (major_ver < 10 && major_ver > 0) {
+				sif_log(sdev, SIF_INFO,
+					"User verbs abi version mismatch - driver has v.%d.%d - libsif has v.%d.%d",
+					SIF_UVERBS_ABI_MAJOR_VERSION, SIF_UVERBS_ABI_MINOR_VERSION,
+					major_ver, minor_ver);
+				ret = -EINVAL;
+				goto udata_copy_failed;
+			} else {
+				static bool printed;
+				/* TBD: remove - bw comp - in this case probably not set */
+				/* Set to final version that does not report to us */
+				if (!printed) {
+					sif_log(sdev, SIF_INFO,
+						"Invalid version info - upgrade libsif!");
+					printed = true;
+				}
+				s_uc->abi_version = SIF_UVERBS_VERSION(3, 1);
+			}
+		}
+		memset(&resp, 0, sizeof(resp));
+		resp.sq_sw_ext_sz = sdev->ba[sq_sw].ext_sz;
+		resp.sq_hw_ext_sz = sdev->ba[sq_hw].ext_sz;
+		resp.rq_ext_sz = sdev->ba[rq_sw].ext_sz;
+		resp.cq_ext_sz = sdev->ba[cq_sw].ext_sz;
+		resp.sq_entry_per_block = sdev->ba[sq_sw].entry_per_block;
+		resp.rq_entry_per_block = sdev->ba[rq_sw].entry_per_block;
+		resp.cq_entry_per_block = sdev->ba[cq_sw].entry_per_block;
+		ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (ret)
+			goto udata_copy_failed;
+	}
+
+	sif_log(sdev, SIF_VERBS_V, " at %p with pd %d used for CQs libsif abi v.%d.%d",
+		s_uc, s_uc->pd->idx, s_uc->abi_version >> 8, s_uc->abi_version & 0xff);
+	return &s_uc->ib_uc;
+
+udata_copy_failed:
+	release_cb(sdev, s_uc->cb);
+alloc_cb_failed:
+	dealloc_pd(s_uc->pd);
+alloc_pd_failed:
+	kfree(s_uc);
+	return ERR_PTR(ret);
+}
+
+static int sif_dealloc_ucontext(struct ib_ucontext *ib_uc)
+{
+	int ret;
+	u32 pd_idx = 0;
+	struct sif_dev *sdev = to_sdev(ib_uc->device);
+	struct sif_ucontext *s_uc =
+	    container_of(ib_uc, struct sif_ucontext, ib_uc);
+
+	sif_logs(SIF_VERBS_V, pd_idx = s_uc->pd->idx);
+
+	ret = dealloc_pd(s_uc->pd);
+	if (ret) {
+		sif_log(sdev, SIF_INFO, "Failed (status %d) to deallocate pd %d", ret, s_uc->pd->idx);
+		return ret;
+	}
+
+	release_cb(sdev, s_uc->cb);
+	kfree(s_uc);
+	sif_log(sdev, SIF_VERBS_V, "at %p done (cq pd index %d)", s_uc, pd_idx);
+	return 0;
+}
+
+
+static int sif_mmap_block(struct sif_ucontext *uc, struct vm_area_struct *vma,
+			enum sif_tab_type type, u32 index, int vm_flags)
+{
+	struct sif_dev *sdev = to_sdev(uc->ib_uc.device);
+	struct sif_table *tp = &sdev->ba[type];
+	struct sif_table_block *b;
+	struct sif_pd *pd;
+	u64 start, block_sz;
+	off_t len;
+	off_t offset;
+	int ret;
+
+	if (tp->entry_per_block <= 1) {
+		sif_log(sdev, SIF_INFO,
+			"Failed to map %s block index %d: direct user access not available with flat_alloc scheme",
+			sif_table_name(type), index);
+		return -EPERM;
+	}
+	if (tp->block_cnt <= index) {
+		sif_log(sdev, SIF_INFO, "Failed to map %s block index %d: out of range - block_cnt %d",
+			sif_table_name(type), index, tp->block_cnt);
+		return -EINVAL;
+	}
+
+	b = sif_get_block(tp, index);
+	pd = b->pd;
+	if (!pd) {
+		sif_log(sdev, SIF_INFO, "Failed to map %s block index %d: not allocated",
+			sif_table_name(type), index);
+		return -ENODEV;
+	}
+	if (pd == uc->pd)
+		goto pd_ok; /* CQ case */
+
+	if (!sif_is_user_pd(pd)) {
+		sif_log(sdev, SIF_INFO, "Failed to map %s block index %d, pd %d - owned by kernel space",
+			sif_table_name(type), index, pd->idx);
+		return -EACCES;
+	}
+
+	/* TBD: Security aspects of XRC domain access
+	 * (in the xrc case, we don't have a user context at the moment)
+	 */
+	if (pd->ibpd.uobject && pd->ibpd.uobject->context != &uc->ib_uc) {
+		sif_log(sdev, SIF_INFO, "Failed to map %s block index %d: belongs to another user context",
+			sif_table_name(type), index);
+		return -EACCES;
+	}
+pd_ok:
+	block_sz = tp->ext_sz * tp->entry_per_block;
+	len = vma->vm_end - vma->vm_start;
+	if (block_sz != len) {
+		sif_log(sdev, SIF_INFO, "Failed to map %s block index %d: Expected map len %lld, got %ld",
+			sif_table_name(type), index,
+			block_sz, len);
+		return -EINVAL;
+	}
+
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+	vma->vm_flags |= vm_flags;
+	start = vma->vm_start;
+
+	offset = block_sz * index;
+
+	ret = sif_mem_vma_map_part(tp->mem, vma, offset, len);
+	if (ret)
+		return ret;
+
+	/* TBD: ehca uses a vm_operations_struct and vma->private_data to ref.count
+	 * but MLX does not - is it necessary?
+	 * Also remap_pfn_range requires the mm sema to be held, but other drivers dont take it
+	 * - is it already held by the caller here?
+	 */
+	return 0;
+}
+
+
+static int sif_mmap_cb(struct sif_ucontext *uc, struct vm_area_struct *vma, u32 index)
+{
+	struct sif_dev *sdev = to_sdev(uc->ib_uc.device);
+	struct sif_cb *cb = sif_cb_from_uc(uc, index);
+	off_t len;
+	dma_addr_t cb_start;
+	int ret;
+
+	if (!cb) {
+		sif_log(sdev, SIF_INFO, "Failed to associate cb %d with context", index);
+		return -EINVAL;
+	}
+
+	len = vma->vm_end - vma->vm_start;
+	if (len != PAGE_SIZE) {
+		sif_log(sdev, SIF_INFO, "Failed to map cb index %d: Expected map len %ld, got %ld",
+			index, PAGE_SIZE, len);
+		return -EINVAL;
+	}
+	cb_start = pci_resource_start(sdev->pdev, SIF_CBU_BAR) + index * PAGE_SIZE;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_flags |= VM_WRITE;
+	ret = io_remap_pfn_range(vma, vma->vm_start, cb_start >> PAGE_SHIFT,
+				PAGE_SIZE, vma->vm_page_prot);
+	if (ret)
+		sif_log(sdev, SIF_INFO, "io_remap_pfn_range failed with %d", ret);
+	return ret;
+}
+
+
+#define def_map_queue(type) \
+static int sif_mmap_##type(struct sif_ucontext *uc, struct vm_area_struct *vma, u32 index)\
+{\
+	struct sif_dev *sdev = to_sdev(uc->ib_uc.device);\
+	struct sif_##type *type;\
+	u64 q_sz;\
+	off_t len;\
+	\
+	type = safe_get_sif_##type(sdev, index);\
+	if (!type) {\
+		sif_log(sdev, SIF_INFO, "Failed to map " #type \
+			" index %d out of range", index);\
+		sif_log(sdev, SIF_INFO, "%p : %p", sdev->ba[type##_hw].bitmap, sdev->ba[qp].bitmap);\
+		return -EINVAL;\
+	} \
+	\
+	q_sz = type->mem->size;\
+	len = vma->vm_end - vma->vm_start;\
+	if (q_sz < len) {\
+		sif_log(sdev, SIF_INFO, "Failed to map " #type " index %d: "\
+			"Expected map req for <= %lld bytes, got %ld", index, q_sz, len);\
+		return -EINVAL;\
+	} \
+	\
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;\
+	vma->vm_flags |= VM_READ|VM_WRITE;\
+	\
+	return sif_mem_vma_map_part(type->mem, vma, 0, len);\
+}
+
+def_map_queue(sq)
+def_map_queue(rq)
+def_map_queue(cq)
+
+static int sif_mmap(struct ib_ucontext *ib_uc, struct vm_area_struct *vma)
+{
+	enum sif_mmap_cmd cmd;
+	u32 index;
+	struct sif_dev *sdev = to_sdev(ib_uc->device);
+	struct sif_ucontext *s_uc = to_sctx(ib_uc);
+
+	mmap_get_cmd(vma->vm_pgoff << PAGE_SHIFT, &cmd, &index);
+
+	sif_log(sdev, SIF_MMAP,
+		"pg offset 0x%lx start 0x%lx, end 0x%lx len 0x%lx, flags 0x%lx index %d",
+		vma->vm_pgoff, vma->vm_start, vma->vm_end, vma->vm_end - vma->vm_start,
+		vma->vm_flags, index);
+
+	switch (cmd) {
+	case SIF_MAP_SQ_SW:
+		return sif_mmap_block(s_uc, vma, sq_sw, index, VM_READ|VM_WRITE);
+	case SIF_MAP_RQ_SW:
+		return sif_mmap_block(s_uc, vma, rq_sw, index, VM_READ|VM_WRITE);
+	case SIF_MAP_CQ_SW:
+		return sif_mmap_block(s_uc, vma, cq_sw, index, VM_READ|VM_WRITE);
+	case SIF_MAP_SQ_HW:
+		return sif_mmap_block(s_uc, vma, sq_hw, index, VM_READ);
+	case SIF_MAP_RQ_HW:
+		return sif_mmap_block(s_uc, vma, rq_hw, index, VM_READ);
+	case SIF_MAP_CQ_HW:
+		return sif_mmap_block(s_uc, vma, cq_hw, index, VM_READ);
+	case SIF_MAP_CB:
+		return sif_mmap_cb(s_uc, vma, index);
+	case SIF_MAP_SQ:
+		return sif_mmap_sq(s_uc, vma, index);
+	case SIF_MAP_RQ:
+		return sif_mmap_rq(s_uc, vma, index);
+	case SIF_MAP_CQ:
+		return sif_mmap_cq(s_uc, vma, index);
+	default:
+		break;
+	}
+	sif_log(sdev, SIF_MMAP, "cmd %d not implemented", cmd);
+	return -EOPNOTSUPP;
+}
+
+static int sif_get_protocol_stats(struct ib_device *ibdev,
+				union rdma_protocol_stats *stats)
+{
+	struct sif_dev *sdev = to_sdev(ibdev);
+
+	sif_log(sdev, SIF_VERBS, "Not implemented");
+	return -EOPNOTSUPP;
+}
+
+
+static enum rdma_link_layer sif_get_link_layer(struct ib_device *ibdev, u8 port_num)
+{
+	struct sif_dev *sdev = to_sdev(ibdev);
+
+	sif_log(sdev, SIF_VERBS, "returns IB_LINK_LAYER_INFINIBAND for port %d", port_num);
+	return IB_LINK_LAYER_INFINIBAND;
+}
+
+static int sif_port_callback(struct ib_device *ibdev, u8 portno, struct kobject *obj)
+{
+	struct sif_dev *sdev = to_sdev(ibdev);
+
+	sif_log(sdev, SIF_VERBS, "port %d", portno);
+	return 0;
+}
+
+static inline struct ib_cq *sif_ib_create_cq(struct ib_device *ibdev, int cqe,
+					int comp_vector, struct ib_ucontext *context,
+					struct ib_udata *udata)
+{
+	return sif_create_cq(ibdev, cqe, comp_vector, context, udata, SIFPX_OFF);
+}
+
+/* putting this function here to avoid sif_epsc.h from being rdma/ib_verbs.h dependent */
+static int sif_eps_wr_ex(struct ib_device *ibdev, enum psif_mbox_type eps_num,
+	struct  psif_epsc_csr_req *req, struct psif_epsc_csr_rsp *cqe)
+{
+	struct sif_dev *sdev = to_sdev(ibdev);
+
+	return sif_eps_wr(sdev, eps_num, req, cqe);
+
+}
+
+int sif_register_ib_device(struct sif_dev *sdev)
+{
+	int ret = 0;
+	int i;
+	struct ib_device *dev = &sdev->ib_dev;
+	struct psif_epsc_device_attr epsdev;
+
+	/* We need to do a query_device to get the node_guid */
+	ret = epsc_query_device(sdev, &epsdev);
+	if (ret)
+		return ret;
+
+	strlcpy(dev->name, "sif%d", IB_DEVICE_NAME_MAX);
+
+	dev->owner = THIS_MODULE;
+	dev->uverbs_abi_ver = SIF_UVERBS_ABI_VERSION;
+
+	/* SIF supported user verbs */
+	dev->uverbs_cmd_mask =
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_AH) |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_AH) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_AH) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_AH) |
+		(1ull << IB_USER_VERBS_CMD_REG_MR) |
+		(1ull << IB_USER_VERBS_CMD_REG_SMR) |
+		(1ull << IB_USER_VERBS_CMD_REREG_MR) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_MR) |
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+		(1ull << IB_USER_VERBS_CMD_ALLOC_MW) |
+		(1ull << IB_USER_VERBS_CMD_BIND_MW) |
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_MW) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+		(1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+		(1ull << IB_USER_VERBS_CMD_PEEK_CQ) |
+		(1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+		(1ull << IB_USER_VERBS_CMD_POST_SEND) |
+		(1ull << IB_USER_VERBS_CMD_POST_RECV) |
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) |
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) |
+		(1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV)
+	      | (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
+		(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) |
+		(1ull << IB_USER_VERBS_CMD_OPEN_QP)
+	      | (1ull << IB_USER_VERBS_CMD_ALLOC_SHPD) |
+		(1ull << IB_USER_VERBS_CMD_SHARE_PD)
+	      ;
+
+	dev->get_protocol_stats = sif_get_protocol_stats;
+
+	dev->query_device = sif_query_device;
+	dev->modify_device = sif_modify_device;
+
+	dev->query_port = sif_query_port;
+	dev->modify_port = sif_modify_port;
+
+	dev->get_link_layer = sif_get_link_layer;
+	dev->query_gid = sif_query_gid;
+	dev->query_pkey = sif_query_pkey;
+
+	dev->alloc_ucontext = sif_alloc_ucontext;
+	dev->dealloc_ucontext = sif_dealloc_ucontext;
+	dev->mmap = sif_mmap;
+
+	dev->alloc_pd = sif_alloc_pd;
+	dev->dealloc_pd = sif_dealloc_pd;
+	dev->create_ah = sif_create_ah;
+	dev->destroy_ah = sif_destroy_ah;
+	dev->query_ah = sif_query_ah;
+
+	dev->create_srq = sif_create_srq;
+	dev->modify_srq = sif_modify_srq;
+	dev->query_srq = sif_query_srq;
+	dev->destroy_srq = sif_destroy_srq;
+
+	dev->create_qp = sif_create_qp;
+	dev->modify_qp = sif_modify_qp;
+	dev->query_qp = sif_query_qp;
+	dev->destroy_qp = sif_destroy_qp;
+
+	dev->post_send = sif_post_send;
+	dev->post_recv = sif_post_recv;
+	dev->post_srq_recv = sif_post_srq_recv;
+
+	dev->create_cq = sif_ib_create_cq;
+	dev->destroy_cq = sif_destroy_cq;
+	dev->resize_cq = sif_resize_cq;
+	dev->poll_cq = sif_poll_cq;
+	dev->peek_cq = sif_peek_cq;
+	dev->req_notify_cq = sif_req_notify_cq;
+	dev->req_ncomp_notif = sif_req_ncomp_notif;
+
+	dev->get_dma_mr = sif_get_dma_mr;
+	dev->reg_phys_mr = sif_reg_phys_mr;
+	dev->rereg_phys_mr = sif_rereg_phys_mr;
+	dev->reg_user_mr = sif_reg_user_mr;
+	dev->dereg_mr = sif_dereg_mr;
+	dev->query_mr = sif_query_mr;
+
+	dev->alloc_fmr = sif_alloc_fmr;
+	dev->map_phys_fmr = sif_map_phys_fmr;
+	dev->unmap_fmr = sif_unmap_phys_fmr_list;
+	dev->dealloc_fmr = sif_dealloc_fmr;
+
+	dev->attach_mcast = sif_multicast_attach;
+	dev->detach_mcast = sif_multicast_detach;
+
+	/* All our mad handling happens via the normal QP0 paths
+	 * this function is for devices which implements the SMA
+	 * in software:
+	 */
+	dev->process_mad = NULL;
+
+	dev->alloc_xrcd = sif_alloc_xrcd;
+	dev->dealloc_xrcd = sif_dealloc_xrcd;
+	dev->alloc_shpd = sif_alloc_shpd;
+	dev->share_pd = sif_share_pd;
+	dev->remove_shpd = sif_remove_shpd;
+
+	dev->node_guid = cpu_to_be64(epsdev.node_guid);
+
+	snprintf(dev->node_desc, sizeof(dev->node_desc), "sif_%s",
+		 init_utsname()->nodename);
+
+	dev->node_type = RDMA_NODE_IB_CA;
+	dev->phys_port_cnt = sdev->limited_mode ? 0 : epsdev.phys_port_cnt;
+	dev->num_comp_vectors = sdev->es[sdev->mbox_epsc].eqs.cnt - 2;
+
+	ret = ib_register_device(dev, sif_port_callback);
+	if (ret) {
+		sif_log(sdev, SIF_VERBS, "Fail to register IB device: error %d",
+			-ret);
+		goto err_ibreg;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(sif_class_attributes); ++i) {
+		ret = device_create_file(&dev->dev, sif_class_attributes[i]);
+		if (ret) {
+			sif_log(sdev, SIF_VERBS,
+				"Fail to register with sysfs: error %d!", -ret);
+			goto err_sysfsreg;
+		}
+	}
+
+	/* Diag_counters */
+	ret = sysfs_create_group(&dev->dev.kobj, &diag_counters_attr_group);
+	if (ret) {
+		sif_log(sdev, SIF_VERBS,
+			"Fail to register diag_counters with sysfs: error %d!", -ret);
+		goto err_sysfsreg;
+	}
+
+	/* Populate the external kernel API (see sif_verbs.h): */
+	sdev->sv.eps_wr = sif_eps_wr_ex;
+	sdev->sv.create_cq = sif_create_cq;
+	sdev->ib_dev.local_dma_lkey = sdev->dma_mr->index;
+
+	sdev->registered = true;
+	sif_log(sdev, SIF_VERBS_V, "%s registered with IB", sdev->ib_dev.name);
+	return 0;
+
+err_sysfsreg:
+	ib_unregister_device(dev);
+err_ibreg:
+	sif_log(sdev, SIF_INFO, "Exit - error %d", -ret);
+	return ret;
+}
+
+void sif_unregister_ib_device(struct sif_dev *sdev)
+{
+	struct ib_device *ibdev = &sdev->ib_dev;
+
+	sdev->registered = false;
+	ib_unregister_device(ibdev);
+	sif_logi(ibdev, SIF_VERBS, "done unregistering device");
+}
diff --git a/drivers/infiniband/hw/sif/sif_ireg.h b/drivers/infiniband/hw/sif/sif_ireg.h
new file mode 100644
index 0000000000000..724b6df9c6e19
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_ireg.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_ireg.h: support functions used in setup of sif as an IB HCA
+ */
+
+#ifndef __SIF_IREG_H
+#define __SIF_IREG_H
+
+/* User context of a user level ib call */
+struct sif_ucontext {
+	struct ib_ucontext ib_uc;
+	struct sif_pd *pd;  /* A protection domain for completion queues */
+	struct sif_cb *cb;  /* The collect buffer for the user process */
+	u32 abi_version;  /* User level library's abi version */
+};
+
+static inline struct sif_ucontext *to_sctx(struct ib_ucontext *context)
+{
+	return container_of(context, struct sif_ucontext, ib_uc);
+}
+
+int sif_register_ib_device(struct sif_dev *sdev);
+void sif_unregister_ib_device(struct sif_dev *sdev);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_main.c b/drivers/infiniband/hw/sif/sif_main.c
new file mode 100644
index 0000000000000..1890a1a6cb651
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_main.c
@@ -0,0 +1,635 @@
+/*
+ * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_main.c: main entry points and initialization
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#ifdef CONFIG_X86
+#include <asm/mtrr.h>
+#endif
+#include <linux/pci.h>
+#include <linux/aer.h>
+#include "sif_dev.h"
+#include "sif_fwa.h"
+#include "sif_mmu.h"
+#include "sif_mr.h"
+#include "sif_hwi.h"
+#include "sif_r3.h"
+#include "sif_vf.h"
+#include "sif_pt.h"
+#include "sif_ireg.h"
+#include "sif_debug.h"
+#include "psif_hw_csr.h"
+#include "version.h"
+#include <xen/xen.h>
+
+
+#define PSIF_VERSION_STR "0.1.0.6+"
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Oracle SIF Infiniband HCA driver");
+MODULE_VERSION(PSIF_VERSION_STR);
+MODULE_AUTHOR("Knut Omang");
+
+/* The device(s) we support */
+
+static const struct pci_device_id pci_table[] = {
+	{PCI_DEVICE(PCI_VENDOR_ID_SUN, PCI_DEVICE_ID_PSIF_PF)},
+	{PCI_DEVICE(PCI_VENDOR_ID_SUN, PCI_DEVICE_ID_PSIF_VF)},
+	{PCI_DEVICE(PCI_VENDOR_ID_SUN, PCI_DEVICE_ID_SN1_PF)},
+	{PCI_DEVICE(PCI_VENDOR_ID_SUN, PCI_DEVICE_ID_SN1_VF)},
+	{0,}
+};
+
+MODULE_DEVICE_TABLE(pci, pci_table);
+
+/* module entry points */
+static int __init sif_init(void);
+static void __exit sif_exit(void);
+
+/* device entry points */
+static int sif_probe(struct pci_dev *pdev,
+			       const struct pci_device_id *id);
+static void sif_remove(struct pci_dev *dev);
+
+static int sif_suspend(struct pci_dev *dev, pm_message_t state)
+{
+	struct sif_dev *sdev = pci_get_drvdata(dev);
+
+	sif_log(sdev, SIF_INFO, " ");
+	return 0;
+}
+
+static int sif_resume(struct pci_dev *dev)
+{
+	struct sif_dev *sdev = pci_get_drvdata(dev);
+
+	sif_log(sdev, SIF_INFO, " ");
+	return 0;
+}
+
+static void sif_shutdown(struct pci_dev *dev)
+{
+	struct sif_dev *sdev = pci_get_drvdata(dev);
+
+	sif_log(sdev, SIF_INFO, " ");
+}
+
+static struct pci_driver sif_driver = {
+	.name = "sif",
+	.id_table = pci_table,
+	.probe =	sif_probe,
+	.remove =	sif_remove,
+	.suspend =	sif_suspend,
+	.resume =	sif_resume,
+	.shutdown =	sif_shutdown,
+	.sriov_configure = sif_vf_enable,
+};
+
+/* Driver parameters: */
+
+ulong sif_debug_mask = 0x3;
+module_param_named(debug_mask, sif_debug_mask, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(debug_mask, "Selective enabling of debugging output to the system log");
+
+#ifdef SIF_TRACE_MASK
+ulong sif_trace_mask = 0x0;
+module_param_named(trace_mask, sif_trace_mask, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(trace_mask, "Selective enabling of debugging output to the ftrace buffer");
+#endif
+
+ulong sif_feature_mask = 0;
+module_param_named(feature_mask, sif_feature_mask, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(feature_mask, "Selective enabling of sif driver features");
+
+ulong sif_vendor_flags = 0;
+module_param_named(vendor_flags, sif_vendor_flags, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(vendor_flags, "Selective enabling of sif driver vendor specific mode flags");
+
+uint sif_max_pqp_wr = SIF_SW_MAX_SQE;
+module_param_named(max_pqp_wr, sif_max_pqp_wr, uint, S_IRUGO);
+MODULE_PARM_DESC(max_pqp_wr, "Maximum number of outstanding privileged QP requests supported");
+
+uint sif_ki_spqp_size = 1;
+module_param_named(ki_spqp_size, sif_ki_spqp_size, uint, S_IRUGO);
+MODULE_PARM_DESC(ki_spqp_size, "Number of privileged QPs for key invalidate stencils to set up");
+
+/* pqp_size ==  cq_eq_max */
+uint sif_cq_eq_max = 12;
+module_param_named(cq_eq_max, sif_cq_eq_max, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(cq_eq_max, "Upper limit on no. of EQs to distribute completion events among");
+
+uint sif_cb_max = 100;
+module_param_named(cb_max, sif_cb_max, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(cb_max, "Upper limit on no. of CBs.");
+
+/* TBD - This is a debug feature to evaluate performance. */
+ushort sif_perf_sampling_threshold = 100;
+module_param_named(perf_sampling_threshold, sif_perf_sampling_threshold, ushort, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(perf_sampling_threshold, "The performance measurement based on every N samples");
+
+uint sif_fmr_cache_flush_threshold = 512;
+module_param_named(fmr_cache_flush_threshold, sif_fmr_cache_flush_threshold, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(fmr_cache_flush_threshold, "PF limit for when to use fast-path full MMU flush for FMR unmap");
+
+
+/* In principle, SIF can allow any max inline size but at the cost of more memory
+ * allocated per QP. This variable sets the upper limit for any QP by defining
+ * the max extent of the sq entries, which means that the real max size is slightly
+ * less, depending on the max number of sges requested:
+ */
+uint sif_max_inline = 0x400;
+module_param_named(max_inline, sif_max_inline, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(max_inline, "Max configurable inline data per QP");
+
+uint sif_vf_en = 1;
+module_param_named(vf_en, sif_vf_en, uint, S_IRUGO);
+MODULE_PARM_DESC(vf_en, "If set to 0, refuse to load VF drivers");
+
+ulong sif_eps_log_size = 0;
+module_param_named(eps_log_size, sif_eps_log_size, ulong, S_IRUGO);
+MODULE_PARM_DESC(eps_log_size, "Enable log redirection - value is size of log buffer to allocate");
+
+ushort sif_eps_log_level = EPS_LOG_INFO;
+module_param_named(eps_log_level, sif_eps_log_level, ushort, S_IRUGO);
+MODULE_PARM_DESC(eps_log_level, "Level of logging to set for EPS redirect at load");
+
+static int sif_bar_init(struct pci_dev *pdev);
+static void sif_bar_deinit(struct pci_dev *pdev);
+
+
+static int sif_set_check_max_payload(struct sif_dev *sdev)
+{
+	struct pci_dev *parent;
+	u16 devctl, devcap, pdevctl, pdevcap;
+	int pcie_cap, pcie_parent_cap, min_cap_mps, err;
+
+	u8 payload_sz, payload_sz_cap;
+	u8 parent_payload_sz, parent_payload_sz_cap;
+
+	pcie_cap = pci_find_capability(sdev->pdev, PCI_CAP_ID_EXP);
+
+	/* read PSIF max payload size capability and setting */
+	err = pci_read_config_word(sdev->pdev, pcie_cap + PCI_EXP_DEVCTL, &devctl);
+	if (err)
+		return err;
+
+	payload_sz = (devctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5;
+
+	err = pci_read_config_word(sdev->pdev, pcie_cap + PCI_EXP_DEVCAP, &devcap);
+	if (err)
+		return err;
+
+	payload_sz_cap = (devcap & PCI_EXP_DEVCAP_PAYLOAD);
+
+	if (sif_feature(max_supported_payload)) {
+		parent = pci_upstream_bridge(sdev->pdev);
+		if (!parent) {
+			sif_log(sdev, SIF_INFO,
+				"No parent bridge device, cannot determine atomic capabilities!");
+			return PSIF_PCIE_ATOMIC_OP_NONE;
+		}
+
+		pcie_parent_cap = pci_find_capability(parent, PCI_CAP_ID_EXP);
+		if (!pcie_parent_cap) {
+			sif_log(sdev, SIF_INFO,
+				"Unable to find any PCIe capability in parent device - assuming payload size is ok");
+			return 0;
+		}
+
+		/* read root complex (port) max payload size */
+		err = pci_read_config_word(parent, pcie_parent_cap + PCI_EXP_DEVCTL, &pdevctl);
+		if (err)
+			return err;
+
+		err = pci_read_config_word(parent, pcie_parent_cap + PCI_EXP_DEVCAP, &pdevcap);
+		if (err)
+			return err;
+
+		parent_payload_sz = (pdevctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5;
+		parent_payload_sz_cap = (pdevcap & PCI_EXP_DEVCAP_PAYLOAD);
+
+		min_cap_mps = min(parent_payload_sz_cap, payload_sz_cap);
+
+		/* adjusting the RC max payload size to the supported max payload size */
+		if (parent_payload_sz != min_cap_mps) {
+			sif_log(sdev, SIF_INFO,
+				"Adjusting RC max payload sz to %d\n", 128 << parent_payload_sz_cap);
+			err = pci_write_config_word(parent,
+					pcie_parent_cap + PCI_EXP_DEVCTL,
+					(pdevctl & ~PCI_EXP_DEVCTL_PAYLOAD) + (min_cap_mps << 5));
+		}
+
+		/* Adjusting the max payload size to the supported max payload size */
+		if (payload_sz != min_cap_mps) {
+			sif_log(sdev, SIF_INFO,
+				"Adjusting max payload sz to %d\n", 128 << parent_payload_sz_cap);
+			err = pci_write_config_word(sdev->pdev,
+					pcie_cap + PCI_EXP_DEVCTL,
+					(devctl & ~PCI_EXP_DEVCTL_PAYLOAD) + (min_cap_mps << 5));
+		}
+
+		if (min_cap_mps == 0) {
+			sif_log(sdev, SIF_INFO,
+				"PCI express max payload size is set to 128 which triggers a rev1 bug");
+		}
+	}
+	return err;
+}
+
+/* Entry of new instance */
+static int sif_probe(struct pci_dev *pdev,
+			       const struct pci_device_id *id)
+{
+	int err = 0;
+
+	/* TBD: Zeroed memory from ib_alloc_device? */
+	struct sif_dev *sdev =
+	    (struct sif_dev *)ib_alloc_device(sizeof(struct sif_dev));
+	if (!sdev) {
+		err = -ENOMEM;
+		goto pfail_ib_alloc;
+	}
+
+	sdev->pdev = pdev;
+	sdev->dfs = NULL;
+	sdev->fw_vfs = -1; /* #of VFS enabled in firmware not known yet */
+	sdev->ib_dev.dma_device = &pdev->dev;
+	sdev->limited_mode = sif_feature(force_limited_mode) ? true : false;
+
+	strlcpy(sdev->ib_dev.name, "sif%d", IB_DEVICE_NAME_MAX);
+
+	pci_set_drvdata(pdev, sdev);
+	sif_log(sdev, SIF_INFO,
+		"%s found, device id 0x%x, subsystem id 0x%x, revision %d, at 0x%p",
+		get_product_str(sdev), PSIF_DEVICE(sdev),
+		PSIF_SUBSYSTEM(sdev), PSIF_REVISION(sdev), sdev);
+
+	sdev->wq = create_singlethread_workqueue(sdev->ib_dev.name);
+	if (!sdev->wq) {
+		sif_log(sdev, SIF_INFO, "Failed to allocate kernel work queue");
+		err = -ENOMEM;
+		goto wq_fail;
+	}
+
+	err = sif_set_check_max_payload(sdev);
+	if (err)
+		goto wq_fail;
+
+	/* Ask PCI drivers to enable the device and set up BARs etc */
+	err = pci_enable_device_mem(pdev);
+	if (err)
+		goto pfail_enable;
+
+	/* Check if 64 bits DMA is supported */
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (!err) {
+		sif_log(sdev, SIF_INIT, "64 bit DMA supported");
+		pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	} else {
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (!err) {
+			sif_log(sdev, SIF_INIT, "32 bit DMA supported");
+			pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		} else {
+			sif_log(sdev, SIF_INIT, "No DMA support!?");
+			goto pfail_dma;
+		}
+	}
+
+	pci_enable_pcie_error_reporting(pdev);
+
+	/* Set up BAR access */
+	err = sif_bar_init(pdev);
+	if (err)
+		goto pfail_bar;
+
+	if (xen_pv_domain()) {
+		/* The Xen PV domain may return huge pages that are misaligned
+		 * in DMA space, see Orabug: 21690736.
+		 * Also we have to turn off the inline sge optimization, as it assumes
+		 * that (guest) physical and DMA addresses are equal, which is not
+		 * the case for the PV domain - see Orabug: 23012335.
+		 */
+		sif_log(sdev, SIF_INFO, "xen pv domain: Restricting resource allocation..");
+		sif_feature_mask |= SIFF_no_huge_pages | SIFF_disable_inline_first_sge;
+		sif_qp_size = min(sif_qp_size, 0x1000U);
+		sif_mr_size = min(sif_mr_size, 0x1000U);
+		sif_ah_size = min(sif_ah_size, 0x1000U);
+		sif_cq_size = min(sif_cq_size, 0x1000U);
+		sif_rq_size = min(sif_rq_size, 0x1000U);
+		sif_max_pqp_wr = min(sif_max_pqp_wr, 0x1000U);
+	}
+
+	/* Timeout scaling factor:
+	 * This value is used as a factor to calculate sensible
+	 * timeout values throughout the driver:
+	 */
+	sdev->min_resp_ticks = SIF_HW_TIMEOUT;
+	/* Type UMEM means no override - initialize */
+	sdev->mt_override = SIFMT_UMEM;
+
+	err = sif_dfs_register(sdev);
+	if (err)
+		goto pfail_dfs;
+
+	/* PSIF initialization */
+	err = sif_hw_init(sdev);
+	if (err)
+		goto pfail_psif_base;
+
+	err = sif_fwa_register(sdev);
+	if (err)
+		goto fwa_reg_failed;
+
+	/* Reserve key 0 as an invalid key for sanity checking
+	 * See #3323 for details
+	 */
+	sdev->dma_inv_mr = sif_alloc_invalid_mr(sdev->pd);
+	if (IS_ERR(sdev->dma_inv_mr)) {
+		err = PTR_ERR(sdev->dma_inv_mr);
+		goto pfail_dma_inv_mr;
+	}
+
+	/* Create a DMA MR (mapping the whole address space)
+	 * for use with the local_dma_lkey
+	 */
+	sdev->dma_mr = create_dma_mr(sdev->pd,
+				IB_ACCESS_LOCAL_WRITE |
+				IB_ACCESS_REMOTE_READ |
+				IB_ACCESS_REMOTE_WRITE);
+
+	if (IS_ERR(sdev->dma_mr)) {
+		err = PTR_ERR(sdev->dma_mr);
+		goto pfail_dma_mr;
+	}
+
+	if (PSIF_REVISION(sdev) <= 3) {
+		err = sif_r3_init(sdev);
+		if (err)
+			goto pfail_r3_init;
+	}
+
+	/* Successful device init */
+
+	err = sif_register_ib_device(sdev);
+	if (err)
+		goto pfail_ibreg;
+
+	/* Now that an IB device name exists, create a symlink in debugfs */
+	sif_dfs_link_to_ibdev(sdev);
+
+
+	sif_log(sdev, SIF_INFO, "Successfully probed and set up device");
+	return 0;
+pfail_ibreg:
+	sif_r3_deinit(sdev);
+pfail_r3_init:
+	sif_dealloc_mr(sdev, sdev->dma_mr);
+pfail_dma_mr:
+	sif_dealloc_mr(sdev, sdev->dma_inv_mr);
+pfail_dma_inv_mr:
+	sif_fwa_unregister(sdev);
+fwa_reg_failed:
+	sif_hw_deinit(sdev);
+pfail_psif_base:
+	sif_dfs_unregister(sdev);
+pfail_dfs:
+	sif_bar_deinit(pdev);
+pfail_bar:
+	pci_disable_pcie_error_reporting(pdev);
+pfail_dma:
+	pci_disable_device(pdev);
+pfail_enable:
+	destroy_workqueue(sdev->wq);
+wq_fail:
+	ib_dealloc_device(&sdev->ib_dev);
+pfail_ib_alloc:
+	sif_log0(SIF_INIT, "sif_probe failed with status %d\n", err);
+	return err;
+}
+
+/* Exit of instance */
+static void sif_remove(struct pci_dev *dev)
+{
+	struct sif_dev *sdev = pci_get_drvdata(dev);
+
+	sif_log0(SIF_INIT, "Enter: sif_remove");
+
+	sif_vf_disable(sdev);
+
+	sif_unregister_ib_device(sdev);
+	sif_r3_deinit(sdev);
+	sif_dealloc_mr(sdev, sdev->dma_mr);
+	sif_dealloc_mr(sdev, sdev->dma_inv_mr);
+	sif_fwa_unregister(sdev);
+	sif_hw_deinit(sdev);
+	sif_dfs_unregister(sdev);
+	sif_bar_deinit(dev);
+	pci_clear_master(dev);
+	pci_disable_device(dev);
+	flush_workqueue(sdev->wq);
+	destroy_workqueue(sdev->wq);
+	ib_dealloc_device(&sdev->ib_dev);
+	sif_log0(SIF_INIT, "exit sif_remove");
+}
+
+static int sif_bar_init(struct pci_dev *pdev)
+{
+	struct sif_dev *sdev = pci_get_drvdata(pdev);
+	int err;
+	phys_addr_t start;
+	size_t length;
+
+	/* Request access to the device space in BAR0 for this driver */
+	err = pci_request_region(pdev, SIF_CBU_BAR, "sif_cb");
+	if (err) {
+		sif_log(sdev, SIF_INIT, "Failed to request cb region");
+		goto pfail_bar0;
+	}
+
+	/* Then map all of it to allow access */
+	start = pci_resource_start(pdev, SIF_CBU_BAR);
+
+	/* This should not happen - kernel or BIOS bug?
+	 * TBD: Check this from the CPU ID? (M bit?)
+	 */
+	if (start > (1ULL << 52)) {
+		sif_log(sdev, SIF_INIT,
+			"pci_resource_start returned a physical address beyond CPU max phys.addr (%llx)",
+			start);
+		err = -ENOMEM;
+		goto pfail_ioremap0;
+	}
+
+	length = pci_resource_len(pdev, SIF_CBU_BAR);
+
+	sdev->cbu_mtrr = -1; /* Avoid attempt to free mtrr 0 */
+
+	/*
+	 * Need iomap_wc() in order to get write-combining to work,
+	 * even when using explicit write-combining instructions.
+	 */
+	sdev->cb_base = ioremap_wc(start, length);
+	if (!sdev->cb_base) {
+		sif_log(sdev, SIF_INIT,
+			"ioremap_wc - failed to map cb BAR (start %llx len %lx)",
+			start, length);
+		err = -ENOMEM;
+		goto pfail_ioremap0;
+	}
+	sdev->cb_sz = length;
+
+	sif_log(sdev, SIF_INIT, "BAR%d (cb) mapped at kva %p start %llx len %lx",
+		SIF_CBU_BAR, sdev->cb_base, start, length);
+
+	err = pci_request_region(pdev, SIF_MSIX_BAR, "sif_msix");
+	if (err) {
+		sif_log(sdev, SIF_INIT, "Failed to request msix region");
+		goto pfail_bar2;
+	}
+
+	start = pci_resource_start(pdev, SIF_MSIX_BAR);
+	length = pci_resource_len(pdev, SIF_MSIX_BAR);
+	sdev->msi_base = ioremap_nocache(start, length);
+	if (!sdev->msi_base) {
+		sif_log(sdev, SIF_INIT,
+			"ioremap_nocache - failed to map msix BAR%d (start %llx len %lx)",
+			SIF_MSIX_BAR, start, length);
+		err = -ENOMEM;
+		goto pfail_ioremap2;
+	}
+	sdev->msi_sz = length;
+	sif_log(sdev, SIF_INIT, "BAR%d (msix) mapped at kva %p start %llx len %lx",
+		SIF_MSIX_BAR, sdev->msi_base, start, length);
+
+	err = pci_request_region(pdev, SIF_EPS_BAR, "sif_csr");
+	if (err) {
+		sif_log(sdev, SIF_INIT, "Failed to request eps region");
+		goto pfail_bar4;
+	}
+
+	start = pci_resource_start(pdev, SIF_EPS_BAR);
+	length = pci_resource_len(pdev, SIF_EPS_BAR);
+	sdev->eps_base = ioremap_nocache(start, length);
+	if (!sdev->eps_base) {
+		sif_log(sdev, SIF_INIT, "Failed to map eps BAR%d (start %llx len %lx)",
+			SIF_EPS_BAR, start, length);
+		err = -ENOMEM;
+		goto pfail_ioremap4;
+	}
+	sdev->eps = (struct __iomem psif_pcie_mbox *)sdev->eps_base;
+	sdev->eps_sz = length;
+
+	sif_log(sdev, SIF_INIT, "BAR%d (eps) mapped at kva %p start %llx len %lx",
+		SIF_EPS_BAR, sdev->eps, start, length);
+	return 0;
+
+pfail_ioremap4:
+	pci_release_region(pdev, SIF_EPS_BAR);
+pfail_bar4:
+	iounmap(sdev->msi_base);
+pfail_ioremap2:
+	pci_release_region(pdev, SIF_CBU_BAR);
+pfail_bar2:
+	iounmap(sdev->cb_base);
+pfail_ioremap0:
+#ifdef CONFIG_X86
+	if (sdev->cbu_mtrr >= 0)
+		mtrr_del(sdev->cbu_mtrr,
+			pci_resource_start(pdev, SIF_CBU_BAR),
+			pci_resource_len(pdev, SIF_CBU_BAR));
+#endif
+	pci_release_region(pdev, SIF_MSIX_BAR);
+pfail_bar0:
+	return err;
+}
+
+static void sif_bar_deinit(struct pci_dev *pdev)
+{
+	struct sif_dev *sdev = pci_get_drvdata(pdev);
+
+	iounmap(sdev->eps);
+	pci_release_region(pdev, 4);
+	iounmap(sdev->msi_base);
+	pci_release_region(pdev, 2);
+	iounmap(sdev->cb_base);
+#ifdef CONFIG_X86
+	if (sdev->cbu_mtrr >= 0)
+		mtrr_del(sdev->cbu_mtrr,
+			pci_resource_start(pdev, SIF_CBU_BAR),
+			pci_resource_len(pdev, SIF_CBU_BAR));
+#endif
+	pci_release_region(pdev, 0);
+}
+
+
+
+/* Statically register this driver with the kernel */
+
+static int __init sif_init(void)
+{
+	int stat = 0;
+
+	sif_log0(SIF_INFO, "**** Oracle development driver - internal use only! ****");
+	sif_log0(SIF_INFO, "%s - build user %s at %s", sif_version.git_repo,
+		sif_version.build_user, sif_version.build_git_time);
+	sif_log0(SIF_INFO, "sifdrv git tag:\n%s", sif_version.last_commit);
+	if (sif_version.git_status[0] != '\0')
+		sif_log0(SIF_INFO, " *** sifdrv git status at build time: ***\n%s", sif_version.git_status);
+	sif_log0(SIF_INFO, "psifapi git tag:\n%s", sif_version.last_psifapi_commit);
+	if (sif_version.git_psifapi_status[0] != '\0')
+		sif_log0(SIF_INFO, " *** psifapi git status at build time ***\n%s",
+			sif_version.git_psifapi_status);
+
+	sif_log0(SIF_INIT, "hw header release \"%s\"", PSIF_RELEASE_STR);
+	sif_log0(SIF_INIT, "built for PSIF version %d.%d, EPSC API version %d.%d",
+		PSIF_MAJOR_VERSION, PSIF_MINOR_VERSION, EPSC_MAJOR_VERSION, EPSC_MINOR_VERSION);
+	sif_log0(SIF_INIT, "sif debug mask 0x%lx", sif_debug_mask);
+	if (sif_feature_mask) {
+		u64 undef = sif_feature_mask & ~SIFF_all_features;
+
+		if (undef) {
+			sif_log0(SIF_INFO,
+				"***** Invalid feature mask - undefined bits %llx - get rid of legacy bits!",
+				undef);
+			return -EINVAL;
+		}
+		sif_log0(SIF_INFO, "sif feature mask 0x%lx", sif_feature_mask);
+	}
+
+	stat = sif_pt_init();
+	if (stat)
+		goto pt_init_failed;
+
+	stat = sif_fwa_init();
+	if (stat)
+		goto fwa_init_failed;
+
+	return pci_register_driver(&sif_driver);
+
+fwa_init_failed:
+	sif_pt_exit();
+pt_init_failed:
+	return stat;
+}
+
+static void __exit sif_exit(void)
+{
+	sif_fwa_exit();
+	pci_unregister_driver(&sif_driver);
+	sif_pt_exit();
+	sif_log0(SIF_INIT, "done unregistering");
+}
+
+module_init(sif_init);
+module_exit(sif_exit);
diff --git a/drivers/infiniband/hw/sif/sif_mem.c b/drivers/infiniband/hw/sif/sif_mem.c
new file mode 100644
index 0000000000000..2f2629b116d38
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_mem.c
@@ -0,0 +1,1109 @@
+/*
+ * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_mem.c: SIF table memory and page table management
+ */
+
+#include <linux/scatterlist.h>
+#include <linux/kref.h>
+#include <linux/slab.h>
+#include <rdma/ib_umem.h>
+#include "sif_dev.h"
+#include "sif_mem.h"
+#include "sif_dma.h"
+#include "sif_pt.h"
+
+/* Defined below */
+static int sif_mem_fixup_dma(struct scatterlist *sg);
+
+/* Initialization of global per device info */
+void sif_mem_init(struct sif_dev *sdev)
+{
+	struct sif_mem_info *mi = &sdev->mi;
+
+	if (sif_feature(toggle_page_size)) {
+		mi->page_shift = PAGE_SHIFT == 12 ? 13 : 12;
+		mi->page_size = PAGE_SIZE == 0x1000 ? 0x2000 : 0x1000;
+	} else {
+		mi->page_shift = PAGE_SHIFT;
+		mi->page_size = PAGE_SIZE;
+	}
+	mi->level_shift = 9;
+	mi->max_shift = mi->page_shift + mi->level_shift * PT_LEVELS;
+	mi->ptes_per_page = 1 << mi->level_shift;
+	mi->page_mask = ~(mi->page_size - 1);
+}
+
+/* Some utilities */
+
+inline size_t mem_type_to_page_shift(struct sif_dev *sdev, enum sif_mem_type mem_type)
+{
+	switch (mem_type) {
+	case SIFMT_2M:
+		return sdev->mi.page_shift + sdev->mi.level_shift;
+	default:
+		return sdev->mi.page_shift;
+	}
+}
+
+
+static u32 sif_mem_fmr_max_page_shift(struct sif_mem *mem)
+{
+	struct sif_dev *sdev = mem->sdev;
+	u32 max_shift = sdev->mi.max_shift;
+	u64 end = 0;
+	u32 bits = sizeof(dma_addr_t) << 3;
+	int i;
+	u64 incr = 1 << mem->m.fmr.page_shift;
+
+	BUG_ON(mem->mem_type != SIFMT_FMR);
+
+	for (i = 0; i < mem->m.fmr.page_list_len; i++) {
+		u64 next_addr = mem->m.fmr.page_list[i];
+
+		if (end && end != next_addr) {
+			unsigned long border = end | next_addr;
+			u32 shift = find_first_bit(&border, bits);
+
+			if (shift < max_shift) {
+				sif_log(sdev, SIF_MEM_V,
+					"%4d: start 0x%llx, sz 0x%llx, prev.end 0x%llx shift %d -> %d",
+					i, next_addr, incr, end, max_shift, shift);
+				max_shift = shift;
+				if (max_shift == mem->m.fmr.page_shift) /* No point in continuing */
+					break;
+			}
+		}
+		end = next_addr + incr;
+	}
+	sif_log(sdev, SIF_MEM_SG, "found max shift %d from inspecting %d sges", max_shift, i);
+	return max_shift;
+}
+
+
+/* Calculate the max.possible page_shift for this memory
+ * based on alignment of the DMA
+ */
+static u32 sif_mem_max_page_shift(struct sif_mem *mem)
+{
+	struct sif_dev *sdev = mem->sdev;
+	u32 max_shift = sdev->mi.max_shift;
+	u64 end = 0;
+	u32 bits = sizeof(dma_addr_t) << 3;
+	u32 sg_cnt = 0;
+
+	struct scatterlist *sg = sif_mem_get_sgl(mem);
+
+	if (!sg)
+		return sdev->mi.page_shift;
+	for (; sg; sg = sg_next(sg)) {
+		u64 dma_start = sg_dma_address(sg);
+
+		sg_cnt++;
+#ifdef __sparc__
+		/* TBD: Fix bug in umem:
+		 * SG lists are not always properly terminated
+		 */
+		if (!sg_dma_len(sg))
+			break;
+#endif
+		if (end && end != dma_start) {
+			unsigned long border = end | dma_start;
+			u32 shift = find_first_bit(&border, bits);
+
+			if (shift < max_shift) {
+				sif_log(sdev, SIF_MEM_V,
+					"%4d: start 0x%llx, sz %x, prev.end 0x%llx shift %d -> %d",
+					sg_cnt, dma_start, sg_dma_len(sg), end, max_shift, shift);
+				max_shift = shift;
+				if (max_shift == sdev->mi.page_shift) /* No point in continuing */
+					break;
+				/* BUG_ON(max_shift < sdev->mi.page_shift); */
+				if (max_shift < sdev->mi.page_shift) {
+					sif_log(sdev, SIF_INFO,
+						"Failed to find a valid page shift: max_shift %d sdev->mi.page_shift %d",
+						max_shift, sdev->mi.page_shift);
+					return max_shift;
+				}
+			}
+		}
+		end = sg_dma_address(sg) + sg_dma_len(sg);
+	}
+	sif_log(sdev, SIF_MEM_SG, "found max shift %d from inspecting %d sges", max_shift, sg_cnt);
+	return max_shift;
+}
+
+/* External observer:
+ * Return the largest page size (represented by page shift bits) usable for this memory
+ */
+u32 sif_mem_page_shift(struct sif_mem *mem)
+{
+	/* If a maximum has been calculated, use it: */
+	if (mem->max_page_shift)
+		return mem->max_page_shift;
+	return mem_type_to_page_shift(mem->sdev, mem->mem_type);
+}
+
+static struct scatterlist *sg_alloc_list(struct sif_dev *sdev, unsigned int nelems, gfp_t flag)
+{
+	struct scatterlist *sg = sif_kmalloc(sdev, sizeof(struct scatterlist) * nelems, flag);
+
+	if (sg) {
+		sif_log0(SIF_MMU, "start at %p, %d elems allocated", sg, nelems);
+		sg_init_table(sg, nelems);
+	}
+	return sg;
+}
+
+
+/* API for managing a sif_kmem object */
+
+/** Allocate a set of pages of size (1 << page_shift).
+ * Prepare for scatterlist(s) of fixed length @sg_size (in number of elements)
+ * and allocate an initial @sz bytes (must be multiple of  1 << page_shift)
+ * @sz must be less than what fits within the initial scatterlist.
+ * If sg_size is 0, figure out the optimal sg_size.
+ */
+int sif_kmem_init(struct sif_dev *sdev, struct sif_kmem *kmem, size_t sg_size, size_t sz,
+		u32 page_shift, gfp_t flag, enum dma_data_direction dir)
+{
+	int ret;
+
+	memset(kmem, 0, sizeof(*kmem));
+	kmem->page_shift = page_shift;
+
+	if (!sg_size)
+		sg_size = sz >> page_shift;
+	kmem->sg_size = sg_size;
+	kmem->dir = dir;
+	kmem->sg_max = 0; /* Indicates an empty list with no end mark set yet */
+
+	if (sz == 0)
+		return 0;
+
+	ret = sif_kmem_extend(sdev, kmem, sz, flag);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+
+static void sif_kmem_free_pages(struct sif_kmem *kmem, struct scatterlist *sg, u32 nelems)
+{
+	int i;
+	int order = kmem->page_shift - PAGE_SHIFT;
+
+	for (i = 0; i < nelems; i++) {
+		__free_pages(sg_page(sg), order);
+		sg = sg_next(sg);
+	}
+}
+
+
+static void sif_kmem_free_sgls(struct sif_kmem *kmem, struct scatterlist *sgl, u32 nlists)
+{
+	for (; nlists > 0; nlists--) {
+		struct scatterlist *nsgl = sg_chain_ptr(&sgl[kmem->sg_size]);
+
+		kfree(sgl);
+		sgl = nsgl;
+	}
+}
+
+/* Find the @n'th scatterlist array within kmem */
+static struct scatterlist *sif_kmem_find_sg_head_idx(struct sif_kmem *kmem, u32 n)
+{
+	int i = 0;
+	struct scatterlist *sgl = kmem->sg;
+
+	for (; n > i; i++)
+		sgl = sg_chain_ptr(&sgl[kmem->sg_size]);
+	return sgl;
+}
+
+
+/* Find the scatterlist element with index idx within kmem */
+struct scatterlist *sif_kmem_find_sg_idx(struct sif_kmem *kmem, u32 idx)
+{
+	struct scatterlist *sgl;
+	int n = idx / kmem->sg_size;
+
+	sgl = sif_kmem_find_sg_head_idx(kmem, n);
+	return &sgl[idx % kmem->sg_size];
+}
+
+
+void sif_kmem_free(struct sif_dev *sdev, struct sif_kmem *kmem)
+{
+	int npages = kmem->sg_max - kmem->sg_start;
+	struct scatterlist *sg = sif_kmem_find_sg_idx(kmem, kmem->sg_start);
+
+	ib_dma_unmap_sg(&sdev->ib_dev, sg, npages, kmem->dir);
+
+	sif_kmem_free_pages(kmem, sg, npages);
+	sif_kmem_free_sgls(kmem, sg, kmem->nlists);
+	kmem->sg = NULL;
+}
+
+
+/* Extend a kmem object by allocating more sg entries if necessary, then
+ * allocate pages and dma map them. The invariant upon exit is that
+ * all allocated pages are dma mapped, which means that we must
+ * clean up pages that did not get mapped, if mapping fails midway:
+ */
+
+int sif_kmem_extend(struct sif_dev *sdev, struct sif_kmem *kmem, size_t sz, gfp_t flag)
+{
+	u32 i;
+	int ret;
+	int order;
+	struct page *page;
+	struct scatterlist *sg;
+	struct scatterlist *sg_prev = NULL;
+	struct scatterlist *sg_start = NULL;
+	size_t page_size = 1UL << kmem->page_shift;
+	u64 page_mask = page_size - 1;
+	u32 sg_size = (sz + page_mask) >> kmem->page_shift;
+
+	u32 nl = kmem->nlists;
+	long free_sg = nl * kmem->sg_size - kmem->sg_max;
+
+	sif_log(sdev, SIF_MEM, "enter, kmem at %p, sz 0x%lx", kmem, sz);
+
+	/* Make room in sg list */
+	for (; free_sg < sg_size; free_sg += kmem->sg_size) {
+		sg = sg_alloc_list(sdev, kmem->sg_size + 1, flag);
+		if (!sg) {
+			ret = -ENOMEM;
+			goto failed;
+		}
+		if (kmem->last_sg)
+			sg_chain(kmem->last_sg, kmem->sg_size + 1, sg);
+		else
+			kmem->sg = sg;
+		kmem->last_sg = sg;
+		kmem->nlists++;
+	}
+
+	/* The end mark is always in the last used element, not the first available one
+	 * which sg_max points to:
+	 */
+	if (kmem->sg_max) {
+		sg_prev = sif_kmem_find_sg_idx(kmem, kmem->sg_max - 1);
+		sg_unmark_end(sg_prev);
+		sg = sg_next(sg_prev);
+	} else
+		sg = sif_kmem_find_sg_idx(kmem, 0);
+
+	sg_start = sg;
+	order = kmem->page_shift - PAGE_SHIFT;
+
+	/* Allocate the new memory */
+	for (i = 0; i < sg_size; i++) {
+		sif_log(sdev, SIF_MEM_V, "i = %d, sg %p", i, sg);
+		page = sif_alloc_pages(sdev, flag | __GFP_ZERO, order);
+		if (!page) {
+			ret = -ENOMEM;
+			sg_size = i;
+			sg_mark_end(sg);
+			goto map_failed;
+		}
+		BUG_ON(!sg);
+		sg_set_page(sg, page, page_size, 0);
+		sg_prev = sg;
+		sg = sg_next(sg);
+	}
+	sg_mark_end(sg_prev);
+
+	ret = ib_dma_map_sg(&sdev->ib_dev, sg_start, sg_size, kmem->dir);
+	if (ret < 0) {
+		sif_log(sdev, SIF_INFO, "ib_dma_map_sg failed with %d", ret);
+		ret = -EFAULT;
+		goto map_failed;
+	}
+
+	sif_logs(SIF_PT_VV, sif_dump_sg(sg_start));
+
+	/* TBD: Remove this when issues with wrong alignments of DMA addresses
+	 * has been resolved (both Sparc and OVM, see Orabug: 21690736
+	 * For 2M seg_size, check that all DMA addresses are 2M aligned:
+	 */
+	if (page_size >= PMD_SIZE) {
+		for (sg = sg_start, i = 0; sg != NULL; sg = sg_next(sg), i++) {
+			if (sg_dma_address(sg) & ~PMD_MASK) {
+				sif_log(sdev, SIF_INFO,
+					"**** Orabug: 21690736 - aligned PA maps to unaligned IOVA: i = %d, pa %llx dma %pad",
+					i,
+					(u64)sg_phys(sg), &sg_dma_address(sg));
+				ret = -EIO;
+				goto map_failed;
+			}
+			sif_log(sdev, SIF_MEM_V, "i = %d, pa %llx dma %pad", i,
+				(u64)sg_phys(sg), &sg_dma_address(sg));
+		}
+	}
+
+	/* To enable direct lookup, we rely on the s/g list not being
+	 * collapsed by dma mapping. This holds on x86 but eg. on sparc we see
+	 * collapsed lists where the IOMMU delivers the whole DMA range in a single entry
+	 * at the start. Handle this case too by rewriting the DMA list
+	 * to comply with our needs, otherwise fail (and dump the sg list to the trace buffer
+	 * for analysis):
+	 */
+	if (sg_size != ret) {
+		if (ret == 1) {
+			sif_log(sdev, SIF_MEM, "Fixing up collapsed sg list (%d/%d)",
+				ret, sg_size);
+			ret = sif_mem_fixup_dma(sg_start);
+			if (ret)
+				goto map_failed;
+			sif_logs(SIF_PT_VV, sif_dump_sg(sg_start));
+		} else {
+			/* This should not happen, but sanity check it anyway */
+			sif_log(sdev, SIF_INFO,
+				"** Detected unhandled layout of s/g list (%d/%d) **",
+				ret, sg_size);
+			ret = -EPROTOTYPE;
+			goto map_failed;
+		}
+	}
+	i = kmem->sg_max;
+	kmem->sg_max += ret;
+	kmem->size += sz;
+	return i;
+map_failed:
+	sif_dump_sg(sg_start);
+	if (sg_size)
+		sif_kmem_free_pages(kmem, sg_start, sg_size);
+failed:
+	return ret;
+}
+
+
+/* Map a part of the @kmem object given by @offset, @size to the user space
+ * vm context given in @vma. The part must be page aligned and page sized:
+ */
+
+static int sif_kmem_vma_map_part(struct sif_dev *sdev, struct sif_kmem *kmem, struct vm_area_struct *vma,
+			off_t start_off, size_t size)
+{
+	off_t sg_index = start_off >> kmem->page_shift;
+	u64 page_size = 1 << kmem->page_shift;
+	u64 page_mask = (page_size - 1);
+	off_t off = start_off & page_mask; /* start offset within mem page */
+	off_t sz = min_t(off_t, size, page_size - off);
+	struct scatterlist *sg;
+	dma_addr_t pfn, sg_phy;
+	u64 start = vma->vm_start;
+	u64 rem = size;
+	int ret;
+
+	BUG_ON(off & ~PAGE_MASK);
+
+	sg = sif_kmem_find_sg_idx(kmem, sg_index);
+
+	sif_log(sdev, SIF_MMAP, "size %lx, off %lx start sg idx: %ld",
+		size, off, sg_index);
+
+	for (; rem > 0; sg = sg_next(sg)) {
+		sg_phy = sg_phys(sg);
+		pfn = (sg_phy + off) >> PAGE_SHIFT;
+		sif_log(sdev, SIF_MMAP, "pfn %pad, sz %lx sg_phys %pad off %lx",
+			&pfn, sz, &sg_phy, off);
+		ret = remap_pfn_range(vma, start, pfn, sz, vma->vm_page_prot);
+		if (ret)
+			return ret;
+		rem -= sz;
+		start += sz;
+		sz = min(rem, page_size);
+		off = 0;
+	}
+	return 0;
+}
+
+
+static int sif_vma_map_sg_part(struct sif_dev *sdev, struct scatterlist *sg,
+			struct vm_area_struct *vma, off_t start_off, size_t size)
+{
+	u64 start = vma->vm_start;
+	off_t off = start_off;
+	dma_addr_t pfn, sg_phy;
+	off_t rem = size;
+	off_t sz;
+	int ret;
+
+	BUG_ON(off & ~PAGE_MASK);
+
+	sif_log(sdev, SIF_MMAP, "size %lx, off %lx",
+		size, start_off);
+
+	while (off > sg->length) {
+		off -= sg->length;
+		sg = sg_next(sg);
+	}
+	sz = min_t(off_t, rem, sg->length - off);
+
+	for (;;) {
+		sg_phy = sg_phys(sg);
+		pfn = (sg_phy + off) >> PAGE_SHIFT;
+		sif_log(sdev, SIF_MMAP, "pfn %pad, sz %lx sg_phys %pad off %lx",
+			&pfn, sz, &sg_phy, off);
+		ret = remap_pfn_range(vma, start, pfn, sz, vma->vm_page_prot);
+		if (ret)
+			return ret;
+		rem -= sz;
+		start += sz;
+		off = 0;
+		if (rem <= 0)
+			break;
+		sg = sg_next(sg);
+		sz = min_t(off_t, rem, sg->length);
+	}
+	return 0;
+}
+
+
+/* Remove a set of sg entries from the list starting at page index sg_idx
+ * and unlink from the linked list.
+ *
+ * We have to make sure we maintain consistency for index lookups,
+ * so no scatterlist vectors can be deleted from the middle of the list,
+ * only head and tail removal is allowed,
+ * and if we remove scatterlists from the head of the list, we must update the offset.
+ */
+
+int sif_kmem_shrink(struct sif_dev *sdev, struct sif_kmem *kmem, int sg_idx, size_t size)
+{
+	/* TBD: Implement this! */
+	return -EOPNOTSUPP;
+}
+
+
+/************************************
+ * API for managing different higher level (scatter) memory segment abstractions
+ * used by SIF:
+ */
+
+/* Set up a sif_mem structure for handling a memory
+ * segment of initial size @size.
+ */
+struct sif_mem *sif_mem_create(struct sif_dev *sdev, size_t sg_size,
+			size_t size, enum sif_mem_type mem_type,
+			gfp_t flag, enum dma_data_direction dir)
+{
+	int ret;
+	u32 page_shift = mem_type_to_page_shift(sdev, mem_type);
+	struct sif_mem *mem = kzalloc(sizeof(*mem), flag);
+
+	if (!mem)
+		return NULL;
+
+	BUG_ON(mem_type != SIFMT_2M && mem_type != SIFMT_4K);
+
+
+	ret = sif_kmem_init(sdev, &mem->m.km, sg_size,
+			size, page_shift, flag, dir);
+	if (ret)
+		goto failed;
+
+	mem->sdev = sdev;
+	mem->size = size;
+	mem->mem_type = mem_type;
+	mem->max_page_shift = 0;
+	return mem;
+failed:
+	kfree(mem);
+	return NULL;
+}
+
+/* Create a sif_mem object from an umem object (User level memory)
+ * The sif_mem object resumes ownership of the umem:
+ */
+struct sif_mem *sif_mem_create_umem(struct sif_dev *sdev,
+				struct ib_umem *umem,
+				enum sif_mem_type mem_type,
+				gfp_t flag, enum dma_data_direction dir)
+{
+	struct sif_mem *mem;
+	u64 dma_addr;
+
+	if (mem_type != SIFMT_BYPASS && !umem) {
+		sif_log(sdev, SIF_INFO, "Invalid umem setup");
+		return NULL;
+	}
+	mem = kzalloc(sizeof(*mem), flag);
+	if (!mem)
+		return NULL;
+
+	BUG_ON(!umem);
+	BUG_ON(mem_type != SIFMT_UMEM &&
+		mem_type != SIFMT_UMEM_RO &&
+		mem_type != SIFMT_BYPASS);
+
+	mem->sdev = sdev;
+	mem->m.u.umem = umem;
+	mem->size = umem->length;
+	mem->mem_type = mem_type;
+
+	/* See commit eeb8461e - sg chain safe impl of umem in 3.15 */
+	mem->m.u.sg = umem->sg_head.sgl;
+	mem->m.u.start_offset = umem->address & ~PAGE_MASK;
+	mem->vmap_base = (void *)umem->address;
+	mem->max_page_shift = sif_mem_max_page_shift(mem);
+	dma_addr = sg_dma_address(mem->m.u.sg);
+	sif_log(sdev, SIF_MEM, "vaddr %p, sg dma start 0x%llx, umem start_offset %llx",
+		mem->vmap_base, dma_addr, mem->m.u.start_offset);
+	if (umem->nmap < umem->npages) {
+		int ret;
+
+		sif_log(sdev, SIF_MEM, "Fixing up collapsed sg list (%d/%d)",
+			umem->nmap, umem->npages);
+		sif_logs(SIF_MEM, sif_dump_sg(mem->m.u.sg));
+		ret = sif_mem_fixup_dma(mem->m.u.sg);
+		if (ret) {
+			sif_log(sdev, SIF_INFO, "sg list fixup failed");
+			sif_dump_sg(mem->m.u.sg);
+			kfree(mem);
+			return NULL;
+		}
+	}
+	sif_logs(SIF_PT_VV, sif_dump_sg(mem->m.u.sg));
+	return mem;
+}
+
+/* Create a sif_mem object from a phys array of length @num_phys
+ * The phys array is owned by caller:
+ */
+struct sif_mem *sif_mem_create_phys(struct sif_dev *sdev, void *kvaddr,
+				struct ib_phys_buf *phys_buf, int num_phys,
+				gfp_t flag)
+{
+	int i;
+	u64 size = 0;
+	struct sif_mem *mem = kzalloc(sizeof(*mem), flag);
+
+	if (!mem)
+		return NULL;
+
+	mem->sdev = sdev;
+	mem->m.phys.phys_buf = phys_buf;
+	mem->m.phys.phys_buf_len = num_phys;
+	for (i = 0; i < num_phys; i++) {
+		sif_log(sdev, SIF_MMU_V, "phys_buf addr 0x%llx size 0x%llx",
+			phys_buf[i].addr, phys_buf[i].size);
+		size += phys_buf[i].size;
+	}
+	/* TBD: We could calculate this above but phys_mr is scheduled to be removed */
+	mem->max_page_shift = 0;
+	mem->vmap_base = kvaddr;
+	mem->size = size;
+	mem->mem_type = SIFMT_PHYS;
+	return mem;
+}
+
+struct sif_mem *sif_mem_create_fmr(struct sif_dev *sdev, size_t max_pages, u32 page_shift,
+				gfp_t flag)
+{
+	size_t size = max_pages << page_shift;
+	struct sif_mem *mem = sif_mem_create_ref(sdev, SIFMT_PTONLY, 0, size, flag);
+
+	if (mem)
+		mem->m.fmr.page_shift = page_shift;
+	sif_log(sdev, SIF_FMR, "page_shift %d, size 0x%lx", page_shift, size);
+	return mem;
+}
+
+/* Create a sif_mem object from a memory pointer array of length @num_pages
+ * The memory pointer array is owned by caller:
+ */
+int sif_mem_map_fmr(struct sif_mem *mem, u64 iova,
+		u64 *page_list, int num_pages)
+{
+	u64 actual_size = num_pages << mem->m.fmr.page_shift;
+
+	if (iova & (mem->m.fmr.page_shift - 1)) {
+		sif_log(mem->sdev, SIF_INFO, "Misaligned FMR start - iova 0x%llx", iova);
+		return -EINVAL;
+	}
+	if (actual_size > mem->size) {
+		/* This is really now an artificial limit for us, except for performance */
+		sif_log(mem->sdev, SIF_INFO, "Attempt to map 0x%llx bytes, max for this FMR is 0x%llx",
+			actual_size, mem->size);
+		return -ENOMEM;
+	}
+	mem->vmap_base = (void *)iova;
+	mem->m.fmr.page_list = page_list;
+	mem->m.fmr.page_list_len = num_pages;
+	mem->mem_type = SIFMT_FMR;
+
+	/* We save the max mem size to be able to restore it later */
+	mem->m.fmr.max_size = mem->size;
+	mem->size = actual_size;
+	mem->max_page_shift = sif_mem_fmr_max_page_shift(mem);
+	return 0;
+}
+
+void sif_mem_unmap_fmr(struct sif_mem *mem)
+{
+	mem->vmap_base = NULL;
+	mem->size = mem->m.fmr.max_size;
+	mem->m.fmr.page_list = NULL;
+	mem->m.fmr.page_list_len = 0;
+	mem->mem_type = SIFMT_PTONLY;
+}
+
+/* Create a sif_mem object mapped dma contiguous, suitable for
+ * BYPASS mapping (size constraints..)
+ */
+struct sif_mem *sif_mem_create_dmacont(struct sif_dev *sdev, size_t size,
+				gfp_t flag, enum dma_data_direction dir)
+{
+	struct sif_mem *mem = kzalloc(sizeof(*mem), flag);
+	dma_addr_t dma_handle;
+	struct scatterlist *sg;
+
+	if (!mem)
+		return NULL;
+
+	/* The __GFP_DMA32 bit is not supported by page_alloc in all kernels */
+	if (unlikely(flag & __GFP_DMA32)) {
+		u64 dma_addr;
+
+		mem->vmap_base = ib_dma_alloc_coherent(&sdev->ib_dev, size,
+				&dma_addr, flag);
+		dma_handle = dma_addr;
+		mem->m.u.flags = SMF_DMA32;
+	} else
+		mem->vmap_base = sif_dma_alloc_aligned(&sdev->ib_dev, size, &dma_handle,
+						       flag, dir);
+	if (!mem->vmap_base)
+		goto dma_alloc_failed;
+	mem->sdev = sdev;
+	mem->mem_type = SIFMT_BYPASS;
+	mem->max_page_shift = sdev->mi.max_shift;
+	mem->size = size;
+	mem->m.u.dir = dir;
+	mem->m.u.umem = NULL;
+	sg = mem->m.u.sg = &mem->m.u.sg0;
+	sg_init_one(sg, mem->vmap_base, mem->size);
+	sg->dma_address = dma_handle;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+	sg->dma_length = mem->size;
+#endif
+	return mem;
+dma_alloc_failed:
+	kfree(mem);
+	return NULL;
+}
+
+
+/* Create a sif_mem object with no own memory backing - to use for CB, SQ_CMPL and
+ * kernel full passthrough cases to have a "shallow" mem object:
+ */
+struct sif_mem *sif_mem_create_ref(struct sif_dev *sdev, enum sif_mem_type mem_type,
+				u64 sif_vaddr, size_t size, gfp_t flag)
+{
+	struct sif_mem *mem = kzalloc(sizeof(*mem), flag);
+
+	if (!mem)
+		return NULL;
+
+	BUG_ON(mem_type != SIFMT_PTONLY && mem_type != SIFMT_NOMEM && mem_type != SIFMT_CS);
+
+	mem->sdev = sdev;
+	mem->mem_type = mem_type;
+	mem->vmap_base = (void *)sif_vaddr;
+	mem->size = size;
+	mem->max_page_shift = 0;
+	return mem;
+}
+
+
+/* Free a sif_mem previously created with sif_mem_create */
+int sif_mem_free(struct sif_mem *mem)
+{
+	switch (mem->mem_type) {
+	case SIFMT_2M:
+	case SIFMT_4K:
+		sif_kmem_free(mem->sdev, &mem->m.km);
+		break;
+	case SIFMT_BYPASS:
+		/* BYPASS mode can be used from kernel or user space
+		 * If umem is set, it is a user space mapping:
+		 */
+		if (!mem->m.u.umem) {
+			if (mem->m.u.flags & SMF_DMA32)
+				ib_dma_free_coherent(&mem->sdev->ib_dev, mem->size,
+						mem->vmap_base, sif_mem_dma(mem, 0));
+			else
+				sif_dma_free_aligned(&mem->sdev->ib_dev, mem->size,
+						mem->vmap_base, sif_mem_dma(mem, 0), mem->m.u.dir);
+		}
+		/* Deliberate fall-through */
+	case SIFMT_UMEM:
+	case SIFMT_UMEM_RO:
+		if (mem->m.u.umem)
+			ib_umem_release(mem->m.u.umem);
+		break;
+	default:
+		break; /* Nothing extra to do */
+	}
+	kfree(mem);
+	return 0;
+}
+
+
+/* Allocate some (more) memory for this sif_mem
+ * Return a pointer to the start of that memory and increase ref.cnt for the sif_mem
+ */
+int sif_mem_extend(struct sif_mem *mem, size_t size, gfp_t flag)
+{
+	int sg_idx;
+
+	if (mem->mem_type != SIFMT_2M && mem->mem_type != SIFMT_4K)
+		return -EINVAL;
+
+	sg_idx = sif_kmem_extend(mem->sdev, &mem->m.km, size, flag);
+	mem->size = mem->m.km.size;
+	return sg_idx;
+}
+
+/* Free a subrange of this memory object starting at @sg and dereference the
+ * sif_mem object. Assumes there is no other references to this subrange:
+ */
+int sif_mem_shrink(struct sif_mem *mem, int sg_idx, size_t size)
+{
+	int ret;
+
+	if (mem->mem_type != SIFMT_2M && mem->mem_type != SIFMT_4K)
+		return -EINVAL;
+
+	ret = sif_kmem_shrink(mem->sdev, &mem->m.km, sg_idx, size);
+	mem->size = mem->m.km.size;
+	return ret;
+}
+
+
+bool sif_mem_has_umem(struct sif_mem *mem)
+{
+	switch (mem->mem_type) {
+	case SIFMT_UMEM:
+	case SIFMT_UMEM_RO:
+	case SIFMT_BYPASS:
+		return mem->m.u.umem != NULL;
+	default:
+		break;
+	}
+	return false;
+}
+
+
+/* Find kernel virtual address at @offset within map */
+void *sif_mem_kaddr(struct sif_mem *mem, off_t offset)
+{
+	switch (mem->mem_type) {
+	case SIFMT_2M:
+	case SIFMT_4K:
+	{
+		off_t off = offset & ((1 << mem->m.km.page_shift) - 1);
+		u32 i = offset >> mem->m.km.page_shift;
+		struct scatterlist *sg = sif_kmem_find_sg_idx(&mem->m.km, i);
+
+		return sg_virt(sg) + off;
+	}
+	case SIFMT_BYPASS:
+	case SIFMT_UMEM:
+	case SIFMT_UMEM_RO:
+	case SIFMT_NOMEM:
+	case SIFMT_PHYS:
+	case SIFMT_FMR:
+		return mem->vmap_base + offset;
+	default:
+		break;
+	}
+
+	sif_log(mem->sdev, SIF_INFO, "Not implemented for type %d",
+		mem->mem_type);
+	return NULL;
+}
+
+/* Find DMA address at @offset within map */
+dma_addr_t sif_mem_dma(struct sif_mem *mem, off_t offset)
+{
+	switch (mem->mem_type) {
+	case SIFMT_PTONLY:
+		return offset;
+	case SIFMT_2M:
+	case SIFMT_4K:
+	{
+		off_t off = offset & ((1 << mem->m.km.page_shift) - 1);
+		u32 i = offset >> mem->m.km.page_shift;
+		struct scatterlist *sg = sif_kmem_find_sg_idx(&mem->m.km, i);
+
+		return sg_dma_address(sg) + off;
+	}
+	case SIFMT_BYPASS:
+		return sg_dma_address(mem->m.u.sg) + offset;
+	case SIFMT_UMEM:
+	case SIFMT_UMEM_RO:
+	{
+		struct scatterlist *sg = mem->m.u.sg;
+		/* umem objects have page aligned sg lists but may start at an offset */
+		offset += mem->m.u.start_offset;
+		while (sg && offset >= sg->length) {
+			offset -= sg->length;
+			sg = sg_next(sg);
+		}
+		return sg_dma_address(sg) + offset;
+	}
+	case SIFMT_PHYS:
+	{
+		struct ib_phys_buf *pb = mem->m.phys.phys_buf;
+
+		while (offset >= pb->size) {
+			offset -= pb->size;
+			pb++;
+		}
+		return pb->addr + offset;
+	}
+	case SIFMT_FMR:
+	{
+		u32 pageno = offset >> mem->m.fmr.page_shift;
+		off_t off = offset & ((1 << mem->m.fmr.page_shift) - 1);
+
+		return mem->m.fmr.page_list[pageno] + off;
+	}
+	default:
+		break;
+	}
+
+	sif_log(mem->sdev, SIF_INFO, "Not implemented for type %d",
+		mem->mem_type);
+	BUG();
+	return 0ull;
+}
+
+
+struct scatterlist *sif_mem_get_sgl(struct sif_mem *mem)
+{
+	switch (mem->mem_type) {
+	case SIFMT_UMEM:
+	case SIFMT_UMEM_RO:
+	case SIFMT_BYPASS:
+		return mem->m.u.sg;
+	case SIFMT_2M:
+	case SIFMT_4K:
+		return mem->m.km.sg;
+	default:
+		sif_log(mem->sdev, SIF_INFO, "unsupported memory type %d", mem->mem_type);
+		break;
+	}
+	return NULL;
+}
+
+
+/* If map is continuous, get start of dma mapping
+ * otherwise return an error pointer:
+ */
+dma_addr_t sif_mem_dma_if_cont(struct sif_mem *mem)
+{
+	struct scatterlist *sg;
+	size_t sz = 1 << sif_mem_max_page_shift(mem);
+
+	if (sz < mem->size) {
+		sif_log(mem->sdev, SIF_INFO,
+			"size: %lld - max possible page sz %ld: mmu bypass not possible",
+			mem->size, sz);
+		return (u64)ERR_PTR(-EPERM);
+	}
+	sg = sif_mem_get_sgl(mem);
+	if (unlikely(!sg))
+		return (u64)ERR_PTR(-EINVAL);
+	return sg_dma_address(sg);
+}
+
+
+int sif_mem_vma_map_part(struct sif_mem *mem, struct vm_area_struct *vma,
+			off_t start_off, size_t size)
+{
+	switch (mem->mem_type) {
+	case SIFMT_2M:
+	case SIFMT_4K:
+		return sif_kmem_vma_map_part(mem->sdev, &mem->m.km, vma, start_off, size);
+	case SIFMT_BYPASS:
+	case SIFMT_BYPASS_RO:
+		return sif_vma_map_sg_part(mem->sdev, mem->m.u.sg, vma, start_off, size);
+	default:
+		sif_log(mem->sdev, SIF_INFO, "not implemented for mem.type %d", mem->mem_type);
+		return -EOPNOTSUPP;
+	}
+}
+
+
+/* Map the memory referenced by @mem to the user space vma */
+int sif_mem_vma_map(struct sif_mem *mem, struct vm_area_struct *vma)
+{
+	return sif_mem_vma_map_part(mem, vma, 0, mem->size);
+}
+
+/* sif_mem iterator support (mainly for the types that do not expose a scatterlist) */
+
+int sif_mem_iter_init(struct sif_mem *mem, struct sif_mem_iter *it)
+{
+	it->mem = mem;
+	switch (mem->mem_type) {
+	case SIFMT_PHYS:
+	case SIFMT_FMR:
+	case SIFMT_PTONLY:
+		it->phys.i = 0;
+		break;
+	default:
+		it->sg = sif_mem_get_sgl(mem);
+		if (!it->sg)
+			return -EINVAL;
+	}
+	it->offset = 0;
+	return 0;
+}
+
+
+int sif_mem_iter_advance(struct sif_mem_iter *it, u64 incr)
+{
+	switch (it->mem->mem_type) {
+	case SIFMT_PHYS:
+	{
+		long left = it->mem->m.phys.phys_buf[it->phys.i].size - it->offset;
+
+		if (left > incr)
+			it->offset += incr;
+		else {
+			it->offset = incr - left;
+			it->phys.i++;
+		}
+		if (it->phys.i >= it->mem->m.phys.phys_buf_len)
+			return -ENOMEM;
+		return 0;
+	}
+	case SIFMT_FMR:
+	{
+		long page_size = 1 << it->mem->m.fmr.page_shift;
+		long left = page_size - it->offset;
+
+		if (left > incr)
+			it->offset += incr;
+		else {
+			it->offset = incr - left;
+			it->phys.i++;
+		}
+		if (it->phys.i >= it->mem->m.fmr.page_list_len)
+			return -ENOMEM;
+		return 0;
+	}
+	case SIFMT_PTONLY:
+		it->offset += incr;
+		if (it->offset >= it->mem->size)
+			return -ENOMEM;
+		return 0;
+	default:
+		it->offset += incr;
+		while (it->offset >= it->sg->length) {
+			it->offset = it->offset - it->sg->length;
+			it->sg = sg_next(it->sg);
+		}
+		if (it->sg)
+			return 0;
+		else
+			return -ENOMEM;
+	}
+}
+
+dma_addr_t sif_mem_iter_dma(struct sif_mem_iter *it)
+{
+	switch (it->mem->mem_type) {
+	case SIFMT_PHYS:
+		return it->mem->m.phys.phys_buf[it->phys.i].addr + it->offset;
+	case SIFMT_FMR:
+		return it->mem->m.fmr.page_list[it->phys.i] + it->offset;
+	case SIFMT_PTONLY:
+		return 0; /* For future fmr use: populate with empty ptes to be filled later */
+	default:
+		return sg_dma_address(it->sg) + it->offset;
+	}
+}
+
+
+/* DMA is mapped continuously and the map is reflected in a "collapsed" sg list for DMA,
+ * The rest of the list is still valid for the pa/va part - we need to loop through and
+ * make it consistent for our usage:
+ */
+static int sif_mem_fixup_dma(struct scatterlist *sg)
+{
+	struct scatterlist *from_sg = sg;
+	struct scatterlist *last_sg = sg;
+	dma_addr_t dma_addr = sg_dma_address(from_sg);
+	size_t dma_size = sg_dma_len(sg);
+	size_t sg_len = sg->length; /* Save the "homogeneous" length */
+
+	while (sg) {
+		if (dma_size < sg->length)
+			return -EINVAL;  /* should not happen */
+
+		if (sg->dma_address && sg->dma_address != (dma_addr_t)-1) {
+			/* This entry is part of the collapsed list
+			 * must keep address and dma_length until we have "consumed" it,
+			 * Since all lengths are homogeneous in the resulting list we
+			 * can temporarily "misuse" the length field in this entry to
+			 * store the new dma_address, and just leave the dma_length
+			 * for later consumption:
+			 */
+			sg->length = sg->dma_address;
+		} else
+			sg->dma_length = sg_len;
+
+		sg->dma_address = dma_addr;
+		dma_addr += sg_len;
+		dma_size -= sg_len;
+		last_sg = sg;
+		sg = sg_next(sg);
+
+		if (!dma_size) {
+			/* Clean up our "temporary store" (see below comment) */
+			from_sg->length = from_sg->dma_length = sg_len;
+			from_sg = sg_next(from_sg);
+			dma_addr = from_sg->length; /* from temp store */
+			dma_size = sg_dma_len(from_sg);
+		}
+	}
+	return 0;
+}
+
+/* A utility for dumping an sg list to the trace buffer */
+void sif_dump_sg(struct scatterlist *sgl)
+{
+	struct scatterlist *sg = sgl;
+	int cnt = 0;
+
+	trace_printk(" **** sg dump - start at %p ****\n", sg);
+	trace_printk("%16s: %16s %8s %16s %16s %8s %8s %4s\n",
+		"sg", "dma", "dmalen", "pa", "kva", "length", "offset", "end mark");
+	while (sg) {
+		u64 dma_addr = sg_dma_address(sg);
+		u64 pa = sg_phys(sg);
+
+		trace_printk("%p: %#16llx %#8x %#16llx %p %#8x %#8x %4s\n",
+			sg, dma_addr, sg_dma_len(sg), pa,
+			sg_virt(sg), sg->length, sg->offset,
+			(sg_is_last(sg) ? "[last]" : ""));
+		sg = sg_next(sg);
+		cnt++;
+	}
+	trace_printk(" **** tot.%d elements ****\n", cnt);
+}
diff --git a/drivers/infiniband/hw/sif/sif_mem.h b/drivers/infiniband/hw/sif/sif_mem.h
new file mode 100644
index 0000000000000..1b91a8fd72854
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_mem.h
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_mem.h: A common interface for all memory used by
+ *  SIF for queue, table and page table management
+ */
+
+#ifndef _SIF_MEM_H
+#define _SIF_MEM_H
+#include <rdma/ib_verbs.h>
+#include "sif_user.h"
+
+/* We need to support 4 interfaces to memory; abbreviated umem, fmr,
+ * phys and kmem below, to be compatible with the different ways we are called.
+ * This is due to be cleaned up in the core IB stack,
+ * by allowing the use of scatterlists for all types of s/g memory
+ * provided to rdma devices.
+ */
+
+/* Allocation of table and queue memory:
+ * The Linux buddy allocator should guarantee us lots of up to 4M I/O contiguous
+ * memory segments through alloc_pages provided the system has enough memory.
+ * Assume that we get at least 4M standalone and any number of (aligned) 2M entries after that
+ *
+ * This way we allocate contiguous memory and use bypass/passthrough mapping if
+ * alloc_sz <= 4M, and revert to GVA2GPA if needs are larger, but allocate in 2M blocks
+ * and use PSIF 2M pages for this.
+ */
+
+struct ib_umem;
+struct sif_dev;
+
+/* Per device memory configuration info
+ * embedded in sif_dev:
+ */
+struct sif_mem_info {
+	u8 page_shift;   /* number of bits within the smallest SIF level 0 page (depends on config) */
+	u8 level_shift;  /* number of bits to shift to the next level in the page table  */
+	u8 max_shift;    /* Highest number of bits within the highest level page */
+	u32 ptes_per_page; /* Page table entries per page table page */
+	u64 page_size;   /* size of a SIF level 0 page (as configured) */
+	u64 page_mask;   /* All bits beyond page_shift set */
+};
+
+/* Valid for SIFMT_2M, SIFMT_4K and SIFMT_BYPASS_RO:
+ * Represented as a pool of equally sized pages.
+ * Allows direct page offset lookup from the kernel side.
+ * All pages are the same size.
+ * To maintain offset indexes, interior pages cannot be removed.
+ * sg_start will be > 0 if there are empty entries at the start, allowing
+ * indexes to remain valid if entries are deleted from the head
+ */
+struct sif_kmem {
+	u64 size;	 /* Size of the mapped memory of this kmem */
+	u32 page_shift;  /* Represents page size of each scatter element */
+	u32 sg_size;     /* Allocated number of (usable!) elements in (each) scatter list */
+	u32 sg_start;    /* Current start offset into the sg list */
+	u32 sg_max;      /* Last entry in use + 1 (<= sg_size * nlists) */
+	u32 nlists;      /* Number of (sg_size+1'd) sg lists linked through sg */
+	enum dma_data_direction dir;  /* DMA direction used for dma mapping */
+	struct scatterlist *sg; /* Pointer to start of scatterlist array */
+	struct scatterlist *last_sg; /* The start of the last list array in the sg list linkage */
+};
+
+/* Valid for SIFMT_FMR (when called from ib_map_phys_fmr) */
+struct sif_mem_fmr {
+	u64 *page_list;  /* Array of dma addresses of buffers */
+	u32 page_list_len; /* length of page_list array */
+	u32 page_shift;  /* Represents page size of each scatter element */
+	u64 max_size;    /* Saved maximal size of the FMR as supplied during creation */
+};
+
+/* Valid for SIFMT_PHYS (when called from ib_reg_phys_mr)
+ * It is called "phys" but should have been called "dma" as it is used
+ * with dma addresses in at least 1 of the 2 use cases in the kernel...
+ * not important to support this API, but keep for completeness:
+ */
+struct sif_mem_phys {
+	struct ib_phys_buf *phys_buf;  /* Array of dma address/size pairs of buffers */
+	u64 phys_buf_len; /* length of phys_buf array */
+};
+
+/* Flag values so far only used by 'flags' in sif_mem_umem: */
+enum sif_mem_flags {
+	SMF_DMA32 = 0x1    /* Set if this memory is allocated from the DMA32 space */
+};
+
+/* Memory types mapped from user space:
+ * Valid for SIFMT_UMEM, SIFMT_UMEM_RO, SIFMT_BYPASS:
+ */
+struct sif_mem_umem {
+	struct ib_umem *umem; /* User memory, NULL if this is a kernel bypass mapping */
+	struct scatterlist *sg; /* A pointer to a valid scatterlist (user and kernel) */
+	u64 start_offset;     /* Stored misalignment according to the scatter element size */
+	enum dma_data_direction dir;  /* DMA direction used for dma mapping */
+	u32 flags;
+	struct scatterlist sg0; /* Inline storage for bypass mode */
+};
+
+
+/* The generic sif s/g memory representation
+ *
+ */
+struct sif_mem {
+	struct sif_dev *sdev;
+	enum sif_mem_type mem_type; /* Logical type of mapping */
+	u16 max_page_shift; /* 0: unknown, >= 0: Largest page size that can be mapped cont. */
+	u64 size;         /* Size of mapping */
+	void *vmap_base;  /* Kernel address of the start of a vmap cont.mapping, if any */
+	union {
+		struct sif_mem_umem u;     /* SIFMT_{UMEM*,BYPASS} */
+		struct sif_kmem km;        /* SIFMT_{2M,CS,4K} */
+		struct sif_mem_fmr fmr;    /* SIFMT_FMR */
+		struct sif_mem_phys phys;  /* SIFMT_PHYS */
+	} m;
+};
+
+
+/* Initialization of global per device info - called from sif_hwi.c */
+void sif_mem_init(struct sif_dev *sdev);
+
+/* API for managing a sif_kmem object */
+
+/* Allocate a memory object of size @size and populate an sg list
+ * with it:
+ */
+int sif_kmem_init(struct sif_dev *sdev, struct sif_kmem *kmem, size_t sg_size, size_t size,
+		u32 page_shift, gfp_t flag, enum dma_data_direction dir);
+
+/* sg unmap and free the memory referenced by mem */
+void sif_kmem_free(struct sif_dev *sdev, struct sif_kmem *mem);
+
+/* Extend the kmem object with a total size of @size - return sg_index of the first
+ * allocated element:
+ */
+int sif_kmem_extend(struct sif_dev *sdev, struct sif_kmem *kmem,
+				size_t size, gfp_t flag);
+int sif_kmem_shrink(struct sif_dev *sdev, struct sif_kmem *mem, int sg_idx, size_t size);
+
+/* Find the scatterlist element with index idx within kmem */
+struct scatterlist *sif_kmem_find_sg_idx(struct sif_kmem *kmem, u32 idx);
+
+/************************************
+ * API for managing different higher level (scatter) memory segment abstractions
+ * used by SIF:
+ */
+
+/* Set up a sif_mem structure for handling a memory
+ * segment of initial size @size.
+ */
+struct sif_mem *sif_mem_create(struct sif_dev *sdev, size_t sg_size, size_t size,
+			enum sif_mem_type mem_type,
+			gfp_t flag,
+			enum dma_data_direction dir);
+
+/* Create a sif_mem object from an umem object (User level memory)
+ * The sif_mem object resumes ownership of the umem:
+ */
+struct sif_mem *sif_mem_create_umem(struct sif_dev *sdev,
+				struct ib_umem *umem,
+				enum sif_mem_type mem_type,
+				gfp_t flag, enum dma_data_direction dir);
+
+/* Create a sif_mem object from a phys array of length @num_phys
+ * The phys array is owned by caller:
+ */
+struct sif_mem *sif_mem_create_phys(struct sif_dev *sdev, void *iova_start,
+				struct ib_phys_buf *phys, int num_phys,
+				gfp_t flag);
+
+/* Create a sif_mem object from a memory pointer array of length @num_pages
+ * The memory pointer array is owned by caller:
+ */
+struct sif_mem *sif_mem_create_fmr(struct sif_dev *sdev, size_t size, u32 page_shift,
+				gfp_t flag);
+
+/* Create a sif_mem object with no own memory backing - to use for CB, SQ_CMPL and
+ * kernel full passthrough cases to have a "shallow" mem object:
+ */
+struct sif_mem *sif_mem_create_ref(struct sif_dev *sdev, enum sif_mem_type mem_type,
+				u64 sif_vaddr, size_t size, gfp_t flag);
+
+/* Create an aligned sif_mem object mapped coherent dma contiguous, suitable for
+ * BYPASS mapping (size constraints..)
+ */
+struct sif_mem *sif_mem_create_dmacont(struct sif_dev *sdev, size_t size, gfp_t flag,
+				enum dma_data_direction dir);
+
+/* Free a sif_mem previously created with sif_mem_create */
+int sif_mem_free(struct sif_mem *mem);
+
+/* Map a previously created sif_mem ref object from a memory pointer array of length @num_pages
+ * The memory pointer array is owned by caller:
+ * Returns -ENOMEM if the sif_mem ref object does not have a sufficiently large size.
+ */
+int sif_mem_map_fmr(struct sif_mem *mem, u64 iova,
+		u64 *page_list, int num_pages);
+
+/* Unmap and reset a mem object previously set up with sif_mem_map_fmr */
+void sif_mem_unmap_fmr(struct sif_mem *mem);
+
+/* Allocate some (more) memory for this sif_mem
+ * Return an s/g index (page offset to the start of that memory
+ * or -errval if an error.
+ */
+int sif_mem_extend(struct sif_mem *mem, size_t size, gfp_t flag);
+
+/* Free a subrange of this memory object starting at @sg_idx and dereference the
+ * sif_mem object. Assumes there is no other references to this subrange, and that
+ * this subrange corresponds exactly to a prior allocation with either create or extend above
+ * returns 0 upon success or a negative errno if failure:
+ */
+int sif_mem_shrink(struct sif_mem *mem, int sg_idx, size_t size);
+
+/* Returns true if this memory is represented internally by an umem object */
+bool sif_mem_has_umem(struct sif_mem *mem);
+
+/* Return the largest page size (represented by page shift bits) usable for this memory */
+u32 sif_mem_page_shift(struct sif_mem *mem);
+
+/* Find kernel virtual address at @offset within map */
+void *sif_mem_kaddr(struct sif_mem *mem, off_t offset);
+
+/* Find dma address at @offset within map */
+dma_addr_t sif_mem_dma(struct sif_mem *mem, off_t offset);
+
+/* If map is continuous, get start of dma mapping
+ * otherwise return an error pointer:
+ */
+dma_addr_t sif_mem_dma_if_cont(struct sif_mem *mem);
+
+/* Return the start of the s/g list for this mem object */
+struct scatterlist *sif_mem_get_sgl(struct sif_mem *mem);
+
+/* Map a part of the @mem object given by @offset, @size to the user space
+ * vm context given in @vma. The part must be page aligned and page sized:
+ */
+
+int sif_mem_vma_map_part(struct sif_mem *mem, struct vm_area_struct *vma,
+			off_t start_off, size_t size);
+
+/* Map the memory referenced by @mem to the user space vma */
+int sif_mem_vma_map(struct sif_mem *mem, struct vm_area_struct *vma);
+
+
+/* sif_mem iterator (mainly for the types that do not expose a scatterlist) */
+struct sif_mem_iter {
+	struct sif_mem *mem;
+	union {
+		struct {
+			int i;  /* Index used by SIFMT_PHYS and SIFMT_FMR */
+		} phys;
+		struct scatterlist *sg; /* Used by scatterlist based types */
+	};
+	size_t offset;  /* Current offset within element */
+};
+
+int sif_mem_iter_init(struct sif_mem *mem, struct sif_mem_iter *it);
+int sif_mem_iter_advance(struct sif_mem_iter *it, u64 incr);
+dma_addr_t sif_mem_iter_dma(struct sif_mem_iter *mi);
+
+/* A utility for dumping an sg list to the trace buffer */
+void sif_dump_sg(struct scatterlist *sgl);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_mmu.c b/drivers/infiniband/hw/sif/sif_mmu.c
new file mode 100644
index 0000000000000..ba4f1a0ba88f1
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_mmu.c
@@ -0,0 +1,751 @@
+/*
+ * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_mmu.c: main entry points and initialization
+ */
+
+#include "sif_mmu.h"
+#include "sif_dev.h"
+#include "sif_base.h"
+#include "sif_dma.h"
+#include "sif_hwi.h"
+#include "sif_mem.h"
+#include "sif_spt.h"
+#include "sif_xmmu.h"
+#include "sif_pt.h"
+#include "sif_mr.h"
+#include "sif_query.h"
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/highmem.h>
+#include <linux/kref.h>
+#include <linux/version.h>
+#include <rdma/ib_umem.h>
+#include "psif_hw_setget.h"
+#include "sif_defs.h"
+
+static int sif_map_gva_ctx(struct sif_dev *sdev,
+			struct sif_mmu_ctx *ctx,
+			struct sif_mem *mem,
+			bool write);
+
+static int sif_map_bypass_ctx(struct sif_dev *sdev,
+			struct sif_mmu_ctx *ctx,
+			struct sif_mem *mem,
+			bool write);
+
+static int sif_map_cs_ctx(struct sif_dev *sdev,
+			struct sif_mmu_ctx *ctx,
+			bool write);
+
+#ifndef __sparc__
+/* Special handling for PHYS memory types which don't have any sg list: */
+static int sif_map_special_ctx(struct sif_dev *sdev,
+			struct sif_mmu_ctx *ctx,
+			struct sif_mem *mem,
+			bool write);
+#endif
+
+static int sif_mmu_invalidate_tlb(struct sif_dev *sdev, struct sif_mmu_ctx *ctx, enum wr_mode mode);
+
+void set_ctx(struct sif_dev *sdev,
+	struct sif_mmu_ctx *ctx,
+	enum psif_table_level level,
+	u64 val)
+{
+	struct psif_mmu_cntx *hw_ctx = &ctx->mctx;
+
+	val &= ~PSIF_TABLE_PTR_MASK;
+	hw_ctx->table_ptr = ((val) >> PT_PAGE_SHIFT);
+	hw_ctx->table_level = level;
+	sif_log(sdev, SIF_MMU, "%p ptr 0x%08llx level %d", hw_ctx, val, level);
+}
+
+
+
+int sif_map_ctx(struct sif_dev *sdev,
+		struct sif_mmu_ctx *ctx,
+		struct sif_mem *mem,
+		u64 virt_base, u64 size, bool write)
+{
+	/* hw_ctx entry assumed to be set up in pass through
+	 * prior to the call (all null bytes)
+	 */
+	ctx->type = MMU_GVA2GPA_MODE;
+	ctx->base = virt_base;
+	ctx->size = size;
+	ctx->mt = mem->mem_type;
+
+	switch (mem->mem_type) {
+	case SIFMT_BYPASS:
+	case SIFMT_BYPASS_RO:
+	case SIFMT_NOMEM:
+		return sif_map_bypass_ctx(sdev, ctx, mem, write);
+	case SIFMT_UMEM:
+	case SIFMT_UMEM_RO:
+	case SIFMT_2M:
+	case SIFMT_4K:
+		return sif_map_gva_ctx(sdev, ctx, mem, write);
+	case SIFMT_CS:
+		return sif_map_cs_ctx(sdev, ctx, write);
+	case SIFMT_ZERO:
+		return sif_zero_map_gva_ctx(sdev, ctx, mem, write);
+	case SIFMT_PTONLY:
+		return 0; /* Nothing to map yet */
+#ifndef __sparc__
+	case SIFMT_PHYS:
+		return sif_map_special_ctx(sdev, ctx, mem, write);
+	case SIFMT_UMEM_SPT:
+		return sif_spt_map_gva_ctx(sdev, ctx, mem, write);
+#endif
+	default:
+		sif_log(sdev, SIF_INFO, "Unimplemented mem_type %d %s",
+			mem->mem_type, sif_mem_type_str(mem->mem_type));
+		return -EOPNOTSUPP;
+	}
+	return -EINVAL;
+}
+
+void sif_unmap_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx)
+{
+	switch (ctx->mt) {
+	case SIFMT_BYPASS:
+	case SIFMT_BYPASS_RO:
+	case SIFMT_NOMEM:
+		break;
+	case SIFMT_UMEM:
+	case SIFMT_UMEM_RO:
+	case SIFMT_PHYS:
+	case SIFMT_FMR:
+	case SIFMT_2M:
+	case SIFMT_4K:
+	case SIFMT_CS:
+	case SIFMT_PTONLY:
+		sif_unmap_gva_ctx(sdev, ctx);
+		break;
+#ifndef __sparc__
+	case SIFMT_ZERO:
+		sif_zero_unmap_gva_ctx(sdev, ctx);
+		break;
+	case SIFMT_UMEM_SPT:
+		sif_spt_unmap_gva_ctx(sdev, ctx);
+		break;
+#endif
+	default:
+		sif_log(sdev, SIF_INFO, "Unimplemented mem type %d, ctx at %p", ctx->mt, ctx);
+		BUG(); /* Should not happen - throwing the cards */
+	}
+}
+
+static size_t num_pages(u64 base, u64 size, u32 page_shift)
+{
+	size_t pg_sz = 1 << page_shift;
+
+	return aligned_size(base, size, pg_sz) >> page_shift;
+}
+
+/* May return -1 or a valid enum value for psif_page_size */
+static int hw_leaf_page_sz(struct sif_dev *sdev, u32 page_shift)
+{
+	/* Page size not supported by device configuration */
+	if (sdev->mi.page_shift > page_shift) {
+		sif_log(sdev, SIF_INFO,
+			"Cannot support page shift %d - min.page shift supported in this configuration is %d",
+			page_shift, sdev->mi.page_shift);
+		return -1;
+	}
+
+	switch (sdev->mi.page_shift) {
+	case 12: /* Device configured for Intel page sizes */
+		if (page_shift < 21)
+			return PAGE_SIZE_IA32E_4KB;
+		if (page_shift < 30)
+			return PAGE_SIZE_IA32E_2MB;
+		return PAGE_SIZE_IA32E_1GB;
+	case 13: /* Device configured for Sparc page sizes */
+		if (page_shift < 16)
+			return PAGE_SIZE_S64_8KB;
+		if (page_shift < 19)
+			return PAGE_SIZE_S64_64KB;
+		if (page_shift < 22)
+			return PAGE_SIZE_S64_512KB;
+		if (page_shift < 25)
+			return PAGE_SIZE_S64_4MB;
+		if (page_shift < 28)
+			return PAGE_SIZE_S64_32MB;
+		if (page_shift < 34)
+			return PAGE_SIZE_S64_2GB;
+		return PAGE_SIZE_S64_16GB;
+	}
+	sif_log(sdev, SIF_INFO, "Cannot support page shift %d", page_shift);
+	return -1;
+}
+
+
+static inline enum psif_table_level hw_leaf_level(enum psif_page_size pg_sz)
+{
+	switch (pg_sz) {
+	case PAGE_SIZE_IA32E_2MB:
+	case PAGE_SIZE_S64_4MB:
+		return PAGE_LEVEL1;
+	case PAGE_SIZE_IA32E_1GB:
+	case PAGE_SIZE_S64_2GB:
+		return PAGE_LEVEL2;
+	default:
+		return PAGE_LEVEL0;
+	}
+}
+
+
+static int sif_map_bypass_ctx(struct sif_dev *sdev,
+			struct sif_mmu_ctx *ctx,
+			struct sif_mem *mem,
+			bool write)
+{
+	u64 addr = 0;
+	int ret = 0;
+
+	ctx->type = MMU_PASS_THROUGH0;
+
+	if (mem->mem_type == SIFMT_NOMEM)
+		ctx->mt = SIFMT_BYPASS;
+	if (write)
+		ctx->mctx.wr_access = 1;
+
+	if (mem->m.u.umem) {
+		addr = sif_mem_dma_if_cont(mem);
+		if (IS_ERR((void *)addr))
+			return PTR_ERR((void *)addr);
+	} else if (mem->mem_type != SIFMT_NOMEM)
+		addr = sif_mem_dma(mem, 0);
+
+	if (mem->mem_type == SIFMT_BYPASS || mem->mem_type == SIFMT_BYPASS_RO)
+		ctx->uv2dma = addr - ctx->base;
+	ctx->base = addr;
+	return ret;
+}
+
+
+static int sif_map_gva_ctx(struct sif_dev *sdev,
+			struct sif_mmu_ctx *ctx,
+			struct sif_mem *mem,
+			bool write)
+{
+	struct psif_mmu_cntx *hw_ctx = &ctx->mctx;
+	bool multipage;
+	u64 page_size;
+	u64 page_mask;
+	enum psif_table_level leaf_level;
+	u64 aligned_base;
+	u64 aligned_sz;
+	u32 page_shift = sif_mem_page_shift(mem);
+	u8 pt_leaf_level = 0;
+	u8 pt_pte_extent = 1;
+	u64 dma_addr;
+
+	/* Adjust to a supported page shift */
+	int ret = find_optimal_leaf_level(sdev, page_shift,
+					ctx->base, sif_mem_dma(mem, 0), ctx->size,
+					&pt_leaf_level, &pt_pte_extent);
+	if (ret)
+		return ret;
+
+	page_shift = sdev->mi.page_shift + pt_leaf_level * sdev->mi.level_shift;
+	page_size = 1ULL << page_shift;
+	page_mask = ~(page_size - 1);
+
+	hw_ctx->wr_access = write;
+	hw_ctx->translation_type = MMU_GVA2GPA_MODE;
+	hw_ctx->page_size = hw_leaf_page_sz(sdev, page_shift);
+
+	aligned_base = ctx->base & page_mask;
+	aligned_sz = aligned_size(ctx->base, ctx->size, page_size);
+	multipage = sdev->single_pte_pt || aligned_sz > page_size;
+	leaf_level = hw_leaf_level(hw_ctx->page_size);
+	dma_addr = sif_mem_dma(mem, 0);
+
+	sif_log(sdev, SIF_MMU_V, "base 0x%llx dma base 0x%llx size 0x%llx page shift %d size %s",
+		ctx->base, dma_addr, ctx->size, page_shift,
+		string_enum_psif_page_size(hw_ctx->page_size));
+
+	if (multipage) {
+		ctx->pt = sif_pt_create(sdev, sif_mem_get_sgl(mem),
+					ctx->base, ctx->size, page_shift, false, false);
+		if (!ctx->pt)
+			return -ENOMEM;
+		set_ctx(sdev, ctx, sif_pt_root_table_level(ctx->pt), sif_pt_dma_root(ctx->pt));
+	} else {
+		dma_addr_t aligned_dma_addr = dma_addr & ~((1 << page_shift) - 1);
+
+		set_ctx(sdev, ctx, leaf_level, aligned_dma_addr);
+	}
+	return 0;
+}
+
+
+static int sif_map_cs_ctx(struct sif_dev *sdev,
+			struct sif_mmu_ctx *ctx,
+			bool write)
+{
+	struct psif_mmu_cntx *hw_ctx = &ctx->mctx;
+
+	hw_ctx->wr_access = write;
+	hw_ctx->translation_type = MMU_GVA2GPA_MODE;
+	hw_ctx->page_size = PAGE_SIZE_IA32E_4KB;
+
+	/* Just create a page table with an empty top level page */
+	ctx->pt = sif_pt_create_empty(sdev, ctx->base, SIFMT_CS);
+	if (!ctx->pt)
+		return -ENOMEM;
+	set_ctx(sdev, ctx, sif_pt_root_table_level(ctx->pt), sif_pt_dma_root(ctx->pt));
+	return 0;
+}
+
+#ifndef __sparc__
+static int sif_map_special_ctx(struct sif_dev *sdev,
+			struct sif_mmu_ctx *ctx,
+			struct sif_mem *mem,
+			bool write)
+{
+	struct psif_mmu_cntx *hw_ctx = &ctx->mctx;
+	bool multipage = aligned_size(ctx->base, ctx->size, PAGE_SIZE) > PAGE_SIZE;
+
+	sif_log(sdev, SIF_MMU_V, "base 0x%llx size 0x%llx", ctx->base, ctx->size);
+
+	hw_ctx->page_size = PAGE_SIZE_IA32E_4KB;
+	hw_ctx->wr_access = write;
+	hw_ctx->translation_type = MMU_GVA2GPA_MODE;
+
+	if (multipage) {
+		ctx->pt = sif_pt_create_for_mem(mem, ctx->base, 12, true, true);
+		if (!ctx->pt)
+			return -ENOMEM;
+		set_ctx(sdev, ctx, sif_pt_root_table_level(ctx->pt), sif_pt_dma_root(ctx->pt));
+	} else
+		set_ctx(sdev, ctx, PAGE_LEVEL0, sif_mem_dma(mem, 0));
+	return 0;
+}
+#endif
+
+/* map an existing context to a new memory object
+ * Reuse key, page table and mmu context if possible
+ */
+int sif_map_fmr_ctx(struct sif_dev *sdev,
+		struct sif_mmu_ctx *ctx,
+		struct sif_mem *mem)
+{
+	struct psif_mmu_cntx *hw_ctx = &ctx->mctx;
+	struct psif_key *key = get_key(sdev, ctx->lkey);
+	bool multipage;
+	u64 vstart = (u64)mem->vmap_base;
+	u64 page_size;
+	u64 page_mask;
+	enum psif_table_level leaf_level;
+	u64 aligned_base;
+	u64 aligned_sz;
+	u32 page_shift = sif_mem_page_shift(mem);
+	u8 pt_leaf_level = 0;
+	u8 pt_pte_extent = 1;
+	int ret;
+
+	/* Adjust to a supported page shift */
+	ret = find_optimal_leaf_level(sdev, page_shift,
+				vstart, sif_mem_dma(mem, 0), mem->size,
+				&pt_leaf_level, &pt_pte_extent);
+	if (ret)
+		return ret;
+
+	page_shift = sdev->mi.page_shift + pt_leaf_level * sdev->mi.level_shift;
+	page_size = 1ULL << page_shift;
+	page_mask = ~(page_size - 1);
+
+	hw_ctx->wr_access = true;
+	hw_ctx->translation_type = MMU_GVA2GPA_MODE;
+	hw_ctx->page_size = hw_leaf_page_sz(sdev, page_shift);
+
+	aligned_base = ctx->base & page_mask;
+	aligned_sz = aligned_size(vstart, mem->size, page_size);
+	multipage = sdev->single_pte_pt || aligned_sz > page_size;
+	leaf_level = hw_leaf_level(hw_ctx->page_size);
+
+	/* Now page sizes may have changed too, if so we cannot reuse the page table, delete it: */
+	if (ctx->pt && page_shift > ctx->pt->page_shift) {
+		sif_pt_free(ctx->pt);
+		ctx->pt = NULL;
+	}
+
+	/* For FMRs we reuse the mmu context and modify the existing key */
+	ctx->base = (u64)mem->vmap_base;
+	ctx->size = mem->size;
+
+	set_psif_key__base_addr(key, ctx->base);
+	set_psif_key__lkey_state(key, PSIF_DMA_KEY_VALID);
+	set_psif_key__rkey_state(key, PSIF_DMA_KEY_VALID);
+	set_psif_key__length(key, mem->size);
+
+	sif_log(sdev, SIF_FMR, "key %d: base now at %llx  (sz %llx - mem sz %llx)",
+		ctx->lkey, ctx->base, ctx->size, mem->size);
+
+	/* We have two cases:
+	 * 1) a single page pointer: Pointer must be set to new address - keep page size and everything
+	 * 2) a page table of any depth:
+	 *    appropriate ptes must be set to refer to new pages
+	 */
+	if (!multipage) {
+		dma_addr_t dma_addr = sif_mem_dma(mem, 0);
+		dma_addr_t aligned_dma_addr = dma_addr & ~((1 << page_shift) - 1);
+
+		set_ctx(sdev, ctx, leaf_level, aligned_dma_addr);
+	} else if (!ctx->pt) {
+		ctx->pt = sif_pt_create_for_mem(mem, ctx->base, page_shift, true, false);
+		if (!ctx->pt)
+			return -ENOMEM;
+		set_ctx(sdev, ctx, sif_pt_root_table_level(ctx->pt), sif_pt_dma_root(ctx->pt));
+	} else {
+		sif_pt_remap_for_mem(ctx->pt, mem, page_shift, ctx->base);
+		/* Only the level of the top node may have changed, the page is
+		 * guaranteed to be the same, but the previous use could
+		 * have been a single page - just set it every time for now:
+		 */
+		set_ctx(sdev, ctx, sif_pt_root_table_level(ctx->pt), sif_pt_dma_root(ctx->pt));
+	}
+	/* Update the used network endian context */
+	set_psif_key__mmu_context(key, *((u64 *)&ctx->mctx));
+	return 0;
+}
+
+void sif_unmap_gva_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx)
+{
+	/* TLB invalidate is not available at teardown, instead we
+	 * invalidate the whole MMU as a final operation before taking down the
+	 * communication with the EPSC.
+	 */
+	if (likely(sdev->registered) && ctx->pt && !sif_feature(disable_invalidate_tlb))
+		sif_mmu_invalidate_tlb(sdev, ctx, PCM_WAIT);
+	if (ctx->pt)
+		sif_pt_free(ctx->pt);
+}
+
+
+void sif_unmap_fmr_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx, enum wr_mode mode)
+{
+	sif_log(sdev, SIF_FMR, "key %d", ctx->lkey);
+	if (!sif_feature(disable_invalidate_tlb))
+		sif_mmu_invalidate_tlb(sdev, ctx, mode);
+}
+
+
+static int sif_mmu_invalidate_tlb_partial(struct sif_dev *sdev, struct sif_mmu_ctx *ctx,
+				u64 start, u64 len, enum wr_mode mode)
+{
+	struct psif_wr wr;
+	int ncompleted;
+	int ret = 0;
+	u32 lkey = ctx->lkey;
+	u32 npages;
+	u32 shift;
+	u32 sq_entry_idx;
+	int pqp_sq_idx;
+	struct sif_sq *sq;
+	struct sif_pqp *pqp;
+	struct psif_cq_entry *cqe;
+	DECLARE_SIF_CQE_POLL(sdev, lcqe);
+
+	pqp = lcqe.pqp;
+
+	if (!lkey) {
+		lkey = allocate_invalidate_key(ctx);
+		if (!lkey) {
+			sif_log(sdev, SIF_INFO,
+				"Failed to allocate a TLB invalidation key!");
+			return -ENOMEM;
+		}
+	}
+
+	/* Do no invalidate TLB if page table is NULL.
+	 * However, if mode == PCM_WAIT, need to generate
+	 * a completion to itself to ensure that all the
+	 * previous posted invalidate TLB pqp operations
+	 * have completed.
+	 *
+	 * This is mainly to cater for invalidating the TLB of a
+	 * list of fmr ctx. This is done here within the function as
+	 * the generated completion needs to know the selected
+	 * pqp. The caller sif_unmap_phys_fmr_list doesn't
+	 * know the pqp until DECLARE_SIF_CQE_POLL.
+	 * In a scenario for invalidating TLB for a ctx,
+	 * the ctx->pt is checked before calling this function
+	 * so that no additional completion will be generated.
+	 * e.g in sif_unmap_gva_ctx.
+	 */
+	if (unlikely(!ctx->pt))  {
+		if (mode == PCM_WAIT) {
+			ret = gen_pqp_cqe(&lcqe);
+			if (ret < 0) {
+				sif_log(sdev, SIF_INFO,
+					"cqe %p gen_pqp_cqe returned %d",
+					&lcqe, ret);
+				return ret;
+			}
+			ret = poll_cq_waitfor(&lcqe);
+			if (ret < 0) {
+				sif_log(sdev, SIF_INFO,
+					"cqe %p poll_cq_waitfor returned %d",
+					&lcqe, ret);
+			}
+		}
+		return ret;
+	}
+
+	memset(&wr, 0, sizeof(struct psif_wr));
+	wr.op = PSIF_WR_INVALIDATE_TLB;
+	wr.details.su.key = lkey;
+
+	shift = sif_pt_page_shift(ctx->pt);
+	npages = num_pages(ctx->base, len, shift);
+
+	while (npages) {
+		/* TLB invalidate only uses the lower 16 bits of the length field */
+		u32 n = min_t(u32, npages, 0xffff);
+
+		wr.details.su.addr = start;
+		wr.details.su.length = n;
+		npages -= n;
+		if (npages > 0) {
+			int sts = sif_pqp_post_send(sdev, &wr, NULL);
+
+			if (sts) {
+				sif_log(sdev, SIF_INFO,
+					"Partial invalidate TLB for key %d, base %llx, length %x failed, sts %d",
+					lkey, start, n << shift, sts);
+				return sts;
+			}
+		} else
+			break;
+		/* reset checksum for the next calculation */
+		wr.checksum = 0;
+		start += n << shift;
+	}
+
+	/* We can allow async post only if we do not depend on deleting the key after
+	 * the request has completed:
+	 */
+	if (mode != PCM_WAIT && ctx->lkey) {
+		wr.completion = (mode == PCM_POST) ? 0 : 1;
+		return sif_pqp_post_send(sdev, &wr, NULL);
+	}
+
+	wr.completion = 1;
+
+	sif_log(sdev, SIF_PQP, "Invalidate TLB for key %d, base %llx, length %x",
+		lkey, start, wr.details.su.length << shift);
+
+	ncompleted = sif_pqp_poll_wr(sdev, &wr, &lcqe);
+
+	if (ncompleted < 0) {
+		sif_log(sdev, SIF_INFO, "%s completion for pqp request",
+			(ncompleted ? "Error" : "No"));
+		ret = ncompleted;
+		goto out;
+	}
+
+	/* Note that we operate on 3 different indices here! */
+	cqe = &lcqe.cqe;
+	pqp_sq_idx = pqp->qp->qp_idx;
+	sq = get_sif_sq(sdev, pqp_sq_idx);
+
+	/* sq_id.sq_seq_num contains the send queue sequence number for this completion
+	 * and by this driver's definition the index into the send queue will
+	 * be this number modulo the length of the send queue:
+	 */
+	sq_entry_idx = cqe->wc_id.sq_id.sq_seq_num & sq->mask;
+
+	if (cqe->status != PSIF_WC_STATUS_SUCCESS) {
+		sif_log(sdev, SIF_INFO,
+			"base %llx, length %x: failed with status %s(%d) for cq_seq %d",
+			start, wr.details.su.length << shift,
+			string_enum_psif_wc_status(cqe->status), cqe->status, cqe->seq_num);
+		sif_logs(SIF_INFO, write_struct_psif_cq_entry(NULL, 0, cqe));
+		ret = -EIO;
+		atomic_inc(&pqp->cq->error_cnt);
+		goto out;
+	}
+
+	sif_log(sdev, SIF_PQP, "cq_seq %d sq_seq %d, sq_entry_idx %d",
+		cqe->seq_num, cqe->wc_id.sq_id.sq_seq_num, sq_entry_idx);
+out:
+	if (!ctx->lkey)
+		release_invalidate_key(sdev, lkey);
+	return ret;
+}
+
+
+static int sif_mmu_invalidate_tlb(struct sif_dev *sdev, struct sif_mmu_ctx *ctx, enum wr_mode mode)
+{
+	return sif_mmu_invalidate_tlb_partial(sdev, ctx, ctx->base, ctx->size, mode);
+}
+
+
+/* extend an mmu context with DMA addresses from @mem.
+ * Only GVA2GPA memory types supports this:
+ */
+int sif_map_ctx_part(struct sif_dev *sdev,
+		struct sif_mmu_ctx *ctx,
+		struct sif_mem *mem,
+		u64 virt_base, u64 size)
+{
+	int ret;
+
+	if (ctx->type != MMU_GVA2GPA_MODE)
+		return -EINVAL;
+
+	ret = sif_pt_extend(ctx->pt, sif_mem_get_sgl(mem), virt_base, size);
+	if (ret >= 0 && ctx->mt == SIFMT_CS && ctx->pt->vsize == size)
+		set_ctx(sdev, ctx, sif_pt_root_table_level(ctx->pt), sif_pt_dma_root(ctx->pt));
+	return ret;
+}
+
+
+/* invalidate a pte range in an already existing context's page table
+ * Only GVA2GPA memory types supports this:
+ */
+
+int sif_unmap_gva_ctx_part(struct sif_dev *sdev, struct sif_mmu_ctx *ctx,
+			u64 virt_base, u64 size)
+{
+	int ret = sif_pt_free_part(ctx->pt, virt_base, size);
+
+	if (ret < 0)
+		return ret;
+
+	if (unlikely(!sdev->registered)) {
+		/* TLB invalidate is not available at teardown */
+		return 0;
+	}
+
+	/* Invalidate this range of the page table with PSIF - assume async call is ok */
+	return sif_mmu_invalidate_tlb_partial(sdev, ctx, virt_base, size, PCM_POST);
+}
+
+
+
+const char *sif_mem_type_str(enum sif_mem_type mem_type)
+{
+	switch (mem_type) {
+	case SIFMT_BYPASS:
+		return "SIFMT_BYPASS";
+	case SIFMT_UMEM:
+		return "SIFMT_UMEM";
+	case SIFMT_UMEM_RO:
+		return "SIFMT_UMEM_RO";
+	case SIFMT_BYPASS_RO:
+		return "SIFMT_BYPASS_RO";
+	case SIFMT_UMEM_SPT:
+		return "SIFMT_UMEM_SPT";
+	case SIFMT_2M:
+		return "SIFMT_2M";
+	case SIFMT_4K:
+		return "SIFMT_4K";
+	case SIFMT_CS:
+		return "SIFMT_CS";
+	case SIFMT_ZERO:
+		return "SIFMT_ZERO";
+	case SIFMT_PHYS:
+		return "SIFMT_PHYS";
+	case SIFMT_FMR:
+		return "SIFMT_FMR";
+	case SIFMT_NOMEM:
+		return "SIFMT_NOMEM";
+	case SIFMT_PTONLY:
+		return "SIFMT_PTONLY";
+	case SIFMT_MAX:
+		return "SIFMT_MAX";
+	default:
+		break;
+	}
+	return "(undefined sif_mem_type)";
+}
+
+
+struct psif_mmu_cntx sif_mmu_ctx_passthrough(bool write)
+{
+	struct psif_mmu_cntx ctx = { .wr_access = 1 };
+	return ctx;
+}
+
+
+#define TSU_MMU_FLUSH_CACHES_ADDR   0x00200003L
+
+/* Post a command to flush the TLBs PTE cache.
+ * If @ptw_cache is set, also flush the PTW cache.
+ */
+int sif_post_flush_tlb(struct sif_dev *sdev, bool ptw_cache)
+{
+	int ret;
+
+	if (eps_version_ge(&sdev->es[sdev->mbox_epsc], 0, 100)) {
+		struct psif_epsc_csr_rsp resp;
+		struct psif_epsc_csr_req req;
+
+		memset(&req, 0, sizeof(req));
+		req.opcode = EPSC_FLUSH_CACHES;
+		req.u.flush_caches.flush_mmu_caches.flush_mmu_cache = 1;
+		if (ptw_cache)
+			req.u.flush_caches.flush_mmu_caches.flush_ptw_cache = 1;
+		ret = sif_epsc_wr_poll(sdev, &req, &resp);
+	} else {
+		int bits = (ptw_cache ? 0x3 : 0x1);
+
+		ret = sif_write_global_csr(sdev, TSU_MMU_FLUSH_CACHES_ADDR, bits);
+	}
+	if (ret) {
+		sif_log(sdev, SIF_INFO,
+			"clearing MMU cache failed with error %d ", ret);
+	}
+	return ret;
+}
+
+
+/* Wait for a previously posted flush_tlb to complete */
+int sif_complete_flush_tlb(struct sif_dev *sdev)
+{
+	ulong start_time = jiffies;
+	ulong timeout = sdev->min_resp_ticks * 4;
+	ulong timeout_time = start_time + timeout;
+	u64 val;
+	int cnt = 0;
+	int ret;
+	int ms;
+
+	if (eps_version_ge(&sdev->es[sdev->mbox_epsc], 0, 100)) {
+		/* For API ver. >= 100, we already wait for completion in mailbox operation */
+		return 0;
+	}
+	do {
+		val = sif_read_global_csr(sdev, TSU_MMU_FLUSH_CACHES_ADDR);
+		cnt++;
+	} while (val != -1LL && (val & 0x4) != 0x4 && time_is_after_jiffies(timeout_time));
+	if (val == -1LL)
+		sif_log(sdev, SIF_INFO, "CSR error waiting for mmu cache flush to finish");
+	if (time_is_before_jiffies(timeout_time)) {
+		sif_log(sdev, SIF_INFO, "timeout waiting for mmu cache flush to finish, val = %lld",
+			val);
+		return -ETIMEDOUT;
+	}
+	ret = sif_write_global_csr(sdev, TSU_MMU_FLUSH_CACHES_ADDR, 0x0);
+	ms = jiffies_to_msecs(jiffies - start_time);
+	if (ret)
+		sif_log(sdev, SIF_INFO, "failed to turn off mmu cache flush mode in %d ms", ms);
+	else
+		sif_log(sdev, SIF_INFO_V, "flushing completed in %d ms, cnt %d",
+			ms, cnt);
+	return ret;
+}
diff --git a/drivers/infiniband/hw/sif/sif_mmu.h b/drivers/infiniband/hw/sif/sif_mmu.h
new file mode 100644
index 0000000000000..6624f9455856c
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_mmu.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_mmu.h: API for management of sif's on-chip mmu.
+ */
+
+#ifndef _SIF_MMU_H
+#define _SIF_MMU_H
+
+#include <rdma/ib_verbs.h>
+#include "psif_hw_data.h"
+#include "sif_user.h"
+
+struct sif_mem;
+struct psif_mmu_cntx;
+struct sif_dev;
+
+enum wr_mode {
+	PCM_POST,	/* Post WR without requesting send completion */
+	PCM_POST_COMPL, /* Post WR requesting send completion but do not wait(poll) for it */
+	PCM_WAIT	/* Post WR requesting send completion and wait(poll) for it to arrive */
+};
+
+enum post_mode {
+	PM_WRITE,	/* Write the WR into the SQ but don't trigger any posting */
+	PM_DOORBELL,	/* Post request and trigger doorbell (send queue mode) */
+	PM_CB,		/* "Normal" collect buffer mode */
+};
+
+/* The driver's representation of an MMU context:
+ * The key is the only means for referring the MMU context wrt invalidation
+ * (TLB_INVALIDATE) but this is only necessary to do for GVA2GPA contexts
+ * [TBD: with level > 0 (?)]
+ */
+
+struct sif_mmu_ctx {
+	u64 base;   /* Start of mapping (byte resolution) */
+	u64 size;   /* Size of mapping (byte resolution) */
+	u32 lkey;   /* Key to use for invalidation - only valid if nonzero */
+	enum sif_mem_type mt;  /* Logical type of mapping */
+	enum psif_mmu_translation type; /* Defined in psif_hw_data */
+	struct psif_mmu_cntx mctx;   /* host order version of MMU context populated by sif_map_ctx */
+	struct sif_pt *pt;  /* sif page table this mmu context points into (only GVA2GPA types) */
+	off_t uv2dma;  /* For bypass: user_va + uv2dma = actual dma_addr */
+	u64 phys_sz;   /* Only used by SIFMT_ZERO mappings */
+};
+
+
+/* Prepare a new mmu context
+ *  ctx points to storage for this mmu context
+ *  mem points to a DMA mapped memory object to map
+ *
+ *  - prepare any page tables needed for dma
+ *    and/or allocate private structures
+ *  - fill in information for hw in ctx->hw_ctx
+ *
+ * NB! hw_ctx is assumed to be set to values for
+ *    MMU_PASS_THROUGH (all null bytes) by default
+ *
+ * Return 0 upon success or -errno
+ */
+int sif_map_ctx(struct sif_dev *sdev,
+		struct sif_mmu_ctx *ctx,
+		struct sif_mem *mem,
+		u64 virt_base, u64 size,
+		bool write);
+
+/* Release any resources associated with
+ *  the mmu context c. This will typically be
+ *  any driver managed page tables and any I/O mappings
+ *  (pinning) of page table memory
+ */
+void sif_unmap_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *c);
+
+/* Populate/invalidate a pte range in an already existing context's page table
+ * Only GVA2GPA memory types supports this:
+ *  page_list should contain the corresponding list of dma_addresses to map:
+ */
+int sif_map_ctx_part(struct sif_dev *sdev,
+		struct sif_mmu_ctx *c,
+		struct sif_mem *mem,
+		u64 virt_base, u64 size);
+
+int sif_unmap_gva_ctx_part(struct sif_dev *sdev, struct sif_mmu_ctx *c,
+			u64 virt_base, u64 size);
+
+/* Remap an existing context to a new memory object
+ * (of the same size)
+ */
+int sif_map_fmr_ctx(struct sif_dev *sdev,
+		struct sif_mmu_ctx *c,
+		struct sif_mem *mem);
+
+void sif_unmap_fmr_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx, enum wr_mode mode);
+
+/*** internal mmu code - used by sif_xmmu.h ***/
+
+void sif_unmap_gva_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx);
+
+const char *sif_mem_type_str(enum sif_mem_type mem_type);
+
+void set_ctx(struct sif_dev *sdev,
+	struct sif_mmu_ctx *ctx,
+	enum psif_table_level level,
+	u64 val);
+
+/* Return an mmu context in passthrough mode */
+struct psif_mmu_cntx sif_mmu_ctx_passthrough(bool write);
+
+/* The I/O side virtual address as seen from sif */
+static inline u64 sif_mmu_vaddr(struct sif_mmu_ctx *ctx, off_t offset)
+{
+	return ctx->base + offset;
+}
+
+/* Post a command to flush the TLBs PTE cache.
+ * If @ptw_cache is set, also flush the PTW cache.
+ */
+int sif_post_flush_tlb(struct sif_dev *sdev, bool ptw_cache);
+
+/* Wait for a previously posted flush_tlb to complete */
+int sif_complete_flush_tlb(struct sif_dev *sdev);
+
+/* Flush the TLB and wait for the flush to complete */
+static inline int sif_flush_tlb(struct sif_dev *sdev)
+{
+	int ret = sif_post_flush_tlb(sdev, true);
+
+	if (ret)
+		return ret;
+	return sif_complete_flush_tlb(sdev);
+}
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_mr.c b/drivers/infiniband/hw/sif/sif_mr.c
new file mode 100644
index 0000000000000..9632f1e759ac8
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_mr.c
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_mr.c: Implementation of memory regions support for SIF
+ */
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+
+#include "sif_dev.h"
+#include "psif_hw_data.h"
+#include "psif_hw_setget.h"
+#include "sif_defs.h"
+#include "sif_base.h"
+#include "sif_mr.h"
+#include "sif_pd.h"
+#include "sif_mmu.h"
+#include "sif_pt.h"
+#include "sif_user.h"
+#include <linux/seq_file.h>
+#include "sif_user.h"
+
+struct sif_mr *sif_alloc_invalid_mr(struct sif_pd *pd)
+{
+	struct sif_dev *sdev = to_sdev(pd->ibpd.device);
+	u64 bad_addr = (~0ull) ^ (PAGE_SIZE-1);
+	struct sif_mem *mem =
+		sif_mem_create_ref(sdev, SIFMT_NOMEM, bad_addr, 0, GFP_KERNEL);
+	if (!mem)
+		return ERR_PTR(-ENOMEM);
+
+	return alloc_mr(sdev, pd, mem, 0, 0);
+}
+
+struct sif_mr *create_dma_mr(struct sif_pd *pd, int acc_fl)
+{
+	/* Use a common MR (in bypass mode)
+	 * covering the whole memory space (for each pd which needs it)
+	 */
+	struct sif_dev *sdev = to_sdev(pd->ibpd.device);
+	struct sif_mr *mr;
+	struct sif_mem *mem =
+		sif_mem_create_ref(sdev, SIFMT_NOMEM, 0ull, (~0ull) ^ (PAGE_SIZE-1), GFP_KERNEL);
+	if (!mem)
+		return ERR_PTR(-ENOMEM);
+
+	mr = alloc_mr(sdev, pd, mem, 0, acc_fl);
+	if (IS_ERR(mr))
+		goto alloc_mr_failed;
+	return mr;
+
+alloc_mr_failed:
+	sif_mem_free(mem);
+	return mr;
+}
+
+
+struct ib_mr *sif_get_dma_mr(struct ib_pd *ibpd, int acc_fl)
+{
+	struct sif_mr *mr = create_dma_mr(to_spd(ibpd), acc_fl);
+
+	return mr ? &mr->ibmr : NULL;
+}
+
+
+struct ib_mr *sif_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
+			      u64 virt_addr, int acc_fl,
+			      struct ib_udata *udata)
+{
+	enum sif_mem_type mem_type = SIFMT_UMEM;
+	struct sif_dev *sdev = to_sdev(ibpd->device);
+	struct sif_mr *mr;
+	void *ret;
+	struct ib_umem *umem;
+	struct sif_mem *mem;
+	ulong user_flags = 0;
+	u64 map_length = 0;
+	u64 phys_length = 0;
+	u64 umem_length = length;
+	enum dma_data_direction dma_dir = DMA_BIDIRECTIONAL;
+	DEFINE_DMA_ATTRS(attrs);
+
+	if (udata) {
+		struct sif_reg_mr_ext cmd;
+		int rv;
+
+		rv = ib_copy_from_udata(&cmd, udata, sizeof(cmd));
+		if (rv)
+			return ERR_PTR(-EFAULT);
+		user_flags = cmd.flags;
+		if (sif_vendor_enable(MMU_special, user_flags)) {
+			mem_type =
+				sdev->mt_override == SIFMT_UMEM ? cmd.mem_type : sdev->mt_override;
+			map_length = cmd.map_length;
+			phys_length = cmd.phys_length;
+			if (mem_type == SIFMT_BYPASS_RO || mem_type == SIFMT_UMEM_RO)
+				dma_dir = DMA_TO_DEVICE;
+			if (mem_type == SIFMT_CS)
+				umem_length = phys_length;
+		}
+	}
+
+	sif_log(sdev, SIF_MR, "start 0x%llx len 0x%llx virt_addr 0x%llx flags 0x%lx",
+		start, length, virt_addr, user_flags);
+
+	/* Pin user memory */
+	umem = ib_umem_get_attrs(ibpd->uobject->context, start, umem_length, acc_fl,
+				dma_dir, &attrs);
+
+	if (IS_ERR(umem)) {
+		int ev = PTR_ERR(umem);
+
+		ret = (void *)umem;
+		sif_log(sdev, SIF_MR,
+			"#### Failed to get umem [err %d] (start %llx length %llx vaddr %llx, udata at %p)",
+			ev, start, length, virt_addr, udata);
+		return ret;
+	}
+
+	if (map_length) {
+		if (map_length < length) {
+			sif_log(sdev, SIF_INFO, "illegal map_length 0x%llx - must be > length 0x%llx",
+				map_length, length);
+			return ERR_PTR(-EINVAL);
+		}
+		length = map_length;
+	}
+
+	mem = sif_mem_create_umem(sdev, umem, mem_type, GFP_KERNEL, dma_dir);
+	if (!mem) {
+		mr = (void *)ERR_PTR(-ENOMEM);
+		goto err_create_mem;
+	}
+
+	mr = alloc_mr(sdev, to_spd(ibpd), mem, start, acc_fl);
+	if (IS_ERR(mr))
+		goto err_mmu_ctx;
+
+	if (udata) {
+		struct sif_reg_mr_resp_ext resp;
+		int rv;
+
+		memset(&resp, 0, sizeof(resp));
+		resp.uv2dma = mr->mmu_ctx.uv2dma;
+		rv = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (rv) {
+			/* Exit here as ib_umem_release is implicit via dealloc_mr */
+			dealloc_mr(sdev, mr);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+
+	sif_log(sdev, SIF_MR, "Exit: ibmr 0x%p - uv2dma %lx", &mr->ibmr, mr->mmu_ctx.uv2dma);
+	return &mr->ibmr;
+
+err_mmu_ctx:
+	sif_mem_free(mem); /* owns and frees the umem as well */
+	return (void *)mr;
+err_create_mem:
+	ib_umem_release(umem);
+	return (void *)mr;
+}
+
+
+struct ib_mr *sif_reg_phys_mr(struct ib_pd *ibpd,
+			      struct ib_phys_buf *phys_buf_array,
+			      int num_phys_buf,
+			      int acc_fl, u64 *iova_start)
+{
+	struct sif_dev *sdev = to_sdev(ibpd->device);
+	struct sif_mr *mr;
+	struct sif_mem *mem;
+
+	if ((num_phys_buf <= 0) || !phys_buf_array) {
+		sif_log(sdev, SIF_INFO, "input error: num_phys_buf 0%x phys_buf_array %p",
+			num_phys_buf, phys_buf_array);
+		mr = ERR_PTR(-EINVAL);
+		goto param_err;
+	}
+
+	sif_log(sdev, SIF_MR, " num_phys_buf %d, flags 0x%x, iova_start %p",
+		num_phys_buf, acc_fl, iova_start);
+
+	mem = sif_mem_create_phys(sdev, iova_start, phys_buf_array, num_phys_buf,
+				GFP_KERNEL);
+	if (!mem) {
+		sif_log(sdev, SIF_INFO, "Failed to create mem object (ENOMEM)");
+		mr = ERR_PTR(-ENOMEM);
+		goto param_err;
+	}
+
+	mr = alloc_mr(sdev, to_spd(ibpd), mem, (u64)iova_start, acc_fl);
+	if (IS_ERR(mr))
+		goto alloc_mr_failed;
+
+	return &mr->ibmr;
+alloc_mr_failed:
+	sif_mem_free(mem);
+param_err:
+	return (void *)mr;
+}
+
+
+int sif_rereg_phys_mr(struct ib_mr *ibmr, int mr_rereg_mask,
+		      struct ib_pd *ibpd,
+		      struct ib_phys_buf *phys_buf_array, int num_phys_buf,
+		      int mr_access_flags, u64 *iova_start)
+{
+	struct sif_dev *sdev = to_sdev(ibpd->device);
+
+	sif_log(sdev, SIF_INFO, "Not implemented");
+	return -EOPNOTSUPP;
+}
+
+
+
+struct sif_mr *alloc_mr(struct sif_dev *sdev, struct sif_pd *pd,
+			struct sif_mem *mem, u64 map_start, int acc_fl)
+{
+	struct sif_mr *mr;
+	volatile struct psif_key *key;
+	struct psif_key lkey;
+	bool write;
+	int index;
+	int ret = 0;
+	u64 length = mem ? mem->size : ((~0ull) ^ (PAGE_SIZE-1));
+
+	index = sif_alloc_key_idx(sdev);
+	if (index < 0) {
+		sif_log(sdev, SIF_MR, "Failed to allocate key idx");
+		ret = -ENOMEM;
+		goto err_reg_mr;
+	}
+
+	mr = kzalloc(sizeof(struct sif_mr), GFP_KERNEL);
+	if (!mr) {
+		sif_log(sdev, SIF_MR, "Failed to allocate memory for sif_mr");
+		ret = -ENOMEM;
+		goto err_mr_alloc;
+	}
+
+	memset(mr, 0, sizeof(struct sif_mr));
+	memset(&lkey, 0, sizeof(struct psif_key));
+	mr->index = index;
+	mr->mem = mem;
+	set_sif_mr(sdev, index, mr);
+	key = get_key(sdev, index);
+
+	if (length) {
+		/* MR will always have L/R keys associated with them.*/
+		lkey.lkey_state = PSIF_DMA_KEY_VALID;
+		lkey.rkey_state = PSIF_DMA_KEY_VALID;
+	} else {
+		/* Allocation is for a special invalid key */
+		lkey.lkey_state = PSIF_DMA_KEY_INVALID;
+		lkey.rkey_state = PSIF_DMA_KEY_INVALID;
+	}
+
+	/* Access flags */
+	lkey.local_access_rd = 1;
+	if (acc_fl & IB_ACCESS_LOCAL_WRITE)
+		lkey.local_access_wr = 1;
+	if (acc_fl & IB_ACCESS_REMOTE_READ)
+		lkey.remote_access_rd = 1;
+	if (acc_fl & IB_ACCESS_REMOTE_WRITE)
+		lkey.remote_access_wr = 1;
+	if (acc_fl & IB_ACCESS_REMOTE_ATOMIC)
+		lkey.remote_access_atomic = 1;
+	/* TBD: IB_ACCESS_MW_BIND (what to do with that?)
+	 *  and also conditonal_wr
+	 */
+
+	write = (lkey.local_access_wr ? 1:0) || (lkey.remote_access_wr ? 1:0);
+
+	lkey.pd = pd->idx;
+
+	ret = sif_map_ctx(sdev, &mr->mmu_ctx, mem, map_start, length, write);
+	if (ret)
+		goto err_map_ctx;
+
+	mr->mmu_ctx.lkey = index;
+	if (length)
+		lkey.base_addr = mr->mmu_ctx.base;
+	else
+		lkey.base_addr = (u64)-1LL;
+	lkey.length = mr->mmu_ctx.size;
+	lkey.mmu_context = mr->mmu_ctx.mctx;
+
+	sif_logs(SIF_DUMP, write_struct_psif_key(NULL, 0, &lkey));
+
+	/* Write to HW descriptor */
+	copy_conv_to_hw(key, &lkey, sizeof(lkey));
+
+	mr->ibmr.lkey = mr->ibmr.rkey = mr->index;
+
+	sif_log(sdev, SIF_MR, "type %s - key %d (pd %d) - success",
+		sif_mem_type_str(mem->mem_type),
+		mr->index, pd->idx);
+	return mr;
+err_map_ctx:
+	kfree(mr);
+	set_sif_mr(sdev, index, NULL);
+err_mr_alloc:
+	sif_clear_key(sdev, index);
+	sif_free_key_idx(sdev, index);
+err_reg_mr:
+	sif_log(sdev, SIF_MR, "Exit: failed with status %d", ret);
+	return ERR_PTR(ret);
+}
+
+int sif_query_mr(struct ib_mr *ibmr, struct ib_mr_attr *mr_attr)
+{
+	sif_logi(ibmr->device, SIF_MR, "Not implemented");
+	return -EOPNOTSUPP;
+}
+
+
+/* If the MMU is involved (not pass-through mode)
+ * PSIF MR deregistration is asyncronous and five-step (see #2002):
+ *  1) Invalidate associated dma validation entry but first
+ *     make sure it is in the special MMU_VALID state which does not
+ *     allow uses of it from IB but allows it to be used for invalidation
+ *     operations. The invalidate req causes a flush of the entry in
+ *     VAL's cache.
+ *  2) Invalidate MMU context (TLB_INVALIDATE)
+ *     This will lead to a fetch of the key again, this time with
+ *     state == MMU_VALID.
+ *  3) Issue another key invalidate
+ *  4) NIL validation entry - make valid = 0
+ *  5) Unpin/release memory associated with it
+ */
+
+void dealloc_mr(struct sif_dev *sdev, struct sif_mr *mr)
+{
+	int index = mr->index;
+	int sts;
+	struct psif_key *key = get_key(sdev, index);
+	bool need_5_step = mr->mmu_ctx.type == MMU_GVA2GPA_MODE;
+
+	/* We do not invalidate the invalid key at index 0 */
+	bool do_invalidate_key = index != 0 && !sif_feature(disable_invalidate_key);
+
+	if (do_invalidate_key) {
+		if (need_5_step) {
+			set_psif_key__lkey_state(key, PSIF_DMA_KEY_MMU_VALID);
+			set_psif_key__rkey_state(key, PSIF_DMA_KEY_MMU_VALID);
+		} else {
+			set_psif_key__lkey_state(key, PSIF_DMA_KEY_INVALID);
+			set_psif_key__rkey_state(key, PSIF_DMA_KEY_INVALID);
+		}
+
+		/* Flush this DMA validation entry */
+		sts = sif_invalidate_key(sdev, index, PCM_WAIT);
+		if (sts) {
+			sif_log(sdev, SIF_INFO,
+				"Invalidate key failed");
+		}
+	}
+
+	/* Invalidate and unmap MMU context */
+	sif_unmap_ctx(sdev, &mr->mmu_ctx);
+
+	if (need_5_step && do_invalidate_key) {
+		set_psif_key__lkey_state(key, PSIF_DMA_KEY_INVALID);
+		set_psif_key__rkey_state(key, PSIF_DMA_KEY_INVALID);
+
+		/* Flush this DMA validation entry - the final operation, must be synchronous: */
+		sts = sif_invalidate_key(sdev, index, PCM_WAIT);
+		if (sts) {
+			sif_log(sdev, SIF_INFO,
+				"Invalidate key failed");
+		}
+	}
+
+	kfree(mr);
+	set_sif_mr(sdev, index, NULL);
+
+	if (!sif_feature(disable_invalidate_key)) {
+		/* Release memory associated with this key */
+		sif_clear_key(sdev, index);
+		sif_free_key_idx(sdev, index);
+	}
+}
+
+
+void sif_dealloc_mr(struct sif_dev *sdev, struct sif_mr *mr)
+{
+	struct sif_mem *mem = mr->mem;
+
+	dealloc_mr(sdev, mr);
+	sif_mem_free(mem);
+}
+
+
+int sif_dereg_mr(struct ib_mr *ibmr)
+{
+	struct sif_mr *mr = to_smr(ibmr);
+	struct sif_dev *sdev = to_sdev(ibmr->device);
+	int index = mr->ibmr.lkey;
+
+	sif_logi(ibmr->device, SIF_MR, "Enter: mr 0x%p key 0x%x", mr,
+		 index);
+
+	sif_dealloc_mr(sdev, mr);
+	sif_log(sdev, SIF_MR, "Exit: success");
+	return 0;
+}
+
+struct ib_mr *sif_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list_len)
+{
+	sif_logi(ibpd->device, SIF_FMR, "Not implemented");
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+struct ib_fast_reg_page_list *sif_alloc_fast_reg_page_list(struct ib_device
+							   *ibdev,
+							   int page_list_len)
+{
+	sif_logi(ibdev, SIF_FMR, "Not implemented");
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+void sif_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl)
+{
+	sif_logi(pl->device, SIF_FMR, "Not implemented");
+}
+
+
+/* Line printer for debugfs file */
+void sif_dfs_print_key(struct seq_file *s, struct sif_dev *sdev, loff_t pos)
+{
+	struct psif_key *key;
+	struct psif_key lkey;
+	const char *typestr;
+	char l_state, r_state;
+
+	if (unlikely(pos < 0)) {
+		seq_printf(s, "# %61s State %s\n", "", "Page table info");
+		seq_printf(s, "# Index %18s %18s %16s   LR   %s\n",
+			"Base address(hex)", "Length(hex)", "MMU ctx type", " top leaf pages");
+		return;
+	}
+
+	key = get_key(sdev, pos);
+	copy_conv_to_sw(&lkey, key, sizeof(struct psif_key));
+	typestr = string_enum_psif_mmu_translation(lkey.mmu_context.translation_type) + 4;
+	l_state = string_enum_psif_dma_vt_key_states(lkey.lkey_state)[13];
+	r_state = string_enum_psif_dma_vt_key_states(lkey.rkey_state)[13];
+
+	seq_printf(s, "%7lld %18llx %18llx %16s   %c%c  ", pos, lkey.base_addr, lkey.length,
+		typestr, l_state, r_state);
+	sif_pt_dfs_print(s, sdev, pos);
+}
+
+
+/* API to allocate/release a key for TLB invalidation only
+ * Note that 0 is considered an invalid key!
+ */
+u32 allocate_invalidate_key(struct sif_mmu_ctx *ctx)
+{
+	/* This call is only meaningful for contexts with a valid page table: */
+	struct sif_dev *sdev = ctx->pt->sdev;
+	int index;
+	struct psif_key lkey;
+	volatile struct psif_key *key;
+
+	index = sif_alloc_key_idx(sdev);
+	if (index < 0)
+		return 0;
+
+	key = get_key(sdev, index);
+	memset(&lkey, 0, sizeof(struct psif_key));
+	lkey.lkey_state = PSIF_DMA_KEY_MMU_VALID;
+	lkey.rkey_state = PSIF_DMA_KEY_MMU_VALID;
+	lkey.base_addr = ctx->base;
+	lkey.length = ctx->size;
+	lkey.mmu_context = ctx->mctx;
+
+	/* Write to HW descriptor */
+	copy_conv_to_hw(key, &lkey, sizeof(lkey));
+	return (u32)index;
+}
+
+/* Release and invalidate a previously allocated TLB invalidation key */
+void release_invalidate_key(struct sif_dev *sdev, u32 index)
+{
+	int sts;
+	struct psif_key *key = get_key(sdev, index);
+
+	set_psif_key__lkey_state(key, PSIF_DMA_KEY_INVALID);
+	set_psif_key__rkey_state(key, PSIF_DMA_KEY_INVALID);
+
+	/* Flush this DMA validation entry - we do not really depend on the result
+	 * so safe to make it asynchronous:
+	 */
+	sts = sif_invalidate_key(sdev, index, PCM_POST);
+	if (sts)
+		sif_log(sdev, SIF_INFO,
+			"Invalidate key failed");
+
+	/* Release memory associated with this key */
+	sif_clear_key(sdev, index);
+	sif_free_key_idx(sdev, index);
+}
diff --git a/drivers/infiniband/hw/sif/sif_mr.h b/drivers/infiniband/hw/sif/sif_mr.h
new file mode 100644
index 0000000000000..959f8b407887f
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_mr.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_mr.h: Interface to internal IB memory registration logic for SIF
+ */
+
+#ifndef __SIF_MR_H
+#define __SIF_MR_H
+#include "sif_mmu.h"
+
+struct ib_umem;
+struct sif_mem;
+
+struct sif_mr {
+	struct ib_mr ibmr;
+	int index;
+	struct sif_mem *mem;
+	struct sif_mmu_ctx mmu_ctx;
+};
+
+static inline struct sif_mr *to_smr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct sif_mr, ibmr);
+}
+
+struct ib_mr *sif_get_dma_mr(struct ib_pd *ibpd, int mr_access_flags);
+struct sif_mr *sif_alloc_invalid_mr(struct sif_pd *pd);
+struct ib_mr *sif_reg_phys_mr(struct ib_pd *ibpd,
+			      struct ib_phys_buf *phys_buf_array,
+			      int num_phys_buf, int mr_access_flags,
+			      u64 *iova_start);
+
+struct ib_mr *sif_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
+			      u64 virt_addr, int mr_access_flags,
+			      struct ib_udata *udata);
+
+int sif_query_mr(struct ib_mr *ibmr, struct ib_mr_attr *mr_attr);
+int sif_dereg_mr(struct ib_mr *ibmr);
+
+struct ib_mr *sif_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list_len);
+struct ib_fast_reg_page_list *sif_alloc_fast_reg_page_list(struct ib_device
+							   *ibdev,
+							   int page_list_len);
+
+void sif_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
+int sif_rereg_phys_mr(struct ib_mr *ibmr,
+		      int mr_rereg_mask,
+		      struct ib_pd *ibpd,
+		      struct ib_phys_buf *phys_buf_array,
+		      int num_phys_buf, int mr_access_flags, u64 *iova_start);
+
+/* Deallocate MR - assumes ownership of mr->mem and deletes that as well.
+ * To be used with high level mr allocation operations that create their own
+ * sif_mem object:
+ */
+void sif_dealloc_mr(struct sif_dev *sdev, struct sif_mr *mr);
+
+struct sif_dev;
+struct seq_file;
+struct sif_pd;
+enum psif_mmu_translation;
+
+/* Line printer for debugfs file */
+void sif_dfs_print_key(struct seq_file *s, struct sif_dev *sdev, loff_t pos);
+
+/* Internal mr allocation/deallocation functions:
+ * Allocate an IB MR for the memory object @mem
+ * If length == 0, allocate an invalid map.
+ * The mr does not own the @mem object
+ */
+struct sif_mr *alloc_mr(struct sif_dev *sdev, struct sif_pd *pd,
+			struct sif_mem *mem, u64 map_start, int acc_fl);
+struct sif_mr *create_dma_mr(struct sif_pd *pd, int acc_fl);
+
+void dealloc_mr(struct sif_dev *sdev, struct sif_mr *mr);
+
+
+/* API to allocate/release a key for TLB invalidation only
+ * Note that 0 is considered an invalid key!
+ */
+u32 allocate_invalidate_key(struct sif_mmu_ctx *ctx);
+
+/* Release and invalidate a previously allocated TLB invalidation key */
+void release_invalidate_key(struct sif_dev *sdev, u32 lkey);
+
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_mw.c b/drivers/infiniband/hw/sif/sif_mw.c
new file mode 100644
index 0000000000000..a9099c1f16dfc
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_mw.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_mw.c: Implementation of memory windows for SIF
+ */
+
+#include <rdma/ib_verbs.h>
+#include "sif_mw.h"
+#include "sif_dev.h"
+
+struct ib_mw *sif_alloc_mw(struct ib_pd *ibpd)
+{
+	sif_logi(ibpd->device, SIF_INFO, "Not implemented");
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+int sif_bind_mw(struct ib_qp *ibqp,
+		struct ib_mw *ibmw, struct ib_mw_bind *mw_bind)
+{
+	sif_logi(ibqp->device, SIF_INFO, "Not implemented");
+	return -EOPNOTSUPP;
+}
+
+int sif_dealloc_mw(struct ib_mw *ibmw)
+{
+	sif_logi(ibmw->device, SIF_INFO, "Not implemented");
+	return -EOPNOTSUPP;
+}
diff --git a/drivers/infiniband/hw/sif/sif_mw.h b/drivers/infiniband/hw/sif/sif_mw.h
new file mode 100644
index 0000000000000..4067f36ec0de6
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_mw.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_mw.h: Interface to internal IB memory window logic for SIF
+ */
+
+#ifndef __SIF_MW_H
+#define __SIF_MW_H
+
+struct ib_mw *sif_alloc_mw(struct ib_pd *ibpd);
+int sif_bind_mw(struct ib_qp *ibqp, struct ib_mw *ibmw,
+		struct ib_mw_bind *mw_bind);
+int sif_dealloc_mw(struct ib_mw *ibmw);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_pd.c b/drivers/infiniband/hw/sif/sif_pd.c
new file mode 100644
index 0000000000000..e1fc92f5fa33f
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_pd.c
@@ -0,0 +1,544 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_pd.c: Implementation of IB protection domains for SIF
+ */
+
+#include <rdma/ib_verbs.h>
+#include "sif_dev.h"
+#include "sif_ibpd.h"
+#include "sif_pd.h"
+#include "sif_defs.h"
+#include "sif_base.h"
+#include "sif_mmu.h"
+#include "sif_mr.h"
+#include "sif_xrc.h"
+#include "sif_query.h"
+
+
+int sif_init_pd(struct sif_dev *sdev)
+{
+	/* Avoid using pd == 0 to have HW trap use of blank AHs: */
+	return sif_idr_init(&sdev->pd_refs, 1, SIF_MAX_PD_INDEX);
+}
+
+
+void sif_deinit_pd(struct sif_dev *sdev)
+{
+	sif_idr_deinit(&sdev->pd_refs);
+}
+
+
+inline void cancel_cb(struct psif_cb __iomem *cb)
+{
+	u64 __iomem *c_adr = (u64 __iomem *)((u8 __iomem *)cb + 0xff8);
+	u64 c_val = PSIF_WR_CANCEL_CMD_BE;
+
+	__raw_writeq(cpu_to_be64(c_val), c_adr);
+}
+
+
+struct sif_pd *alloc_pd(struct sif_dev *sdev)
+{
+	struct sif_pd *pd = kzalloc(sizeof(struct sif_pd), GFP_KERNEL);
+
+	if (!pd)
+		return NULL;
+
+	pd->idx = sif_idr_alloc(&sdev->pd_refs, pd, GFP_KERNEL);
+	spin_lock_init(&pd->lock);
+	INIT_LIST_HEAD(&pd->qp_list);
+	INIT_LIST_HEAD(&pd->cq_list);
+	INIT_LIST_HEAD(&pd->rq_list);
+
+	sif_log(sdev, SIF_PD, "pd idx %d", pd->idx);
+	return pd;
+}
+
+
+int dealloc_pd(struct sif_pd *pd)
+{
+	struct sif_dev *sdev = to_sdev(pd->ibpd.device);
+
+	sif_log(sdev, SIF_PD, "pd idx %d", pd->idx);
+
+	if (!list_empty(&pd->qp_list)) {
+		sif_log(sdev, SIF_INFO, "pd idx %d: failed - still active qp blocks", pd->idx);
+		return -EBUSY;
+	}
+	if (!list_empty(&pd->cq_list)) {
+		sif_log(sdev, SIF_INFO, "pd idx %d: failed - still active cq blocks", pd->idx);
+		return -EBUSY;
+	}
+	if (!list_empty(&pd->rq_list)) {
+		sif_log(sdev, SIF_INFO, "pd idx %d: failed - still active rq blocks", pd->idx);
+		return -EBUSY;
+	}
+
+	sif_idr_remove(&sdev->pd_refs, pd->idx);
+	kfree(pd);
+	return 0;
+}
+
+
+/* IB Verbs level interfaces (sif_ibpd.h) */
+
+
+struct ib_pd *sif_alloc_pd(struct ib_device *ibdev,
+			   struct ib_ucontext *context, struct ib_udata *udata)
+{
+	struct sif_dev *sdev = to_sdev(ibdev);
+	struct sif_pd *pd;
+	int ret;
+
+	pd = alloc_pd(sdev);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	/* For bw comp with libsif */
+	if (udata) {
+		struct sif_ucontext *uc = to_sctx(context);
+		struct sif_alloc_pd_resp_ext resp;
+
+		memset(&resp, 0, sizeof(resp));
+		resp.cb_idx = uc->cb->idx;
+		ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (ret) {
+			dealloc_pd(pd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+	return &pd->ibpd;
+}
+
+int sif_dealloc_pd(struct ib_pd *ibpd)
+{
+	return ibpd->shpd ? 0 : dealloc_pd(to_spd(ibpd));
+}
+
+struct ib_shpd *sif_alloc_shpd(struct ib_device *ibdev,
+			struct ib_pd *ibpd,
+			struct ib_udata *udata)
+{
+	struct sif_dev *sdev = to_sdev(ibdev);
+	struct sif_pd *pd = to_spd(ibpd);
+	struct sif_shpd *shpd;
+
+	shpd = kzalloc(sizeof(struct sif_shpd), GFP_KERNEL);
+	if (!shpd)
+		return ERR_PTR(-ENOMEM);
+
+	shpd->ibshpd.device = &sdev->ib_dev;
+	shpd->pd = pd;
+
+	return &shpd->ibshpd;
+}
+
+struct ib_pd *sif_share_pd(struct ib_device *ibdev,
+			struct ib_ucontext *context,
+			struct ib_udata *udata,
+			struct ib_shpd *ibshpd)
+{
+	struct sif_shpd *shpd = to_sshpd(ibshpd);
+	struct sif_pd *pd = shpd->pd;
+	int ret;
+
+	if (udata) {
+		struct sif_ucontext *uc = to_sctx(context);
+		struct sif_share_pd_resp_ext resp;
+
+		memset(&resp, 0, sizeof(resp));
+		resp.cb_idx = uc->cb->idx;
+		ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (ret)
+			return ERR_PTR(-EFAULT);
+	}
+
+	return &pd->ibpd;
+}
+
+int sif_remove_shpd(struct ib_device *ibdev,
+		struct ib_shpd *ibshpd,
+		int atinit)
+{
+	struct sif_shpd *shpd = to_sshpd(ibshpd);
+
+	if (!atinit && shpd->pd)
+		dealloc_pd(shpd->pd);
+
+	kfree(ibshpd);
+
+	return 0;
+}
+
+/* Collect buffer management */
+
+
+/* Obtain information about lat_cb and bw_cb resources
+ * We cannot use the ba structs yet as they are not initialized at this point:
+ */
+static void sif_cb_init(struct sif_dev *sdev)
+{
+	struct psif_epsc_csr_req req;
+	struct psif_epsc_csr_rsp rsp;
+	struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+
+	/* EPSC supports the new requests starting from v.0.36 */
+	if (eps_version_ge(es, 0, 37)) {
+		int ret = 0;
+
+		memset(&req, 0, sizeof(req));
+		req.opcode = EPSC_QUERY;
+		req.u.query.data.op = EPSC_QUERY_CAP_VCB_LO;
+		req.u.query.info.op = EPSC_QUERY_CAP_VCB_HI;
+		ret = sif_epsc_wr(sdev, &req, &rsp);
+		if (ret)
+			sif_log(sdev, SIF_INFO, "Request for VCB info failed with %d", ret);
+		else {
+			sdev->bw_cb_cnt = rsp.data;
+			sdev->lat_cb_cnt = rsp.info;
+			sif_log(sdev, SIF_INFO, "Got %ld bw_cbs and %ld lat_cbs",
+				sdev->bw_cb_cnt, sdev->lat_cb_cnt);
+		}
+	}
+}
+
+
+/* Called from sif_base.c to initialize each of the cb tables */
+void sif_cb_table_init(struct sif_dev *sdev, enum sif_tab_type type)
+{
+	struct sif_table *tp;
+
+	BUG_ON(!is_cb_table(type));
+	tp = &sdev->ba[type];
+
+	/* Update table values with EPSC data: */
+	if (type == bw_cb) {
+		sif_cb_init(sdev);
+		if (sdev->bw_cb_cnt) {
+			tp->entry_cnt = sdev->bw_cb_cnt;
+			tp->table_sz = tp->ext_sz * tp->entry_cnt;
+		}
+		tp->sif_off = sdev->cb_base;
+	} else {
+		/* lat_cb */
+		if (sdev->lat_cb_cnt) {
+			tp->entry_cnt = sdev->lat_cb_cnt;
+			tp->table_sz = tp->ext_sz * tp->entry_cnt;
+			tp->sif_off = sdev->cb_base + sdev->ba[bw_cb].table_sz;
+		} else
+			tp->entry_cnt = 0;
+	}
+
+	tp->mem = sif_mem_create_ref(sdev, SIFMT_NOMEM, tp->sif_base,
+				tp->table_sz, GFP_KERNEL);
+}
+
+
+struct sif_cb *alloc_cb(struct sif_dev *sdev, bool lat_cb)
+{
+	int idx;
+	struct sif_cb *cb = kzalloc(sizeof(struct sif_cb), GFP_KERNEL);
+
+	if (!cb)
+		return NULL;
+
+	if (unlikely(lat_cb)) {
+		idx = sif_alloc_lat_cb_idx(sdev);
+		if (idx < 0) {
+			sif_log(sdev, SIF_INFO, "Unable to allocate lat_cb - trying bw_cb instead");
+			lat_cb = false;
+		} else
+			cb->cb = get_lat_cb(sdev, idx);
+	}
+
+	if (likely(!lat_cb)) {
+		idx = sif_alloc_bw_cb_idx(sdev);
+		if (idx < 0)
+			goto err_index;
+		cb->cb = get_bw_cb(sdev, idx);
+	}
+
+	/* Reset Collect buffer */
+	cb->idx = idx;
+	cb->is_lat_cb = lat_cb;
+
+	cancel_cb(cb->cb);
+
+	spin_lock_init(&cb->lock);
+	return cb;
+err_index:
+	kfree(cb);
+	return NULL;
+}
+
+
+void release_cb(struct sif_dev *sdev, struct sif_cb *cb)
+{
+	cancel_cb(cb->cb);
+	if (unlikely(cb->is_lat_cb))
+		sif_free_lat_cb_idx(sdev, cb->idx);
+	else
+		sif_free_bw_cb_idx(sdev, cb->idx);
+	kfree(cb);
+}
+
+
+/* Find the driver struct for a collect buffer index, if associated with @uc
+ */
+struct sif_cb *sif_cb_from_uc(struct sif_ucontext *uc, u32 index)
+{
+	if (uc->cb->idx == index)
+		return uc->cb;
+	return NULL;
+}
+
+
+/*
+ * Write a prepared work request (in wqe) to the associated collect buffer:
+ * Return 0 on success otherwise -EBUSY if lock is held
+ */
+int sif_cb_write(struct sif_qp *qp, struct psif_wr *wqe, int cp_len)
+{
+	unsigned long flags;
+	struct sif_cb *cb = get_cb(qp);
+
+	if (!spin_trylock_irqsave(&cb->lock, flags))
+		return -EBUSY;
+
+	wmb(); /* Previous memory writes must be ordered wrt the I/O writes */
+	copy_conv_to_mmio(cb->cb, wqe, cp_len);
+	wc_wmb(); /* I/O writes must be completed before we let go of the lock! */
+	spin_unlock_irqrestore(&cb->lock, flags);
+
+	return 0;
+}
+
+
+#define SQS_START_DOORBELL 0xfc0
+#define SQS_STOP_DOORBELL  0xf80
+
+/*
+ * Notify about a work request to the cb doorbell - triggering SQ mode:
+ */
+void sif_doorbell_write(struct sif_qp *qp, struct psif_wr *wqe, bool start)
+{
+	unsigned long flags;
+	u16 doorbell_offset = start ? SQS_START_DOORBELL : SQS_STOP_DOORBELL;
+	struct sif_cb *cb = get_cb(qp);
+	struct sif_dev *sdev = to_sdev(qp->ibqp.pd->device);
+
+	sif_log(sdev, SIF_QP, "%s sqs for qp %d sq_seq %d", (start ? "start" : "stop"),
+		qp->qp_idx, wqe->sq_seq);
+	spin_lock_irqsave(&cb->lock, flags);
+	wmb();
+	copy_conv_to_mmio((u8 __iomem *)cb->cb + doorbell_offset, wqe, 8);
+
+	/* Flush write combining */
+	wc_wmb();
+	spin_unlock_irqrestore(&cb->lock, flags);
+}
+
+
+/*
+ * Force the SQS to process an already posted WR:
+ */
+
+void sif_doorbell_from_sqe(struct sif_qp *qp, u16 seq, bool start)
+{
+	u16 doorbell_offset = start ? SQS_START_DOORBELL : SQS_STOP_DOORBELL;
+	struct sif_cb *cb = get_cb(qp);
+	struct sif_dev *sdev = to_sdev(qp->ibqp.pd->device);
+	struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+	u64 *wqe = (u64 *)get_sq_entry(sq, seq);
+
+	/* Pick the 1st 8 bytes directly from the sq entry: */
+	wmb();
+	__raw_writeq(*wqe, ((u8 __iomem *)cb->cb + doorbell_offset));
+
+	/* Flush write combining */
+	wc_wmb();
+	sif_log(sdev, SIF_QP, "%s sqs for qp %d sq_seq %d", (start ? "start" : "stop"),
+		qp->qp_idx, seq);
+}
+
+
+static struct list_head *type_to_list(struct sif_pd *pd, enum sif_tab_type type)
+{
+	switch (type) {
+	case cq_hw:
+		return &pd->cq_list;
+	case rq_hw:
+		return &pd->rq_list;
+	case qp:
+		return &pd->qp_list;
+	default:
+		BUG();
+	}
+	return NULL;
+}
+
+
+/* Allocate a free index from a block:
+ * The index is a global index
+ */
+static int alloc_from_block(struct sif_table_block *b, enum sif_tab_type type)
+{
+	int next = 0;
+	int index;
+	int loc_idx;
+
+	struct sif_table *table = b->table;
+
+	if (table->alloc_rr)
+		next = (b->last_used + 1) & (table->entry_per_block - 1);
+	loc_idx = find_next_zero_bit(b->bitmap, table->entry_per_block, next);
+	if (table->alloc_rr && loc_idx >= table->entry_per_block)
+		loc_idx = find_next_zero_bit(b->bitmap, table->entry_per_block, 0);
+	if (loc_idx < table->entry_per_block) {
+		set_bit(loc_idx, b->bitmap);
+		if (table->alloc_rr)
+			b->last_used = loc_idx;
+		index = loc_idx + b->offset;
+		sif_log(table->sdev, SIF_IDX2,
+			"%s[%d:%d] -> %d ", sif_table_name(type),
+			b->offset / table->entry_per_block, loc_idx, index);
+		return index;
+	}
+	return -1;
+}
+
+
+/* Free a used index back to a block:
+ * The index is a global index
+ */
+static void free_to_block(struct sif_table_block *b, enum sif_tab_type type, int index)
+{
+	struct sif_table *table = b->table;
+	size_t ext_sz = table->ext_sz;
+	char *desc = sif_mem_kaddr(table->mem, index * ext_sz);
+
+	/* Get from global index to block index */
+	index -= b->offset;
+
+	/* Clean descriptor entry for reuse:
+	 * note that we clean the whole extent here which
+	 * includes all of sif_##type for inlined types:
+	 */
+	if (type == rq_hw) /* only zero out driver data structure */
+		memset(desc + sizeof(struct psif_rq_hw), 0, ext_sz - sizeof(struct psif_rq_hw));
+	else if (!is_cb_table(type) && type != qp && type != cq_hw)
+		memset(desc, 0, ext_sz);
+
+	sif_log(table->sdev, SIF_IDX2,
+		"%s[%d:%d] ", sif_table_name(type),
+		b->offset / table->entry_per_block, index);
+	clear_bit(index, b->bitmap);
+}
+
+
+/* Support for per protection domain table index allocations (2nd level allocation):
+ * Invariants:
+ * - sif_table_block entries are 0-initialized, and initialized to real values on demand.
+ * - We keep a list of blocks and try to allocate starting from the first in the list
+ *   assuming that the last added block has the most free entries.
+ */
+
+int sif_pd_alloc_index(struct sif_pd *pd, enum sif_tab_type type)
+{
+	struct sif_dev *sdev = to_sdev(pd->ibpd.device);
+	struct sif_table *tp = &sdev->ba[type];
+	struct list_head *list = type_to_list(pd, type);
+	struct sif_table_block *b;
+	int idx = -1;
+
+	if (tp->entry_per_block == 1) /* Handle 1-level alloc case */
+		return sif_alloc_index(sdev, type);
+
+	spin_lock(&pd->lock);
+	list_for_each_entry(b, list, pd_list) {
+		idx = alloc_from_block(b, type);
+		if (idx >= 0)
+			break;
+	}
+	if (idx < 0) {
+		/* Allocate a new block */
+		int blk_idx = sif_alloc_index(sdev, type);
+
+		if (blk_idx >= 0) {
+			b = sif_get_block(tp, blk_idx);
+			sif_log(sdev, SIF_IDX2, "%s blk_idx %d: %p [%ld/%d]",
+				sif_table_name(type), blk_idx, b,
+				sizeof(struct sif_table_block), tp->block_ext);
+			b->table = tp;
+			b->pd = pd;
+			b->offset = blk_idx * tp->entry_per_block;
+			/* Don't modify last_used as we want it to survive (de)allocations */
+			list_add(&b->pd_list, list);
+			idx = alloc_from_block(b, type);
+		}
+	}
+	spin_unlock(&pd->lock);
+	return idx;
+}
+
+
+void sif_pd_free_index(struct sif_pd *pd, enum sif_tab_type type, int index)
+{
+	struct sif_dev *sdev = to_sdev(pd->ibpd.device);
+	struct sif_table *tp = &sdev->ba[type];
+	struct sif_table_block *b;
+	int bits_used;
+	int blk_idx = index / tp->entry_per_block;
+
+	if (tp->entry_per_block == 1) /* Handle 1-level alloc case */
+		return sif_free_index(sdev, type, index);
+
+	b = sif_get_block(tp, blk_idx);
+	if (!b->table) {
+		/* BUG */
+		sif_log(sdev, SIF_INFO, "index %d: block table ptr NULL - blk_idx %d table %s",
+			index, blk_idx, sif_table_name(type));
+		return;
+	}
+	spin_lock(&pd->lock);
+	free_to_block(b, type, index);
+	bits_used = bitmap_weight(b->bitmap, tp->entry_per_block);
+	if (!bits_used) {
+		list_del(&b->pd_list);
+		sif_free_index(sdev, type, blk_idx);
+	}
+	spin_unlock(&pd->lock);
+}
+
+
+bool sif_pd_index_used(struct sif_table *tp, int idx)
+{
+	struct sif_table_block *b;
+	int blk_idx = idx / tp->entry_per_block;
+
+	if (!test_bit(blk_idx, tp->bitmap))
+		return false;
+	b = sif_get_block(tp, blk_idx);
+	return test_bit(idx % tp->entry_per_block, b->bitmap);
+}
+
+
+bool sif_is_user_pd(struct sif_pd *pd)
+{
+	if (pd->ibpd.uobject)
+		return true;
+	/* TBD: We don't know if an XRC domain originates from user space,
+	 * as it does not get any uobject
+	 */
+	if (pd->xrcd) /* TBD: && pd->xrcd->ib_xrcd.uobject) */
+		return true;
+	return false;
+}
diff --git a/drivers/infiniband/hw/sif/sif_pd.h b/drivers/infiniband/hw/sif/sif_pd.h
new file mode 100644
index 0000000000000..aa0277a80b12f
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_pd.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_pd.h: Internal interface to protection domains
+ *   and collect buffer management for SIF
+ */
+
+#ifndef __SIF_PD_H
+#define __SIF_PD_H
+
+struct sif_dev;
+struct sif_pd;
+struct sif_cb;
+struct sif_qp;
+struct sif_ucontext;
+
+/**** Protection domains ****/
+
+/* SIF supports a 24 bit PD index: */
+#define SIF_MAX_PD_INDEX ((1 << 24) - 1)
+
+struct sif_pd {
+	struct ib_pd ibpd;
+	int idx;     /* index of this pd */
+	struct sif_xrcd *xrcd; /* If set, this pd is owned by an xrcd */
+	spinlock_t lock;    /* Protects lists and their bitmaps while owned by us */
+	/* List of blocks of descriptor entries owned by this pd */
+	struct list_head qp_list;
+	struct list_head cq_list;
+	struct list_head rq_list;
+};
+
+struct sif_shpd {
+	struct ib_shpd ibshpd;
+	struct sif_pd *pd;
+};
+
+/* Initialize/deinitialize the pd subsystem */
+int sif_init_pd(struct sif_dev *sdev);
+void sif_deinit_pd(struct sif_dev *sdev);
+
+struct sif_pd *alloc_pd(struct sif_dev *sdev);
+int dealloc_pd(struct sif_pd *pd);
+
+
+/* Per protection domain table index allocations (2nd level allocation) */
+int sif_pd_alloc_index(struct sif_pd *pd, enum sif_tab_type type);
+void sif_pd_free_index(struct sif_pd *pd, enum sif_tab_type type, int index);
+
+/* 2-level and 1-level safe index usage check:
+ * idx is the entry index (not block index)
+ * and is assumed to be within bounds:
+ *
+ */
+bool sif_pd_index_used(struct sif_table *tp, int idx);
+
+bool sif_is_user_pd(struct sif_pd *pd);
+
+
+/****  Collect buffers  ****/
+
+static inline bool is_cb_table(enum sif_tab_type type)
+{
+	return type == bw_cb || type == lat_cb;
+}
+
+
+/* Called from sif_base.c to initialize the cb tables */
+void sif_cb_table_init(struct sif_dev *sdev, enum sif_tab_type type);
+
+
+/* per collect buffer struct */
+struct sif_cb {
+	int idx;	 /* index of this cb */
+	bool is_lat_cb;	 /* High bandwidth or low latency cb */
+	spinlock_t lock; /* Serializes access to this cb */
+	u64 reqs;	 /* Number of requests on this cb */
+	struct psif_cb __iomem *cb; /* Pointer to the actual collect buffer space */
+};
+
+/* Allocation and deallocation of collect buffers
+ * If @lat_cb is set, allocate low latency CB instead of high bandwidth one:
+ */
+struct sif_cb *alloc_cb(struct sif_dev *sdev, bool lat_cb);
+void release_cb(struct sif_dev *sdev, struct sif_cb *cb);
+
+/* Find the driver struct for a collect buffer index, if associated with @uc
+ */
+struct sif_cb *sif_cb_from_uc(struct sif_ucontext *uc, u32 index);
+
+
+/*
+ * Write a prepared work request (in wqe) to the associated collect buffer:
+ * Return 0 on success otherwise -EBUSY if lock is held
+ */
+int sif_cb_write(struct sif_qp *qp, struct psif_wr *wqe, int cp_len);
+
+
+/*
+ * Notify about a work request to the cb doorbell - triggering SQ mode:
+ */
+void sif_doorbell_write(struct sif_qp *qp, struct psif_wr *wqe, bool start);
+
+
+/*
+ * Force the SQS to process an already posted WR:
+ */
+void sif_doorbell_from_sqe(struct sif_qp *qp, u16 seq, bool start);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_pqp.c b/drivers/infiniband/hw/sif/sif_pqp.c
new file mode 100644
index 0000000000000..fd22824e29af6
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_pqp.c
@@ -0,0 +1,1048 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_pqp.c: Privileged QP handling
+ *   The privileged QPs are SIFs internal send only QPs for management operations
+ */
+
+#include "sif_dev.h"
+#include "sif_cq.h"
+#include "sif_sq.h"
+#include "sif_base.h"
+#include "psif_hw_data.h"
+#include "psif_hw_setget.h"
+#include "sif_pqp.h"
+#include "sif_qp.h"
+#include "sif_hwi.h"
+#include "sif_ibqp.h"
+#include "sif_checksum.h"
+#include "sif_defs.h"
+
+static inline struct sif_qp *__create_init_qp(struct sif_dev *sdev, struct sif_cq *cq)
+{
+	struct sif_qp *qp;
+	struct ib_qp_init_attr init_attr = {
+		.event_handler = NULL,
+		.send_cq = &cq->ibcq,
+		.recv_cq = NULL, /* receive side not used */
+		.srq = NULL,
+		.cap = {
+			.max_send_wr = sif_max_pqp_wr,
+			.max_recv_wr = 0,
+			.max_send_sge = 0,
+			.max_recv_sge = 0,
+			.max_inline_data = 0
+		},
+		.qp_type = IB_QPT_UD,
+	};
+	struct sif_qp_init_attr sif_attr = {
+		.pd = sdev->pd,
+		.qp_type = PSIF_QP_TRANSPORT_MANSP1,
+		.qosl = QOSL_LOW_LATENCY,
+		.sq_hdl_sz = sizeof(struct sif_sq_hdl),
+	};
+
+	qp = create_qp(sdev, &init_attr, &sif_attr);
+	if (!IS_ERR(qp))
+		qp->ibqp.pd = &sdev->pd->ibpd;
+	return qp;
+}
+
+
+
+static struct sif_pqp *_sif_create_pqp(struct sif_dev *sdev, size_t alloc_sz, int comp_vector)
+{
+	struct sif_pqp *pqp;
+	struct sif_cq *cq;
+	struct sif_qp *qp;
+	struct sif_sq *sq = NULL;
+	int ret = 0;
+
+	/* The privileged QP only supports state in modify_qp */
+	struct ib_qp_attr mod_attr = {
+		.qp_state        = IB_QPS_INIT
+	};
+
+	pqp = kzalloc(alloc_sz, GFP_KERNEL);
+	if (!pqp) {
+		sif_log(sdev, SIF_INFO, "Failed to allocate memory for priv.qp");
+		return NULL;
+	}
+
+	cq = create_cq(sdev->pd, sif_max_pqp_wr, comp_vector, SIFPX_OFF, false);
+	if (IS_ERR(cq)) {
+		ret = PTR_ERR(cq);
+		goto cq_alloc_failed;
+	}
+	cq->ibcq.device = &sdev->ib_dev;
+	pqp->cq = cq;
+	cq->pqp = pqp;
+	init_completion(&pqp->nonfull);
+
+	/* Now create a queue pair.
+	 * TBD: Use a separate pqp for req_notify_cq and use low latency..
+	 */
+	qp = __create_init_qp(sdev, cq);
+	if (IS_ERR(qp)) {
+		ret = PTR_ERR(qp);
+		goto qp_alloc_failed;
+	}
+
+	pqp->qp = qp;
+	sq = get_sif_sq(sdev, qp->qp_idx);
+	/* Reserve 1/2 or at least 1 entry for pqp requests with completion on the PQP */
+	pqp->lowpri_lim = sq->entries - min_t(int, sq->entries/2, 2);
+
+	/* Run the required qp modify sequence */
+	ret = sif_modify_qp(&qp->ibqp, &mod_attr,
+			IB_QP_STATE, NULL);
+	if (ret)
+		goto qp_alloc_failed;
+
+	mod_attr.qp_state = IB_QPS_RTR;
+	ret = sif_modify_qp(&qp->ibqp, &mod_attr,
+			IB_QP_STATE, NULL);
+	if (ret)
+		goto qp_alloc_failed;
+
+	mod_attr.qp_state = IB_QPS_RTS;
+	mod_attr.sq_psn	= 0;
+	ret = sif_modify_qp(&qp->ibqp, &mod_attr,
+			IB_QP_STATE, NULL);
+	if (ret)
+		goto qp_alloc_failed;
+
+	atomic64_set(&pqp->qp->arm_srq_holdoff_time, 0);
+
+	sif_log(sdev, SIF_QP, "success");
+	return pqp;
+
+qp_alloc_failed:
+	/* Special destruction order, see below: */
+	destroy_cq(cq);
+	if (sq)
+		sq->cq_idx = -1;
+
+	if (pqp->qp)
+		destroy_qp(sdev, qp);
+cq_alloc_failed:
+	kfree(pqp);
+	sif_log(sdev, SIF_QP, "failed with %d", ret);
+	return ERR_PTR(ret);
+}
+
+
+int sif_destroy_pqp(struct sif_dev *sdev, struct sif_pqp *pqp)
+{
+	struct sif_sq *sq = get_sif_sq(sdev, pqp->qp->qp_idx);
+	bool self_destruct = get_pqp(sdev) == pqp;
+	/* For the last pqp we make an exception from the IB std reqs
+	 * in that we keep the PQP itself up to invalidate the CQ using the
+	 * PQP to send the invalidate, **before** we take down the QP itself.
+	 * The hardware will make sure that for this special case
+	 * the completion is sent before the CQ entry is invalidated.
+	 */
+	int ret;
+
+	if (self_destruct) {
+		sif_log(sdev, SIF_PQP, "self destruct CQ %d", pqp->cq->index);
+		ret = destroy_cq(pqp->cq);
+		if (ret < 0)
+			return ret;
+
+		if (sq)
+			sq->cq_idx = -1;
+	}
+
+	ret = destroy_qp(sdev, pqp->qp);
+	if (ret < 0)
+		return ret;
+
+	/* Support the normal destruction order as long as we have
+	 * other PQPs in the system:
+	 */
+	if (!self_destruct) {
+		ret = destroy_cq(pqp->cq);
+		if (ret < 0)
+			return ret;
+
+		if (sq)
+			sq->cq_idx = -1;
+	}
+	kfree(pqp);
+	return 0;
+}
+
+
+struct sif_pqp *sif_create_pqp(struct sif_dev *sdev, int comp_vector)
+{
+	return _sif_create_pqp(sdev, sizeof(struct sif_pqp), comp_vector);
+}
+
+
+static void pqp_complete_nonfull(struct sif_pqp *pqp)
+{
+	int ql;
+	unsigned long flags;
+	struct sif_dev *sdev = to_sdev(pqp->cq->ibcq.device);
+	struct sif_sq *sq = get_sif_sq(sdev, pqp->qp->qp_idx);
+	struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, pqp->qp->qp_idx);
+return;
+	spin_lock_irqsave(&sq->lock, flags);
+	ql = sq_length(sq, sq_sw->head_seq, sq_sw->last_seq);
+	if (ql <= sq->mask && atomic_read(&pqp->waiters))
+		complete(&pqp->nonfull);
+	spin_unlock_irqrestore(&sq->lock, flags);
+}
+
+
+static inline void __pqp_complete_sq(struct sif_sq *sq, u32 sq_seq)
+{
+	/* TBD: Allow pqp posters to wait for completions */
+}
+
+
+
+static void pqp_reset_cmpl(struct sif_cqe *lcqe)
+{
+	struct sif_pqp *pqp = lcqe->pqp;
+	struct sif_cq *cq = pqp->cq;
+	struct sif_dev *sdev = to_sdev(cq->ibcq.device);
+	struct sif_sq *sq = get_sif_sq(sdev, pqp->qp->qp_idx);
+	struct sif_sq_hdl *wh = get_sq_hdl(sq, lcqe->sq_seq);
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->lock, flags);
+	wh->wr_id = 0;
+	wh->used = false;
+	spin_unlock_irqrestore(&cq->lock, flags);
+}
+
+
+
+/* Process all received completions on @cq - must be only PQP completions!
+ * Return the number processed, or -errno upon errors:
+ * Assumes the cq lock is held.
+ * If first_err is set, check for completion errors and return the first one with errors:
+ */
+
+/* TBD: Clean up memory barriers in this function */
+static int __pqp_process_cqe(struct sif_pqp *pqp, struct sif_cqe *first_err)
+{
+	struct sif_cq *cq = pqp->cq;
+	struct sif_dev *sdev = to_sdev(cq->ibcq.device);
+	struct sif_sq_sw *sq_sw;
+	volatile struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index);
+	struct sif_sq *sq;
+	u32 seqno = cq_sw->next_seq;
+	volatile struct psif_cq_entry *cqe_be = get_cq_entry(cq, seqno);
+	int npolled = 0;
+	int cqe_cnt = 0;
+	u64 wci;
+	struct psif_send_completion_id *wc_id = (struct psif_send_completion_id *)&wci;
+	int sq_seq;
+	struct sif_cqe *lcqe;
+	struct sif_sq_hdl *wh;
+	int ql = 0;
+	u64 dbg_mask;
+	bool err_seen = false;
+
+	for (; seqno == get_psif_cq_entry__seq_num(cqe_be); npolled++) {
+		enum psif_wc_status status = get_psif_cq_entry__status(cqe_be);
+		int sq_idx = get_psif_cq_entry__qp(cqe_be);
+		bool dump_it = false;
+
+		sq = get_sif_sq(sdev, sq_idx);
+		sq_sw = get_sif_sq_sw(sdev, sq_idx);
+		wci = get_psif_cq_entry__wc_id(cqe_be);
+		sq_seq = wc_id->sq_seq_num;
+		wh = get_sq_hdl(sq, sq_seq);
+
+		if (unlikely(status != PSIF_WC_STATUS_SUCCESS)) {
+			sif_log(sdev, SIF_INFO, "error completion polled");
+			dump_it = true;
+		}
+
+		if (pqp->qp->flags & SIF_QPF_KI_STENCIL)
+			goto cont_check_first_err;
+
+		if (unlikely(!wh)) {
+			sif_log(sdev, SIF_INFO,
+				"cqe %d for cq %d refers sq(qp) %d which has not been initialized",
+				seqno, cq->index, sq_idx);
+			dump_it = true;
+			goto cont_no_wh;
+		}
+		if (unlikely(!wh->used)) {
+			sif_log(sdev, SIF_INFO,
+				"ignoring unused cqe %d for cq %d, sq %d, sq_seq %d",
+				seqno, cq->index, sq_idx, sq_seq);
+			dump_it = true;
+			goto cont;
+		}
+		if (unlikely(wh->sq_seq != sq_seq)) {
+			sif_log(sdev, SIF_INFO,
+				"wrong cqe %d for cq %d: got sq_seq %d, expected %d",
+				seqno, cq->index, sq_seq, wh->sq_seq);
+			dump_it = true;
+			goto cont;
+		}
+
+		lcqe = (struct sif_cqe *)wh->wr_id;
+		if (lcqe) {
+			wh->wr_id = 0;
+			cqe_cnt++;
+			mb();
+			sif_log(sdev, SIF_PQP, "copying to caller cqe at %p", &lcqe->cqe);
+			copy_conv_to_sw(&lcqe->cqe, cqe_be, sizeof(struct psif_cq_entry));
+			wmb();
+			lcqe->written = true;
+			if (lcqe->need_complete)
+				complete(&lcqe->cmpl);
+		}
+cont_check_first_err:
+		if (unlikely(first_err && (status != PSIF_WC_STATUS_SUCCESS))) {
+			sif_log(sdev, SIF_PQP, "error completion received - aborting");
+			copy_conv_to_sw(&first_err->cqe, cqe_be, sizeof(struct psif_cq_entry));
+			err_seen = true;
+			first_err->written = true;
+			npolled++;
+		}
+cont:
+		wh->used = 0;
+cont_no_wh:
+		if (dump_it) {
+			sif_logs(SIF_INFO,
+				write_struct_psif_cq_entry(NULL, 1,
+						(const struct psif_cq_entry *)cqe_be);
+				printk("\n"));
+		}
+
+		mb();
+		sq_sw->head_seq = sq_seq;
+		seqno = ++cq_sw->next_seq;
+
+		if (cq_length(cq, cq_sw->cached_head, seqno) >= cq->high_watermark) {
+			/* Update CQ hardware pointer */
+			set_psif_cq_sw__head_indx(&cq_sw->d, seqno);
+			cq_sw->cached_head = seqno;
+		}
+
+		ql = sq_length(sq, sq_seq, sq_sw->last_seq);
+		if (ql <= sq->mask)
+			pqp_complete_nonfull(pqp);
+		mb();
+		if (unlikely(err_seen))
+			break;
+		cqe_be = get_cq_entry(cq, seqno);
+	}
+
+	dbg_mask = npolled ? SIF_PQP : SIF_IPOLL;
+	sif_log(sdev, dbg_mask, "processed %d (%d with waiters) requests - seqno 0x%x, ql %d",
+		npolled, atomic_read(&pqp->waiters),
+		seqno, ql);
+
+	if (npolled > 0) {
+		/* reset timeout each time we see a new completion: */
+		pqp->timeout = jiffies + sdev->min_resp_ticks * 4;
+	}
+	return npolled;
+}
+
+
+static int pqp_process_cqe(struct sif_pqp *pqp, struct sif_cqe *first_err)
+{
+	unsigned long flags;
+	int npolled;
+	struct sif_cq *cq = pqp->cq;
+
+	/* If someone else holds the lock, the CQEs are handled */
+	if (!spin_trylock_irqsave(&cq->lock, flags))
+		return -EBUSY;
+	npolled = __pqp_process_cqe(pqp, first_err);
+	spin_unlock_irqrestore(&cq->lock, flags);
+	return npolled;
+}
+
+
+static struct sif_pqp *find_any_pqp(struct sif_dev *sdev)
+{
+	int cpu;
+
+	for (cpu = 0; cpu < sdev->pqp_cnt; cpu++)
+		if (sdev->pqp[cpu])
+			return sdev->pqp[cpu];
+	return NULL;
+}
+
+/* Get the right PQP for the same EQ*/
+struct sif_pqp *get_pqp_same_eq(struct sif_dev *sdev, int comp_vector)
+{
+	unsigned int pqp_index = comp_vector - 2;
+	struct sif_pqp *pqp = sdev->pqp_cnt ? sdev->pqp[pqp_index % sdev->pqp_cnt] : NULL;
+
+	if (unlikely(!pqp)) {
+		/* Typically during take down */
+		return find_any_pqp(sdev);
+	}
+	return pqp;
+}
+
+
+/* Get the right PQP for the current CPU */
+struct sif_pqp *get_pqp(struct sif_dev *sdev)
+{
+	unsigned int cpu = smp_processor_id();
+	struct sif_pqp *pqp = sdev->pqp_cnt ? sdev->pqp[cpu % sdev->pqp_cnt] : NULL;
+
+	if (unlikely(!pqp)) {
+		/* Typically during take down */
+		return find_any_pqp(sdev);
+	}
+	return pqp;
+}
+
+/* Get the next PQP in a round robin fashion */
+struct sif_pqp *get_next_pqp(struct sif_dev *sdev)
+{
+	struct sif_pqp *pqp;
+	int next = atomic_inc_return(&sdev->next_pqp) % sdev->pqp_cnt;
+
+	pqp = sdev->pqp[next];
+	if (unlikely(!pqp)) {
+		/* Typically during take down */
+		return find_any_pqp(sdev);
+	}
+	return pqp;
+}
+
+struct sif_cb *get_cb(struct sif_qp *qp)
+{
+	struct sif_dev *sdev = to_sdev(qp->ibqp.pd->device);
+	unsigned int cpu = smp_processor_id();
+	return sdev->kernel_cb[qp->qosl][cpu % sdev->kernel_cb_cnt];
+}
+
+
+inline bool pqp_req_gets_completion(struct sif_pqp *pqp, struct psif_wr *wr, enum post_mode mode)
+{
+	return mode == PM_WRITE || (wr->op != PSIF_WR_GENERATE_COMPLETION && wr->completion) ||
+		wr->cq_desc_vlan_pri_union.cqd_id == pqp->cq->index;
+}
+
+/* Fill in common parts and post a work request to the management QP for the current CPU
+ * If @cqe is non-null, a completion will be requested and the result put there in
+ * host order when it is found (by __pqp_process_cqe())
+ */
+int sif_pqp_write_send(struct sif_pqp *pqp, struct psif_wr *wr, struct sif_cqe *cqe,
+		enum post_mode mode)
+{
+	struct sif_qp *qp = pqp->qp;
+	u32 qp_idx = qp->qp_idx;
+	struct sif_dev *sdev = to_sdev(pqp->qp->ibqp.device);
+	struct sif_pd *pd = sdev->pd;
+	struct sif_sq *sq = get_sif_sq(sdev, qp_idx);
+	struct psif_sq_entry *sqe;
+	struct sif_sq_hdl *wh;
+	unsigned long flags;
+	bool ring_doorbell;
+	int q_sz;
+	int ret = 0;
+	u16 head, sq_seq;
+	struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp_idx);
+	unsigned long timeout = sdev->min_resp_ticks * 4;
+	u16 limit = pqp_req_gets_completion(pqp, wr, mode) ? sq->entries : pqp->lowpri_lim;
+	/* Per IBTA 11.4.1.1, error is only returned
+	 * when the QP is in the RESET, INIT or RTR states.
+	 */
+	if (qp->last_set_state < IB_QPS_RTS)
+		return -EINVAL; /* The pqp is not ready */
+
+	pqp->timeout = jiffies + timeout;
+
+	wr->local_qp = qp_idx;
+	wr->tsu_qosl = qp->qosl;
+	wr->tsu_sl = qp->tsl;
+
+restart:
+	/* Make sure emptying the queue takes preference over filling it up: */
+	if (mode != PM_WRITE)
+		ret = pqp_process_cqe(pqp, NULL);
+	if (ret > 0 || ret == -EBUSY)
+		ret = 0; /* Got some reqs */
+	else if (ret < 0)
+		return ret;
+
+	spin_lock_irqsave(&sq->lock, flags);
+	sq_seq = sq_sw->last_seq;
+	head = sq_sw->head_seq;
+	q_sz = sq_length(sq, head, sq_seq);
+
+	if (q_sz >= limit) {
+		if (sq_seq != pqp->last_full_seq) {
+			sif_log(sdev, SIF_PQP,
+				"Privileged qp full - head %d sq_seq %d q_sz %d/%d",
+				head, sq_seq, q_sz, sq->entries);
+			pqp->last_full_seq = sq_seq;
+		}
+		spin_unlock_irqrestore(&sq->lock, flags);
+
+		if (limit < sq->entries && sq_seq != pqp->last_nc_full) {
+			/* Avoid spinning creating more sync completions
+			 * - block on next try unless sequence number has changed:
+			 */
+			pqp->last_nc_full = sq_seq;
+			return -EAGAIN;
+		}
+
+		/* PQP requests to a full queue should not be generated at interrupt level */
+		BUG_ON(in_interrupt());
+		if (time_is_after_jiffies(pqp->timeout)) {
+			goto restart;
+			if (sq_seq != pqp->last_full_seq)
+				sif_log(sdev, SIF_PQP, "priv.qp %d: spin waiting for slot in queue",
+					pqp->qp->qp_idx);
+		} else {
+			sif_log(sdev, SIF_INFO,
+				"Timeout waiting for previous response (seq %d) to complete",
+				sq_sw->head_seq);
+			return -ETIMEDOUT;
+		}
+	}
+	sq_seq = ++sq_sw->last_seq;
+
+	/* Store longest send queue observed */
+	if (unlikely(q_sz > sq->max_outstanding && mode != PM_WRITE))
+		sq->max_outstanding = q_sz;
+
+	/* For GENERATE_COMPLETION the CQ id to generate in is put here
+	 * and no completion is expected on the PQP.
+	 */
+	if (wr->op == PSIF_WR_GENERATE_COMPLETION) {
+		/* Are we generating a completion on our own QP? */
+		if (wr->details.su.u2.target_qp == pqp->qp->qp_idx)
+			wr->details.su.wc_id.sq_id.sq_seq_num = sq_seq;
+	} else
+		wr->cq_desc_vlan_pri_union.cqd_id = sq->cq_idx;
+
+	wh = get_sq_hdl(sq, sq_seq);
+	wh->wr_id = (u64)cqe;
+	wh->sq_seq = sq_seq;
+	wh->used = true;
+
+	if (cqe) {
+		if ((wr->op != PSIF_WR_GENERATE_COMPLETION) || (wr->se)) {
+			cqe->sq_seq = sq_seq;
+			wr->completion = 1;
+		}
+		BUG_ON(cqe->written);
+	}
+
+	sqe = get_sq_entry(sq, sq_seq);
+
+	sif_log(sdev, SIF_PQP, "pd %d cq_idx %d sq_idx %d sq.seqn %d op %s",
+		pd->idx, wr->cq_desc_vlan_pri_union.cqd_id, sq->index, sq_seq,
+		string_enum_psif_wr_type(wr->op));
+
+	if (likely(mode != PM_WRITE)) {
+		u64 csum;
+
+		wr->sq_seq = sq_seq;
+
+		/* Collect_length is always 0 for privileged wr's - they have no data */
+		csum = csum32_partial(wr, sizeof(*wr), qp->magic);
+		csum = csum32_fold(csum);
+		wr->checksum = csum;
+
+		sif_log(sdev, SIF_PQP, "PQP checksum %x", wr->checksum);
+	}
+
+	sif_logs(SIF_DUMP, write_struct_psif_wr(NULL, 0, wr));
+
+	/* update send queue */
+	copy_conv_to_hw(sqe, wr, sizeof(struct psif_wr));
+
+	if (likely(mode != PM_WRITE)) {
+		/* Flush writes before updating the sw pointer,
+		 * This is necessary to ensure that the sqs do not see
+		 * an incomplete entry:
+		 */
+		wmb();
+
+		/* Update sw pointer visible to hw */
+		set_psif_sq_sw__tail_indx(&sq_sw->d, sq_seq);
+
+		/* Finally write to collect buffer - implicit barriers before/after I/O writes
+		 *
+		 * Workaround #3595: ring doorbell if SQS in SQ-mode
+		 */
+		ring_doorbell = qp->flags & SIF_QPF_FORCE_SQ_MODE ||
+			!(get_psif_sq_hw__sq_next(&sq->d) & 0x1) ||
+			mode == PM_DOORBELL;
+
+		if (ring_doorbell)
+			sif_doorbell_from_sqe(qp, sq_seq, true);
+		else if (sif_cb_write(qp, wr, sizeof(struct psif_wr))) {
+			/* vcb lock busy, use db mode instead */
+			sif_doorbell_from_sqe(qp, sq_seq, true);
+		}
+	}
+
+	spin_unlock_irqrestore(&sq->lock, flags);
+	return ret;
+}
+
+
+int sif_pqp_post_send(struct sif_dev *sdev, struct psif_wr *wr, struct sif_cqe *cqe)
+{
+	struct sif_pqp *pqp = cqe ? cqe->pqp : get_pqp(sdev);
+	enum post_mode mode = pqp->qp->flags & SIF_QPF_FORCE_SQ_MODE ? PM_DOORBELL : PM_CB;
+
+	return sif_pqp_write_send(pqp, wr, cqe, mode);
+}
+
+int sif_pqp_poll_wr(struct sif_dev *sdev, struct psif_wr *wr, struct sif_cqe *cqe)
+{
+	int ret = sif_pqp_post_send(sdev, wr, cqe);
+
+	if (ret) {
+		sif_log(sdev, SIF_INFO, "PQP wr %d post failed on QP %d, CQ %d",
+			cqe->pqp->qp->qp_idx, cqe->pqp->cq->index, wr->sq_seq);
+		return ret;
+	}
+
+	ret = poll_cq_waitfor(cqe);
+	if (ret < 0)
+		sif_log(sdev, SIF_INFO, "poll_cq_waitfor, pqp QP %d, CQ %d failed with %d",
+			cqe->pqp->qp->qp_idx, cqe->pqp->cq->index, ret);
+	return ret;
+}
+
+
+/* Poll and process incoming (internal) completions
+ * while waiting for this particular completion
+ */
+int poll_cq_waitfor(struct sif_cqe *lcqe)
+{
+	struct sif_pqp *pqp = lcqe->pqp;
+	struct sif_cq *cq = pqp->cq;
+	struct sif_dev *sdev = to_sdev(cq->ibcq.device);
+	int ret = 0;
+	volatile bool *written = &lcqe->written;
+	u64 min_resp_ticks = sdev->min_resp_ticks;
+
+	/* TBD: This timeout is unsafe - we just keep it now to allow runs be aborted
+	 * without having to reboot. Keep value for it a factor larger than other timeouts:
+	 */
+	pqp->timeout = jiffies + min_resp_ticks * 4;
+
+	while (!(*written)) {
+		ret = pqp_process_cqe(pqp, NULL);
+		if (ret == -EBUSY) {
+			ret = 0;
+			continue;
+		} else if (ret < 0)
+			break;
+		else if (ret == 0) {
+			if (time_is_before_jiffies(pqp->timeout)) {
+				if (sif_feature(pcie_trigger))
+					force_pcie_link_retrain(sdev);
+				sif_log(sdev, SIF_INFO,
+					"cq %d: poll for cqe %p timed out", cq->index, lcqe);
+				atomic_inc(&cq->timeout_cnt);
+
+				sif_logs(SIF_PQPT,
+					struct sif_sq *sq = get_sif_sq(sdev, pqp->qp->qp_idx);
+					struct psif_sq_entry *sqe =
+						get_sq_entry(sq, lcqe->sq_seq);
+					write_struct_psif_sq_entry(NULL, 1, sqe));
+				ret = -ETIMEDOUT;
+				break;
+			}
+			if (!in_interrupt()) /* TBD: Fix this as well */
+				cond_resched();
+			else
+				cpu_relax();
+
+			if (sdev->min_resp_ticks != min_resp_ticks) {
+				/* Give us a quick way out by changing min_resp_ticks */
+				pqp->timeout -= (min_resp_ticks - sdev->min_resp_ticks) * 4;
+				min_resp_ticks = sdev->min_resp_ticks;
+			}
+			continue;
+		}
+	}
+
+	if (ret < 0)
+		pqp_reset_cmpl(lcqe);
+	return ret;
+}
+
+
+/* Poll for any pqp completion, return the number of completions polled */
+static int poll_cq_waitfor_any(struct sif_pqp *pqp, struct sif_cqe *first_err)
+{
+	struct sif_cq *cq = pqp->cq;
+	struct sif_dev *sdev = to_sdev(cq->ibcq.device);
+	int ret = 0;
+	u64 min_resp_ticks = sdev->min_resp_ticks;
+
+	pqp->timeout = jiffies + min_resp_ticks * 4;
+
+	while (!ret) {
+		ret = pqp_process_cqe(pqp, first_err);
+		if (ret == -EBUSY) {
+			ret = 0;
+			continue;
+		} else if (ret < 0)
+			break;
+		else if (ret == 0) {
+			if (time_is_before_jiffies(pqp->timeout)) {
+				if (sif_feature(pcie_trigger))
+					force_pcie_link_retrain(sdev);
+				sif_log(sdev, SIF_INFO,
+					"cq %d: poll timed out", cq->index);
+				atomic_inc(&cq->timeout_cnt);
+				ret = -ETIMEDOUT;
+				break;
+			}
+			if (!in_interrupt()) /* TBD: Fix this as well */
+				cond_resched();
+			else
+				cpu_relax();
+
+			if (sdev->min_resp_ticks != min_resp_ticks) {
+				/* Give us a quick way out by changing min_resp_ticks */
+				pqp->timeout -= (min_resp_ticks - sdev->min_resp_ticks) * 4;
+				min_resp_ticks = sdev->min_resp_ticks;
+			}
+		}
+	}
+	sif_log(sdev, SIF_PQP, "ret = %d", ret);
+	return ret;
+}
+
+
+/***** Generic completion generation *****/
+
+static int __gen_cqe(struct sif_dev *sdev, u32 target_cq, u64 wc_id, u32 target_qp,
+	      enum psif_wc_opcode opcode, enum psif_wc_status status, struct sif_cqe *cqe,
+	      bool event)
+{
+	struct psif_wr wr;
+
+	memset(&wr, 0, sizeof(struct psif_wr));
+	wr.op = PSIF_WR_GENERATE_COMPLETION;
+	wr.cq_desc_vlan_pri_union.cqd_id = target_cq;
+	wr.details.su.completion_status = status;
+	wr.details.su.completion_opcode = opcode;
+
+	if (opcode >= PSIF_WC_OPCODE_RECEIVE_SEND)
+		wr.details.su.wc_id.rq_id = wc_id;
+	else
+		wr.details.su.wc_id.sq_id.sq_seq_num = wc_id;
+
+	wr.details.su.u2.target_qp = target_qp;
+	/* set the IB_CQ_SOLICITED flag because the CQ might be armed
+	 * and the consumer might be interested in getting these events.
+	 * Setting IB_CQ_SOLICITED is generally safe because it is a
+	 * subset of IB_CQ_NEXT_COMP.
+	 */
+	if (event)
+		wr.se = 1;
+
+	return sif_pqp_post_send(sdev, &wr, cqe);
+}
+
+
+/* Generate a SUCCESS completion on the PQP itself
+ * We use this to be able to wait for a set of generated completions to other
+ * CQs to have been completed:
+ */
+int gen_pqp_cqe(struct sif_cqe *cqe)
+{
+	struct sif_pqp *pqp = cqe->pqp;
+	struct sif_dev *sdev = to_sdev(pqp->cq->ibcq.device);
+	struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, pqp->qp->qp_idx);
+
+	if (cqe)
+		cqe->written = false;
+
+	sif_log(sdev, SIF_PQP, " for sq %d, last_nc_full %d, head_seq %d last_seq %d",
+		pqp->qp->qp_idx, pqp->last_nc_full, sq_sw->head_seq, sq_sw->last_seq);
+	return __gen_cqe(sdev, pqp->cq->index, 0, pqp->qp->qp_idx,
+			PSIF_WC_OPCODE_GENERATE_COMPLETION, PSIF_WC_STATUS_SUCCESS,
+			 cqe, true);
+}
+
+
+/* Post a request to generate a completion with the given values
+ * on the cq identified by @target_cq.
+ * This request generates no completion on the PQP itself:
+ */
+static int sif_gen_cqe(struct sif_dev *sdev, u32 target_cq, u64 wc_id, u32 target_qp,
+		enum psif_wc_opcode opcode, enum psif_wc_status status, bool event)
+{
+	return __gen_cqe(sdev, target_cq, wc_id, target_qp, opcode, status, NULL, event);
+}
+
+/* Post a request to generate a completion for an outstanding rq entry
+ * on the given qp. This request generates no completion on the PQP itself:
+ */
+
+static int sif_gen_rq_cqe(struct sif_dev *sdev, struct sif_rq *rq, u32 rq_seq,
+		struct sif_qp *target_qp, enum psif_wc_opcode opcode,
+		enum psif_wc_status status)
+{
+	struct psif_rq_entry *rqe = get_rq_entry(rq, rq_seq);
+	u64 wc_id = get_psif_rq_entry__rqe_id(rqe);
+	u32 cq_idx = get_psif_qp_core__rcv_cq_indx(&target_qp->d.state);
+
+	sif_log(sdev, SIF_PQP, "on rq %d, rq_seq %d, wc_id %llx, cq %d (target_qp %d)",
+		rq->index, rq_seq, wc_id, cq_idx, target_qp->qp_idx);
+
+	return sif_gen_cqe(sdev, cq_idx, wc_id, target_qp->qp_idx, opcode, status, true);
+}
+
+
+int sif_gen_rq_flush_cqe(struct sif_dev *sdev, struct sif_rq *rq,
+			u32 rq_seq, struct sif_qp *target_qp)
+{
+	return sif_gen_rq_cqe(sdev, rq, rq_seq, target_qp,
+			PSIF_WC_OPCODE_RECEIVE_SEND, PSIF_WC_STATUS_WR_FLUSH_ERR);
+}
+
+/* Post a request to generate a completion for an outstanding sq entry
+ * on the given qp. This request generates no completion on the PQP itself:
+ */
+
+static int sif_gen_sq_cqe(struct sif_dev *sdev, struct sif_sq *sq, u32 sq_seq, u32 target_qp,
+		   enum psif_wc_opcode opcode, enum psif_wc_status status, bool event)
+{
+	struct psif_sq_entry *sqe = get_sq_entry(sq, sq_seq);
+	u64 wc_id = get_psif_wr__sq_seq(&sqe->wr);
+
+	sif_log(sdev, SIF_PQP, "on sq %d, sq_seq %d, wc_id %llx, cq %d (target_qp %d)",
+		sq->index, sq_seq, wc_id, sq->cq_idx, target_qp);
+
+	return sif_gen_cqe(sdev, sq->cq_idx, wc_id, target_qp, opcode, status, event);
+}
+
+
+int sif_gen_sq_flush_cqe(struct sif_dev *sdev, struct sif_sq *sq,
+			 u32 sq_seq, u32 target_qp, bool event)
+{
+	return sif_gen_sq_cqe(sdev, sq, sq_seq, target_qp,
+			      PSIF_WC_OPCODE_SEND, PSIF_WC_STATUS_WR_FLUSH_ERR, event);
+}
+
+
+/***** Stencil PQP support ****
+ *
+ *  A stencil PQP is a PQP set up fully populated with WRs ready
+ *  for parallel batch processing (using SQSes) of particularly performance
+ *  critical PQP operations.
+ *
+ *  The idea is to lay this out to allow the WRs to be reused with minimal
+ *  updates:
+ */
+
+struct sif_st_pqp *sif_create_inv_key_st_pqp(struct sif_dev *sdev)
+{
+	int i;
+	struct sif_st_pqp *spqp = (struct sif_st_pqp *)_sif_create_pqp(sdev, sizeof(*spqp), 0);
+	struct sif_pqp *pqp;
+	int qp_idx;
+	struct sif_sq *sq;
+	struct sif_sq_sw *sq_sw;
+	struct psif_sq_entry *sqe;
+	struct psif_wr lwr;
+	u16 max_db_int;
+
+	if (IS_ERR(spqp))
+		return spqp;
+
+	pqp = &spqp->pqp;
+	qp_idx = pqp->qp->qp_idx;
+	sq = get_sif_sq(sdev, qp_idx);
+	sq_sw = get_sif_sq_sw(sdev, qp_idx);
+	max_db_int = (sq->entries >> 3);
+
+	/* Pre-populate the SQ */
+	for (i = 0; i < sq->entries; i++)
+		sif_write_invalidate(pqp, key, 0, NULL, PCM_POST, PM_WRITE);
+
+	/* Now, to start using the stencil at seq.1 (as normal SQs)
+	 * we must reset the sw tail pointer which
+	 * was updated by sif_write_invalidate:
+	 */
+	sq_sw->last_seq = 0;
+	spqp->doorbell_seq = 1;
+
+	spqp->doorbell_interval = min_t(u16, SPQP_DOORBELL_INTERVAL, max_db_int);
+	spqp->next_doorbell_seq = spqp->doorbell_interval + 1;
+	spqp->req_compl = 0;
+	spqp->next_poll_seq = (sq->entries >> 1);
+	spqp->sq = sq;
+	spqp->sq_sw = sq_sw;
+	spqp->pqp.qp->flags |= SIF_QPF_KI_STENCIL;
+
+	/* Calculate a partial checksum
+	 * - they are all the same since the fields we change
+	 * are calculated with 0-values to ease checksum mod. later:
+	 */
+	sqe = get_sq_entry(sq, 0);
+	copy_conv_to_sw(&lwr, &sqe->wr, sizeof(lwr));
+	spqp->checksum = csum32_partial(&lwr, sizeof(lwr), pqp->qp->magic);
+
+	sif_log(sdev, SIF_PQPT, "done qp %d, sq sz %d, next_poll_seq %d", qp_idx,
+		sq->entries, spqp->next_poll_seq);
+	return spqp;
+}
+
+
+int sif_destroy_st_pqp(struct sif_dev *sdev, struct sif_st_pqp *spqp)
+{
+	return sif_destroy_pqp(sdev, &spqp->pqp);
+}
+
+
+/* Update a new invalidate key request into a preconfigured stencil pqp
+ * Assumes exclusive access to the PQP SQ.
+ */
+int sif_inv_key_update_st(struct sif_st_pqp *spqp, int index, enum wr_mode mode)
+{
+	struct sif_sq *sq = spqp->sq;
+	struct sif_sq_sw *sq_sw = spqp->sq_sw;
+	u16 sq_seq = ++sq_sw->last_seq;
+	struct psif_sq_entry *sqe = get_sq_entry(sq, sq_seq);
+	struct sif_dev *sdev = to_sdev(spqp->pqp.cq->ibcq.device);
+	bool poll_prev = false;
+	int ret = 1;
+	u64 csum_inc = (u64)index + (u64)sq_seq;
+	u64 csum;
+	int q_sz;
+	u16 head;
+	DECLARE_SIF_CQE_POLL(sdev, first_err);
+
+	/* Modify the request to our need */
+	set_psif_wr_su__key(&sqe->wr.details.su, index);
+	set_psif_wr__sq_seq(&sqe->wr, sq_seq);
+
+	head = sq_sw->head_seq;
+	q_sz = sq_length(sq, head, sq_seq);
+
+	if (unlikely(q_sz > (int)sq->entries)) {
+		sif_log(sdev, SIF_INFO,	"Error: Stencil pqp (qp %d) is full at seq %d, head %d",
+			sq->index, sq_seq, sq_sw->head_seq);
+		sq_sw->last_seq--;
+		return -ENOMEM;
+	}
+
+	/* Store longest send queue observed */
+	if (unlikely(q_sz > sq->max_outstanding))
+		sq->max_outstanding = q_sz;
+
+	if (unlikely(mode == PCM_WAIT || sq_seq == spqp->next_poll_seq)) {
+		set_psif_wr__completion(&sqe->wr, 1);
+		spqp->req_compl++;
+		sif_log(sdev, SIF_PQPT, "sq %d: requesting completion for seq %d (%d)",
+			sq->index, sq_seq, spqp->req_compl);
+		poll_prev = spqp->req_compl > 1;
+		if (sq_seq == spqp->next_poll_seq)
+			spqp->next_poll_seq += (sq->entries >> 1);
+		csum_inc += 0x80000000;
+	} else {
+		/* Reset the completion bit in case it was set in the previous generation! */
+		set_psif_wr__completion(&sqe->wr, 0);
+	}
+
+	/* Add the changes to the checksum */
+	csum = csum32_partial(&csum_inc, 8, spqp->checksum);
+	csum = csum32_fold(csum);
+	set_psif_wr__checksum(&sqe->wr, csum);
+
+	sif_log(sdev, SIF_PQP, "cq %d, sq %d, sq seq %d%s", spqp->pqp.cq->index,
+		sq->index, sq_seq, (poll_prev ? " (poll prev)" : ""));
+
+	if (unlikely(mode == PCM_WAIT || sq_seq == spqp->next_doorbell_seq)) {
+		sif_log(sdev, SIF_PQPT, "sq %d: writing doorbell at seq %d - tail at %d%s",
+			sq->index, spqp->doorbell_seq, sq_seq, (mode == PCM_WAIT ? " [wait]" : ""));
+		wmb();
+		set_psif_sq_sw__tail_indx(&sq_sw->d, sq_seq);
+		sif_doorbell_from_sqe(spqp->pqp.qp, spqp->doorbell_seq, true);
+		spqp->doorbell_seq = sq_seq + 1;
+		spqp->next_doorbell_seq = sq_seq + spqp->doorbell_interval + 1;
+	}
+
+	if (poll_prev) {
+		sif_log(sdev, SIF_PQPT, "enter wait (poll_prev) (%d)", spqp->req_compl);
+		ret = poll_cq_waitfor_any(&spqp->pqp, &first_err);
+		if (ret < 0)
+			goto out;
+		if (unlikely(first_err.written)) {
+			sif_log(sdev, SIF_INFO, "error completion with status %s",
+				string_enum_psif_wc_status(first_err.cqe.status));
+			goto out;
+		}
+		sif_log(sdev, SIF_PQPT, "polled %d completions", ret);
+		spqp->req_compl -= ret;
+	}
+
+	if (unlikely(mode == PCM_WAIT)) {
+		while (sq_sw->head_seq != sq_seq) {
+			sif_log(sdev, SIF_PQPT, "enter wait (%d) seq %d/%d",
+				spqp->req_compl, sq_sw->head_seq, sq_seq);
+			ret = poll_cq_waitfor_any(&spqp->pqp, &first_err);
+			if (ret < 0)
+				break;
+			spqp->req_compl -= ret;
+			sif_log(sdev, SIF_PQPT, "done wait - head now %d - rem.cmpl %d",
+				sq_sw->head_seq, spqp->req_compl);
+		}
+	}
+
+	if (ret == 0)
+		ret = -ENOMEM;
+	else if (ret > 0)
+		ret = 0;
+
+out:
+	sif_log(sdev, SIF_PQP, "done ret = %d", ret);
+	return ret;
+}
+
+
+/* get exclusive access to a stencil pqp */
+struct sif_st_pqp *sif_alloc_ki_spqp(struct sif_dev *sdev)
+{
+	int index;
+	struct sif_st_pqp *spqp = NULL;
+
+	mutex_lock(&sdev->ki_spqp.lock);
+	index = find_next_zero_bit(sdev->ki_spqp.bitmap, sdev->ki_spqp.pool_sz, 0);
+	if (index < sdev->ki_spqp.pool_sz) {
+		set_bit(index, sdev->ki_spqp.bitmap);
+		spqp = sdev->ki_spqp.spqp[index];
+	}
+	mutex_unlock(&sdev->ki_spqp.lock);
+	sif_log(sdev, SIF_PQPT, "bit index %d", index);
+	return spqp;
+}
+
+void sif_release_ki_spqp(struct sif_st_pqp *spqp)
+{
+	struct sif_dev *sdev = to_sdev(spqp->pqp.cq->ibcq.device);
+
+	mutex_lock(&sdev->ki_spqp.lock);
+	clear_bit(spqp->index, sdev->ki_spqp.bitmap);
+	mutex_unlock(&sdev->ki_spqp.lock);
+	sif_log(sdev, SIF_PQPT, "bit index %d", spqp->index);
+}
diff --git a/drivers/infiniband/hw/sif/sif_pqp.h b/drivers/infiniband/hw/sif/sif_pqp.h
new file mode 100644
index 0000000000000..55bcd7ce38809
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_pqp.h
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_pqp.h: Privileged QP handling
+ */
+
+#ifndef __SIF_PQP_H
+#define __SIF_PQP_H
+
+struct sif_qp;
+struct sif_cq;
+struct sif_rq;
+struct sif_sq;
+struct completion;
+enum post_mode;
+
+/* Data structure used by PQP requesters to get the completion information,
+ * and optionally block waiting for it to arrive:
+ */
+struct sif_cqe {
+	struct psif_cq_entry cqe; /* host order copy of hw cqe */
+	struct completion cmpl;   /* a completion to wait on for response */
+	struct sif_pqp *pqp;	  /* Priv.qp to wait on */
+	bool need_complete;	  /* cmpl is initialized and a waiter is present */
+	bool written;		  /* Set to true when a completion has been copied here */
+	u16 sq_seq;		  /* set by post_send to allow us to reset ourselves */
+};
+
+/*
+ * Declare and initialize data structure to receive a poll completion
+ * cqe.status initialized tosomething != SUCCESS
+ */
+#define DECLARE_SIF_CQE_POLL(d_, c_)\
+	struct sif_cqe c_ = { \
+		.cqe.status = PSIF_WC_STATUS_FIELD_MAX,\
+		.pqp = get_pqp(d_),\
+		.need_complete = false,\
+		.written = false,\
+	}
+
+#define DECLARE_SIF_CQE_WITH_SAME_EQ(d_, c_, e_)	\
+	struct sif_cqe c_ = { \
+		.cqe.status = PSIF_WC_STATUS_FIELD_MAX,\
+		.pqp = get_pqp_same_eq(d_, e_),	\
+		.need_complete = false,\
+		.written = false,\
+	}
+
+
+#define DECLARE_SIF_CQE_WAIT(d_, c_)\
+	struct sif_cqe c_ = { \
+		.cqe.status = PSIF_WC_STATUS_FIELD_MAX,\
+		.pqp = get_pqp(d_),\
+		.need_complete = true,\
+		.written = false,\
+	};\
+	init_completion(&c_.cmpl)
+
+#define DECLARE_SIF_CQE_POLL_WITH_RR_PQP(d_, c_)\
+	struct sif_cqe c_ = { \
+		.cqe.status = PSIF_WC_STATUS_FIELD_MAX,\
+		.pqp = get_next_pqp(d_),\
+		.need_complete = false,\
+		.written = false,\
+	}
+
+
+struct sif_pqp {
+	struct sif_qp *qp;  /* The qp used */
+	struct sif_cq *cq;  /* Associated completion queue for this priv.QP */
+	unsigned long timeout; /* rescheduled when new completions observed */
+	struct completion nonfull; /* allow a poster to wait for a cred */
+	atomic_t waiters; /* number of waiters on nonfull */
+	u16 last_full_seq;  /* For logging purposes, record when last observed full */
+	u16 last_nc_full;   /* Track when to return EAGAIN to flush non-compl.entries */
+	u16 lowpri_lim;  /* Max number of outstanding low priority reqs */
+};
+
+struct sif_pqp *sif_create_pqp(struct sif_dev *sdev, int comp_vector);
+int sif_destroy_pqp(struct sif_dev *sdev, struct sif_pqp *pqp);
+
+/* Get the right PQP for the current CPU */
+struct sif_pqp *get_pqp(struct sif_dev *sdev);
+
+/* Get the right PQP with the same EQ */
+struct sif_pqp *get_pqp_same_eq(struct sif_dev *sdev, int comp_vector);
+
+/* Get the next PQP in round robin fashion */
+struct sif_pqp *get_next_pqp(struct sif_dev *sdev);
+
+/* Get the right CB for the current CPU for the given QP */
+struct sif_cb *get_cb(struct sif_qp *qp);
+
+static inline struct sif_cq *pqp_cq(struct sif_dev *sdev)
+{
+	return (get_pqp(sdev))->cq;
+}
+
+static inline struct sif_qp *pqp_qp(struct sif_dev *sdev)
+{
+	return (get_pqp(sdev))->qp;
+}
+
+/* Fill in common parts and post a work request to the management QP for the current CPU
+ * If @cqe is non-null, a completion will be requested and eventually reflected to @cqe
+ * in host order.
+ */
+int sif_pqp_post_send(struct sif_dev *sdev, struct psif_wr *wr, struct sif_cqe *cqe);
+
+/* Same as post send but allow post_mode - sif_pqp_post_send uses PM_CB */
+int sif_pqp_write_send(struct sif_pqp *pqp, struct psif_wr *wr, struct sif_cqe *cqe,
+		enum post_mode mode);
+
+
+/* Poll and process incoming (internal) completions
+ * while waiting for this particular completion
+ */
+int poll_cq_waitfor(struct sif_cqe *lcqe);
+
+int sif_pqp_poll_wr(struct sif_dev *sdev, struct psif_wr *wr, struct sif_cqe *cqe);
+
+
+
+/* Generate a SUCCESS completion on the PQP itself
+ * We use this to be able to wait for a set of generated completions to other
+ * CQs to have been completed:
+ */
+int gen_pqp_cqe(struct sif_cqe *cqe);
+
+/* Post a request to generate a flushed-in-error completion for an outstanding rq entry
+ * on the given qp. This request generates no completion on the PQP itself:
+ */
+int sif_gen_rq_flush_cqe(struct sif_dev *sdev, struct sif_rq *rq,
+			u32 rq_seq, struct sif_qp *target_qp);
+
+/* Post a request to generate a flushed-in-error completion for an outstanding sq entry
+ * on the given qp. This request generates no completion on the PQP itself:
+ */
+int sif_gen_sq_flush_cqe(struct sif_dev *sdev, struct sif_sq *sq,
+			 u32 sq_seq, u32 target_qp, bool notify_ev);
+
+/* Stencil PQP support - pre-populated PQPs for special performance sensitive use cases */
+
+#define SPQP_DOORBELL_INTERVAL 8192
+
+struct sif_st_pqp {
+	struct sif_pqp pqp;	/* The PQP to use - must be first */
+	struct sif_sq *sq;	/* Short path to sq */
+	struct sif_sq_sw *sq_sw;/* Short path to sq_sw */
+	int index;		/* The index of this st_pqp within it's pool */
+	u16 doorbell_interval;  /* Interval between each doorbell write */
+	u16 doorbell_seq;	/* Seq.no to use in next doorbell */
+	u16 next_doorbell_seq;  /* Next seqno to ring doorbell */
+	u16 req_compl;		/* Number of completions requested */
+	u16 next_poll_seq;	/* Next seqno to set completion and wait/poll for one */
+	u64 checksum;		/* Host endian partial checksum of stencil WR entries */
+};
+
+
+/* Stencil PQP management */
+struct sif_spqp_pool {
+	struct mutex lock;	  /* Protects access to this pool */
+	struct sif_st_pqp **spqp; /* Key invalidate stencil PQPs */
+	u32 pool_sz;		  /* Number of stencil PQPs set up */
+	ulong *bitmap;		  /* Bitmap for allocation from spqp */
+};
+
+
+struct sif_st_pqp *sif_create_inv_key_st_pqp(struct sif_dev *sdev);
+
+/* get exclusive access to a stencil pqp */
+struct sif_st_pqp *sif_alloc_ki_spqp(struct sif_dev *sdev);
+void sif_release_ki_spqp(struct sif_st_pqp *spqp);
+
+/* Update a new invalidate key request into a preconfigured stencil pqp
+ * Assumes exclusive access to the PQP SQ.
+ */
+int sif_inv_key_update_st(struct sif_st_pqp *spqp, int index, enum wr_mode mode);
+
+
+int sif_destroy_st_pqp(struct sif_dev *sdev, struct sif_st_pqp *spqp);
+
+#endif