From: Knut Omang <knut.omang@oracle.com>
Date: Wed, 25 May 2016 09:01:10 +0000 (+0200)
Subject: sif driver initial commit part 1
X-Git-Tag: v4.1.12-92~148^2~11
X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=ed2b400dc4f85d922f31c25b66dff750b9620fd2;p=users%2Fjedix%2Flinux-maple.git

sif driver initial commit part 1

sif_ah.c:        Implementation of IB address handles for SIF
sif_ah.h:        Interface to internal IB address handle logic for SIF
sif_base.c:      Basic hardware setup of SIF
sif_base.h:      Basic hardware setup of SIF
sif_checksum.c:  Utilities for SIF specific 32 bit checksums
sif_checksum.h:  Utilities for SIF specific 32 bit checksums
sif_cq.c:        Implementation of completion queue logic for SIF
sif_cq.h:        Internal interface to psif completion queue logic
sif_debug.c:     Use of debugfs for dumping internal data structure info
sif_debug.h:     Use of debugfs for dumping internal data structure info
sif_defs.c:      IB-to-SIF Mapper.
sif_defs.h:      Div. utility definitions and auxiliary data structures
sif_dev.h:       Driver specific data structure definitions
sif_dma.c:       DMA memory mapping
sif_dma.h:       DMA memory mapping
sif_drvapi.h:    Device specific operations available via the FWA access path
sif_elog.c:      Log over PCIe support for firmware
sif_elog.h:      Misc device for capturing log from the EPSC
sif_enl.h:       Protocol definitions for the netlink protocol for EPSC access from
sif_epsc.c:      Implementation of API for communication with the EPSC
sif_epsc.h:      API for communication with the EPSC (and EPS-A's)
sif_eq.c:        Setup of event queues and interrupt handling
sif_eq.h:        Event queues and interrupt handling
sif_fmr.c:       Implementation of fast memory registration for SIF
sif_fmr.h:       Interface to internal IB Fast Memory Registration (FMR)

Credits:
The sif driver supports Oracle’s new Dual Port EDR and QDR
IB Adapters and the integrated IB devices on the new SPARC SoC.

The driver is placed under drivers/infiniband/hw/sif

This patch set is the result of direct or indirect contribution by
several people:

Code contributors:
  Knut Omang, Vinay Shaw, Haakon Bugge, Wei Lin Guay,
  Lars Paul Huse, Francisco Trivino-Garcia.

Minor patch/bug fix contributors:
  Hans Westgaard Ry, Jesus Escudero, Robert Schmidt, Dag Moxnes,
  Andre Wuttke, Predrag Hodoba, Roy Arntsen

Initial architecture adaptations:
  Khalid Aziz (sparc64), Gerd Rausch (arm64)

Testing, Test development, Continuous integration, Bug haunting, Code
review:
  Knut Omang, Hakon Bugge, Åsmund Østvold, Francisco Trivino-Garcia,
  Wei Lin Guay, Vinay Shaw, Hans Westgaard Ry,
  + numerous other people within Oracle.

Simulator development:
  Andrew Manison, Hans Westgaard Ry, Knut Omang, Vinay Shaw

Orabug: 22529577

Reviewed-by: Hakon Bugge <Haakon.Bugge@oracle.com>
Signed-off-by: Knut Omang <knut.omang@oracle.com>
---

diff --git a/drivers/infiniband/hw/sif/Kconfig b/drivers/infiniband/hw/sif/Kconfig
new file mode 100644
index 0000000000000..6cea193a78085
--- /dev/null
+++ b/drivers/infiniband/hw/sif/Kconfig
@@ -0,0 +1,5 @@
+config INFINIBAND_SIF
+	tristate "Oracle Infiniband HCA support"
+	depends on PCI && 64BIT && HAS_DMA
+	---help---
+	  Low level driver for Oracle's family of Infiniband HCAs
diff --git a/drivers/infiniband/hw/sif/sif_ah.c b/drivers/infiniband/hw/sif/sif_ah.c
new file mode 100644
index 0000000000000..6d76057b43c9e
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_ah.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_ah.c: Implementation of IB address handles for SIF
+ */
+
+#include <rdma/ib_verbs.h>
+#include <linux/seq_file.h>
+#include "sif_dev.h"
+#include "psif_hw_data.h"
+#include "sif_defs.h"
+#include "sif_base.h"
+#include "sif_ah.h"
+
+
+struct ib_ah *sif_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *ah_attr,
+			struct ib_udata *udata)
+{
+	struct sif_ah *ah;
+	struct sif_dev *sdev = to_sdev(ibpd->device);
+	struct sif_pd *pd = to_spd(ibpd);
+	struct ib_ah *ret;
+
+	volatile struct psif_ah *ah_p;
+	struct psif_ah lah;
+	int index;
+
+	sif_log(sdev, SIF_AH, "for pd %d", pd->idx);
+
+	index = sif_alloc_ah_idx(sdev);
+	if (index < 0) {
+		ret = ERR_PTR(-ENOMEM);
+		goto err_create_ah;
+	}
+	ah = get_sif_ah(sdev, index);
+	memset(ah, 0, sizeof(struct sif_ah));
+	ah->index = index;
+	ah_p = &ah->d;
+
+	/* TBD: Many attrs should come from device cap-limits and
+	 * as provided by user
+	 */
+
+	/* Update hw */
+	memset(&lah, 0, sizeof(lah));
+	lah.sl = ah_attr->sl;
+	lah.port = ah_attr->port_num - 1;
+	lah.pd = pd->idx;
+	lah.remote_lid = ah_attr->dlid;
+	lah.local_lid_path = ah_attr->src_path_bits;
+	lah.ipd = ah_attr->static_rate;  /* TBD: Encoding + is this right? */
+	lah.loopback =
+		(sdev->port[lah.port].lid | lah.local_lid_path) == ah_attr->dlid ?
+		LOOPBACK : NO_LOOPBACK;
+
+
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		lah.use_grh = USE_GRH;
+		/* We need to byte swap these an extra time as we are receiving
+		 * them in big endian format, and they are subject to copy/convert as well:
+		 */
+		lah.grh_remote_gid_0 = cpu_to_be64(ah_attr->grh.dgid.global.subnet_prefix);
+		lah.grh_remote_gid_1 = cpu_to_be64(ah_attr->grh.dgid.global.interface_id);
+		lah.grh_flowlabel = ah_attr->grh.flow_label;
+		lah.grh_hoplmt = ah_attr->grh.hop_limit;
+		/* TBD: ah_attr->grh.sgid_index? */
+
+		sif_log(sdev, SIF_AH, " - with grh dgid %llx.%llx",
+			lah.grh_remote_gid_0,
+			lah.grh_remote_gid_1);
+	}
+
+	copy_conv_to_hw(ah_p, &lah, sizeof(lah));
+
+	sif_log(sdev, SIF_AH, "ah %d - remote_lid 0x%x src_path_bits 0x%x sl %d, %s",
+		ah->index, lah.remote_lid, lah.local_lid_path, lah.sl,
+		(lah.loopback ? "(loopback)" : ""));
+	sif_logs(SIF_DUMP, write_struct_psif_ah(NULL, 0, &lah));
+
+
+	if (udata) {
+		struct sif_create_ah_resp_ext resp;
+		int ret;
+
+		memset(&resp, 0, sizeof(resp));
+		resp.index = ah->index;
+		ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (ret) {
+			sif_destroy_ah(&ah->ibah);
+			return ERR_PTR(ret);
+		}
+	}
+	return &ah->ibah;
+err_create_ah:
+	return ret;
+}
+
+int sif_destroy_ah(struct ib_ah *ibah)
+{
+	struct sif_ah *ah = to_sah(ibah);
+	struct sif_dev *sdev = to_sdev(ibah->device);
+	int index = ah->index;
+
+	sif_logi(ibah->device, SIF_AH, "index 0x%x", index);
+
+	sif_clear_ah(sdev, index);
+	sif_free_ah_idx(sdev, index);
+
+	return 0;
+}
+
+int sif_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+	sif_logi(ibah->device, SIF_AH, "Not implemented");
+	return -EOPNOTSUPP;
+}
+
+int sif_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+
+	struct sif_ah *ah = to_sah(ibah);
+	struct psif_ah lah;
+
+	ah_attr->ah_flags = 0;
+	copy_conv_to_sw(&lah, &ah->d, sizeof(lah));
+	ah_attr->sl = lah.sl;
+	ah_attr->port_num = lah.port + 1;
+	ah_attr->static_rate = lah.ipd;
+	ah_attr->dlid = lah.remote_lid;
+
+	if (lah.use_grh == USE_GRH) {
+		ah_attr->ah_flags |= IB_AH_GRH;
+		ah_attr->grh.dgid.global.subnet_prefix = lah.grh_remote_gid_0;
+		ah_attr->grh.dgid.global.interface_id = lah.grh_remote_gid_1;
+		ah_attr->grh.flow_label = lah.grh_flowlabel;
+		ah_attr->grh.hop_limit = lah.grh_hoplmt;
+	}
+
+	sif_logi(ibah->device, SIF_AH, "ah %d - remote_lid 0x%x src_path_bits 0x%x %s",
+		ah->index, lah.remote_lid, lah.local_lid_path,
+		(lah.loopback ? "(loopback)" : ""));
+	return 0;
+}
+
+
+void sif_dfs_print_ah(struct seq_file *s, struct sif_dev *sdev,
+		loff_t pos)
+{
+	if (unlikely(pos < 0))
+		seq_puts(s, "# Index  Port    PD Rem.lid\n");
+	else {
+		struct psif_ah *ah_p = get_ah(sdev, pos);
+		struct psif_ah lah;
+
+		copy_conv_to_sw(&lah, ah_p, sizeof(struct psif_ah));
+		seq_printf(s, "%7lld %5d %5d %7d\n",
+			pos, lah.port + 1, lah.pd, lah.remote_lid);
+	}
+}
diff --git a/drivers/infiniband/hw/sif/sif_ah.h b/drivers/infiniband/hw/sif/sif_ah.h
new file mode 100644
index 0000000000000..8ccbdf80ce543
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_ah.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_ah.h: Interface to internal IB address handle logic for SIF
+ */
+
+#ifndef __SIF_AH_H
+#define __SIF_AH_H
+
+struct sif_ah {
+	volatile struct psif_ah d;
+	struct ib_ah ibah;
+	int index;
+};
+
+static inline struct sif_ah *to_sah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct sif_ah, ibah);
+}
+
+struct ib_ah *sif_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *ah_attr,
+			struct ib_udata *udata);
+int sif_destroy_ah(struct ib_ah *ibah);
+int sif_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr);
+int sif_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr);
+
+struct seq_file;
+struct sif_dev;
+
+/* Line printer for debugfs file */
+void sif_dfs_print_ah(struct seq_file *s, struct sif_dev *sdev, loff_t pos);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_base.c b/drivers/infiniband/hw/sif/sif_base.c
new file mode 100644
index 0000000000000..a911d2991c5ee
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_base.c
@@ -0,0 +1,1163 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_base.c: Basic hardware setup of SIF
+ */
+#include <linux/module.h>
+#include <linux/bitmap.h>
+#include <linux/scatterlist.h>
+#include <linux/vmalloc.h>
+#ifdef CONFIG_X86
+#include <asm/cacheflush.h>
+#endif
+#include "sif_base.h"
+#include "sif_hwi.h"
+#include "sif_mmu.h"
+#include "sif_dma.h"
+#include "psif_hw_csr.h"
+#include "sif_epsc.h"
+#include "sif_query.h"
+#include "sif_defs.h"
+
+/* Pretty printers for debugfs defined here: */
+#include "sif_qp.h"
+#include "sif_sq.h"
+#include "sif_ah.h"
+#include "sif_mr.h"
+#include "sif_eq.h"
+#include "sif_cq.h"
+
+static int sif_init_bitmap(struct sif_table *table);
+static void sif_free_bitmap(struct sif_table *table);
+
+#define psif_xrq_sw psif_rq_sw
+
+/* fallback cases for special entries below */
+static uint dummy_bw_cb_size = 16383;
+static uint dummy_lat_cb_size = 1;
+
+/* Macro for generating parameter values for queues
+ * They are all read only after driver load
+ */
+
+#define add_qsz_parameter(type, hwtype, initsize) \
+uint sif_##type##_size = initsize;\
+module_param_named(type##_size, sif_##type##_size, uint, S_IRUGO);\
+MODULE_PARM_DESC(type##_size, "Size of the " #type " descriptor table")
+
+
+/* These are the queue size parameters we support
+ *  e.g. for instance qp_size=2048 or ah_size=100
+ * (all sizes will be rounded up to a power of two value)
+ */
+add_qsz_parameter(mr, key, 524288);
+add_qsz_parameter(epsc, epsc_csr_req, 2048);
+add_qsz_parameter(qp, qp, 131072);
+add_qsz_parameter(rq, rq_hw, 131072);
+add_qsz_parameter(cq, cq_hw, 131072);
+add_qsz_parameter(ah, ah, 524288);
+add_qsz_parameter(sq_ring, sq_ring, 262144);
+add_qsz_parameter(sq_tvl, sq_tvl, 128);
+
+/* These sizes must be equal to QP size */
+#define sif_sq_rspq_size sif_qp_size
+#define sif_rqsp_size sif_qp_size
+#define sif_atsp_size sif_qp_size
+
+/* These can be set from the command line - no parameter needed */
+static uint sif_epsa0_size = 64;
+static uint sif_epsa1_size = 64;
+static uint sif_epsa2_size = 64;
+static uint sif_epsa3_size = 64;
+
+/* This defines how small the smallest (sw) pointers can get.
+ * If set to <= 8, 512 sw descriptors will fit in one page.
+ * This gives the smallest amount of internal overhead in each software descriptor
+ * but will yield a much larger block size which will require a larger amount of
+ * entries from both software and hardware descriptors to be reserved for each
+ * protection domain:
+ */
+uint sif_min_extent = 128;
+module_param_named(min_extent, sif_min_extent, uint, S_IRUGO);
+MODULE_PARM_DESC(min_extent, "The smallest entry size to use for descriptors");
+
+/* These vars defines a minimal value for the number of extra eq entries
+ * to allocate. The driver will only update the EQ_SW_INDEX pointer
+ * when necessary. Necessary is defined by the absolute requirement that
+ * there must at any time be enough space in the event queue to store all possible
+ * sets of events occuring simultaenously. During setup, the driver will allocate
+ * enough entries to have at least @epsc_eq_headroom extra entries such that EQ_SW_INDEX
+ * need not be updated more often than for every @epsc_eq_headroom event:
+ */
+uint sif_epsc_eq_headroom = 64;
+module_param_named(epsc_eq_headroom, sif_epsc_eq_headroom, uint, S_IRUGO);
+MODULE_PARM_DESC(epsc_eq_headroom, "Minimal amount of extra headroom in the EPSC event queue");
+
+uint sif_tsu_eq_headroom = 64;
+module_param_named(tsu_eq_headroom, sif_tsu_eq_headroom, uint, S_IRUGO);
+MODULE_PARM_DESC(tsu_eq_headroom, "Minimal amount of extra headroom in TSU event queue 0");
+
+
+/* sif_table_layout is a static struct used to organize
+ * base pointer size/layout data in a way that allows
+ * them to be configured by iteration:
+ */
+
+struct sif_table_layout {
+	off_t off; /* Off. to corr. psif_base_addr within psif_csr */
+	const char *name; /* Corresponding to enum name */
+	const char *desc; /* Textual table desc (for logging) */
+	uint *e_cnt_ref; /* Driver parameter ref for no.of entries to allocate */
+	u32 entry_sz;  /* Real size of entries in this table */
+	u32 ext;       /* Actual extent of (stride between) entries in this table */
+	sif_dfs_printer dfs_printer; /* entry printing in debugfs */
+	enum sif_tab_type xref;  /* -1: No xref, else xref bitmap (read only) */
+	bool wr_access;  /* Whether or not PSIF should have write access */
+	bool drv_ref;    /* Keep track of driver structs via separate pointer array */
+};
+
+/* Composition of static entries into the base_layout table below:
+ *
+ * This setup defines the memory layout of descriptors and inlined
+ * driver data structures.
+ *
+ * add_layout  :  base layout of descriptors with no inlined struct and no debugfs print
+ *  - a version: Include separate array of pointers to driver struct
+ * add_x_layout: layout with alternative type to define extent (inlined driver struct)
+ *  - p version: provide a printer function for debugfs
+ *  - d version: default naming of printer function
+ *  - r version: "cross reference" the bitmap of another map - no separate allocation
+ */
+
+#define add_xpr_layout(type, ec, _desc, _e_type, _dfs_printer, _xref, _wr_acc, _drv_ref) { \
+	.off = offsetof(struct psif_csr_be, base_addr_##type),\
+	.name = #type,\
+	.desc = _desc,\
+	.e_cnt_ref = &sif_##ec##_size,\
+	.entry_sz = sizeof(struct _e_type),\
+	.ext = roundup_pow_of_two(sizeof(struct _e_type)),\
+	.dfs_printer = _dfs_printer,\
+	.xref = _xref, \
+	.wr_access = _wr_acc, \
+	.drv_ref = _drv_ref, \
+}
+
+#define add_xp_layout(type, ec, _desc, _e_type, _dfs_printer, _wr_acc) \
+	add_xpr_layout(type, ec, _desc, _e_type, _dfs_printer, -1, _wr_acc, false)
+
+#define add_x_layout(type, ec, _desc, _e_type, _wr_acc) \
+	add_xp_layout(type, ec, _desc, _e_type, NULL, _wr_acc)
+
+#define add_xd_layout(type, ec, _desc, _e_type, _wr_acc)	\
+	add_xp_layout(type, ec, _desc, _e_type, sif_dfs_print_##type, _wr_acc)
+
+#define add_xdr_layout(type, ec, _desc, _e_type, _xref, _wr_acc)	\
+	add_xpr_layout(type, ec, _desc, _e_type, sif_dfs_print_##type, _xref, _wr_acc, false)
+
+#define add_layout(type, ec, _desc, _wr_acc) \
+	add_x_layout(type, ec, _desc, psif_##type, _wr_acc)
+
+#define add_a_layout(type, ec, _desc, _wr_acc) \
+	add_xpr_layout(type, ec, _desc, psif_##type, sif_dfs_print_##type, -1, _wr_acc, true)
+
+#define add_r_layout(type, ec, _desc, _xref, _wr_acc) \
+	add_xpr_layout(type, ec, _desc, sif_##type, NULL, _xref, _wr_acc, false)
+
+#define add_d_layout(type, ec, _desc, _wr_acc) \
+	add_xp_layout(type, ec, _desc, psif_##type, sif_dfs_print_##type, _wr_acc)
+
+/* For use with eps req */
+#define add_e_req_layout(type, _suff) { \
+	.off = 0, \
+	.name = #type "_csr_req", \
+	.desc = "EPS" #_suff " Request queue", \
+	.e_cnt_ref = &sif_##type##_size, \
+	.entry_sz = sizeof(struct psif_epsc_csr_req),\
+	.ext = roundup_pow_of_two(sizeof(struct psif_epsc_csr_req)), \
+	.dfs_printer = sif_dfs_print_##type, \
+	.xref = -1, \
+	.wr_access = false, \
+	.drv_ref = false, \
+}
+
+/* For use with eps rsp */
+#define add_e_rsp_layout(type, _suff) { \
+	.off = 0, \
+	.name = #type "_csr_rsp", \
+	.desc = "EPS" #_suff " Response queue", \
+	.e_cnt_ref = &sif_##type##_size, \
+	.entry_sz = sizeof(struct psif_epsc_csr_rsp),\
+	.ext = roundup_pow_of_two(sizeof(struct psif_epsc_csr_rsp)), \
+	.dfs_printer = NULL, \
+	.xref = type##_csr_rsp, \
+	.wr_access = true,\
+	.drv_ref = false,\
+}
+
+
+/* This array is indexed by the sif_tab_type enum
+ * NB! If you change anything here (including order)
+ * remember to update
+ * - enum sif_tab_type in sif_dev.h
+ * - define_funcs call list in sif_base.h
+ */
+
+static struct sif_table_layout base_layout[] = {
+	add_e_req_layout(epsc, C),
+	add_e_rsp_layout(epsc, C),
+	add_a_layout(key,   mr, "Key validation", false),
+	add_xd_layout(qp, qp, "QP descriptor", sif_qp, true),
+	add_layout(rqsp, rqsp, "RQ scratch pad", true),
+	add_layout(atsp, atsp, "Atomic replay data", true),
+	add_xd_layout(ah,    ah, "Address handle", sif_ah, false),
+	add_xd_layout(cq_hw, cq, "Compl.desc (hw)", sif_cq, true),
+	add_r_layout(cq_sw, cq, "Compl.desc (sw)", cq_hw, false),
+	add_xd_layout(rq_hw, rq, "Recv.queue (hw)", sif_rq, true),
+	add_r_layout(rq_sw, rq, "Recv.queue (sw)", rq_hw, false),
+	add_xdr_layout(sq_hw, qp, "Send queue (hw)", sif_sq, qp, true),
+	add_r_layout(sq_sw, qp, "Send queue (sw)", qp, false),
+	{
+		/* Special handling of the completion block's
+		 * special send queue address map - see #944
+		 */
+		.off = offsetof(struct psif_csr_be, base_addr_sq_cmpl),
+		.name = "sq_cmpl",
+		.desc = "cq: SQ addr.map",
+		.e_cnt_ref = &sif_qp_size,
+		.entry_sz = 0, /* Calculated later */
+		.ext = 0, /* Calculated later */
+		.dfs_printer = sif_dfs_print_sq_cmpl,
+		.xref = qp, /* Reference QP to have flat setup (used by dfs only) */
+		.wr_access = false,
+		.drv_ref = false,
+	},
+	add_layout(sq_ring, sq_ring, "SQS Ring buffer", true),
+	add_layout(sq_tvl, sq_tvl, "SQS Resp.queue TVL", true),
+	add_layout(sq_rspq, sq_rspq, "SQS Resp.queue", true),
+	{
+		/* Special handling of collect buffer entries */
+		.off = 0,
+		.name = "bw_cb",
+		.desc = "High bandwith collect buffer",
+		.e_cnt_ref = &dummy_bw_cb_size,
+		.entry_sz = sizeof(struct psif_cb),
+		.ext = 4096,
+		.dfs_printer = NULL,
+		.xref = -1,
+		.wr_access = false,
+		.drv_ref = false,
+	},
+	{
+		/* Special handling of collect buffer entries */
+		.off = 0,
+		.name = "lat_cb",
+		.desc = "Low latency collect buffer",
+		.e_cnt_ref = &dummy_lat_cb_size,
+		.entry_sz = sizeof(struct psif_cb),
+		.ext = 4096,
+		.dfs_printer = NULL,
+		.xref = -1,
+		.wr_access = false,
+		.drv_ref = false,
+	},
+	add_e_req_layout(epsa0, A-0),
+	add_e_rsp_layout(epsa0, A-0),
+	add_e_req_layout(epsa1, A-1),
+	add_e_rsp_layout(epsa1, A-1),
+	add_e_req_layout(epsa2, A-2),
+	add_e_rsp_layout(epsa2, A-2),
+	add_e_req_layout(epsa3, A-3),
+	add_e_rsp_layout(epsa3, A-3)
+};
+
+
+const char *sif_table_name(enum sif_tab_type type)
+{
+	return base_layout[type].name;
+}
+
+
+static bool is_eps_req(enum sif_tab_type type)
+{
+	switch (type) {
+	case epsc_csr_req:
+	case epsa0_csr_req:
+	case epsa1_csr_req:
+	case epsa2_csr_req:
+	case epsa3_csr_req:
+		return true;
+	default:
+		break;
+	}
+	return false;
+}
+
+
+static bool is_eps_rsp(enum sif_tab_type type)
+{
+	switch (type) {
+	case epsc_csr_rsp:
+	case epsa0_csr_rsp:
+	case epsa1_csr_rsp:
+	case epsa2_csr_rsp:
+	case epsa3_csr_rsp:
+		return true;
+	default:
+		break;
+	}
+	return false;
+}
+
+
+sif_dfs_printer sif_table_dfs_printer(enum sif_tab_type type)
+{
+	/* At this point we have one common implementation: */
+	return base_layout[type].dfs_printer;
+}
+
+
+static enum sif_tab_type get_sw_type(enum sif_tab_type type)
+{
+	switch (type) {
+	case cq_hw:
+		return cq_sw;
+	case rq_hw:
+		return rq_sw;
+	case qp:
+	case sq_hw:
+		return sq_sw;
+	default:
+		break;
+	}
+	return (enum sif_tab_type)0;
+}
+
+static enum sif_tab_type get_hw_type(enum sif_tab_type type)
+{
+	switch (type) {
+	case cq_sw:
+		return cq_hw;
+	case rq_sw:
+		return rq_hw;
+	case sq_sw:
+		return sq_hw;
+	default:
+		break;
+	}
+	return (enum sif_tab_type)0;
+}
+
+static bool is_sw_type(enum sif_tab_type type)
+{
+	switch (type) {
+	case cq_sw:
+	case rq_sw:
+	case sq_sw:
+		return true;
+	default:
+		break;
+	}
+	return false;
+}
+
+
+/* The user mapped types we need to adjust extent for
+ * based on min_extent
+ * qp is exempt from this list as it is not mapped to
+ * user space although part of two-level alloc:
+ */
+static bool is_user_mapped_type(enum sif_tab_type type)
+{
+	switch (type) {
+	case cq_sw:
+	case rq_sw:
+	case sq_sw:
+	case cq_hw:
+	case rq_hw:
+	case sq_hw:
+		return true;
+	default:
+		break;
+	}
+	return false;
+}
+
+
+static int init_blocks(struct sif_dev *sdev, enum sif_tab_type type)
+{
+	struct sif_table *tp = &sdev->ba[type];
+	enum sif_tab_type sw_type;
+	size_t sw_eb; /* sw type's required minimal entries per block */
+
+	if (is_sw_type(type)) {
+		/* Pick up block settings from the hw type which has already been initialized */
+		enum sif_tab_type hw_type = get_hw_type(type);
+		struct sif_table *tph = &sdev->ba[hw_type];
+
+		tp->entry_per_block = tph->entry_per_block;
+		tp->block_ext = tph->block_ext;
+		tp->block_cnt = tph->block_cnt;
+		tp->block = tph->block;
+		return 0;
+	}
+
+	sw_type = get_sw_type(type);
+	/* Only the tables with a software type requires 2-level alloc */
+	if (sw_type)
+		sw_eb = PAGE_SIZE / base_layout[sw_type].ext;
+	else
+		return 0;
+
+	if (type == qp) {
+		/* Only relate to sq_hw and sq_sw
+		 * (which hasn't been setup yet) for block size calc
+		 */
+		tp->entry_per_block = max(sw_eb, PAGE_SIZE / base_layout[sq_hw].ext);
+	} else {
+		/* blocks must fill a page of the smallest of the sw and hw pointer */
+		tp->entry_per_block = max(sw_eb, PAGE_SIZE / tp->ext_sz);
+	}
+	tp->block_cnt = tp->entry_cnt / tp->entry_per_block;
+
+	if (tp->entry_per_block > 1) {
+		/* Allocate an 8 byte aligned/end aligned room for the local bitmap
+		 * right after the block struct:
+		 */
+		int bitmap_bytes = (((tp->entry_per_block + 7) >> 3) + 7) & ~7;
+
+		sif_log(sdev, SIF_INFO,
+			"%s uses two-level alloc: entry_per_block %d, block_cnt %d bitmap_bytes %d",
+			sif_table_name(type), tp->entry_per_block, tp->block_cnt,
+				bitmap_bytes);
+
+		tp->block_ext = sizeof(struct sif_table_block) + bitmap_bytes;
+
+		if (unlikely(type == sq_hw)) /* Uses QP bitmap */
+			tp->block = sdev->ba[qp].block;
+		else {
+			/* Zero-initialize the block struct - real initialize
+			 * upon first allocation
+			 */
+			tp->block = kzalloc(tp->block_ext * tp->block_cnt, GFP_KERNEL);
+		}
+		if (!tp->block)
+			return -ENOMEM;
+	}
+
+	if (tp->alloc_rr) {
+		size_t i;
+		/* Make sure we start at index 0 for readability + reserve QP 0 */
+		for (i = 0; i < tp->block_cnt; i++) {
+			struct sif_table_block *b = sif_get_block(tp, i);
+
+			b->last_used = tp->entry_per_block - 1;
+		}
+	}
+	return 0;
+}
+
+
+static void deinit_blocks(struct sif_dev *sdev, enum sif_tab_type type)
+{
+	struct sif_table *tp = &sdev->ba[type];
+
+	if (tp->block) {
+		/* SQ uses QP bitmap and sw types refs the corresponding hw type */
+		if (likely(type != sq_hw && !is_sw_type(type)))
+			kfree(tp->block);
+		tp->block = NULL;
+	}
+}
+
+
+/* Set up the memory mapped table type given by @type
+ * with SIF based on information in the base_layout table.
+ */
+int sif_table_init(struct sif_dev *sdev, enum sif_tab_type type)
+{
+	struct sif_table *tp = &sdev->ba[type];
+	int extent;  /* As log2 */
+	int ret = 0;
+	struct psif_epsc_csr_req req; /* local epsc wr copy */
+	struct psif_epsc_csr_rsp resp;
+	u64 alloc_sz;
+	u32 cfg_sz;
+
+	memset(tp, 0, sizeof(*tp));
+	tp->type = type;
+	tp->sdev = sdev;
+	cfg_sz = (u32)(*base_layout[type].e_cnt_ref);
+	if (cfg_sz & 0x80000000 || cfg_sz == 0) {
+		sif_log(sdev, SIF_INFO, "%s(%u): table size %#x out of bounds",
+			base_layout[type].desc, type, cfg_sz);
+		return -EINVAL;
+	}
+
+	/* Only 2^n sized number of entries allowed: */
+	tp->entry_cnt = roundup_pow_of_two(cfg_sz);
+	tp->ext_sz = base_layout[type].ext;
+	tp->table_sz = (size_t)tp->ext_sz * tp->entry_cnt;
+
+	/* Set aside room for a sif_epsc_data struct at the end of
+	 * the eps completion vectors so they can use the same mmu context in psif:
+	 */
+	alloc_sz = (is_eps_rsp_tab(type) ?
+		tp->table_sz + sizeof(struct sif_epsc_data) + sif_eps_log_size :
+		tp->table_sz);
+
+	if (unlikely(type == sq_cmpl))
+		sif_sq_cmpl_setup(tp);
+	else if (unlikely(is_cb_table(type)))
+		sif_cb_table_init(sdev, type);
+	else
+		sif_alloc_table(tp, alloc_sz);
+
+	if (!tp->mem) {
+		sif_log(sdev, SIF_INFO,
+			"Failed to allocate 0x%lx bytes of memory for the %s table",
+			tp->table_sz, base_layout[type].desc);
+		return -ENOMEM;
+	}
+
+	extent = order_base_2(tp->ext_sz);
+
+	if (type == ah) /* Address handles can be allocated from intr.context */
+		tp->from_interrupt = true;
+
+	/* Allocate descriptors in a round robin fashion */
+	tp->alloc_rr = is_cb_table(type) ?
+		sif_feature(alloc_cb_round_robin) : !sif_feature(disable_alloc_round_robin);
+
+	/* single level defaults - then check for 2-level setup.. */
+	tp->block_cnt = tp->entry_cnt;
+	tp->entry_per_block = 1;
+
+	/* Enable one or two-level allocation */
+	if (!sif_feature(flat_alloc))
+		ret = init_blocks(sdev, type);
+
+	if (ret)
+		goto err_init_blocks;
+
+	if (tp->alloc_rr)
+		tp->last_used = tp->block_cnt - 1; /* Next will be the first entry */
+
+	sif_log(sdev, SIF_INFO,	"%s(%d): entry cnt %d, entry sz %d, ext sz %d, extent %d, [%s]",
+		base_layout[type].desc, type, tp->entry_cnt, base_layout[type].entry_sz, tp->ext_sz,
+		extent, (base_layout[type].wr_access ? "writable" : "readonly"));
+	sif_log(sdev, SIF_INIT,	" - table sz 0x%lx %s sif_base 0x%llx csr off 0x%lx",
+		tp->table_sz, sif_mem_type_str(tp->mem->mem_type),
+		tp->sif_base, base_layout[type].off);
+
+	/* If xref is set to something other than -1 it means
+	 * this table is not being allocated from individually, and thus
+	 * need no bitmap, but rather is implicitly allocated from the referenced
+	 * table entry (which must be lower in enum value to ensure that it is
+	 * already allocated!)
+	 * Also a table that references another this way is not allowed to allocate
+	 * any indices..
+	 */
+	if (base_layout[type].xref != -1)
+		tp->bitmap = sdev->ba[base_layout[type].xref].bitmap;
+	else if (sif_init_bitmap(tp) != 0) {
+		ret = -ENOMEM;
+		goto err_init_bitmap;
+	}
+
+	spin_lock_init(&tp->lock);
+
+	if (is_cb_table(type))
+		return 0; /* No base addr setup for CBs */
+
+	/* Base address setup - inform the EPS */
+	memset(&req, 0, sizeof(req));
+
+	if (is_eps_req(type)) {
+		/* Both req and rsp gets posted when rsp is set up */
+		ret = 0;
+	} else if (is_eps_rsp(type)) {
+		/* req,rsp and eq setup taken care of here: */
+		ret = sif_eps_init(sdev, type);
+		if (ret)
+			goto err_map_ctx; /* No context mapped in this case */
+	} else {
+		req.opcode = EPSC_SET_BASEADDR;
+		req.u.base_addr.address = tp->sif_base;
+		req.u.base_addr.num_entries = tp->entry_cnt;
+		req.u.base_addr.extent_log2 = extent;
+		ret = sif_map_ctx(sdev, &tp->mmu_ctx, tp->mem,
+				tp->sif_base, tp->table_sz,
+				base_layout[type].wr_access);
+		if (ret) {
+			sif_log(sdev, SIF_INFO, "Failed to set up mmu context for %s",
+				base_layout[type].desc);
+			goto err_map_ctx;
+		}
+		req.addr = base_layout[type].off;
+
+		/* Fill in the mmu context from sif_map_ctx before submitting to the EPSC */
+		req.u.base_addr.mmu_context = tp->mmu_ctx.mctx;
+
+		ret = sif_epsc_wr_poll(sdev, &req, &resp);
+		if (ret)
+			goto err_epsc_comm;
+	}
+	return 0;
+
+
+err_epsc_comm:
+	sif_unmap_ctx(sdev, &tp->mmu_ctx);
+err_map_ctx:
+	if (base_layout[type].xref == -1)
+		sif_free_bitmap(tp);
+err_init_bitmap:
+	deinit_blocks(sdev, type);
+err_init_blocks:
+	sif_free_table(tp);
+	tp->mem = NULL;
+	return ret;
+}
+
+static void sif_table_deinit(struct sif_dev *sdev, enum sif_tab_type type)
+{
+	struct sif_table *tp = &sdev->ba[type];
+
+	if (tp->mem) {
+		if (is_eps_rsp(type))
+			sif_eps_deinit(sdev, type);
+		sif_unmap_ctx(sdev, &tp->mmu_ctx);
+		if (base_layout[type].xref == -1)
+			sif_free_bitmap(tp);
+		deinit_blocks(sdev, type);
+		sif_free_table(tp);
+		tp->mem = NULL;
+	}
+}
+
+
+static void sif_base_deinit_partly(struct sif_dev *sdev, int level)
+{
+	int i;
+
+	for (i = level - 1; i >= 0; i--)
+		sif_table_deinit(sdev, i);
+}
+
+
+int sif_base_init(struct sif_dev *sdev)
+{
+	/* Setting up base registers */
+	int ret = 0;
+	int i;
+
+	/* extent less than 8 bytes not supported by hw */
+	if (sif_min_extent < 8)
+		sif_min_extent = 8;
+	else
+		sif_min_extent = roundup_pow_of_two(sif_min_extent);
+
+	if (!sif_feature(flat_alloc) && sif_min_extent > 2048) {
+		sif_log(sdev, SIF_INFO,
+			"cap'ing min_extent to 2048 - largest supported with two -level alloc");
+		sif_min_extent = 2048;
+	}
+
+	/* Update sw table extents with min_extent: */
+	for (i = 0; i < sif_tab_init_max; i++)
+		if (is_user_mapped_type(i) && base_layout[i].ext < sif_min_extent)
+			base_layout[i].ext = sif_min_extent;
+
+	for (i = 0; i < sif_tab_init_max; i++) {
+		ret = sif_table_init(sdev, i);
+		/* Allow some base address setup calls to fail.
+		 * This should allow us to work around some cases very old firmware
+		 * just to perform firmware flash upgrade:
+		 */
+		if (ret) {
+			sif_log(sdev, SIF_INFO, "table init failed for the \"%s\" table",
+				sif_table_name(i));
+			if (i <= epsc_csr_rsp || i == qp || i == key)
+				goto bi_failed;
+		}
+	}
+
+	/* We rely upon 0-initialized table structs for the EPS-A entries as well */
+	for (i = sif_tab_init_max; i < sif_tab_max; i++) {
+		struct sif_table *tp = &sdev->ba[i];
+
+		memset(tp, 0, sizeof(*tp));
+	}
+
+	/* Init complete */
+	return 0;
+
+bi_failed:
+	sif_base_deinit_partly(sdev, i);
+	return ret;
+}
+
+
+void sif_base_deinit(struct sif_dev *sdev)
+{
+	sif_base_deinit_partly(sdev, sif_tab_max);
+}
+
+
+/* Send a base addr request to a given EPSA with address information for @type */
+int sif_table_update(struct sif_dev *sdev, enum psif_mbox_type eps_num,
+		enum sif_tab_type type)
+{
+	int ret;
+	struct sif_table *tp = &sdev->ba[type];
+	struct psif_epsc_csr_req req; /* local epsc wr copy */
+	struct psif_epsc_csr_rsp resp;
+	int extent = order_base_2(tp->ext_sz);
+
+	/* GVA2GPA not supported by EPSes in rev2: */
+	if (PSIF_REVISION(sdev) <= 2 && tp->mem->mem_type != SIFMT_BYPASS)
+		return -EOPNOTSUPP;
+
+	memset(&req, 0, sizeof(req));
+	req.opcode = EPSC_SET_BASEADDR;
+	req.u.base_addr.address = tp->sif_base;
+	req.u.base_addr.num_entries = tp->entry_cnt;
+	req.u.base_addr.extent_log2 = extent;
+	req.u.base_addr.mmu_context = tp->mmu_ctx.mctx;
+	req.addr = base_layout[type].off; /* This is the type of request */
+
+	ret = sif_eps_wr(sdev, eps_num, &req, &resp);
+	return ret;
+}
+
+
+/* Write an invalidate request to the pqp.
+ * using the given modes. Note that if @lcqe is set, wr_mode must be
+ * set to PCM_WAIT, to avoid the cqe from living beyond it's caller's scope!
+ */
+int sif_write_invalidate(struct sif_pqp *pqp, enum sif_tab_type type, int index,
+			struct sif_cqe *lcqe, enum wr_mode wr_mode, enum post_mode p_mode)
+{
+	struct psif_wr wr;
+	enum psif_wr_type inv_op;
+	int ncompleted;
+	u32 sq_entry_idx;
+	int pqp_sq_idx;
+	struct sif_sq *sq;
+	struct psif_cq_entry *cqe;
+	bool self_destruct;
+	struct sif_dev *sdev = to_sdev(pqp->qp->ibqp.device);
+
+	self_destruct = (type == cq_hw) && (index == pqp->cq->index);
+
+	/* Figure out if an invalidate request is necessary */
+	inv_op = sif_invalidate_opcode(type);
+	BUG_ON(inv_op == -1);
+	BUG_ON(lcqe && wr_mode != PCM_WAIT);
+	if (inv_op == -1)
+		return -ENODEV;
+
+	sif_log(sdev, SIF_PQP, "sending inv.req. type %s (0x%x) target queue index %d",
+		sif_table_name(type), inv_op, index);
+
+	memset(&wr, 0, sizeof(struct psif_wr));
+	/* For this table type we need to send an explicit
+	 * invalidate work request
+	 */
+	wr.op = inv_op;
+	switch (inv_op) {
+	case PSIF_WR_INVALIDATE_RKEY:
+	case PSIF_WR_INVALIDATE_LKEY:
+	case PSIF_WR_INVALIDATE_BOTH_KEYS:
+		wr.details.su.key = index;
+		break;
+	case PSIF_WR_INVALIDATE_RQ:
+		wr.details.su.u2.rq_id = index;
+		break;
+	case PSIF_WR_INVALIDATE_XRCSRQ:
+		wr.details.su.u2.xrq_id = index;
+		break;
+	case PSIF_WR_INVALIDATE_CQ:
+		wr.details.su.u2.cq_id = index;
+		break;
+	case PSIF_WR_INVALIDATE_SGL_CACHE:
+		wr.details.su.u2.target_qp = index;
+		break;
+	default:
+		/* Should never get here */
+		return -ENODEV;
+	}
+
+	if (self_destruct) {
+		/* A self destruct does not receive any completion
+		 * instead we must poll for descriptor write-back
+		 */
+		int ret = 0;
+		int sts = sif_pqp_post_send(sdev, &wr, NULL);
+
+		if (sts) {
+			sif_log(sdev, SIF_INFO,
+				"Posted self-destruct request on cq %d failed, sts %d",
+				index, sts);
+		}
+
+		sif_log(sdev, SIF_INFO_V, "Posted self-destruct request on cq %d", index);
+		ret = poll_wait_for_cq_writeback(sdev, pqp->cq);
+		return ret;
+	}
+
+	if (wr_mode != PCM_WAIT) {
+		int sts;
+
+		wr.completion = (wr_mode == PCM_POST) ? 0 : 1;
+		sts = sif_pqp_write_send(pqp, &wr, NULL, p_mode);
+		if (sts != -EAGAIN)
+			return sts;
+		/* In the EAGAIN case, fall through to post a new request with completion
+		 * to be able to use the quota beyond lowpri_lim
+		 */
+	}
+
+	wr.completion = 1;
+	ncompleted = sif_pqp_poll_wr(sdev, &wr, lcqe);
+
+	if (ncompleted < 0) {
+		sif_log(sdev, SIF_INFO, "pqp request failed with errno %d", ncompleted);
+		return ncompleted;
+	}
+
+	/* Note that we operate on 3 different indices here! */
+	cqe = &lcqe->cqe;
+	pqp_sq_idx = pqp->qp->qp_idx;
+	sq = get_sif_sq(sdev, pqp_sq_idx);
+
+	/* sq_id.sq_seq_num contains the send queue sequence number for this completion
+	 * and by this driver's definition the index into the send queue will
+	 * be this number modulo the length of the send queue:
+	 */
+	sq_entry_idx = cqe->wc_id.sq_id.sq_seq_num & sq->mask;
+
+	if (cqe->status != PSIF_WC_STATUS_SUCCESS) {
+		sif_log(sdev, SIF_INFO,	"failed with status %s(%d) for cq_seq %d",
+			string_enum_psif_wc_status(cqe->status), cqe->status, cqe->seq_num);
+		sif_logs(SIF_INFO, write_struct_psif_cq_entry(NULL, 0, cqe));
+		atomic_inc(&pqp->cq->error_cnt);
+		return -EIO;
+	}
+
+	sif_log(sdev, SIF_PQP, "cq_seq %d sq_seq %d, sq_entry_idx %d",
+		cqe->seq_num, cqe->wc_id.sq_id.sq_seq_num, sq_entry_idx);
+
+	return ncompleted < 0 ? ncompleted : 0;
+}
+
+int sif_invalidate(struct sif_dev *sdev, enum sif_tab_type type, int index,
+		enum wr_mode wr_mode)
+{
+	struct sif_cqe *cqe = NULL;
+	DECLARE_SIF_CQE_POLL(sdev, lcqe);
+	struct sif_pqp *pqp = lcqe.pqp;
+
+	if (unlikely(!pqp))
+		return 0; /* Failed before any PQPs were set up */
+
+	if (wr_mode == PCM_WAIT)
+		cqe = &lcqe;
+	return sif_write_invalidate(pqp, type, index, cqe, wr_mode, PM_CB);
+}
+
+#define table_lock(table, flags) \
+	do {\
+		if (unlikely(table->from_interrupt))	\
+			spin_lock_irqsave(&table->lock, flags); \
+		else						\
+			spin_lock(&table->lock); \
+	} while (0)
+
+
+#define table_unlock(table, flags) \
+	do {				     \
+		if (unlikely(table->from_interrupt))	     \
+			spin_unlock_irqrestore(&table->lock, flags);	\
+		else	\
+			spin_unlock(&table->lock);	\
+	} while (0)
+
+
+/* 1st level bitmap index allocation scheme */
+static int sif_init_bitmap(struct sif_table *table)
+{
+	/* Allocate 1 bit for each block of entries */
+	size_t bsz = max(sizeof(ulong), table->block_cnt / sizeof(ulong));
+
+	if (bsz > SIF_MAX_CONT)
+		table->bitmap = vzalloc(bsz);
+	else
+		table->bitmap = kzalloc(bsz, GFP_KERNEL);
+	if (!table->bitmap) {
+		sif_log0(SIF_INIT,
+			 "Failed to allocate 0x%lx bytes of alloc.bitmap", bsz);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+int sif_alloc_index(struct sif_dev *sdev, enum sif_tab_type type)
+{
+	int index;
+	int next = 0;
+	struct sif_table *table = &sdev->ba[type];
+	unsigned long flags = 0;
+
+	table_lock(table, flags);
+	if (table->alloc_rr)
+		next = (table->last_used + 1) & (table->block_cnt - 1);
+
+	index = find_next_zero_bit(table->bitmap, table->block_cnt, next);
+	if (table->alloc_rr && index >= table->block_cnt)
+		index = find_next_zero_bit(table->bitmap, table->block_cnt, 0);
+	if (index < table->block_cnt) {
+		set_bit(index, table->bitmap);
+		if (table->alloc_rr)
+			table->last_used = index;
+	} else
+		index = -1;
+	table_unlock(table, flags);
+	sif_log(sdev, SIF_IDX, "%s[%d] (entries per block %d)", sif_table_name(type), index,
+		table->entry_per_block);
+	return index;
+}
+
+void sif_free_index(struct sif_dev *sdev, enum sif_tab_type type, int index)
+{
+	struct sif_table *table = &sdev->ba[type];
+	size_t ext_sz = table->ext_sz;
+	char *desc = sif_mem_kaddr(table->mem, index * ext_sz);
+	unsigned long flags = 0;
+
+	if (!test_bit(index, table->bitmap)) {
+		/* This should not happen - inconsistency somewhere */
+		sif_log(sdev, SIF_INFO, "XZW: index %d, table type %d/%d was not marked as used!",
+			index, type, sif_tab_init_max);
+		BUG();
+		return;
+	}
+
+
+	if (table->entry_per_block == 1) {
+		/* Clean descriptor entry for reuse:
+		 * note that we clean the whole extent here which
+		 * includes all of sif_##type for inlined types:
+		 */
+		if (table->type == rq_hw) /* only zero out driver data structure */
+			memset(desc + sizeof(struct psif_rq_hw), 0, ext_sz - sizeof(struct psif_rq_hw));
+		else if (!is_cb_table(table->type) && table->type != qp && table->type != cq_hw)
+			memset(desc, 0, ext_sz);
+	}
+
+	table_lock(table, flags);
+	clear_bit(index, table->bitmap);
+	table_unlock(table, flags);
+	sif_log(sdev, SIF_IDX, "%s[%d]", sif_table_name(type), index);
+}
+
+
+bool sif_index_used(struct sif_table *table, int index)
+{
+	if (unlikely(index < 0 || index >= table->entry_cnt))
+		return NULL;
+	return test_bit(index, table->bitmap);
+}
+
+
+u32 sif_entries_used(struct sif_table *table)
+{
+	int bits_used = 0;
+	int i = 0;
+	unsigned long flags = 0;
+
+	table_lock(table, flags);
+	if (table->entry_per_block == 1)
+		bits_used = bitmap_weight(table->bitmap, table->block_cnt);
+	else
+		for (;;) {
+			i = sif_next_used(table, i);
+			if (i < 0)
+				break;
+			bits_used++;
+			i++;
+		}
+
+	table_unlock(table, flags);
+	return bits_used;
+}
+
+static void sif_free_bitmap(struct sif_table *table)
+{
+	if (table->bitmap) {
+		size_t bsz = table->block_cnt / sizeof(ulong);
+
+		if (bsz > SIF_MAX_CONT)
+			vfree(table->bitmap);
+		else
+			kfree(table->bitmap);
+		table->bitmap = NULL;
+	}
+}
+
+
+/* This function is used to traverse tables for the debugfs file system.
+ * @index is the descriptor index (not block index) so in case of
+ * two-level allocation (table->entry_per_block > 1)
+ * a two-level traversal is needed here:
+ */
+int sif_next_used(struct sif_table *table, int index)
+{
+	ulong *map = NULL;
+	int blk_idx, new_blk_idx, epb, old_idx;
+	struct sif_table_block *b;
+
+	/* This is a queue - no bitmap */
+	if (unlikely(table->type == epsc_csr_req))
+		return sif_eps_next_used(table, index);
+
+	/* TBD: Quick hack for now - the bitmap reference stuff does not work
+	 * properly with two-level alloc:
+	 */
+	if (unlikely(table->type == sq_cmpl))
+		table = &table->sdev->ba[qp];
+
+	map = table->bitmap;
+	if (!map)
+		return -1;
+
+	if (table->entry_per_block == 1) {
+		index = find_next_bit(map, table->block_cnt, index);
+		if (index < table->block_cnt)
+			return index;
+		else
+			return -1;
+	}
+	old_idx = index;
+
+	/* Two level allocation */
+	epb = table->entry_per_block;
+	blk_idx = index / epb;
+next_block:
+	index = index % epb;
+	new_blk_idx = find_next_bit(map, table->block_cnt, blk_idx);
+	if (new_blk_idx >= table->block_cnt)
+		return -1;
+	if (new_blk_idx != blk_idx)
+		index = 0;
+
+	b = sif_get_block(table, new_blk_idx);
+	index =  find_next_bit(b->bitmap, epb, index);
+	if (index >= epb) {
+		blk_idx++;
+		goto next_block;
+	}
+	index += b->offset;
+	return index;
+}
+
+static int sif_alloc_sg_table(struct sif_table *tp, size_t size)
+{
+	struct sif_dev *sdev = tp->sdev;
+	size_t sg_size = size >> PMD_SHIFT;
+	enum sif_mem_type memtype = sif_feature(no_huge_pages) ? SIFMT_4K : SIFMT_2M;
+
+	tp->mem = sif_mem_create(sdev, sg_size, size, memtype,
+				GFP_KERNEL, DMA_BIDIRECTIONAL);
+	if (!tp->mem)
+		return -ENOMEM;
+	return 0;
+}
+
+int sif_alloc_table(struct sif_table *tp, size_t size)
+{
+	struct sif_dev *sdev = tp->sdev;
+	int ret;
+
+	/* TBD: handle eqs in a better way */
+	if (!tp->is_eq && base_layout[tp->type].drv_ref) {
+		size_t ref_tbl_sz = sizeof(void *) *  tp->entry_cnt;
+
+		tp->drv_ref = vzalloc(ref_tbl_sz);
+		if (!tp->drv_ref) {
+			sif_log(sdev, SIF_INFO, "unable to allocate %ld bytes of ref.table for table %s",
+				ref_tbl_sz, sif_table_name(tp->type));
+			return -ENOMEM;
+		}
+	}
+
+	/* The sqs ring buffer must be phys.cont to avoid PCIe deadlocks (#3477)
+	 * and do not need to be zero initialized, its written by HW and read by HW
+	 */
+	if (size <= SIF_MAX_CONT || (tp->type == sq_ring && !tp->is_eq)) {
+		gfp_t flags = GFP_KERNEL;
+
+		if (tp->type != sq_ring)
+			flags |= __GFP_ZERO;
+
+		tp->mem = sif_mem_create_dmacont(sdev, size, flags, DMA_BIDIRECTIONAL);
+		if (!tp->mem) {
+			ret = -ENOMEM;
+			goto t_alloc_failed;
+		}
+		tp->sif_base = sif_mem_dma(tp->mem, 0);
+		if (tp->type == sq_ring) {
+			/* Avoid deadlocks on PCIe (#3484) */
+			tp->mmu_ctx.mctx.ro = 1;
+			tp->mmu_ctx.mctx.ns = 1;
+
+			/*
+			 * BZ #3618: Make sure no dirty cache lines
+			 * exists, which might be flushed out and
+			 * overwrite the ring-buffer, after it has
+			 * been written by PSIF
+			 */
+#ifdef CONFIG_X86
+			clflush_cache_range(tp->mem->vmap_base, size);
+#else
+			sif_log(sdev, SIF_INFO, "Warning: implement flush cache for this architecture");
+#endif
+		}
+		return 0;
+	}
+
+	ret = sif_alloc_sg_table(tp, size);
+	if (ret)
+		goto t_alloc_failed;
+
+	/* Use some easily identifiable (nonzero) high virtual address range on the sif side */
+	tp->sif_base = tp->is_eq ?
+		SIF_BASE_ADDR_EQ_START(tp->index) :
+		SIF_BASE_ADDR_START(tp->type);
+	return 0;
+
+t_alloc_failed:
+	if (tp->drv_ref) {
+		vfree(tp->drv_ref);
+		tp->drv_ref = NULL;
+	}
+	return ret;
+}
+
+
+void sif_free_table(struct sif_table *tp)
+{
+	sif_mem_free(tp->mem);
+	tp->mem = NULL;
+
+	if (tp->drv_ref) {
+		vfree(tp->drv_ref);
+		tp->drv_ref = NULL;
+	}
+}
diff --git a/drivers/infiniband/hw/sif/sif_base.h b/drivers/infiniband/hw/sif/sif_base.h
new file mode 100644
index 0000000000000..df91a7e392c42
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_base.h
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_base.h: Basic hardware setup of SIF
+ */
+
+#ifndef __SIF_BASE_H
+#define __SIF_BASE_H
+#include "sif_dev.h"
+#include "sif_debug.h"
+#include "sif_pd.h"
+#include "sif_qp.h"
+#include "sif_cq.h"
+#include "sif_ah.h"
+#include "sif_int_user.h"
+
+/* Establish contact with the EPS and initialize the base descriptor setup */
+int sif_base_init(struct sif_dev *sdev);
+
+void sif_base_deinit(struct sif_dev *sdev);
+
+int sif_alloc_index(struct sif_dev *sdev, enum sif_tab_type type);
+void sif_free_index(struct sif_dev *sdev, enum sif_tab_type type, int index);
+u32 sif_entries_used(struct sif_table *table);
+
+bool sif_index_used(struct sif_table *table, int index);
+
+/* Find next used entry, starting at (and including) index
+ */
+int sif_next_used(struct sif_table *table, int index);
+
+int sif_invalidate(struct sif_dev *sdev, enum sif_tab_type type, int index, enum wr_mode mode);
+
+int sif_write_invalidate(struct sif_pqp *pqp, enum sif_tab_type type, int index,
+			 struct sif_cqe *lcqe, enum wr_mode wr_mode, enum post_mode p_mode);
+
+#define sif_define_funcs(type) \
+static inline int sif_invalidate_##type(struct sif_dev *sdev, int index, \
+					enum wr_mode mode)\
+{ \
+		return sif_invalidate(sdev, type, index, mode); \
+} \
+static inline u32 sif_##type##_usage(struct sif_dev *sdev)\
+{\
+	return sif_entries_used(&sdev->ba[type]); \
+} \
+static inline struct psif_##type *get_##type(struct sif_dev *sdev, int index)\
+{ \
+	return (struct psif_##type *)(sif_mem_kaddr(sdev->ba[type].mem, \
+				index * sdev->ba[type].ext_sz)); \
+} \
+static inline void sif_clear_##type(struct sif_dev *sdev, int index)\
+{ \
+	struct psif_##type *p = get_##type(sdev, index);\
+	memset(p, 0, sizeof(*p));\
+}
+
+
+#define sif_def_pd_index_alloc(type)\
+static inline int sif_alloc_##type##_idx(struct sif_pd *pd)\
+{ \
+	return sif_pd_alloc_index(pd, type);	\
+} \
+static inline void sif_free_##type##_idx(struct sif_pd *pd, int index)\
+{ \
+	sif_pd_free_index(pd, type, index);	\
+}
+
+#define sif_def_global_index_alloc(type)\
+static inline int sif_alloc_##type##_idx(struct sif_dev *sdev)\
+{ \
+	return sif_alloc_index(sdev, type);	\
+} \
+static inline void sif_free_##type##_idx(struct sif_dev *sdev, int index)\
+{ \
+	sif_free_index(sdev, type, index); \
+}
+
+const char *sif_table_name(enum sif_tab_type type);
+
+/* Exposed to sif_epsc only! */
+
+/* Set up the table type @type and send a base addr request to the EPSC */
+int sif_table_init(struct sif_dev *sdev, enum sif_tab_type type);
+
+/* Send a base addr request to a given EPSA with address information for @type */
+int sif_table_update(struct sif_dev *sdev, enum psif_mbox_type eps_num,
+		enum sif_tab_type type);
+
+sif_dfs_printer sif_table_dfs_printer(enum sif_tab_type type);
+
+#define psif_bw_cb psif_cb __iomem
+#define psif_lat_cb psif_cb __iomem
+
+sif_define_funcs(key)
+sif_define_funcs(qp)
+sif_define_funcs(cq_hw)
+sif_define_funcs(cq_sw)
+sif_define_funcs(ah)
+sif_define_funcs(rq_sw)
+sif_define_funcs(rq_hw)
+sif_define_funcs(sq_sw)
+sif_define_funcs(sq_hw)
+sif_define_funcs(sq_rspq)
+sif_define_funcs(bw_cb)
+sif_define_funcs(lat_cb)
+
+/* These descriptors use 2-level alloc,
+ * 2nd level resource management is done by the protection domain.
+ * The purpose of this is that elements that fits within the same page will always be
+ * owned by the same protection domain, to avoid that an ill-behaved application
+ * may accidentially modify the descriptors of an unrelated application.
+ * Changes in allocation levels here must be accompanied by changes in init_blocks
+ * in sif_base.c and type changes sdev <-> pd in the index allocation functions.
+ */
+sif_def_pd_index_alloc(qp)
+sif_def_pd_index_alloc(rq_hw)
+sif_def_pd_index_alloc(sq_hw)
+sif_def_pd_index_alloc(cq_hw)
+
+/* These use global, single level alloc.
+ * CBs are unproblematic since they each occupy a full page.
+ * The rest is only used from kernel space
+ */
+
+sif_def_global_index_alloc(key)
+sif_def_global_index_alloc(ah)
+sif_def_global_index_alloc(bw_cb)
+sif_def_global_index_alloc(lat_cb)
+
+/* Lookup functions for sif structs inlined with hw descs */
+#define sif_define_lookup_funcs(type, hwtype)\
+static inline struct sif_##type *get_sif_##type(struct sif_dev *sdev, int idx)\
+{ \
+	return container_of(get_##hwtype(sdev, idx),\
+		struct sif_##type, d);\
+} \
+static inline struct sif_##type *safe_get_sif_##type(struct sif_dev *sdev, int idx)\
+{ \
+	struct sif_table *tp = &sdev->ba[hwtype];\
+	if (unlikely(idx < 0 || idx >= tp->entry_cnt)) \
+		return NULL;\
+	if (!sif_pd_index_used(tp, idx))\
+		return NULL;\
+	return get_sif_##type(sdev, idx);\
+} \
+extern uint sif_##type##_size
+
+sif_define_lookup_funcs(rq, rq_hw);
+sif_define_lookup_funcs(rq_sw, rq_sw);
+sif_define_lookup_funcs(sq, sq_hw);
+sif_define_lookup_funcs(sq_sw, sq_sw);
+sif_define_lookup_funcs(cq, cq_hw);
+sif_define_lookup_funcs(cq_sw, cq_sw);
+sif_define_lookup_funcs(qp, qp);
+sif_define_lookup_funcs(ah, ah);
+
+/* Lookup functions for sif structs accessed via the
+ * "side-array" table->drv_ref
+ */
+#define sif_def_ref_lookup_funcs(type, hwtype) \
+static inline struct sif_##type *get_sif_##type(struct sif_dev *sdev, int idx) \
+{ \
+	return ((struct sif_##type **)sdev->ba[hwtype].drv_ref)[idx];	\
+} \
+static inline void set_sif_##type(struct sif_dev *sdev, int idx, struct sif_##type *v)  \
+{ \
+	((struct sif_##type **)sdev->ba[hwtype].drv_ref)[idx] = v; \
+} \
+static inline struct psif_##hwtype *safe_get_##hwtype(struct sif_dev *sdev, int idx)\
+{ \
+	struct sif_table *tp = &sdev->ba[hwtype];  \
+	if (!sif_index_used(tp, idx)) \
+		return NULL;\
+	return get_##hwtype(sdev, idx);\
+} \
+static inline struct sif_##type *safe_get_sif_##type(struct sif_dev *sdev, int idx)\
+{ \
+	struct sif_table *tp = &sdev->ba[hwtype];  \
+	if (!sif_index_used(tp, idx)) \
+		return NULL;\
+	return get_sif_##type(sdev, idx);\
+} \
+extern uint sif_##type##_size
+
+sif_def_ref_lookup_funcs(mr, key);
+
+static inline struct sif_table_block *sif_get_block(struct sif_table *tp, int index)
+{
+	return (struct sif_table_block *)(tp->block + tp->block_ext * index);
+}
+
+extern uint sif_xrq_size;
+extern uint sif_epsc_size;
+extern uint sif_epsc_eq_headroom;
+extern uint sif_tsu_eq_headroom;
+extern uint sif_sq_ring_size;
+extern uint sif_sq_tvl_size;
+extern uint sif_min_extent;
+
+/* Multi-strategy allocation of table memory */
+int sif_alloc_table(struct sif_table *tp, size_t size);
+
+void sif_free_table(struct sif_table *tp);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_checksum.c b/drivers/infiniband/hw/sif/sif_checksum.c
new file mode 100644
index 0000000000000..b64ef70797b99
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_checksum.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_checksum.c: Utilities for SIF specific 32 bit checksums
+ *
+ */
+#include <net/checksum.h>
+#include <asm/byteorder.h>
+#include "sif_checksum.h"
+#include <linux/module.h>
+
+/*
+ *   32 bit "IP/TCP"-like checksumming - modified from 16 to 32 bit
+ *   from kernel/lib/checksum.c:
+ */
+
+static inline u32 from64to32(u64 x)
+{
+	/* add up 32-bit and 32-bit for 32+c bit */
+	x = (x & 0xffffffff) + (x >> 32);
+	/* add up carry.. */
+	x = (x & 0xffffffff) + (x >> 32);
+	return x;
+}
+
+
+static u64 do_csum32(const unsigned char *buff, int len)
+{
+	int unaligned;
+	u64 result = 0;
+
+	if (len <= 0)
+		goto out;
+	unaligned = 3 & (unsigned long) buff;
+	if (1 & (unsigned long) buff) {
+#ifdef __LITTLE_ENDIAN
+		result += (*buff << 24);
+#else
+		result = *buff;
+#endif
+		len--;
+		buff++;
+	}
+	if (len >= 2) {
+		if (2 & (unsigned long) buff) {
+#ifdef __LITTLE_ENDIAN
+			result += (*(u32 *) buff) << 16;
+#else
+			result += *(u32 *) buff;
+#endif
+			len -= 2;
+			buff += 2;
+		}
+		if (len >= 4) {
+			if (4 & (unsigned long) buff) {
+				result += *(u32 *) buff;
+				len -= 4;
+				buff += 4;
+			}
+			if (len >= 8) {
+				const unsigned char *end = buff + ((unsigned int)len & ~7);
+				unsigned int carry = 0;
+
+				do {
+					u64 w = *(u64 *) buff;
+
+					buff += 8;
+					result += carry;
+					result += w;
+					carry = (w > result);
+				} while (buff < end);
+				result += carry;
+				result = (result & 0xffffffff) + (result >> 32);
+			}
+			if (len & 4) {
+				result += *(u32 *) buff;
+				len -= 4;
+				buff += 4;
+			}
+		}
+		if (len & 2) {
+#ifdef __LITTLE_ENDIAN
+			result += (*(unsigned short *) buff) << 16;
+#else
+			result += *(unsigned short *) buff;
+#endif
+			buff += 2;
+		}
+	}
+	if (len & 1)
+#ifdef __LITTLE_ENDIAN
+		result += *buff;
+#else
+		result += (*buff << 24);
+#endif
+	result = from64to32(result);
+	switch (unaligned) {
+	case 1:
+		result = ((result >> 8) & 0xffffff) | ((result & 0xff) << 24);
+		break;
+	case 2:
+		result = ((result >> 16) & 0xffff) | ((result & 0xffff) << 16);
+		break;
+	case 3:
+		result = ((result >> 24) & 0xff) | ((result & 0xffffff) << 8);
+		break;
+	default:
+		break;
+	}
+out:
+	return result;
+}
+
+
+u64 csum32_partial(const void *buff, int len, u64 wsum)
+{
+	u64 sum = (__force u64)wsum;
+	u64 result = do_csum32(buff, len);
+
+	/* add in old sum, and carry.. */
+	result += sum;
+	if (sum > result)
+		result += 1;
+	return (__force u64)result;
+}
+EXPORT_SYMBOL(csum32_partial);
diff --git a/drivers/infiniband/hw/sif/sif_checksum.h b/drivers/infiniband/hw/sif/sif_checksum.h
new file mode 100644
index 0000000000000..aaa17a1a5536b
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_checksum.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_checksum.h: Utilities for SIF specific 32 bit checksums
+ */
+#ifndef _SIF_CHECKSUM_H
+#define _SIF_CHECKSUM_H
+
+/*
+ *   32 bit "IP/TCP"-like checksumming - modified from 16 to 32 bit
+ *   from kernel/lib/checksum.c:
+ */
+
+u64 csum32_partial(const void *buff, int len, u64 wsum);
+
+/*
+ * Fold a partial checksum
+ */
+static inline u32 csum32_fold(u64 csum)
+{
+	u64 sum = (__force u64)csum;
+
+	sum = (sum & 0xffffffff) + (sum >> 32);
+	sum = (sum & 0xffffffff) + (sum >> 32);
+	return (__force u32)~sum;
+}
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_cq.c b/drivers/infiniband/hw/sif/sif_cq.c
new file mode 100644
index 0000000000000..26502cef38236
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_cq.c
@@ -0,0 +1,1010 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_cq.c: Implementation of completion queue logic for SIF
+ */
+#include <linux/version.h>
+#include <linux/sched.h>
+#include <linux/random.h>
+#include <rdma/ib_verbs.h>
+
+#include "sif_dev.h"
+#include "psif_hw_data.h"
+#include "psif_hw_setget.h"
+#include "sif_defs.h"
+#include "sif_base.h"
+#include "sif_mmu.h"
+#include "sif_ibcq.h"
+#include "sif_cq.h"
+#include "sif_hwi.h"
+#include "sif_dma.h"
+#include "sif_user.h"
+#include "sif_qp.h"
+#include "sif_pqp.h"
+#include "sif_hwi.h"
+#include "sif_ibqp.h"
+#include <linux/bitmap.h>
+#include <linux/seq_file.h>
+
+static inline int translate_wr_id(
+	uint64_t *wr_id,
+	struct sif_dev *sdev,
+	struct sif_cq *cq,
+	struct sif_sq *sq,
+	struct psif_cq_entry *cqe,
+	u32 sq_seq_num, int qpn)
+{
+	struct sif_sq_hdl *wh = get_sq_hdl(sq, sq_seq_num);
+
+	if (unlikely(!wh)) {
+		sif_log(sdev, SIF_INFO,
+			"cqe 0x%x for cq %d refers sq(qp) %d (not initialized), sts %d opc 0x%x",
+			cqe->seq_num, cq->index, qpn, cqe->status, cqe->opcode);
+		return -EFAULT;
+	}
+	if (!unlikely(wh->used)) {
+		if (sq_seq_num == wh->sq_seq)
+			sif_log(sdev, SIF_INFO,
+			"dupl cqe 0x%x for cq %d: got sq_seq 0x%x, last exp.0x%x, sts %d opc 0x%x",
+				cqe->seq_num, cq->index, sq_seq_num, wh->sq_seq,
+				cqe->status, cqe->opcode);
+		else
+			sif_log(sdev, SIF_INFO,
+			"unexp. cqe 0x%x for cq %d: got sq_seq 0x%x, last exp.0x%x, sts %d opc 0x%x",
+				cqe->seq_num, cq->index, sq_seq_num, wh->sq_seq,
+				cqe->status, cqe->opcode);
+		return -EFAULT;
+	}
+	if (unlikely(wh->sq_seq != sq_seq_num)) {
+		sif_log(sdev, SIF_INFO,
+			"wrong cqe 0x%x for cq %d: got sq_seq 0x%x, expected 0x%x, sts %d opc 0x%x",
+			cqe->seq_num, cq->index, sq_seq_num, wh->sq_seq, cqe->status, cqe->opcode);
+		return -EFAULT;
+	}
+	*wr_id = wh->wr_id;
+	wh->used = false;
+
+	return 0;
+}
+
+
+struct ib_cq *sif_create_cq(struct ib_device *ibdev, int entries,
+			int comp_vector,
+			struct ib_ucontext *context,
+			struct ib_udata *udata,
+			enum sif_proxy_type proxy)
+{
+	struct sif_cq *cq = NULL;
+	struct sif_dev *sdev = to_sdev(ibdev);
+	struct sif_ucontext *uc = to_sctx(context);
+	struct sif_pd *pd = context ? uc->pd : sdev->pd;
+	ulong user_flags = 0;
+	bool user_mode = udata != NULL;
+
+	if (entries < 1)
+		return ERR_PTR(-EINVAL);
+
+	if (udata) {
+		struct sif_create_cq_ext cmd;
+		int rv = ib_copy_from_udata(&cmd, udata, sizeof(cmd));
+
+		if (rv)
+			return ERR_PTR(rv);
+		user_flags = cmd.flags;
+		if (sif_vendor_enable(proxy_mode, user_flags))
+			proxy = cmd.proxy;
+		if (sif_vendor_enable(SVF_kernel_mode, user_flags))
+			user_mode = false;
+		if (uc->abi_version < 0x0302) /* TBD: Remove - bw comp */
+			user_mode = !user_mode;
+	}
+
+	cq = create_cq(pd, entries, comp_vector, proxy, user_mode);
+	if (IS_ERR(cq))
+		return (struct ib_cq *)cq;
+
+	if (udata) {
+		struct sif_create_cq_resp_ext resp;
+		int ret;
+
+		memset(&resp, 0, sizeof(resp));
+		resp.cq_idx = cq->index;
+		ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (ret) {
+			destroy_cq(cq);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+	atomic_inc(&sdev->cq_count);
+	sif_log(sdev, SIF_CQ, "new cq at %p entries %d (used %d)%s",
+		cq, entries, atomic_read(&sdev->cq_count),
+		(user_mode ? " (user mode)" : ""));
+	return &cq->ibcq;
+}
+
+
+struct sif_cq *create_cq(struct sif_pd *pd, int entries,
+			int comp_vector,
+			enum sif_proxy_type proxy,
+			bool user_mode)
+{
+	struct sif_dev *sdev = to_sdev(pd->ibpd.device);
+	struct sif_cq_sw *cq_sw;
+	struct psif_cq_sw lcq_sw;
+	struct psif_cq_entry *cqe;
+	struct sif_cq *cq;
+	struct sif_cq *ecq;
+	u32 entries_log2;
+	u64 alloc_sz;
+	int ret;
+	int index = sif_alloc_cq_hw_idx(pd);
+
+	if (index < 0) {
+		ecq = ERR_PTR(-ENOMEM);
+		goto err_alloc_index;
+	}
+
+	cq = get_sif_cq(sdev, index);
+	/* Use entries field to determine if entry has been used before */
+	if (cq->entries) {
+		ret = poll_wait_for_cq_writeback(sdev, cq);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+
+	memset(cq, 0, sizeof(*cq));
+	cq->pd = pd;
+	cq->index = index;
+
+	cq_sw = get_sif_cq_sw(sdev, index);
+	cq_sw->next_seq = 0;
+	cq_sw->last_hw_seq = 0;
+
+	/* Make sure we never fill the CQ completely on rev 1-3 - Bug #3657 */
+	if (PSIF_REVISION(sdev) <= 3)
+		entries++;
+
+	cq->entries = roundup_pow_of_two(entries);
+	cq->ibcq.cqe = cq->entries;
+	entries_log2 = order_base_2(cq->entries);
+
+	/* Adjust available cqes on rev 1-3 - Bug #3657 */
+	if (PSIF_REVISION(sdev) <= 3)
+		cq->ibcq.cqe--;
+
+	/* See #2965: 5 bit size_log2 field in cq desc
+	 * but counter is 32 bit. For simplicity to distinguish full from empty
+	 * SIF can allow allocation of up to 2^30 (size_log2 = 0x1e) entries.
+	 * Use the largest value tested, which should be enough
+	 *
+	 * TBD: Should perhaps limit to some fraction of physical memory available?
+	 */
+	if (entries_log2 > SIF_SW_MAX_CQE_LOG2) {
+		sif_log(sdev, SIF_INFO,
+			"requested %d entries -> %d but sif only supports %d",
+			entries, cq->entries, 1 << SIF_SW_MAX_CQE_LOG2);
+		return ERR_PTR(-ENFILE);
+	}
+
+	cq->mask = cq->entries - 1;
+	cq->extent = sizeof(struct psif_cq_entry);
+
+	alloc_sz = cq->entries * cq->extent;
+
+	/* Only whole pages must be exposed to user space */
+	if (user_mode && (alloc_sz & ~PAGE_MASK))
+		alloc_sz = (alloc_sz + PAGE_SIZE) & PAGE_MASK;
+	cq->user_mode = user_mode;
+
+	if (alloc_sz <= SIF_MAX_CONT)
+		cq->mem = sif_mem_create_dmacont(sdev, alloc_sz, GFP_KERNEL | __GFP_ZERO, DMA_BIDIRECTIONAL);
+	else
+		cq->mem = sif_mem_create(sdev, alloc_sz >> PMD_SHIFT,
+					alloc_sz, SIFMT_2M, GFP_KERNEL | __GFP_ZERO, DMA_BIDIRECTIONAL);
+	if (!cq->mem) {
+		sif_log(sdev, SIF_INFO,	"Failed to allocate %d CQ entries", entries);
+		ecq = ERR_PTR(-ENOMEM);
+		goto err_cdt_invalid;
+	}
+
+	sif_log(sdev, SIF_CQ, "CQ: hw %p sw %p, base_adr %p, alloc_sz 0x%llx",
+		cq, cq_sw, sif_mem_kaddr(cq->mem, 0), alloc_sz);
+
+	/* Since we assume seq.0 as the first valid sequence number,
+	 * we must assume that the first entry we poll against is invalid to
+	 * start with:
+	 */
+	cqe = get_cq_entry(cq, 0);
+	set_psif_cq_entry__seq_num(cqe, (u32)-1);
+	cq->cq_hw.size_log2 = entries_log2;
+
+	/* Prefetch cq_sw when queue is half full: */
+	cq->cq_hw.prefetch_threshold_log2 = entries_log2 - 1;
+
+	cq->cq_hw.valid = 1;
+	cq->cq_hw.base_addr = sif_mem_dma(cq->mem, 0);
+	cq->cq_hw.sequence_number = cq_sw->next_seq;
+
+	if (proxy != SIFPX_OFF) {
+		/* This is a proxy CQ */
+		cq->cq_hw.proxy_en = 1;
+		cq->cq_hw.eps_core = (enum psif_eps_a_core)(proxy - 1);
+	}
+
+	/* Allocate mmu context */
+	ret = sif_map_ctx(sdev, &cq->mmu_ctx, cq->mem, cq->cq_hw.base_addr,
+			alloc_sz, true);
+	if (ret) {
+		ecq = ERR_PTR(-ENOMEM);
+		goto err_map_ctx;
+	}
+
+	/* Designate an EQ to this CQ:
+	 * Note that the two first queues as seen by the driver in rev2
+	 * - index 0 and 1, is reserved for EPSC and async events respectively.
+	 * The index here refers to the first "normal" eq, e.g. eq[2] in
+	 * driver sense:
+	 */
+	cq->cq_hw.int_channel = (sif_check_valid_eq_channel(sdev, comp_vector)) ?
+		comp_vector : sif_get_eq_channel(sdev, cq);
+	cq->eq_idx = cq->cq_hw.int_channel + 2;
+
+	cq->next_logtime = jiffies;
+	init_completion(&cq->cleanup_ok);
+	cq->cq_hw.mmu_cntx = cq->mmu_ctx.mctx;
+
+	copy_conv_to_hw(&cq->d, &cq->cq_hw, sizeof(cq->cq_hw));
+
+	/* Initialize sw part of descriptor */
+	memset(&lcq_sw, 0, sizeof(lcq_sw));
+	lcq_sw.head_indx = cq_sw->next_seq;
+	copy_conv_to_hw(&cq_sw->d, &lcq_sw, sizeof(lcq_sw));
+
+	spin_lock_init(&cq->lock);
+
+	wmb();
+
+	/* to sync with event handling.
+	 * NB! Must be the final operation here as there may events
+	 * pending that only handles either a fully valid CQ or refcnt == 0
+	 */
+	atomic_set(&cq->refcnt, 1);
+
+	sif_log(sdev, SIF_CQ, "Exit: success cq %p index %d", cq,
+		 cq->index);
+	return cq;
+
+err_map_ctx:
+	sif_mem_free(cq->mem);
+err_cdt_invalid:
+	sif_free_cq_hw_idx(pd, cq->index);
+err_alloc_index:
+	return ecq;
+}
+
+int sif_modify_cq(struct ib_cq *ibcq, u16 cq_count, u16 cq_period)
+{
+	struct sif_dev *sdev = to_sdev(ibcq->device);
+
+	sif_log(sdev, SIF_CQ, "Not implemented");
+	return -EOPNOTSUPP;
+}
+
+
+int sif_destroy_cq(struct ib_cq *ibcq)
+{
+	struct sif_cq *cq = to_scq(ibcq);
+	struct sif_dev *sdev = to_sdev(ibcq->device);
+	int ret = destroy_cq(cq);
+
+	if (!ret)
+		atomic_dec(&sdev->cq_count);
+	return ret;
+}
+
+
+int destroy_cq(struct sif_cq *cq)
+{
+	struct sif_dev *sdev = to_sdev(cq->ibcq.device);
+	u32 index = cq->index;
+	struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, index);
+	int ret = 0;
+	u32 miss_cnt = cq_sw->miss_cnt;
+	u32 miss_occ = cq_sw->miss_occ;
+
+	BUG_ON(atomic_read(&cq->ibcq.usecnt));
+
+	if (cq_sw->miss_cnt) {
+		atomic_add(miss_cnt, &sdev->cq_miss_cnt);
+		atomic_add(miss_occ, &sdev->cq_miss_occ);
+	}
+	ret = sif_invalidate_cq_hw(sdev, index, PCM_WAIT);
+	if (ret) {
+		sif_log(sdev, SIF_INFO,
+			"Releasing index %d in dirty state - ret %d", index, ret);
+		return 0;
+	}
+
+	ret = sif_release_cq(sdev, index);
+
+	sif_log(sdev, SIF_CQ, "Exit index %d ret %d miss cnt/occ %d/%d",
+		index, ret, miss_cnt, miss_occ);
+	return ret;
+}
+
+
+
+int sif_release_cq(struct sif_dev *sdev, int index)
+{
+	struct sif_cq *cq = get_sif_cq(sdev, index);
+	struct sif_pd *pd = cq->pd;
+	struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, index);
+
+	/* Wait for any in-progress event queue entry for this CQ to be finished */
+	if (atomic_dec_and_test(&cq->refcnt))
+		complete(&cq->cleanup_ok);
+	wait_for_completion(&cq->cleanup_ok);
+
+	/* Make sure any completions on the cq TLB invalidate
+	 * for priv.qp does arrive before the cq is destroyed..
+	 */
+	sif_unmap_ctx(sdev, &cq->mmu_ctx);
+	sif_mem_free(cq->mem);
+
+	/* Clear sw descriptor - hw descriptor is cleared by hw write-back
+	 * We verify that the write-back has been received before making
+	 * use of the cq again.
+	 */
+	memset(cq_sw, 0, sizeof(*cq_sw));
+
+	if (!sif_feature(disable_invalidate_cq))
+		sif_free_cq_hw_idx(pd, index);
+	return 0;
+}
+
+
+int sif_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+	sif_logi(ibcq->device, SIF_CQ, "Not implemented");
+	return -EOPNOTSUPP;
+}
+
+
+/* @cqe contains little endian local copy of the associated
+ * completion queue entry
+ */
+static int handle_send_wc(struct sif_dev *sdev, struct sif_cq *cq,
+		struct ib_wc *wc, struct psif_cq_entry *cqe, bool qp_is_destroyed)
+{
+	/* send queue descriptor aligned with qp */
+	int sq_idx = cqe->qp;
+	int ret;
+	struct sif_sq *sq = get_sif_sq(sdev, sq_idx);
+	struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, sq_idx);
+
+	/* This is a full 32 bit seq.num */
+	u32 sq_seq_num = cqe->wc_id.sq_id.sq_seq_num;
+
+	if (qp_is_destroyed) {
+		wc->wr_id = cqe->wc_id.rq_id;
+
+		/* No more work, when QP is gone */
+		return 0;
+	}
+
+	ret = translate_wr_id(&wc->wr_id, sdev, cq, sq, cqe, sq_seq_num, cqe->qp);
+	if (ret)
+		return ret;
+
+	wmb();
+	/* Update head_seq after we have marked entry as unused since
+	 * head_seq is used by post_send in the queue full check:
+	 */
+	sq_sw->head_seq = sq_seq_num;
+
+	sif_log(sdev, SIF_CQ,
+		"wr_id 0x%llx on qp/sq %d sq_seq_num %d",
+		wc->wr_id, cqe->qp, sq_seq_num);
+	return 0;
+}
+
+/* @cqe contains a host endian local copy of the associated
+ * completion queue entry.
+ */
+static struct sif_rq *find_rq(struct sif_dev *sdev, struct sif_cq *cq,
+		struct psif_cq_entry *cqe)
+{
+	struct sif_qp *qp = get_sif_qp(sdev, cqe->qp);
+
+	if (qp->type == PSIF_QP_TRANSPORT_XRC)
+		return cq->xsrq;
+	else
+		return get_sif_rq(sdev, qp->rq_idx);
+}
+
+/* @cqe contains a host endian local copy of the associated
+ * completion queue entry
+ */
+static int handle_recv_wc(struct sif_dev *sdev, struct sif_cq *cq, struct ib_wc *wc,
+		struct psif_cq_entry *cqe, bool qp_is_destroyed)
+{
+	struct sif_rq *rq = find_rq(sdev, cq, cqe);
+	struct sif_rq_sw *rq_sw = get_sif_rq_sw(sdev, rq->index);
+	u32 rq_len;
+
+	wc->wr_id = cqe->wc_id.rq_id;
+
+	/* If no QP, no further work */
+	if (qp_is_destroyed)
+		return 0;
+
+	rq_len = atomic_dec_return(&rq_sw->length);
+
+	/* WA #622: For Responder Class A & C error, QP should have been
+	 * marked in ERROR, flush RQ for remaining posted entries.
+	 *
+	 * Note: PSIF doesn't generate FLUSH_ERR completions, we see
+	 * them due to s/w WA #622, do not flush again.
+	 */
+	if ((wc->status != IB_WC_WR_FLUSH_ERR) &&
+		(wc->status != IB_WC_SUCCESS)) {
+		struct sif_qp *qp = to_sqp(wc->qp);
+
+		if (is_regular_qp(qp) && !rq->is_srq
+			&& IB_QPS_ERR == get_qp_state(qp)) {
+			if (sif_flush_rq(sdev, rq, qp, rq_len))
+				sif_log(sdev, SIF_INFO,
+					"failed to flush RQ %d", rq->index);
+		}
+	}
+
+	sif_log(sdev, SIF_CQ, "wr_id 0x%llx queue len %d", wc->wr_id, rq_len);
+	return 0;
+}
+
+static bool fatal_err(enum ib_qp_type type, struct ib_wc *wc)
+{
+	if (wc->opcode == IB_WC_SEND ||
+		wc->opcode == IB_WC_RDMA_WRITE ||
+		wc->opcode == IB_WC_RDMA_READ ||
+		wc->opcode == IB_WC_COMP_SWAP ||
+		wc->opcode == IB_WC_FETCH_ADD ||
+		wc->opcode == IB_WC_RECV ||
+		wc->opcode == IB_WC_RECV_RDMA_WITH_IMM) {
+		switch (type) {
+		case IB_QPT_UD:
+			return	wc->status == IB_WC_LOC_QP_OP_ERR ||
+				wc->status == IB_WC_LOC_PROT_ERR;
+		case IB_QPT_RC:
+			return  wc->status == IB_WC_LOC_LEN_ERR ||
+				wc->status == IB_WC_LOC_QP_OP_ERR ||
+				wc->status == IB_WC_LOC_PROT_ERR ||
+				wc->status == IB_WC_BAD_RESP_ERR ||
+				wc->status == IB_WC_REM_INV_REQ_ERR ||
+				wc->status == IB_WC_REM_ACCESS_ERR ||
+				wc->status == IB_WC_REM_OP_ERR ||
+				wc->status == IB_WC_RETRY_EXC_ERR ||
+				wc->status == IB_WC_RNR_RETRY_EXC_ERR;
+		case IB_QPT_UC:
+			return	wc->status == IB_WC_LOC_QP_OP_ERR;
+		default:
+			/* Any other supported QP transport? */
+			return false;
+		}
+	} else if (wc->status == IB_WC_FATAL_ERR ||
+		wc->status == IB_WC_REM_ABORT_ERR) {
+		return true;
+	}
+	return false;
+}
+
+/* Handle a single completion queue entry at pos @head
+ */
+static int handle_wc(struct sif_dev *sdev, struct sif_cq *cq,
+	volatile struct psif_cq_entry *cqe_p, struct ib_wc *wc)
+{
+	int ret = 0;
+	struct psif_cq_entry lcqe;
+	struct sif_qp *qp;
+	int qpn;
+	bool qp_is_destroyed;
+
+	mb();
+
+	/* Read into local copy in host memory and order */
+	copy_conv_to_sw(&lcqe, cqe_p, sizeof(lcqe));
+
+	/* Completion status ok - store generic info
+	 * in ib_wc
+	 */
+	qpn = lcqe.qp;
+
+	/* For qp 0/1 decode actual qp index: */
+	if (qpn < 2) {
+		/* pkey_index only valid for qp 1 */
+		if (qpn == IB_QPT_GSI)
+			wc->pkey_index = lcqe.pkey_indx;
+	       qpn |= (lcqe.port << 1);
+	       lcqe.qp = qpn;
+	}
+
+	qp = get_sif_qp(sdev, qpn);
+
+	sif_log(sdev, SIF_CQ, "CQ %d: Received cq seqn %d for QP %d opcode %s status %s",
+		cq->index, lcqe.seq_num, qpn,
+		string_enum_psif_wc_opcode(lcqe.opcode),
+		string_enum_psif_wc_status(lcqe.status));
+
+	wc->qp = &qp->ibqp;
+	wc->status = sif2ib_wc_status(lcqe.status);
+	qp_is_destroyed = lcqe.opcode & SIF_WC_QP_DESTROYED;
+	lcqe.opcode &= ~SIF_WC_QP_DESTROYED;
+	wc->opcode = sif2ib_wc_opcode(lcqe.opcode);
+	wc->wc_flags = 0;
+
+	if (unlikely(is_epsa_tunneling_qp(qp->ibqp.qp_type))) {
+		/* if this is EPSA tunneling QP, always return 0. */
+		wc->vendor_err = lcqe.vendor_err;
+		wc->wr_id = lcqe.wc_id.rq_id;
+		return 0;
+	}
+
+	if (wc->status != IB_WC_SUCCESS) {
+		/*
+		 * IBTA: only wr_id, status, qp_num, and vendor_err are valid
+		 * when status != SUCCESS.
+		 *
+		 * Magne 2015-08-25: opcode is also always valid (this
+		 * is required in order to deliver wr_id correct for
+		 * sends when status != SUCCESS)
+		 */
+
+		/* WA #3850: generate LAST_WQE event on SRQ*/
+		struct sif_rq *rq = get_sif_rq(sdev, qp->rq_idx);
+
+		int log_level =
+			(wc->status == IB_WC_WR_FLUSH_ERR) ? SIF_WCE_V : SIF_WCE;
+
+
+		if (!qp_is_destroyed && is_regular_qp(qp) && rq->is_srq) {
+			if (fatal_err(qp->ibqp.qp_type, wc)) {
+				struct ib_event ibe = {
+					.device = &sdev->ib_dev,
+					.event = IB_EVENT_QP_LAST_WQE_REACHED,
+					.element.qp = &qp->ibqp
+				};
+
+				if (qp->ibqp.event_handler)
+					qp->ibqp.event_handler(&ibe, qp->ibqp.qp_context);
+			}
+		}
+
+		sif_log(sdev, log_level,
+			"Err.compl on cq %d seq %d raw wr_id %lld raw stat %s(%d) sif op %s(0x%x) qp# %d vendor_err 0x%x %s",
+			cq->index, lcqe.seq_num, lcqe.wc_id.rq_id,
+			string_enum_psif_wc_status(lcqe.status)+15, lcqe.status,
+			string_enum_psif_wc_opcode(lcqe.opcode)+15, lcqe.opcode,
+			qpn, lcqe.vendor_err, string_enum_psif_tsu_error_types(lcqe.vendor_err));
+
+		sif_logs(SIF_DUMP, write_struct_psif_cq_entry(NULL, 0, &lcqe));
+		atomic_inc(&cq->error_cnt);
+	}
+
+	/* then handle different types */
+	switch (lcqe.opcode) {
+	case PSIF_WC_OPCODE_LSO:
+	case PSIF_WC_OPCODE_SEND:
+	case PSIF_WC_OPCODE_RDMA_WR:
+	/* Do send completions pass immd data ? */
+	/* Answer: Send completions do not report back immediate data */
+	if (lcqe.with_imm)
+		wc->wc_flags |= IB_WC_WITH_IMM;
+	case PSIF_WC_OPCODE_RDMA_READ:
+	case PSIF_WC_OPCODE_CMP_SWAP:
+	case PSIF_WC_OPCODE_FETCH_ADD:
+		ret = handle_send_wc(sdev, cq, wc, &lcqe, qp_is_destroyed);
+		break;
+	case PSIF_WC_OPCODE_RECEIVE_SEND:
+	case PSIF_WC_OPCODE_RECEIVE_RDMA_WR_IMM:
+		/* A heuristic mechanism to determine the traffic pattern. */
+		qp->traffic_patterns.mask = (qp->traffic_patterns.mask << 1) &
+			HEUR_RX_DIRECTION;
+		ret = handle_recv_wc(sdev, cq, wc, &lcqe, qp_is_destroyed);
+	if (lcqe.with_imm) {
+		wc->ex.imm_data = be32_to_cpu(lcqe.seq_num_imm.imm);
+		wc->wc_flags |= IB_WC_WITH_IMM;
+	}
+		break;
+
+	case PSIF_WC_OPCODE_MASKED_CMP_SWAP:
+	case PSIF_WC_OPCODE_MASKED_FETCH_ADD:
+	case PSIF_WC_OPCODE_INVALIDATE_RKEY:
+	case PSIF_WC_OPCODE_INVALIDATE_LKEY:
+	case PSIF_WC_OPCODE_INVALIDATE_BOTH_KEYS:
+	case PSIF_WC_OPCODE_INVALIDATE_TLB:
+	case PSIF_WC_OPCODE_RESIZE_CQ:
+	case PSIF_WC_OPCODE_SET_SRQ_LIM:
+	case PSIF_WC_OPCODE_SET_XRCSRQ_LIM:
+	case PSIF_WC_OPCODE_CMPL_NOTIFY_RCVD:
+	case PSIF_WC_OPCODE_REARM_CMPL_EVENT:
+	case PSIF_WC_OPCODE_INVALIDATE_RQ:
+	case PSIF_WC_OPCODE_INVALIDATE_CQ:
+	case PSIF_WC_OPCODE_INVALIDATE_RB:
+	case PSIF_WC_OPCODE_INVALIDATE_XRCSRQ:
+	case PSIF_WC_OPCODE_INVALIDATE_SGL_CACHE:
+	default:
+		sif_log(sdev, SIF_INFO,
+			"Unhandled wc opcode %s", string_enum_psif_wc_opcode(lcqe.opcode));
+		ret = -EINVAL;
+		break;
+	}
+
+	/* Need sif2ib_flags() */
+	if (lcqe.grh == 1) {
+		wc->wc_flags |= IB_WC_GRH;
+		sif_log(sdev, SIF_CQ, "GRH present in payload");
+	}
+
+	wc->vendor_err = lcqe.vendor_err;
+	wc->byte_len = lcqe.byte_len;
+
+	/*
+	 * Brian Manula 2-august-2015: src_qp is zero on connected QP transports.
+	 *
+	 * IBTA: Remote node address and QP. Returned only for Datagram services.
+	 */
+	wc->src_qp = lcqe.src_qp;
+	wc->slid = lcqe.slid;
+	wc->sl = lcqe.sl;
+	wc->dlid_path_bits = lcqe.dlid_path_bits;
+	wc->port_num = lcqe.port + 1; /* Sif port numbers start at 0 */
+
+	if (qp->flags & (SIF_QPF_IPOIB | SIF_QPF_EOIB)) {
+		bool do_l3_csum;
+		bool do_l4_csum;
+		bool csum_l3_ok;
+		bool csum_l4_ok;
+		bool csum_ok;
+		struct psif_offload_info *oinfo;
+
+		oinfo = &lcqe.offload_wc_id.offload;
+		do_l3_csum  =
+			oinfo->packet_classification_ipv4  ||
+			oinfo->packet_classification_ipv6;
+		do_l4_csum =
+			oinfo->packet_classification_tcp  ||
+			oinfo->packet_classification_udp;
+
+		csum_l3_ok = do_l3_csum ? oinfo->l3_checksum_ok : true;
+		csum_l4_ok = do_l4_csum ? oinfo->l4_checksum_ok : true;
+		csum_ok = csum_l3_ok & csum_l4_ok;
+
+		qp->ipoib_rx_csum_l3_ok  += !!(do_l3_csum &&  csum_l3_ok);
+		qp->ipoib_rx_csum_l3_err += !!(do_l3_csum && !csum_l3_ok);
+
+		qp->ipoib_rx_csum_l4_ok  += !!(do_l4_csum &&  csum_l4_ok);
+		qp->ipoib_rx_csum_l4_err += !!(do_l4_csum && !csum_l4_ok);
+		/* set flag; could be ignored by next level if disabled */
+		wc->wc_flags |= (csum_ok) ? IB_WC_IP_CSUM_OK : 0;
+		if (!csum_ok) {
+			sif_log(sdev,
+				SIF_WCE,
+				"checksum not ok for ipv4/ipv6 eth2 %d ip4 %d ip6 %d frag %d options %d arp %d arp_reply %d exthdr %d tcp %d udp %d l3_ok %d l4_ok %d",
+				oinfo->packet_classification_eth2,
+				oinfo->packet_classification_ipv4,
+				oinfo->packet_classification_ipv6,
+				oinfo->packet_classification_ip_frag,
+				oinfo->packet_classification_ip_options,
+				oinfo->packet_classification_arp,
+				oinfo->packet_classification_arp_reply,
+				oinfo->packet_classification_ip6_unsupported_exthdr,
+				oinfo->packet_classification_tcp,
+				oinfo->packet_classification_udp,
+				oinfo->l3_checksum_ok,
+				oinfo->l4_checksum_ok
+				);
+		}
+	}
+	return ret;
+}
+
+
+/*
+ * When a QP is taken down and it has send completions that are not
+ * polled, we need to walk through the send CQ and update the wr_id,
+ * before the QP's SQ handle are de-allocated. To signal that the
+ * wr_id is correct, we set the SIF_WC_QP_DESTROYED bit in the wc
+ * opcode.
+ *
+ * Further, for a receive completion, we normally need the QP in order
+ * to retrieve the RQ number. Again, the QP might not exist. Hence, we
+ * mark receive CQEs the same way.
+ *
+ * Negative return implies an error, errno is set. Zero or greater
+ * return indicates numbers of CQEs that were marked with
+ * SIF_WC_QP_DESTROYED.
+ */
+
+int sif_fixup_cqes(struct sif_cq *cq, struct sif_sq *sq, struct sif_qp *qp)
+{
+	volatile struct psif_cq_entry *cqe;
+	struct sif_dev *sdev = to_sdev(cq->ibcq.device);
+	struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index);
+	u32 seqno;
+	u32 polled_value;
+	int n = 0;
+	int ret = 0;
+	unsigned long flags = 0;
+
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	for (seqno = cq_sw->next_seq;; ++seqno) {
+		struct psif_cq_entry lcqe;
+		uint64_t wr_id_host_order = 0;
+
+		cqe = get_cq_entry(cq, seqno);
+		polled_value = get_psif_cq_entry__seq_num(cqe);
+
+		/* More CQEs to check? */
+		if (seqno != polled_value)
+			break;
+
+		/* Fixup only for this QP */
+		if (get_psif_cq_entry__qp(cqe) != qp->qp_idx)
+			continue;
+
+		/* Read into local copy in host memory order */
+		copy_conv_to_sw(&lcqe, cqe, sizeof(lcqe));
+
+		/* Receive completion? */
+		if (lcqe.opcode & 0x80) {
+			struct sif_post_mortem_qp_info_in_cqe *post_mortem_info =
+				(struct sif_post_mortem_qp_info_in_cqe *) cqe->reserved + 0;
+
+			/* if a receive completion, record some info to be used when cqe is polled */
+			post_mortem_info->was_srq = has_srq(sdev, qp);
+			post_mortem_info->srq_idx = qp->rq_idx;
+			post_mortem_info->qpn     = qp->qp_idx;
+		} else {
+			/* If a send completion, handle the wr_id */
+			ret = translate_wr_id(&wr_id_host_order, sdev, cq, sq, &lcqe,
+					lcqe.wc_id.sq_id.sq_seq_num, lcqe.qp);
+			if (ret)
+				goto err;
+
+			set_psif_cq_entry__wc_id(cqe, wr_id_host_order);
+		}
+
+		/* Tell sub-sequent poll_cq() that the wr_id is OK */
+		set_psif_cq_entry__opcode(cqe, get_psif_cq_entry__opcode(cqe) | SIF_WC_QP_DESTROYED);
+		++n;
+	}
+
+	ret = n;
+
+err:
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+
+	return ret;
+}
+
+
+/* standard poll function called from ib_poll_cq
+ * driver internal completion handling uses special logic in sif_pqp.c
+ *
+ * All types of QP ownership can use this function for peek operations
+ * [ via sif_peek_cq (with @wc = NULL) ]
+ */
+int sif_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct sif_cq *cq = to_scq(ibcq);
+	struct sif_dev *sdev = to_sdev(ibcq->device);
+	volatile struct psif_cq_entry *cqe;
+	struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index);
+
+	u32 seqno;
+	u32 polled_value = 0;
+	int npolled = 0;
+	unsigned long flags = 0;
+	int ret = 0;
+	/* TBD: Replace lock with atomic ops */
+	spin_lock_irqsave(&cq->lock, flags);
+
+	seqno = cq_sw->next_seq;
+	cqe = get_cq_entry(cq, seqno);
+
+	sif_log_cq(cq, SIF_POLL, "cq %d (requested %d entries), next_seq %d %s",
+		cq->index, num_entries, cq_sw->next_seq, (wc ? "" : "(peek)"));
+
+	while (npolled < num_entries) {
+		/* TBD - maybe should hide this as a function in sif_r3.c */
+		if ((test_bit(CQ_POLLING_NOT_ALLOWED, &cq_sw->flags)))
+			break;
+
+		polled_value = get_psif_cq_entry__seq_num(cqe);
+
+		if ((test_bit(CQ_POLLING_IGNORED_SEQ, &cq_sw->flags)) && ~seqno == polled_value) {
+			seqno = ++cq_sw->next_seq;
+			clear_bit(CQ_POLLING_IGNORED_SEQ, &cq_sw->flags);
+			continue;
+		}
+
+		if (seqno == polled_value)
+			npolled++;
+		else
+			break;
+
+		if (likely(wc)) {
+			ret = handle_wc(sdev, cq, cqe, wc);
+			if (ret < 0)
+				goto handle_failed;
+			wc++;
+			seqno = ++cq_sw->next_seq;
+		} else /* peek_cq semantics */
+			++seqno;
+
+		cqe = get_cq_entry(cq, seqno);
+	}
+
+	if (likely(wc)) {
+		if (cq_length(cq, cq_sw->cached_head, seqno) >= cq->high_watermark) {
+			/* Update CQ software pointer */
+			set_psif_cq_sw__head_indx(&cq_sw->d, seqno);
+			cq_sw->cached_head = seqno;
+		}
+	}
+
+handle_failed:
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	if (npolled)
+		sif_log(sdev, SIF_CQ, "done - %d completions - seq_no of next entry: %d",
+			npolled, polled_value);
+	else
+		sif_log_cq(cq, SIF_POLL, "no completions polled - seq_no of next entry: %d",
+			polled_value);
+	return !ret ? npolled : ret;
+}
+
+
+int sif_peek_cq(struct ib_cq *ibcq, int wc_cnt)
+{
+	return sif_poll_cq(ibcq, wc_cnt, NULL);
+}
+
+
+int sif_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	struct sif_cq *cq = to_scq(ibcq);
+	struct sif_dev *sdev = to_sdev(ibcq->device);
+	struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index);
+	struct psif_wr wr;
+	int ret;
+	DECLARE_SIF_CQE_WITH_SAME_EQ(sdev, lcqe, cq->eq_idx);
+
+	sif_log(sdev, SIF_NCQ, "cq_idx %d, flags 0x%x", cq->index, flags);
+
+	memset(&wr, 0, sizeof(struct psif_wr));
+
+	if (flags & IB_CQ_SOLICITED)
+		wr.se = 1;
+
+	/* We should never miss events in psif so we have no need for a separate
+	 *  handling of IB_CQ_REPORT_MISSED_EVENTS - ignore it.
+	 */
+
+	wr.op = cq->rcn_sent ? PSIF_WR_REARM_CMPL_EVENT : PSIF_WR_REQ_CMPL_NOTIFY;
+	wr.completion = 1;
+	wr.details.su.u2.cq_id = cq->index;
+
+	ret = sif_pqp_poll_wr(sdev, &wr, &lcqe);
+
+	cq->rcn_sent = ret >= 0;
+
+	if (lcqe.cqe.status != PSIF_WC_STATUS_SUCCESS) {
+		if (ret >= 0)
+			ret = -EINVAL;
+		sif_log(sdev, SIF_INFO,
+			" cq %d: last_hw_seq %u next_seq %u failed with status %s",
+			cq->index, cq_sw->last_hw_seq, cq_sw->next_seq,
+			string_enum_psif_wc_status(lcqe.cqe.status));
+	} else
+		sif_log(sdev, SIF_NCQ, "cq %d: last_hw_seq %u next_seq %u status %s",
+			cq->index, cq_sw->last_hw_seq, cq_sw->next_seq,
+			string_enum_psif_wc_status(lcqe.cqe.status));
+
+	if ((ret > 0) && (flags & IB_CQ_REPORT_MISSED_EVENTS)) {
+		/* peek to see if there is any outstanding completion.
+		 * By checking for this flag, the application
+		 * does  not required to call poll_cq again to
+		 * avoid race condition.
+		 */
+		return sif_peek_cq(ibcq, 1);
+	}
+
+	return ret > 0 ? 0 : ret;
+}
+
+
+int sif_req_ncomp_notif(struct ib_cq *ibcq, int wc_cnt)
+{
+	struct sif_dev *sdev = to_sdev(ibcq->device);
+
+	sif_log(sdev, SIF_VERBS, "Not implemented");
+	return -EOPNOTSUPP;
+}
+
+
+void sif_dfs_print_cq_hw(struct seq_file *s, struct sif_dev *sdev,
+			loff_t pos)
+{
+	struct sif_cq *cq;
+	volatile struct psif_cq_hw *cq_hw_p;
+	volatile struct sif_cq_sw *cq_sw;
+	int qlen;
+
+	if (unlikely(pos < 0)) {
+		seq_printf(s, "#    Destroyed cq miss_cnt/occ %u/%u\n",
+			atomic_read(&sdev->cq_miss_cnt),
+			atomic_read(&sdev->cq_miss_occ));
+
+		seq_puts(s, "# Index  actual_head  cached_head  hw_tail  entries ");
+		seq_puts(s, "queue_len next_seq eq  #events timeouts   errors  miss_cnt/occ\n");
+		return;
+	}
+
+	cq = get_sif_cq(sdev, pos);
+	cq_hw_p = &cq->d;
+	cq_sw = get_sif_cq_sw(sdev, cq->index);
+
+	/* TBD: Must peek for new entries to report accurately, but it is unsafe
+	 * unless we ref.cnt the cq
+	 */
+	qlen = 0;
+
+	seq_printf(s, "%7llu %12u %12d %8u %8u %9u %8u %2u %8u %8u %8u %8u %4u", pos,
+		get_psif_cq_sw__head_indx(&cq_sw->d), cq_sw->cached_head,
+		get_psif_cq_hw__tail_indx(cq_hw_p),
+		cq->entries, qlen, cq_sw->next_seq, cq->eq_idx, atomic_read(&cq->event_cnt),
+		atomic_read(&cq->timeout_cnt),
+		atomic_read(&cq->error_cnt),
+		cq_sw->miss_cnt, cq_sw->miss_occ);
+
+	if (get_psif_cq_hw__proxy_en(cq_hw_p))
+		seq_printf(s, " [proxy to %s]",
+			string_enum_psif_eps_a_core(get_psif_cq_hw__eps_core(cq_hw_p)));
+	if (cq_sw->armed)
+		seq_puts(s, " [armed]\n");
+	else
+		seq_puts(s, "\n");
+}
+
+
+/* Poll wait for a cq descriptor to be written back in invalid state */
+int poll_wait_for_cq_writeback(struct sif_dev *sdev, struct sif_cq *cq)
+{
+	int ret = 0;
+	ulong timeout = jiffies + sdev->min_resp_ticks * 2;
+	u8 valid;
+
+	while ((valid = get_psif_cq_hw__valid(&cq->d))) {
+		if (time_after(jiffies, timeout)) {
+			sif_log(sdev, SIF_INFO,
+				"timeout waiting for cq_hw write-back cq %d", cq->index);
+			atomic_inc(&cq->timeout_cnt);
+			return -ETIMEDOUT;
+		}
+		cpu_relax();
+	}
+	sif_log(sdev, SIF_CQ, "exit - write-back observed on cq %d", cq->index);
+	return ret;
+}
diff --git a/drivers/infiniband/hw/sif/sif_cq.h b/drivers/infiniband/hw/sif/sif_cq.h
new file mode 100644
index 0000000000000..402db2bd5b7f9
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_cq.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_cq.h: Internal interface to psif completion queue logic
+ */
+
+#ifndef __SIF_CQ_H
+#define __SIF_CQ_H
+#include "psif_hw_data.h"
+#include "sif_user.h"
+#include "sif_mmu.h"
+
+struct sif_dev;
+struct sif_cqe;
+struct sif_compl;
+struct sif_pd;
+struct sif_qp;
+struct sif_sq;
+
+struct sif_cq {
+	volatile struct psif_cq_hw d; /* Hardware descriptor */
+	struct ib_cq ibcq ____cacheline_internodealigned_in_smp;
+	struct sif_pd *pd; /* Unlike the rest of ofed we tie a CQ to a PD */
+	struct sif_mem *mem; /* Allocated queue memory */
+	int index;
+	u32 entries;
+	u32 mask;  /* entries - 1 for modulo using & */
+	u32 extent;
+	atomic_t refcnt;  /* refc.count on this object */
+	struct completion cleanup_ok; /* Used to synchronize cleanup with event handling */
+	u32 high_watermark; /* if < used entries (as seen by hw), update hw: head */
+	struct psif_cq_hw cq_hw; /* Local copy of cq_hw, as initialized, in host endianness */
+	struct sif_mmu_ctx mmu_ctx;
+	/* lock protects the below data structure and access/freeing of sq elems */
+	spinlock_t lock ____cacheline_internodealigned_in_smp;
+	bool user_mode;  /* Set if this is a CQ to be mapped to user space */
+	bool pd_is_set;  /* Whether or not this cq has a pd set in it's descriptor */
+	bool rcn_sent;   /* Set if ib_req_notify_cq() has been called on this cq */
+	u8 eq_idx;       /* Index of the event queue that gets completion events for this cq */
+	atomic_t error_cnt;   /* No. of error completions observed on this cq */
+	atomic_t timeout_cnt; /* No. of completion timeouts observed on this cq */
+	atomic_t event_cnt;   /* No. of completion events observed for this cq (will wrap..) */
+	u32 log_cnt;  /* Number of suppressed log messages since last print */
+	unsigned long next_logtime;  /* timeout for when to print next message */
+	struct sif_rq *xsrq; /* The XRC SRQ using this completion queue (see #3521) */
+	struct sif_pqp *pqp; /* The PQP using this completion queue (for dfs reporting..) */
+};
+
+static inline struct sif_cq *to_scq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct sif_cq, ibcq);
+}
+
+/* Poll wait for a cq descriptor to be written back in invalid state */
+int poll_wait_for_cq_writeback(struct sif_dev *sdev, struct sif_cq *cq);
+
+
+struct sif_cq *create_cq(struct sif_pd *pd, int cqe,
+			int comp_vector,
+			enum sif_proxy_type proxy,
+			bool user_mode);
+
+
+/* internal poll/peek of completion queue:
+ *  - Return value: 0 - @num_entries representing
+ * the number of ready completions on the queue.
+ *
+ * If @wc is set, @poll_cq processes entries and updates the local cq state.
+ * If @wc is NULL @poll_cq behaves as a peek, not modifying
+ * the local completion queue state.
+ *
+ * Note that @poll_cq does not modify any state shared with
+ * hardware except the head pointer
+ */
+int poll_cq(struct sif_dev *sdev, struct sif_cq *cq, int num_entries,
+	struct sif_cqe *cqe);
+
+int destroy_cq(struct sif_cq *cq);
+
+
+/* Clean up resource usage associated with this cq
+ * If return value is -EIDRM it means that this cq was used with a privileged
+ * QP. In that case no more polls can be made at this point since the completion queue
+ * polled just self destructed..
+ */
+int sif_release_cq(struct sif_dev *sdev, int index);
+
+
+/* Printer for debugfs cq_hw file */
+void sif_dfs_print_cq_hw(struct seq_file *s, struct sif_dev *sdev,
+			loff_t pos);
+
+extern int sif_fixup_cqes(struct sif_cq *cq, struct sif_sq *sq, struct sif_qp *qp);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_debug.c b/drivers/infiniband/hw/sif/sif_debug.c
new file mode 100644
index 0000000000000..763f6b4172635
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_debug.c
@@ -0,0 +1,636 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_debug.c: Use of debugfs for dumping internal data structure info
+ */
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include "sif_dev.h"
+#include "sif_debug.h"
+#include "sif_base.h"
+#include "sif_query.h"
+#include "sif_qp.h"
+#include "sif_defs.h"
+
+/* A 'reference' element to identify each table type
+ */
+struct sif_dfs_ref {
+	struct sif_dev *sdev;
+	bool is_eq;
+	enum sif_tab_type type;
+	sif_dfs_printer dfs_print;
+};
+
+
+/* Our private data within driver struct
+ */
+struct sif_dfs {
+	struct dentry *root; /* The root of the debugfs tree, if set up (pci id name) */
+	struct dentry *root_link; /* A symlink from ib device name to pci id name */
+	struct dentry *raw_qp; /* Ref to directory with raw qp info, if set up */
+	struct sif_dfs_ref sd[sif_tab_init_max];
+	struct sif_dfs_ref sd_eq;
+	struct sif_dfs_ref sd_irq_ch;
+	struct sif_dfs_ref sd_ipoffload;
+};
+
+/* A simple iterator */
+
+struct sif_dfs_iter {
+	loff_t pos;   /* Current "virtual" offset */
+	bool started; /* If header has been printed */
+};
+
+
+static void *sif_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct sif_dfs_iter *it = (struct sif_dfs_iter *) v;
+	struct sif_dfs_ref *sd = (struct sif_dfs_ref *) s->private;
+	struct sif_table *tp = &sd->sdev->ba[sd->type];
+
+	++(*pos);
+	*pos = sif_next_used(tp, *pos);
+	sif_log(sd->sdev, SIF_DFS, "%lld -> %lld", it->pos, *pos);
+	if (*pos < 0) {
+		kfree(it);
+		return NULL;
+	}
+	it->pos = *pos;
+	return it;
+}
+
+static void *sif_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct sif_dfs_iter *it;
+	struct sif_dfs_ref *sd = (struct sif_dfs_ref *) s->private;
+	struct sif_table *tp = &sd->sdev->ba[sd->type];
+
+	sif_log(sd->sdev, SIF_DFS, " at %lld", *pos);
+	*pos = sif_next_used(tp, *pos);
+	if (*pos < 0)
+		return NULL;
+	it = kmalloc(sizeof(struct sif_dfs_iter), GFP_KERNEL);
+	if (!it)
+		return NULL;
+	it->pos = *pos;
+	it->started = false;
+	return it;
+}
+
+static void sif_seq_stop(struct seq_file *s, void *v)
+{
+	struct sif_dfs_ref *sd = (struct sif_dfs_ref *) s->private;
+
+	if (v) {
+		sif_log(sd->sdev, SIF_DFS, "sif_seq_stop at %p", v);
+		kfree(v);
+	}
+	sif_log(sd->sdev, SIF_DFS, " [at end]");
+}
+
+static int sif_seq_show(struct seq_file *s, void *v)
+{
+	struct sif_dfs_iter *it = (struct sif_dfs_iter *) v;
+	struct sif_dfs_ref *sd = (struct sif_dfs_ref *) s->private;
+
+	sif_log(sd->sdev, SIF_DFS, "%lld", it->pos);
+	if (!it->pos || !it->started) {
+		seq_printf(s, "# %s state:\n", sif_table_name(sd->type));
+		if (sd->dfs_print)
+			sd->dfs_print(s, sd->sdev, -1);
+		else
+			seq_puts(s, "# Index\tValues\n");
+		it->started = true;
+	}
+	if (sd->dfs_print)
+		sd->dfs_print(s, sd->sdev, it->pos);
+	else
+		seq_printf(s, "%lld\n", it->pos);
+	return 0;
+}
+
+
+static const struct seq_operations seq_ops = {
+	.start = sif_seq_start,
+	.next  = sif_seq_next,
+	.stop  = sif_seq_stop,
+	.show  = sif_seq_show
+};
+
+
+/* Specific support for eq reporting which has slightly different logic: */
+static void *sif_eq_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct sif_dfs_iter *it = (struct sif_dfs_iter *) v;
+	struct sif_dfs_ref *sd = (struct sif_dfs_ref *) s->private;
+	struct sif_dev *sdev = sd->sdev;
+	u32 cnt = sdev->es[sdev->mbox_epsc].eqs.cnt;
+
+	if (*pos > cnt - 2)
+		*pos = -1;
+	else
+		++(*pos);
+
+	sif_log(sdev, SIF_DFS, "%lld -> %lld", it->pos, *pos);
+	if (*pos < 0) {
+		kfree(it);
+		return NULL;
+	}
+	it->pos = *pos;
+	return it;
+}
+
+static void *sif_eq_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct sif_dfs_iter *it;
+	struct sif_dfs_ref *sd = (struct sif_dfs_ref *) s->private;
+	struct sif_dev *sdev = sd->sdev;
+	u32 cnt = sdev->es[sdev->mbox_epsc].eqs.cnt;
+
+	sif_log(sdev, SIF_DFS, " at %lld", *pos);
+	if (*pos > cnt - 2) {
+		*pos = -1;
+		return NULL;
+	}
+	it = kmalloc(sizeof(struct sif_dfs_iter), GFP_KERNEL);
+	if (!it)
+		return NULL;
+	it->pos = *pos;
+	it->started = false;
+	return it;
+}
+
+static const struct seq_operations eq_seq_ops = {
+	.start = sif_eq_seq_start,
+	.next  = sif_eq_seq_next,
+	.stop  = sif_seq_stop,
+	.show  = sif_seq_show
+};
+
+static int sif_seq_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct sif_dfs_ref *sd = (struct sif_dfs_ref *)inode->i_private;
+	struct seq_file *seq;
+
+	if (!try_module_get(THIS_MODULE))
+		return -EIO;
+
+	if (unlikely(sd->is_eq))
+		ret = seq_open(file, &eq_seq_ops);
+	else
+		ret = seq_open(file, &seq_ops);
+	if (!ret) {
+		seq = file->private_data;
+		seq->private = inode->i_private;
+	}
+	return ret;
+};
+
+static int sif_seq_release(struct inode *inode, struct file *file)
+{
+	int stat = seq_release(inode, file);
+
+	module_put(THIS_MODULE);
+	return stat;
+}
+
+
+static const struct file_operations table_fops = {
+	.owner   = THIS_MODULE,
+	.open    = sif_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = sif_seq_release
+};
+
+static ssize_t irq_ch_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct seq_file *seq = file->private_data;
+	struct sif_dfs_ref *sd = (struct sif_dfs_ref *) seq->private;
+	struct sif_dev *sdev = sd->sdev;
+	struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+	u32 channels = es->eqs.cnt;
+
+	struct sif_eq *eq = &es->eqs.eq[1];
+	struct psif_epsc_csr_interrupt_channel *settings;
+	struct psif_epsc_csr_req req; /* local epsc wr copy */
+	struct psif_epsc_csr_rsp resp;
+
+	char buffer[256] = ""; /* make a writable copy of const buf*/
+	char *str, *token, *param[2];
+	int ret;
+
+	if (!eps_version_ge(es, 0, 36))
+		goto opcode_not_available;
+
+	if (count >= sizeof(buffer))
+		return -ENOSPC;
+
+	ret = simple_write_to_buffer(buffer, sizeof(buffer), ppos, buf, count);
+	if (ret < 0) {
+		sif_log(sd->sdev, SIF_INFO, "Not able to read input parameters from userspace");
+		return ret;
+	}
+	buffer[ret] = '\0';
+	str = buffer;
+
+	memset(&req, 0, sizeof(req));
+	req.opcode = EPSC_HOST_INT_CHANNEL_CTRL;
+	req.uf = 0;
+	settings = &req.u.int_channel;
+
+	while ((token = strsep(&str, ";")) != NULL) {
+		param[0] = strsep(&token, "=");
+		if (param[0]) {
+			param[1] = strsep(&token, "=");
+			if (!param[1])
+				continue;
+		} else {
+			continue;
+		}
+
+		if (strcmp(param[0], "channel") == 0) {
+			u16 value;
+
+			ret = kstrtou16(param[1], 10, &value);
+			if (ret == 0 && value > 0 && value < channels) {
+				settings->int_channel = value;
+				eq = &es->eqs.eq[value];
+			} else {
+				sif_log(sd->sdev, SIF_INTR, "Invalid irq channel: %hu",
+					value);
+				goto sif_invalid_channel;
+			}
+		} else if (strcmp(param[0], "adaptive") == 0) {
+			u8 value;
+
+			ret = kstrtou8(param[1], 10, &value);
+			if (ret == 0 && value == 0) {
+				settings->attributes.enable_adaptive = 1;
+				settings->enable_adaptive = 0;
+			} else if (ret == 0 && value > 0) {
+				settings->attributes.enable_adaptive = 1;
+				settings->enable_adaptive = 1;
+			} else {
+				sif_log(sd->sdev, SIF_INTR, "Invalid channel_adaptive value: %hu",
+					value);
+			}
+		} else if (strcmp(param[0], "rx_scale") == 0) {
+			u16 value;
+
+			ret = kstrtou16(param[1], 10, &value);
+			if (!ret) {
+				settings->attributes.channel_rx_scale = 1;
+				settings->channel_rx_scale = value;
+			} else {
+				sif_log(sd->sdev, SIF_INTR, "Invalid channel_rx_scale value: %hu",
+					value);
+			}
+		} else if (strcmp(param[0], "rate_low") == 0) {
+			u32 value;
+
+			ret = kstrtou32(param[1], 10, &value);
+			if (!ret) {
+				settings->attributes.channel_rate_low = 1;
+				settings->channel_rate_low = value;
+			} else {
+				sif_log(sd->sdev, SIF_INTR, "Invalid channel_rate_low value: %u",
+					value);
+			}
+		} else if (strcmp(param[0], "rate_high") == 0) {
+			u32 value;
+
+			ret = kstrtou32(param[1], 10, &value);
+			if (!ret) {
+				settings->attributes.channel_rate_high = 1;
+				settings->channel_rate_high = value;
+			} else {
+				sif_log(sd->sdev, SIF_INTR, "Invalid channel_rate_high value: %u",
+					value);
+			}
+		} else if (strcmp(param[0], "ausec") == 0) {
+			u16 value;
+
+			ret = kstrtou16(param[1], 10, &value);
+			if (!ret) {
+				settings->attributes.channel_ausec = 1;
+				settings->channel_ausec = value;
+			} else {
+				sif_log(sd->sdev, SIF_INTR, "Invalid channel_ausec value: %hu",
+					value);
+			}
+		} else if (strcmp(param[0], "ausec_low") == 0) {
+			u16 value;
+
+			ret = kstrtou16(param[1], 10, &value);
+			if (!ret) {
+				settings->attributes.channel_ausec_low = 1;
+				settings->channel_ausec_low = value;
+			} else {
+				sif_log(sd->sdev, SIF_INTR, "Invalid channel_ausec_low value: %hu",
+					value);
+			}
+		} else if (strcmp(param[0], "ausec_high") == 0) {
+			u16 value;
+
+			ret = kstrtou16(param[1], 10, &value);
+			if (!ret) {
+				settings->attributes.channel_ausec_high = 1;
+				settings->channel_ausec_high = value;
+			} else {
+				sif_log(sd->sdev, SIF_INTR, "Invalid channel_ausec_high value: %hu",
+					value);
+			}
+		} else if (strcmp(param[0], "pusec") == 0) {
+			u16 value;
+
+			ret = kstrtou16(param[1], 10, &value);
+			if (!ret) {
+				settings->attributes.channel_pusec = 1;
+				settings->channel_pusec = value;
+			} else {
+				sif_log(sd->sdev, SIF_INTR, "Invalid channel_pusec value: %hu",
+					value);
+			}
+		} else if (strcmp(param[0], "pusec_low") == 0) {
+			u16 value;
+
+			ret = kstrtou16(param[1], 10, &value);
+			if (!ret) {
+				settings->attributes.channel_pusec_low = 1;
+				settings->channel_pusec_low = value;
+			} else {
+				sif_log(sd->sdev, SIF_INTR, "Invalid channel_pusec_low value: %hu",
+					value);
+			}
+		} else if (strcmp(param[0], "pusec_high") == 0) {
+			u16 value;
+
+			ret = kstrtou16(param[1], 10, &value);
+			if (!ret) {
+				settings->attributes.channel_pusec_high = 1;
+				settings->channel_pusec_high = value;
+			} else {
+				sif_log(sd->sdev, SIF_INTR, "Invalid channel_pusec_high value: %hu",
+					value);
+			}
+		} else {
+			sif_log(sd->sdev, SIF_INTR, "Omitting invalid irq coalesce parameter %s",
+				param[0]);
+		}
+	}
+
+	if (!settings->int_channel) {
+		sif_log(sd->sdev, SIF_INTR, "Missing irq channel");
+		goto sif_invalid_channel;
+	}
+
+	ret = sif_epsc_wr_poll(sd->sdev, &req, &resp);
+	if (ret) {
+		sif_log(sd->sdev, SIF_INFO, "Failed to configure the coalescing settings for irq channel %d",
+			settings->int_channel);
+		goto err_epsc_comm;
+	}
+	/* Update the driver device settings */
+#define UPDATE_DRIVER_INT_CTRL_SETTING(attr) {			\
+		if (settings->attributes.attr)			\
+			eq->irq_ch.attr = settings->attr;	\
+	}
+	UPDATE_DRIVER_INT_CTRL_SETTING(enable_adaptive);
+	UPDATE_DRIVER_INT_CTRL_SETTING(channel_rx_scale);
+	UPDATE_DRIVER_INT_CTRL_SETTING(channel_rate_low);
+	UPDATE_DRIVER_INT_CTRL_SETTING(channel_rate_high);
+	UPDATE_DRIVER_INT_CTRL_SETTING(channel_ausec);
+	UPDATE_DRIVER_INT_CTRL_SETTING(channel_ausec_low);
+	UPDATE_DRIVER_INT_CTRL_SETTING(channel_ausec_high);
+	UPDATE_DRIVER_INT_CTRL_SETTING(channel_pusec);
+	UPDATE_DRIVER_INT_CTRL_SETTING(channel_pusec_low);
+	UPDATE_DRIVER_INT_CTRL_SETTING(channel_pusec_high);
+	/* Update the irq_ch debug file*/
+	sd->dfs_print(seq, sd->sdev, *ppos);
+
+	return count;
+
+opcode_not_available:
+sif_invalid_channel:
+	return -EINVAL;
+err_epsc_comm:
+	return ret;
+}
+
+static const struct file_operations table_fops_rw = {
+	.owner   = THIS_MODULE,
+	.open    = sif_seq_open,
+	.read    = seq_read,
+	.write	 = irq_ch_write,
+	.llseek  = seq_lseek,
+	.release = sif_seq_release
+};
+
+
+/* Setup/teardown */
+
+/* Called before sif_hw_init in main since needed by pqp setup */
+int sif_dfs_register(struct sif_dev *sdev)
+{
+	struct dentry *df;
+	struct sif_dfs_ref *sdr;
+	int i;
+	char name[100];
+
+	sprintf(name, "%s", dev_name(&sdev->pdev->dev));
+	sdev->dfs = kzalloc(sizeof(struct sif_dfs), GFP_KERNEL);
+	if (sdev->dfs)
+		sdev->dfs->root = debugfs_create_dir(name, NULL);
+	if (!sdev->dfs || !sdev->dfs->root) {
+		sif_log(sdev, SIF_INFO,
+			"Unable to set up debugfs file system for %s", name);
+		goto sif_dfs_reg_failed;
+	}
+
+	for (i = 0; i < sif_tab_init_max; i++) {
+		sdr = &sdev->dfs->sd[i];
+		sdr->sdev = sdev;
+		sdr->is_eq = false;
+		sdr->type = i;
+		sdr->dfs_print = sif_table_dfs_printer(i);
+		df = debugfs_create_file(sif_table_name(i), S_IRUGO, sdev->dfs->root,
+					(void *)sdr, &table_fops);
+		if (!df) {
+			sif_log(sdev, SIF_INFO, "Unable to set up debugfs file %s",
+				sif_table_name(i));
+			goto sif_dfs_reg_failed;
+		}
+	}
+
+	/* Single file for the event queues */
+	sdr = &sdev->dfs->sd_eq;
+	sdr->sdev = sdev;
+	sdr->is_eq = true;
+	sdr->dfs_print = sif_dfs_print_eq;
+	df = debugfs_create_file("eq", S_IRUGO, sdev->dfs->root,
+				(void *)sdr, &table_fops);
+	if (!df) {
+		sif_log(sdev, SIF_INFO, "Unable to set up debugfs file for event queues");
+		return -ENOMEM;
+	}
+	/* Single file for the ipoffload qp-statistics */
+	sdr = &sdev->dfs->sd_ipoffload;
+	sdr->sdev = sdev;
+	sdr->dfs_print = sif_dfs_print_ipoffload;
+	sdr->type = qp;
+	df = debugfs_create_file("ipoffload", S_IRUGO, sdev->dfs->root,
+				(void *)sdr, &table_fops);
+	if (!df) {
+		sif_log(sdev, SIF_INFO, "Unable to set up debugfs file for ipoffload qp stat");
+		return -ENOMEM;
+	}
+	/* Single file for the int channel coalescing settings */
+	sdr = &sdev->dfs->sd_irq_ch;
+	sdr->sdev = sdev;
+	sdr->is_eq = true;
+	sdr->dfs_print = sif_dfs_print_irq_ch;
+	df = debugfs_create_file("irq_ch", S_IWUSR | S_IRUGO, sdev->dfs->root,
+				(void *)sdr, &table_fops_rw);
+	if (!df) {
+		sif_log(sdev, SIF_INFO,
+			"Unable to set up debugfs file for interrupt channels coalescing settings");
+		return -ENOMEM;
+	}
+
+	/* Create a directory for raw qp dump info */
+	sdev->dfs->raw_qp = debugfs_create_dir("raw_qp", sdev->dfs->root);
+	if (!sdev->dfs->raw_qp) {
+		sif_log(sdev, SIF_INFO, "Unable to set up debugfs directory for raw QP information");
+		goto sif_dfs_reg_failed;
+	}
+	return 0;
+
+sif_dfs_reg_failed:
+	sif_dfs_unregister(sdev);
+	return -ENOMEM;
+}
+
+
+/* Symlink ib device name to debugfs root node - named by PCI id */
+void sif_dfs_link_to_ibdev(struct sif_dev *sdev)
+{
+	sdev->dfs->root_link =
+		debugfs_create_symlink(sdev->ib_dev.name, NULL, sdev->dfs->root->d_iname);
+	if (!sdev->dfs->root_link)
+		sif_log(sdev, SIF_INFO, "Failed to create link %s -> %s",
+			sdev->dfs->root->d_iname, sdev->ib_dev.name);
+}
+
+
+void sif_dfs_unregister(struct sif_dev *sdev)
+{
+	if (!sdev->dfs)
+		return;
+	debugfs_remove(sdev->dfs->root_link);
+	debugfs_remove_recursive(sdev->dfs->root);
+	kfree(sdev->dfs);
+	sdev->dfs = NULL;
+}
+
+
+/**** support for raw QP state dump */
+
+
+static int rqp_open(struct inode *inode, struct file *file)
+{
+	if (!try_module_get(THIS_MODULE))
+		return -EIO;
+
+	file->private_data = inode->i_private;
+	return 0;
+};
+
+
+static ssize_t rqp_read(struct file *file, char __user *buf, size_t sz, loff_t *off)
+{
+	struct sif_qp *qp = (struct sif_qp *)file->private_data;
+	struct psif_query_qp lqqp;
+	int ret;
+	size_t len = 0;
+	struct xchar xc;
+	size_t dump_size = 12000; /* enough space for allocating the qp dump*/
+	char *dump;
+
+	sif_log0(SIF_QP, "rqp_read idx %d, sz %ld offset 0x%llx", qp->qp_idx, sz, *off);
+	if (*off > 0)
+		return 0;
+
+	dump = kmalloc(dump_size, GFP_KERNEL);
+	if (!dump) {
+		sif_log0(SIF_INFO, "Error allocating temp.storage for raw qp read");
+		return -ENOMEM;
+	}
+
+	memset(dump, 0, dump_size*sizeof(char));
+	xc.buf = dump;
+
+	ret = epsc_query_qp(qp, &lqqp);
+	if (ret) {
+		len = snprintf(xc.buf, sz,
+			"[query_qp failed with status %d - returning last cached state]\n",
+			ret);
+		xc.buf += len;
+		sz -= len;
+	}
+	/* TBD: Could cause buffer overflow in theory: see #2738 */
+	write_struct_psif_query_qp(&xc, 0, &lqqp);
+	sprintf(xc.buf, "\n");
+	len = simple_read_from_buffer(buf, sz, off, dump, strlen(dump));
+	kfree(dump);
+
+	return len;
+}
+
+
+static int rqp_release(struct inode *inode, struct file *file)
+{
+	module_put(THIS_MODULE);
+	return 0;
+}
+
+
+static const struct file_operations qp_fops = {
+	.owner   = THIS_MODULE,
+	.open    = rqp_open,
+	.read    = rqp_read,
+	.release = rqp_release,
+};
+
+
+/* TBD: Ref.cnt or other protection probably needed to protect agains "take down" while
+ * a query is in progress
+ */
+int sif_dfs_add_qp(struct sif_dev *sdev, struct sif_qp *qp)
+{
+	char tmp[20];
+
+	sprintf(tmp, "%d", qp->qp_idx);
+	qp->dfs_qp = debugfs_create_file(tmp, S_IRUGO, sdev->dfs->raw_qp,
+				(void *)qp, &qp_fops);
+	if (!qp->dfs_qp)
+		return -ENOMEM;
+	return 0;
+}
+
+
+void sif_dfs_remove_qp(struct sif_qp *qp)
+{
+	debugfs_remove(qp->dfs_qp);
+	qp->dfs_qp = NULL;
+}
diff --git a/drivers/infiniband/hw/sif/sif_debug.h b/drivers/infiniband/hw/sif/sif_debug.h
new file mode 100644
index 0000000000000..b95ed7893c357
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_debug.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_debug.h: Use of debugfs for dumping internal data structure info
+ */
+
+#ifndef __SIF_DEBUG_H
+#define __SIF_DEBUG_H
+
+struct sif_dev;
+
+/* Set up/tear down the debugfs structures */
+int sif_dfs_register(struct sif_dev *sdev);
+void sif_dfs_unregister(struct sif_dev *sdev);
+
+/* Symlink to ib device name (to be called after ib_register_device */
+void sif_dfs_link_to_ibdev(struct sif_dev *sdev);
+
+int sif_dfs_add_qp(struct sif_dev *sdev, struct sif_qp *qp);
+void sif_dfs_remove_qp(struct sif_qp *qp);
+
+/* A generic callback function for printing a table entry
+ * in a debug fs file:
+ */
+typedef void (*sif_dfs_printer)(struct seq_file *s,
+				struct sif_dev *,
+				loff_t pos);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_defs.c b/drivers/infiniband/hw/sif/sif_defs.c
new file mode 100644
index 0000000000000..63a6ecd3d3ee0
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_defs.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_defs.c: IB-to-SIF Mapper.
+ */
+#include <linux/version.h>
+#include <rdma/ib_verbs.h>
+#include "sif_dev.h"
+#include "sif_defs.h"
+#include "psif_hw_setget.h"
+#include "sif_qp.h"
+
+/* This is where we build and define kernel utilities for logging psif structures: */
+#define copy_convert	   copy_conv_to_sw
+#define copy_convert_to_sw copy_conv_to_sw
+#define copy_convert_to_hw copy_conv_to_hw
+#define assert(x) BUG_ON(!(x))
+#include "psif_hw_print.c"
+
+enum psif_wr_type sif_invalidate_opcode(enum sif_tab_type type)
+{
+	switch (type) {
+	case rq_sw:
+	case rq_hw:
+		return PSIF_WR_INVALIDATE_RQ;
+	case cq_sw:
+	case cq_hw:
+		return PSIF_WR_INVALIDATE_CQ;
+	case key:
+		return PSIF_WR_INVALIDATE_BOTH_KEYS;
+	case qp:
+		return PSIF_WR_INVALIDATE_SGL_CACHE;
+	default:
+		/* This function is used to figure out if an invalidate
+		 * request is needed so ending here is a normal case
+		 */
+		break;
+	}
+	return (enum psif_wr_type)-1;
+}
+
+
+enum psif_wr_type ib2sif_wr_op(enum ib_wr_opcode op, bool is_dr)
+{
+	switch (op) {
+	case IB_WR_RDMA_WRITE:
+		return PSIF_WR_RDMA_WR;
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		return PSIF_WR_RDMA_WR_IMM;
+	case IB_WR_SEND:
+		return !is_dr ? PSIF_WR_SEND : PSIF_WR_QP0_SEND_DR_LOOPBACK;
+	case IB_WR_SEND_WITH_IMM:
+		return PSIF_WR_SEND_IMM;
+	case IB_WR_RDMA_READ:
+		return PSIF_WR_RDMA_RD;
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+		return PSIF_WR_CMP_SWAP;
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		return PSIF_WR_FETCH_ADD;
+	case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+		return PSIF_WR_MASK_CMP_SWAP;
+	case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
+		return PSIF_WR_MASK_FETCH_ADD;
+	case IB_WR_LSO:
+		return PSIF_WR_LSO;
+	case IB_WR_SEND_WITH_INV:
+	case IB_WR_RDMA_READ_WITH_INV:
+	case IB_WR_LOCAL_INV:
+	case IB_WR_FAST_REG_MR:
+	default:
+		break;
+	}
+	sif_log0(SIF_INFO, "Unsupported opcode %d", op);
+	return (enum psif_wr_type)-1;
+}
+
+enum ib_wr_opcode sif2ib_wr_op(enum psif_wr_type op)
+{
+	switch (op) {
+	case PSIF_WR_SEND:
+		return IB_WR_SEND;
+	case PSIF_WR_SEND_IMM:
+		return IB_WR_SEND_WITH_IMM;
+	case PSIF_WR_RDMA_WR:
+		return IB_WR_RDMA_WRITE;
+	case PSIF_WR_RDMA_WR_IMM:
+		return IB_WR_RDMA_WRITE_WITH_IMM;
+	case PSIF_WR_RDMA_RD:
+		return IB_WR_RDMA_READ;
+	case PSIF_WR_CMP_SWAP:
+		return IB_WR_ATOMIC_CMP_AND_SWP;
+	case PSIF_WR_FETCH_ADD:
+		return IB_WR_ATOMIC_FETCH_AND_ADD;
+	case PSIF_WR_MASK_CMP_SWAP:
+		return IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
+	case PSIF_WR_MASK_FETCH_ADD:
+		return IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
+	case PSIF_WR_LSO:
+		return IB_WR_LSO;
+	case PSIF_WR_INVALIDATE_RKEY:
+	case PSIF_WR_INVALIDATE_LKEY:
+	case PSIF_WR_INVALIDATE_BOTH_KEYS:
+	case PSIF_WR_INVALIDATE_TLB:
+	case PSIF_WR_RESIZE_CQ:
+	case PSIF_WR_SET_SRQ_LIM:
+	case PSIF_WR_SET_XRCSRQ_LIM:
+	case PSIF_WR_INVALIDATE_RQ:
+	case PSIF_WR_INVALIDATE_CQ:
+	case PSIF_WR_INVALIDATE_XRCSRQ:
+	default:
+		break;
+	}
+	sif_log0(SIF_INFO, "Unable to convert opcode %d", op);
+	return (enum ib_wr_opcode)-1;
+}
+
+/* TBD: These should map directly - must add test first */
+enum ib_wc_opcode sif2ib_wc_opcode(enum psif_wc_opcode opcode)
+{
+	switch (opcode) {
+	case PSIF_WC_OPCODE_SEND:
+		return IB_WC_SEND;
+	case PSIF_WC_OPCODE_RDMA_WR:
+		return IB_WC_RDMA_WRITE;
+	case PSIF_WC_OPCODE_RDMA_READ:
+		return IB_WC_RDMA_READ;
+	case PSIF_WC_OPCODE_CMP_SWAP:
+		return IB_WC_COMP_SWAP;
+	case PSIF_WC_OPCODE_FETCH_ADD:
+		return IB_WC_FETCH_ADD;
+	case PSIF_WC_OPCODE_LSO:
+		return IB_WC_LSO;
+	case PSIF_WC_OPCODE_MASKED_CMP_SWAP:
+		return IB_WC_MASKED_COMP_SWAP;
+	case PSIF_WC_OPCODE_MASKED_FETCH_ADD:
+		return IB_WC_MASKED_FETCH_ADD;
+	case PSIF_WC_OPCODE_RECEIVE_SEND:
+		return IB_WC_RECV;
+	case PSIF_WC_OPCODE_RECEIVE_RDMA_WR_IMM:
+		return IB_WC_RECV_RDMA_WITH_IMM;
+	case PSIF_WC_OPCODE_INVALIDATE_SGL_CACHE:
+		return PSIF_WR_INVALIDATE_SGL_CACHE;
+	case PSIF_WC_OPCODE_INVALIDATE_RKEY:
+	case PSIF_WC_OPCODE_INVALIDATE_LKEY:
+	case PSIF_WC_OPCODE_INVALIDATE_BOTH_KEYS:
+	case PSIF_WC_OPCODE_INVALIDATE_TLB:
+	case PSIF_WC_OPCODE_RESIZE_CQ:
+	case PSIF_WC_OPCODE_SET_SRQ_LIM:
+	case PSIF_WC_OPCODE_REQ_CMPL_NOTIFY:
+	case PSIF_WC_OPCODE_CMPL_NOTIFY_RCVD:
+	case PSIF_WC_OPCODE_REARM_CMPL_EVENT:
+	case PSIF_WC_OPCODE_SET_XRCSRQ_LIM:
+	case PSIF_WC_OPCODE_INVALIDATE_RQ:
+	case PSIF_WC_OPCODE_INVALIDATE_CQ:
+	case PSIF_WC_OPCODE_INVALIDATE_RB:
+	case PSIF_WC_OPCODE_INVALIDATE_XRCSRQ:
+	case PSIF_WC_OPCODE_GENERATE_COMPLETION:
+	case PSIF_WC_OPCODE_RECEIVE_CONDITIONAL_WR_IMM:
+		break;
+	}
+	return -1;
+}
+
+enum psif_wc_opcode ib2sif_wc_opcode(enum ib_wc_opcode opcode)
+{
+	switch (opcode) {
+	case IB_WC_SEND:
+		return PSIF_WC_OPCODE_SEND;
+	case IB_WC_RDMA_WRITE:
+		return PSIF_WC_OPCODE_RDMA_WR;
+	case IB_WC_RDMA_READ:
+		return PSIF_WC_OPCODE_RDMA_READ;
+	case IB_WC_COMP_SWAP:
+		return PSIF_WC_OPCODE_CMP_SWAP;
+	case IB_WC_FETCH_ADD:
+		return PSIF_WC_OPCODE_FETCH_ADD;
+	case IB_WC_LSO:
+		return PSIF_WC_OPCODE_LSO;
+	case IB_WC_MASKED_COMP_SWAP:
+		return PSIF_WC_OPCODE_MASKED_CMP_SWAP;
+	case IB_WC_MASKED_FETCH_ADD:
+		return PSIF_WC_OPCODE_MASKED_FETCH_ADD;
+	case IB_WC_RECV:
+		return PSIF_WC_OPCODE_RECEIVE_SEND;
+	case IB_WC_RECV_RDMA_WITH_IMM:
+		return PSIF_WC_OPCODE_RECEIVE_RDMA_WR_IMM;
+	case IB_WC_BIND_MW:
+	case IB_WC_LOCAL_INV:
+	case IB_WC_FAST_REG_MR:
+		break;
+	}
+	sif_log0(SIF_INFO, "IB opcode %d not implemented", opcode);
+	return -1;
+}
+
+enum ib_wc_status sif2ib_wc_status(enum psif_wc_status status)
+{
+	switch (status) {
+	case PSIF_WC_STATUS_SUCCESS:
+		return IB_WC_SUCCESS;
+	case PSIF_WC_STATUS_LOC_LEN_ERR:
+		return IB_WC_LOC_LEN_ERR;
+	case PSIF_WC_STATUS_LOC_QP_OP_ERR:
+		return IB_WC_LOC_QP_OP_ERR;
+	case PSIF_WC_STATUS_LOC_EEC_OP_ERR:
+		return IB_WC_LOC_EEC_OP_ERR;
+	case PSIF_WC_STATUS_LOC_PROT_ERR:
+		return IB_WC_LOC_PROT_ERR;
+	case PSIF_WC_STATUS_WR_FLUSH_ERR:
+		return IB_WC_WR_FLUSH_ERR;
+	case PSIF_WC_STATUS_MW_BIND_ERR:
+		return IB_WC_MW_BIND_ERR;
+	case PSIF_WC_STATUS_BAD_RESP_ERR:
+		return IB_WC_BAD_RESP_ERR;
+	case PSIF_WC_STATUS_LOC_ACCESS_ERR:
+		return IB_WC_LOC_ACCESS_ERR;
+	case PSIF_WC_STATUS_REM_INV_REQ_ERR:
+		return IB_WC_REM_INV_REQ_ERR;
+	case PSIF_WC_STATUS_REM_ACCESS_ERR:
+		return IB_WC_REM_ACCESS_ERR;
+	case PSIF_WC_STATUS_REM_OP_ERR:
+		return IB_WC_REM_OP_ERR;
+	case PSIF_WC_STATUS_RETRY_EXC_ERR:
+		return IB_WC_RETRY_EXC_ERR;
+	case PSIF_WC_STATUS_RNR_RETRY_EXC_ERR:
+		return IB_WC_RNR_RETRY_EXC_ERR;
+	case PSIF_WC_STATUS_LOC_RDD_VIOL_ERR:
+		return IB_WC_LOC_RDD_VIOL_ERR;
+	case PSIF_WC_STATUS_REM_INV_RD_REQ_ERR:
+		return IB_WC_REM_INV_RD_REQ_ERR;
+	case PSIF_WC_STATUS_REM_ABORT_ERR:
+		return IB_WC_REM_ABORT_ERR;
+	case PSIF_WC_STATUS_INV_EECN_ERR:
+		return IB_WC_INV_EECN_ERR;
+	case PSIF_WC_STATUS_INV_EEC_STATE_ERR:
+		return IB_WC_INV_EEC_STATE_ERR;
+	case PSIF_WC_STATUS_FATAL_ERR:
+		return IB_WC_FATAL_ERR;
+	case PSIF_WC_STATUS_RESP_TIMEOUT_ERR:
+		return IB_WC_RESP_TIMEOUT_ERR;
+	case PSIF_WC_STATUS_GENERAL_ERR:
+		return IB_WC_GENERAL_ERR;
+	case PSIF_WC_STATUS_FIELD_MAX:
+		return -1;
+	}
+	return -1;
+}
+
+enum psif_wc_status ib2sif_wc_status(enum ib_wc_status status)
+{
+	switch (status) {
+	case IB_WC_SUCCESS:
+		return PSIF_WC_STATUS_LOC_LEN_ERR;
+	case IB_WC_LOC_LEN_ERR:
+		return PSIF_WC_STATUS_LOC_LEN_ERR;
+	case IB_WC_LOC_QP_OP_ERR:
+		return PSIF_WC_STATUS_LOC_QP_OP_ERR;
+	case IB_WC_LOC_EEC_OP_ERR:
+		return PSIF_WC_STATUS_LOC_EEC_OP_ERR;
+	case IB_WC_LOC_PROT_ERR:
+		return PSIF_WC_STATUS_LOC_PROT_ERR;
+	case IB_WC_WR_FLUSH_ERR:
+		return PSIF_WC_STATUS_WR_FLUSH_ERR;
+	case IB_WC_MW_BIND_ERR:
+		return PSIF_WC_STATUS_MW_BIND_ERR;
+	case IB_WC_BAD_RESP_ERR:
+		return PSIF_WC_STATUS_BAD_RESP_ERR;
+	case IB_WC_LOC_ACCESS_ERR:
+		return PSIF_WC_STATUS_LOC_ACCESS_ERR;
+	case IB_WC_REM_INV_REQ_ERR:
+		return PSIF_WC_STATUS_REM_INV_REQ_ERR;
+	case IB_WC_REM_ACCESS_ERR:
+		return PSIF_WC_STATUS_REM_ACCESS_ERR;
+	case IB_WC_REM_OP_ERR:
+		return PSIF_WC_STATUS_REM_OP_ERR;
+	case IB_WC_RETRY_EXC_ERR:
+		return PSIF_WC_STATUS_RETRY_EXC_ERR;
+	case IB_WC_RNR_RETRY_EXC_ERR:
+		return PSIF_WC_STATUS_RNR_RETRY_EXC_ERR;
+	case IB_WC_LOC_RDD_VIOL_ERR:
+		return PSIF_WC_STATUS_LOC_RDD_VIOL_ERR;
+	case IB_WC_REM_INV_RD_REQ_ERR:
+		return PSIF_WC_STATUS_REM_INV_RD_REQ_ERR;
+	case IB_WC_REM_ABORT_ERR:
+		return PSIF_WC_STATUS_REM_ABORT_ERR;
+	case IB_WC_INV_EECN_ERR:
+		return PSIF_WC_STATUS_INV_EECN_ERR;
+	case IB_WC_INV_EEC_STATE_ERR:
+		return PSIF_WC_STATUS_INV_EEC_STATE_ERR;
+	case IB_WC_FATAL_ERR:
+		return PSIF_WC_STATUS_FATAL_ERR;
+	case IB_WC_RESP_TIMEOUT_ERR:
+		return PSIF_WC_STATUS_RESP_TIMEOUT_ERR;
+	case IB_WC_GENERAL_ERR:
+		return PSIF_WC_STATUS_GENERAL_ERR;
+	}
+	return -1;
+}
+
+
+enum psif_qp_trans ib2sif_qp_type(enum ib_qp_type type)
+{
+	switch (type) {
+	case IB_QPT_RC:
+		return PSIF_QP_TRANSPORT_RC;
+	case IB_QPT_UC:
+		return PSIF_QP_TRANSPORT_UC;
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	case IB_QPT_UD:
+		return PSIF_QP_TRANSPORT_UD;
+	case IB_QPT_RAW_IPV6:
+	case IB_QPT_RAW_ETHERTYPE:
+		break;
+	case IB_QPT_XRC_INI:
+	case IB_QPT_XRC_TGT:
+		return PSIF_QP_TRANSPORT_XRC;
+	case IB_QPT_MAX:
+	case IB_QPT_RAW_PACKET:
+	/* IB_QPT_EPSA_TUNNELING = IB_QPT_RESERVED1; */
+		break;
+	case IB_QPT_EPSA_TUNNELING:
+		return PSIF_QP_TRANSPORT_UD;
+
+	case IB_QPT_RESERVED2:
+	case IB_QPT_RESERVED3:
+	case IB_QPT_RESERVED4:
+	case IB_QPT_RESERVED5:
+	case IB_QPT_RESERVED6:
+	case IB_QPT_RESERVED7:
+	case IB_QPT_RESERVED8:
+	case IB_QPT_RESERVED9:
+	case IB_QPT_RESERVED10:
+		break;
+	}
+	/* map to a value we don't support as the
+	 * error status value for now..
+	 */
+	return (enum psif_qp_trans)(-1);
+}
+
+
+enum psif_qp_state ib2sif_qp_state(enum ib_qp_state state)
+{
+	switch (state) {
+	case IB_QPS_RESET:
+		return PSIF_QP_STATE_RESET;
+	case IB_QPS_INIT:
+		return PSIF_QP_STATE_INIT;
+	case IB_QPS_RTR:
+		return PSIF_QP_STATE_RTR;
+	case IB_QPS_RTS:
+		return PSIF_QP_STATE_RTS;
+	case IB_QPS_ERR:
+		return PSIF_QP_STATE_ERROR;
+	case IB_QPS_SQE:
+		return PSIF_QP_STATE_SQERR;
+	case IB_QPS_SQD: /* TBD: Is this right? */
+		break;
+	}
+	return PSIF_QP_STATE_INVALID;
+}
+
+
+enum ib_qp_state sif2ib_qp_state(enum psif_qp_state state)
+{
+	switch (state) {
+	case PSIF_QP_STATE_RESET:
+		return IB_QPS_RESET;
+	case PSIF_QP_STATE_INIT:
+		return IB_QPS_INIT;
+	case PSIF_QP_STATE_RTR:
+		return IB_QPS_RTR;
+	case PSIF_QP_STATE_RTS:
+		return IB_QPS_RTS;
+	case PSIF_QP_STATE_ERROR:
+		return IB_QPS_ERR;
+	case PSIF_QP_STATE_SQERR:
+		return IB_QPS_SQE;
+	case PSIF_QP_STATE_INVALID:
+		break;
+	}
+	return IB_QPS_ERR;
+}
+
+enum psif_migration ib2sif_mig_state(enum ib_mig_state mstate)
+{
+	switch (mstate) {
+	case IB_MIG_MIGRATED:
+		return APM_MIGRATED;
+	case IB_MIG_REARM:
+		return APM_REARM;
+	case IB_MIG_ARMED:
+		return APM_ARMED;
+	}
+	return APM_OFF;
+}
+
+enum ib_mig_state sif2ib_mig_state(enum psif_migration mstate)
+{
+	switch (mstate) {
+	case APM_MIGRATED:
+		return IB_MIG_MIGRATED;
+	case APM_REARM:
+		return IB_MIG_REARM;
+	case APM_ARMED:
+		return IB_MIG_ARMED;
+	default:
+		return (enum ib_mig_state)-1;
+	}
+}
+
+enum psif_path_mtu ib2sif_path_mtu(enum ib_mtu mtu)
+{
+	switch (mtu) {
+	case IB_MTU_256:
+		return MTU_256B;
+	case IB_MTU_512:
+		return MTU_512B;
+	case IB_MTU_1024:
+		return MTU_1024B;
+	case IB_MTU_2048:
+		return MTU_2048B;
+	case IB_MTU_4096:
+		return MTU_4096B;
+	}
+	return MTU_INVALID;
+}
+
+enum ib_mtu sif2ib_path_mtu(enum psif_path_mtu mtu)
+{
+	switch (mtu) {
+	case MTU_256B:
+		return IB_MTU_256;
+	case MTU_512B:
+		return IB_MTU_512;
+	case MTU_1024B:
+		return IB_MTU_1024;
+	case MTU_2048B:
+		return IB_MTU_2048;
+	case MTU_4096B:
+		return IB_MTU_4096;
+	default:
+		return (enum ib_mtu)0;
+	}
+}
+
+
+/* TBD: IB datastructure dump functions - remove/replace? */
+
+const char *ib_event2str(enum ib_event_type e)
+{
+	switch (e) {
+	case IB_EVENT_CQ_ERR:
+		return "IB_EVENT_CQ_ERR";
+	case IB_EVENT_QP_FATAL:
+		return "IB_EVENT_QP_FATAL";
+	case IB_EVENT_QP_REQ_ERR:
+		return "IB_EVENT_QP_REQ_ERR";
+	case IB_EVENT_QP_ACCESS_ERR:
+		return "IB_EVENT_QP_ACCESS_ERR";
+	case IB_EVENT_COMM_EST:
+		return "IB_EVENT_COMM_EST";
+	case IB_EVENT_SQ_DRAINED:
+		return "IB_EVENT_SQ_DRAINED";
+	case IB_EVENT_PATH_MIG:
+		return "IB_EVENT_PATH_MIG";
+	case IB_EVENT_PATH_MIG_ERR:
+		return "IB_EVENT_PATH_MIG_ERR";
+	case IB_EVENT_DEVICE_FATAL:
+		return "IB_EVENT_DEVICE_FATAL";
+	case IB_EVENT_PORT_ACTIVE:
+		return "IB_EVENT_PORT_ACTIVE";
+	case IB_EVENT_PORT_ERR:
+		return "IB_EVENT_PORT_ERR";
+	case IB_EVENT_LID_CHANGE:
+		return "IB_EVENT_LID_CHANGE";
+	case IB_EVENT_PKEY_CHANGE:
+		return "IB_EVENT_PKEY_CHANGE";
+	case IB_EVENT_SM_CHANGE:
+		return "IB_EVENT_SM_CHANGE";
+	case IB_EVENT_SRQ_ERR:
+		return "IB_EVENT_SRQ_ERR";
+	case IB_EVENT_SRQ_LIMIT_REACHED:
+		return "IB_EVENT_SRQ_LIMIT_REACHED";
+	case IB_EVENT_QP_LAST_WQE_REACHED:
+		return "IB_EVENT_QP_LAST_WQE_REACHED";
+	case IB_EVENT_CLIENT_REREGISTER:
+		return "IB_EVENT_CLIENT_REREGISTER";
+	case IB_EVENT_GID_CHANGE:
+		return "IB_EVENT_GID_CHANGE";
+	default:
+		return "(Undefined event type)";
+	}
+}
+
+static inline enum kernel_ulp_type find_ulp_type_from_address(void *ptr)
+{
+	if (ptr) {
+#if defined(__x86_64__) || defined(__sparc__)
+		char symbol_name[100];
+
+		snprintf(symbol_name, sizeof(symbol_name), "%ps", ptr);
+		if (strstr(symbol_name, "rds_"))
+			return RDS_ULP;
+		else if (strstr(symbol_name, "ipoib_cm_"))
+			return IPOIB_CM_ULP;
+		else if (strstr(symbol_name, "ipoib_"))
+			return IPOIB_ULP;
+#endif
+	}
+	return OTHER_ULP;
+}
+
+static inline enum kernel_ulp_type find_ulp_type_via_stack_unwind(const int level)
+{
+/* __builtin_return_address argument must be a constant */
+#define STACK_UNWIND_CASE_LEVEL(n) \
+	case (n):  { \
+		enum kernel_ulp_type type = OTHER_ULP;	\
+		void *ptr = __builtin_return_address(n);\
+		type = find_ulp_type_from_address(ptr);	\
+		if (type != OTHER_ULP)	\
+			return type;	\
+	}
+
+	switch (level) {
+	default:
+		STACK_UNWIND_CASE_LEVEL(7);
+		STACK_UNWIND_CASE_LEVEL(6);
+		STACK_UNWIND_CASE_LEVEL(5);
+		STACK_UNWIND_CASE_LEVEL(4);
+		STACK_UNWIND_CASE_LEVEL(3);
+		STACK_UNWIND_CASE_LEVEL(2);
+		STACK_UNWIND_CASE_LEVEL(1);
+		STACK_UNWIND_CASE_LEVEL(0);
+	}
+#undef STACK_UNWIND_CASE_LEVEL
+	return OTHER_ULP;
+}
+
+enum kernel_ulp_type sif_find_kernel_ulp_caller(void)
+{
+	enum kernel_ulp_type type = OTHER_ULP;
+
+	if (!(__builtin_return_address(0))) {
+		/* if current function returns NULL,
+		 * there is no reason to check further.
+		 */
+		goto error;
+	}
+	type = find_ulp_type_via_stack_unwind(STACK_UNWIND_LEVEL);
+error:
+	return type;
+}
diff --git a/drivers/infiniband/hw/sif/sif_defs.h b/drivers/infiniband/hw/sif/sif_defs.h
new file mode 100644
index 0000000000000..f0a06db3fe8b1
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_defs.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_defs.h: Div. utility definitions and auxiliary data structures
+ */
+
+#ifndef __SIF_DEFS_H
+#define __SIF_DEFS_H
+#include "psif_hw_data.h"
+#include "sif_mmu.h"
+#include "sif_pd.h"
+#include "sif_sq.h"
+#include "sif_cq.h"
+#include "sif_mem.h"
+#include "sif_rq.h"
+#include "sif_ireg.h"
+
+/* Needed by print funcs */
+#define xprintf(x, format, arg...) \
+	do {\
+		if (x)	\
+			(x)->buf += sprintf((x)->buf, format, ## arg);	\
+		else \
+			printk(format, ## arg); \
+	} while (0)
+
+struct xchar {
+	char *buf;
+};
+
+#define GREATER_16(a, b) ((s16)((s16)(a) - (s16)(b)) > 0)
+
+
+#define XFILE struct xchar
+#include "psif_hw_print.h"
+
+enum sif_tab_type;
+
+enum psif_wr_type sif_invalidate_opcode(enum sif_tab_type type);
+
+enum ib_wc_opcode sif2ib_wc_opcode(enum psif_wc_opcode opcode);
+enum psif_wc_opcode ib2sif_wc_opcode(enum ib_wc_opcode opcode);
+
+enum ib_wc_status sif2ib_wc_status(enum psif_wc_status status);
+enum psif_wc_status ib2sif_wc_status(enum ib_wc_status status);
+
+enum ib_wr_opcode sif2ib_wr_op(enum psif_wr_type op);
+enum psif_wr_type ib2sif_wr_op(enum ib_wr_opcode op, bool is_dr);
+
+enum psif_qp_trans ib2sif_qp_type(enum ib_qp_type type);
+
+enum psif_qp_state ib2sif_qp_state(enum ib_qp_state state);
+enum ib_qp_state sif2ib_qp_state(enum psif_qp_state state);
+
+enum ib_mig_state sif2ib_mig_state(enum psif_migration mstate);
+enum psif_migration ib2sif_mig_state(enum ib_mig_state mstate);
+
+enum ib_mtu sif2ib_path_mtu(enum psif_path_mtu mtu);
+enum psif_path_mtu ib2sif_path_mtu(enum ib_mtu mtu);
+enum kernel_ulp_type sif_find_kernel_ulp_caller(void);
+
+/* TBD: IB datastructure dump functions - remove/replace? */
+const char *ib_event2str(enum ib_event_type e);
+
+static inline struct sif_pd *to_spd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct sif_pd, ibpd);
+}
+
+static inline struct sif_shpd *to_sshpd(struct ib_shpd *ibshpd)
+{
+	return container_of(ibshpd, struct sif_shpd, ibshpd);
+}
+
+/* Generic table handling functions:
+ * For xx in cq,rq,sq:
+ *
+ *      Return element# @index in the xx queue referred by q:
+ *
+ *    struct psif_xx_entry *get_xx_entry(struct sif_xx *q, int index);
+ *
+ *      @ptr: Kernel virtual address offset into an entry in the xx queue @q
+ *      Return value: The corresponding dma address.
+ *
+ *    u64 xxe_to_dma(struct sif_xx *q, void* ptr);
+
+ *  TBD: Document the rest of the macro defined generic calls
+ */
+
+
+#define sif_define_entry_funcs(type, dtype) \
+static inline struct psif_##type##_entry \
+	*get_##type##_entry(struct sif_##type *q, unsigned dtype seq)\
+{\
+	return (struct psif_##type##_entry *) sif_mem_kaddr(q->mem, (seq & q->mask) * q->extent); \
+} \
+static inline u64 get_##type##e_dma(struct sif_##type *q, unsigned dtype seq) \
+{\
+	return sif_mem_dma(q->mem, (seq & q->mask) * q->extent); \
+} \
+static inline int type##_is_empty(struct sif_##type *q, unsigned dtype head, unsigned dtype tail)\
+{\
+	return (head == tail); \
+} \
+static inline dtype type##_length(struct sif_##type *q, dtype head, dtype tail)\
+{\
+	return tail - head;\
+} \
+
+sif_define_entry_funcs(cq, int)
+sif_define_entry_funcs(rq, int)
+sif_define_entry_funcs(sq, short)
+
+static inline void *sq_sgl_offset(struct sif_sq *sq, struct psif_sq_entry *sqe)
+{
+	return (u8 *)sqe + sq->sgl_offset;
+}
+
+/* Define an architecture independent write combining flush:
+ * According to documentation, we should have been able to use
+ * mmiowb() but on x86_64 mmiowb does not contain the necessary sfence instruction.
+ */
+
+#if defined(__i386__)
+#define wc_wmb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
+#elif defined(__x86_64__)
+#define wc_wmb() asm volatile("sfence" ::: "memory")
+#elif defined(__ia64__)
+#define wc_wmb() asm volatile("fwb" ::: "memory")
+#else
+#define wc_wmb() wmb()
+#endif
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_dev.h b/drivers/infiniband/hw/sif/sif_dev.h
new file mode 100644
index 0000000000000..2ea33817cffaf
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_dev.h
@@ -0,0 +1,728 @@
+/*
+ * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_dev.h: Driver specific data structure definitions
+ */
+
+#ifndef __SIF_DEV_H
+#define __SIF_DEV_H
+
+#include <linux/version.h>
+#include <linux/pci.h>
+#include <linux/sched.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <rdma/ib_verbs.h>
+#include <linux/mm.h>
+#include <linux/workqueue.h>
+
+
+#include "sif_idr.h"
+#include "sif_fwa.h"
+#include "sif_mmu.h"
+#include "sif_pqp.h"
+#include "sif_mem.h"
+
+
+#include "sif_verbs.h"
+
+#define PCI_VENDOR_ID_SUN	0x108e
+#define PCI_DEVICE_ID_PSIF_PF	0x2088
+#define PCI_DEVICE_ID_PSIF_VF	0x2089
+#define PCI_DEVICE_ID_SN1_PF	0x2188
+#define PCI_DEVICE_ID_SN1_VF	0x2189
+#define PCI_DEVICE_ID_SN2_PF	0x2198
+#define PCI_DEVICE_ID_SN2_VF	0x2199
+#define PCI_DEVICE_ID_SN3_PF	0x21A8
+#define PCI_DEVICE_ID_SN3_VF	0x21A9
+
+#define PSIF_DEVICE(sdevice) ((sdevice)->pdev->device)
+#define PSIF_SUBSYSTEM(sdevice) ((sdevice)->pdev->subsystem_device)
+
+#define IS_PSIF(sdevice) (PSIF_DEVICE(sdevice) == PCI_DEVICE_ID_PSIF_PF || \
+				PSIF_DEVICE(sdevice) == PCI_DEVICE_ID_PSIF_VF)
+
+#define IS_SIBS(sdevice) (PSIF_DEVICE(sdevice) == PCI_DEVICE_ID_SN1_PF || \
+				PSIF_DEVICE(sdevice) == PCI_DEVICE_ID_SN1_VF || \
+				PSIF_DEVICE(sdevice) == PCI_DEVICE_ID_SN2_PF || \
+				PSIF_DEVICE(sdevice) == PCI_DEVICE_ID_SN2_VF || \
+				PSIF_DEVICE(sdevice) == PCI_DEVICE_ID_SN3_PF || \
+				PSIF_DEVICE(sdevice) == PCI_DEVICE_ID_SN3_VF)
+
+/* Sonoma rev 1 most closely resembles PSIF rev 2
+ * TBD: Need a more fine grained solution to feature/bug checking as we move on..
+ */
+#define PSIF_REVISION(sdevice) \
+	(IS_SIBS(sdevice) ? (sdevice)->pdev->revision + 1 : (sdevice)->pdev->revision)
+
+
+/* Tested limit on #of CQEs - may support 2^30 but
+ * need machine with lots of memory to test it!
+ */
+#define SIF_SW_MAX_CQE_LOG2 0x18  /* = 16 MB - tested and should cover most use cases.. */
+#define SIF_SW_MAX_CQE (1 << SIF_SW_MAX_CQE_LOG2)
+
+#define SIF_SW_MAX_SQE_LOG2 0xf  /* = 32K */
+#define SIF_SW_MAX_SQE (1 << SIF_SW_MAX_SQE_LOG2)
+
+/* Start offset of the special sq_cmpl mapping:
+ * each queue have at most 1 << SIF_SW_MAX_SQE_LOG2 entries
+ * Maximal extent of elements in a queue is 1 << 1f
+ * We then shift an additional bit to get to an unused upper bit
+ * to set just to avoid starting at vaddr 0:
+ */
+#define SIF_SQ_CMPL_SHIFT (SIF_SW_MAX_SQE_LOG2 + 0x1f + 1)
+#define SIF_SQ_CMPL_START (1ULL << SIF_SQ_CMPL_SHIFT)
+
+/* Use easily identifiable high addresses to map descriptor arrays
+ * when GVA2GPA mapping is needed. These are virtual addresses
+ * that will only be used by sif.
+ * For debug purposes, encode the sif_tab_type index in the address:
+ */
+#define SIF_BASE_ADDR_START(queue) \
+	((1ULL << (SIF_SQ_CMPL_SHIFT + 1)) + ((u64)(queue) << (SIF_SQ_CMPL_SHIFT - 6)))
+#define SIF_BASE_ADDR_EQ_START(queue) \
+	(SIF_BASE_ADDR_START(queue) + (1ULL << SIF_SQ_CMPL_SHIFT))
+
+/* TBD: Software emulation of UD send SGEs - hardware is limited to 16 */
+#define SIF_SW_MAX_UD_SEND_SGE 32
+#define SIF_HW_MAX_SEND_SGE 16
+
+/* This defines the defaults for implicit timers within the driver */
+#define SIF_HW_TIMEOUT 5000
+
+/* BAR indices for SIF */
+#define SIF_MSIX_BAR  0
+#define SIF_CBU_BAR   2
+#define SIF_EPS_BAR   4
+
+struct sif_mmu_ctx; /* See sif_mmu.h */
+
+/* Hardware/firmware accessible tables in memory
+ * NB! If you change anything here (including order)
+ * remember to update
+ * - struct sif_table_layout in sif_base.c
+ * - define_funcs call list in sif_base.h
+ */
+#define sif_tab_init_max epsa0_csr_req
+
+enum sif_tab_type {
+	epsc_csr_req,		/* EPSC request queue */
+	epsc_csr_rsp,		/* EPSC response queue (EPSC completions) */
+	key,			/* Key validation table */
+	qp,			/* QP descriptor table (hw owned) */
+	rqsp,			/* RQ scratch pad data */
+	atsp,			/* Atomic replay data */
+	ah,			/* Address handle table (sw owned) */
+	cq_hw,			/* Compl desc (read only for sw) */
+	cq_sw,			/* Compl desc (writable for sw) */
+	rq_hw,			/* Receive queue (read only for sw) */
+	rq_sw,			/* Receive queue (writable for sw) */
+	sq_hw,			/* Send queue (readable for sw) */
+	sq_sw,			/* Send queue (writable for sw)*/
+	sq_cmpl,		/* sqe cache for cq block (used by hw only) */
+	sq_ring,		/* Send queue scheduler ring buffer */
+	sq_tvl,			/* Send queue scheduler (TBD-what is this?) */
+	sq_rspq,		/* Send queue scheduler response queue */
+	bw_cb,			/* High bandwidth collect buffers (NB! Device addr space) */
+	lat_cb,			/* Low latency collect buffers (NB! Device addr space) */
+	epsa0_csr_req,		/* EPSA-n request queue */
+	epsa0_csr_rsp,		/* EPSA-n response queue (EPSC completions) */
+	epsa1_csr_req,
+	epsa1_csr_rsp,
+	epsa2_csr_req,
+	epsa2_csr_rsp,
+	epsa3_csr_req,
+	epsa3_csr_rsp,
+	sif_tab_max
+};
+
+/* Depends on sif_tab_type: */
+#include "sif_epsc.h"
+
+/* Driver record of a block of entries associated with a particular PD
+ * Used for tables that have entry_per_block > 1:
+ */
+struct sif_table_block {
+	struct sif_pd *pd;  /* Owning protection domain, if allocated */
+	struct sif_table *table;  /* Pointer back to table this is a block within */
+	struct list_head pd_list; /* Used by pd to chain it's allocated blocks */
+	u32 offset;         /* Index offset that this block starts at */
+	u32 last_used;      /* Last alloc'ed entry - used to support round-robin alloc */
+	ulong bitmap[0];    /* Used bitmap for entries, follows right after struct */
+};
+
+/* Driver record of a sif in-memory table */
+struct sif_table {
+	bool is_eq;
+	union {
+		enum sif_tab_type type; /* Our type (and index within sdev->ba) */
+		u32 index;  /* index of this eq - valid iff @is_eq */
+	};
+	bool from_interrupt;    /* If set, alloc/free must be permitted from intr.ctxt */
+	bool alloc_rr;          /* Set if round-robin allocation is to be used */
+	spinlock_t lock;	/* Protects bitmap */
+	ulong *bitmap;		/* Used bitmap for blocks of entries */
+	struct sif_mem *mem;    /* Allocated memory for the table */
+	void *drv_ref;		/* array of driver struct pointers for non-inline structs */
+	union {
+		u64 sif_base;	/* Virtual base address as seen from SIF */
+		void __iomem *sif_off;  /* Used for collect buffer mgmt */
+	};
+	size_t table_sz;	/* Size in byte of the table */
+	u32 ext_sz;		/* Dist.in bytes between start of each entry */
+	u32 entry_cnt;		/* Number of entries in table */
+	u32 block_cnt;          /* No.of blocks (1st level alloc granularity) in table */
+	u32 entry_per_block;    /* entry_per_block = entry_cnt / block_cnt */
+	u32 last_used;          /* Last alloc'ed entry - used to support round-robin alloc */
+	struct sif_mmu_ctx mmu_ctx; /* MMU context bookkeeping */
+	void *block;            /* Space for array with block_cnt elems + bitmap iff entry_per_block > 1 */
+	u32 block_ext;          /* Dist in bytes between sif_table_block elements in block */
+	struct sif_dev *sdev;	/* Pointer back to main driver struct */
+};
+
+/* Driver management of event queues and interrupt channel coalescing settings*/
+
+#define SIF_EQ_NAME_LEN 15
+
+struct sif_irq_ch {
+	bool enable_adaptive;  /* Adaptive coalescing */
+	u16 channel_rx_scale;   /* rx-to-tx timer scaling factor, 2-exponent value */
+	u32 channel_rate_low;   /* Message rate in messages per second. Low rate threshold. */
+	u32 channel_rate_high;  /* Message rate in messages per second. High rate threshold. */
+	u16 channel_ausec;      /* How many usecs to delay after first packet. */
+	u16 channel_ausec_low;  /* How many usecs to delay after first packet. Low rate value. */
+	u16 channel_ausec_high; /* How many usecs to delay after first packet. High rate value. */
+	u16 channel_pusec;      /* How many usecs to delay after packet. */
+	u16 channel_pusec_low;  /* How many usecs to delay after packet. Low rate value. */
+	u16 channel_pusec_high; /* How many usecs to delay after packet. High rate value. */
+	u32 entries;
+	u32 mask;  /* entries - 1 for modulo using & */
+	u32 extent;
+	struct sif_mem *mem;   /* Ref. to ba.mem to implement macro patterns */
+};
+
+struct sif_eq {
+	struct sif_table ba; /* Layout of hardware exposed table */
+	struct sif_eps *eps; /* Pointer back to controlling EPS */
+	u32 index;	     /* EQ index - EPS is 0, hw starts at 1 */
+	u32 next_seq;	     /* Next seq to look for in eq */
+	u32 entries;
+	u32 extent;	     /* Size in byte of each entry */
+	u32 mask;	     /* entries - 1 for modulo using & */
+	struct sif_mem *mem;   /* Ref. to ba.mem to implement macro patterns */
+	int intr_vec;          /* Index into s->entries[..] for the interrupt vector used */
+	u32 sw_index_interval; /* No. of events we can receive before the sw index must be updated */
+	u32 sw_index_next_update; /* Next scheduled update point */
+	atomic_t intr_cnt;   /* Number of interrupts for the interrupt vector for this eq */
+	atomic_t work_cnt;   /* No. of work queue elements processed */
+	char name[SIF_EQ_NAME_LEN+1];	      /* Storage for name visible from /proc/interrupts */
+	struct sif_irq_ch irq_ch; /* Per channel interrupt coalescing settings */
+	cpumask_var_t affinity_mask; /* cpu affinity_mask for set_irq_hints. */
+};
+
+/* Driver specific per instance data */
+
+struct sif_dfs;  /* Declared in sif_debug.c */
+struct sif_compl; /* Declared in sif_cq.h */
+
+struct sif_dev {
+	struct ib_device ib_dev;
+	struct sif_verbs sv;
+	struct pci_dev *pdev;
+	struct sif_dfs *dfs;    /* Optional debugfs info, if enabled in kernel */
+	struct sif_mem_info mi; /* Used by sif_mem.c - configured SIF page sizes etc */
+	struct sif_fwa fwa;     /* Used by sif_fwa.c - firmware access API */
+	u8 __iomem *cb_base;		/* Collect buffer space base address */
+	u8 __iomem *msi_base;		/* Base for the MSI-X vector table */
+	u8 __iomem *eps_base;		/* "Raw" pointer to EPSC BAR space */
+	u32 num_vfs;		/* #of virtual functions to enable */
+	int fw_vfs;		/* #of virtual functions enabled in firmware */
+	bool is_vf;             /* Set if this is a VF instance */
+	u8 mbox_epsc;		/* EPSC mailbox index (differs between SIBS and PSIF) */
+	u8 eps_cnt;		/* Number of EPSes on the chip */
+	int cbu_mtrr;		/* mtrr register for the cbu - save for cleanup */
+	struct psif_pcie_mbox __iomem *eps; /* Pointer to EPS-* mailboxes */
+	struct workqueue_struct *wq; /* Used a.o. for async event processing */
+	struct sif_mr *dma_mr; /* Privileged kernel mem MR (bypass mode) used for local_lkey */
+	struct sif_mr *dma_inv_mr; /* Invalid MR for key 0 */
+	struct sif_pd *pd; /* PD used for driver private table resources */
+
+	/* BAR space sizes */
+	size_t cb_sz;
+	size_t msi_sz;
+	size_t eps_sz;
+
+	/* Interrupt allocation */
+	size_t intr_req;  /* Number of irqs requested */
+	size_t intr_cnt;  /* Number of irqs allocated */
+	size_t bw_cb_cnt;   /* No.of virtual collect buffers available */
+	size_t lat_cb_cnt;   /* No.of virtual collect buffers available */
+	size_t msix_entries_sz; /* Size of the allocated msix_entries array */
+	spinlock_t msix_lock;	/* Protects intr_used */
+	struct msix_entry *msix_entries; /* MSI-X vector info */
+	ulong *intr_used;  /* Bitmap for allocation of irqs */
+
+	atomic_t sqp_usecnt[4];	/* track if someone has created QP 0/1 for port 1/2 */
+	atomic_t cq_count; /* Track #used CQs to better scale (internal debug) timeouts */
+	atomic_t cq_miss_cnt; /* Historic #completions sif_poll_cq had to busy wait for */
+	atomic_t cq_miss_occ; /* Global #times sif_poll_cq had to busy wait (upd.by destroy_cq) */
+	struct sif_eps *es; /* State for the EPS comm (sif_epsc.h) */
+	struct sif_table ba[sif_tab_max]; /* Base address setup structures */
+	struct sif_pqp **pqp;  /* PSIF management QPs */
+	struct sif_cb **kernel_cb[2]; /* cb's for the kernel (bw and low latency per cpu) */
+	int pqp_cnt;		  /* Number of PQPs set up */
+	atomic_t next_pqp;	  /* Used for round robin assignment of pqp */
+	int kernel_cb_cnt;	  /* Number of pairs of CBs set up for the kernel */
+	struct sif_idr xrcd_refs; /* Mgmt of sif_xrcd allocations */
+	struct sif_idr pd_refs;   /* Mgmt of sif_pd allocations */
+	struct sif_spqp_pool ki_spqp; /* Stencil PQPs for key invalidates */
+	/* Misc settings */
+	bool registered;	/* Set when we are registered with the verbs layer */
+	u64 min_resp_ticks;   /* expected min. hw resp.time in ticks */
+
+	u16 jiffies_sampling_cnt;    /* 1/N counter used to display performance measurement.  */
+	/* Support for workaround for #3552 - feature_mask create_do_not_evict_qp: */
+	u32 dne_qp;
+
+	/* Support for workaround for #3713 */
+	u32 flush_qp;
+	struct mutex flush_lock;
+
+	/* Support for PMA proxy QP (indexes for port 1 and 2) bug #3357 */
+	u32 pma_qp_idxs[2];
+
+	/* Support for WA for bug #4096 */
+	bool single_pte_pt;  /* If set, use a level + 1 page table even for a single pte */
+
+	enum sif_mem_type mt_override;  /* Special memory type override available from sysfs */
+	/* TBD: Make sure it gets updated upon value changes (handle error events) */
+	struct ib_port_attr port[2];  /* cached port info. */
+
+	/* SL to TSL map. Indexed by sl, port (0-1 range) and qosl */
+	char sl2tsl[16][2][2];
+
+	/* qosl hint for regular qps, indexed by sl and port (0-1 range) */
+	enum psif_tsu_qos qp_qosl_hint[16][2];
+
+	/* tsl for pqps, latency sensitive (RCN) and bulk (non-critical) per port */
+	char pqp_rcn_tsl[2];
+	char pqp_bulk_tsl[2];
+
+	/* pqp qosl hint per port */
+	enum psif_tsu_qos pqp_qosl_rcn_hint[2];
+	enum psif_tsu_qos pqp_qosl_bulk_hint[2];
+
+	/* tsl for qp 0 (per port) */
+	char qp0_tsl[2];
+
+	/* qp 0 qosl hint (per port) */
+	enum psif_tsu_qos qp0_qosl_hint[2];
+
+	/* limited mode for device, no IB traffic possible */
+	bool limited_mode;
+	/* PSIF is degraded */
+	bool degraded;
+
+};
+
+/* TBD: These should probably come from common pci headers
+ */
+#ifndef PCI_MSIX_ENTRY_SIZE
+#define PCI_MSIX_ENTRY_SIZE 16
+#endif
+#ifndef PCI_MSIX_ENTRY_VECTOR_CTRL
+#define PCI_MSIX_ENTRY_VECTOR_CTRL 12
+#endif
+
+/* SIF specific debugging facilities */
+extern ulong sif_debug_mask;
+extern ulong sif_trace_mask;
+
+/* Defined classes */
+#define SIF_INFO	      0x1L
+#define SIF_INIT	      0x2L
+#define SIF_QPE	              0x4L
+#define SIF_INFO_V	      0x8L
+#define SIF_WCE		     0x10L /* Log error completions */
+#define SIF_PQPT	     0x20L  /* Log WR upon PQP timeouts */
+#define SIF_NCQ		     0x40L
+#define SIF_XRC		     0x80L
+#define SIF_INTR	    0x100L
+#define SIF_VERBS	    0x200L
+#define SIF_PQP		    0x400L
+#define SIF_EPS		    0x800L
+#define SIF_PD	           0x1000L
+#define SIF_QP	           0x2000L
+#define SIF_CQ	           0x4000L
+#define SIF_MR	           0x8000L
+#define SIF_FMR	          0x10000L
+#define SIF_MEM	          0x20000L
+#define SIF_AH	          0x40000L
+#define SIF_SRQ	          0x80000L
+#define SIF_SND	         0x100000L
+#define SIF_RCV	         0x200000L
+#define SIF_DMA	         0x400000L
+#define SIF_RQ	         0x800000L
+#define SIF_WCE_V       0x1000000L
+#define SIF_SQ	        0x2000000L
+#define SIF_POLL        0x4000000L
+#define SIF_PT	        0x8000000L
+#define SIF_MMU	       0x10000000L
+#define SIF_IPOLL      0x20000000L
+#define SIF_MMAP       0x40000000L
+#define SIF_MC	       0x80000000L
+#define SIF_IDX	      0x100000000L
+#define SIF_IDX2      0x200000000L
+#define SIF_MEM_SG    0x400000000L
+#define SIF_DFS	      0x800000000L
+#define SIF_FWA      0x1000000000L
+#define SIF_VERBS_V  0x2000000000L
+#define SIF_DUMP     0x4000000000L
+#define SIF_MMU_V    0x8000000000L
+#define SIF_MEM_V   0x10000000000L
+#define SIF_TSL     0x20000000000L
+#define SIF_CSR	    0x40000000000L
+#define SIF_PT_V    0x80000000000L
+#define SIF_PT_VV  0x100000000000L
+#define SIF_QP_V   0x200000000000L
+#define SIF_PERF_V 0x400000000000L
+
+#ifdef SIF_TRACE_MASK
+#define sif_log_trace(class, format, arg...) \
+	do { \
+		if (unlikely((sif_trace_mask) & (class))) {	\
+			const char *cl = #class;		 \
+			trace_printk("%5s " format "\n", &cl[4], ##arg); \
+		} \
+	} while (0)
+#else
+#define sif_log_trace(class, format, arg...)
+#endif
+
+#define sif_log(sdev, class, format, arg...)	\
+	do { \
+		sif_log_trace(class, format, ## arg);	\
+		if (unlikely((sif_debug_mask) & (class))) {		\
+			const char *cl = #class;\
+			dev_info(&(sdev)->pdev->dev,	\
+				   "[%d] %5s %s: " format "\n", \
+				   current->pid, &cl[4], __func__,      \
+				   ## arg); \
+		} \
+	} while (0)
+
+#define sif_logi(ibdev, class, format, arg...)	\
+	do { \
+		if (unlikely((sif_debug_mask) & (class))) {		\
+			const char *cl = #class;\
+			dev_info((ibdev)->dma_device,     \
+				   "[%d] %5s %s: " format "\n", \
+				   current->pid, &cl[4], __func__,      \
+				   ## arg); \
+		} \
+	} while (0)
+
+#define sif_log0(class, format, arg...)	\
+	do { \
+		if (unlikely((sif_debug_mask) & (class)))	\
+			pr_info("pid [%d] %s: " format "\n", \
+				current->pid, __func__,	     \
+				## arg);		     \
+	} while (0)
+
+#define sif_dump(class, txt, addr, len)		\
+	do { \
+		if (unlikely((sif_debug_mask) & (class))) { \
+			print_hex_dump(KERN_INFO, txt,	\
+			DUMP_PREFIX_ADDRESS, 8, 1, addr, len, 0); \
+		} \
+	} while (0)
+
+#define sif_logs(class, stmt_list) \
+	do { \
+		if (unlikely((sif_debug_mask) & (class))) { \
+			stmt_list;\
+		} \
+	} while (0)
+
+#define sif_log_cq(cq, class, format, arg...)	\
+	do { \
+		if (unlikely((sif_debug_mask) & (class))) {  \
+			struct sif_dev *sdev = \
+				container_of(cq->ibcq.device, struct sif_dev, ib_dev); \
+			if (time_before((cq)->next_logtime, jiffies)) {	\
+				(cq)->next_logtime = jiffies + max(1000ULL, sdev->min_resp_ticks); \
+			} else { \
+				(cq)->log_cnt++;	\
+				  continue;	\
+			} \
+			dev_info(&sdev->pdev->dev, \
+				   "pid [%d] %s (suppressed %d): " format "\n", \
+				current->pid, __func__, (cq)->log_cnt,	\
+				   ## arg); \
+			(cq)->log_cnt = 0;  \
+		} \
+	} while (0)
+
+#define sif_log_perf(sdev, class, format, arg...)	\
+	do { \
+		if (unlikely((sif_debug_mask) & (class))) {  \
+			if ((sdev)->jiffies_sampling_cnt % sif_perf_sampling_threshold) { \
+				(sdev)->jiffies_sampling_cnt++;		\
+				continue;				\
+			} \
+			dev_info(&(sdev)->pdev->dev,  \
+				   "pid [%d] %s: " format "\n", \
+				   current->pid, __func__,	   \
+				   ## arg);			   \
+		}						   \
+	} while (0)
+
+
+
+/* some convenience pointer conversion macros: */
+#define to_sdev(ibdev)  container_of((ibdev), struct sif_dev, ib_dev)
+
+#include <asm/byteorder.h>
+
+#define def_copy_conv(name, type1, type2) \
+static inline void copy_conv_to_##name(type1 void *dest, const type2 void *src, size_t n) \
+{ \
+	int words = n / 8; \
+	int i; \
+	type1 u64 *dp = (type1 u64 *) dest; \
+	type2 u64 *sp = (type2 u64 *) src; \
+	for (i = 0; i < words; i++) \
+		dp[i] = cpu_to_be64(sp[i]); \
+	wmb(); \
+}
+
+/* make checkpatch happy */
+#define N
+
+def_copy_conv(hw, volatile, N)
+def_copy_conv(sw, N, volatile)
+
+static inline void copy_conv_to_le(void *dest, const void *src, size_t n)
+{
+	int words = n / 8;
+	int i;
+	u64 *dp = (u64 *) dest;
+	u64 *sp = (u64 *) src;
+
+	BUG_ON(n & 7);
+	for (i = 0; i < words; i++)
+		dp[i] = cpu_to_le64(sp[i]);
+	wmb();
+}
+
+static inline void copy_conv_to_mmio(void __iomem *dest, const void *src, size_t n)
+{
+	int words = n / 8;
+	int i;
+	u64 __iomem *dp = (u64 __iomem *) dest;
+	u64 *sp = (u64 *) src;
+
+	BUG_ON(n & 7);
+	for (i = 0; i < words; i++)
+		__raw_writeq(cpu_to_be64(sp[i]), &dp[i]);
+}
+
+/* Non-converting copy routines */
+#define def_copy_plain(name, type1, type2) \
+static inline void copy_to_##name(type1 void *dest, const type2 void *src, size_t n) \
+{ \
+	int words = n / 8; \
+	int i; \
+	type1 u64 *dp = (type1 u64 *) dest; \
+	type2 u64 *sp = (type2 u64 *) src; \
+	for (i = 0; i < words; i++) \
+		dp[i] = sp[i]; \
+}
+
+def_copy_plain(hw, volatile, N)
+def_copy_plain(sw, N, volatile)
+
+static __always_inline void *sif_kmalloc(struct sif_dev *sdev, size_t size, gfp_t flags)
+{
+#ifdef CONFIG_NUMA
+	void *m;
+
+	m = kmalloc_node(size, flags, sdev->pdev->dev.numa_node);
+	if (m)
+		return m;
+
+	sif_log(sdev, SIF_INFO, "Warning: unable to allocate memory on numa node %d",
+		sdev->pdev->dev.numa_node);
+#endif
+	return kmalloc(size, flags);
+}
+
+static inline const char *get_product_str(struct sif_dev *sdev)
+{
+	if (IS_PSIF(sdev))
+		return
+			(PSIF_SUBSYSTEM(sdev) == 0x6278) ? "Oracle Dual-port QDR IB Adapter M4" :
+			(PSIF_SUBSYSTEM(sdev) == 0x6279) ? "Oracle Dual-port EDR IB Adapter"    :
+			(PSIF_SUBSYSTEM(sdev) == 0x6280) ? "Oracle InfiniBand Switch IS2-46"    :
+			(PSIF_SUBSYSTEM(sdev) == 0x6281) ? "Oracle InfiniBand Switch IS2-254"   :
+			(PSIF_SUBSYSTEM(sdev) == 0x6282) ? "Oracle Fabric Interconnect F2-12"   :
+			"Unknown PSIF based card";
+
+	switch (PSIF_DEVICE(sdev)) {
+	case	PCI_DEVICE_ID_SN1_PF:
+	case	PCI_DEVICE_ID_SN1_VF:
+		return "SPARC Integrated FDR IB M1";
+	case	PCI_DEVICE_ID_SN2_PF:
+	case	PCI_DEVICE_ID_SN2_VF:
+		return "SPARC Integrated EDR IB M2";
+	case	PCI_DEVICE_ID_SN3_PF:
+	case	PCI_DEVICE_ID_SN3_VF:
+		return "SPARC Integrated EDR IB M3";
+	default:
+		return "Unknown Sonoma or PSIF based system";
+	}
+}
+
+/* Param feature_mask defines */
+extern ulong sif_feature_mask;
+
+/* Disable INVALIDATE_*KEY(S) */
+#define SIFF_disable_invalidate_key	   0x1
+
+/* Disable RQ flushing */
+#define SIFF_disable_rq_flush		   0x2
+
+/* Disable SRQ */
+#define SIFF_disable_srq		   0x8
+
+/* Disable INVALIDATE_CQ only: */
+#define SIFF_disable_invalidate_cq	  0x10
+
+/* Disable INVALIDATE_RQ only: */
+#define SIFF_disable_invalidate_rq	  0x20
+
+/* Disable INVALIDATE_TLB only: */
+#define SIFF_disable_invalidate_tlb	  0x40
+
+/* Disable support for use of huge pages
+ * This feature is necessary to avoid running into bugDB #21690736
+ * on OVM:
+ */
+#define SIFF_no_huge_pages		  0x80
+
+/* Use stencil pqp for invalidation of FMR keys */
+#define SIFF_disable_stencil_invalidate	  0x100
+
+/* Force disable vpci iommu trapping (to operate as on real hardware..) */
+#define SIFF_disable_vpci_iommu		 0x400
+
+/* Toss all multipacket qp's instead of resetting and reusing, see #3334 */
+#define SIFF_no_multipacket_qp_reuse	 0x800
+
+/* Set PCI max payload size to the supported max payload size to avoid #2105 */
+#define SIFF_max_supported_payload	0x1000
+
+/* Let driver do page table walk instead of EPSC for query QP - to avoid #3583 */
+#define SIFF_passthrough_query_qp	0x4000
+
+/* Check all event queues on all interrupts */
+#define SIFF_check_all_eqs_on_intr	0x8000
+
+/* Don't allocate vcbs in a round robin fashion */
+#define SIFF_alloc_cb_round_robin 0x20000
+
+/* Don't allocate from all other queues (except cb and qp) in a round robin fashion */
+#define SIFF_disable_alloc_round_robin    0x40000
+
+/* Default on rev1 is to force rnr_retry_init to 0 - this feature
+ * forces it to 7 (infinite retry) instead:
+ */
+#define SIFF_infinite_rnr	       0x80000
+
+/* Default is to allocate table entries
+ * from a two-level allocation where each pd reserves all entries
+ * within a page and allocates from within this.
+ * This disables the second level to revert to a
+ * flat 1-level allocation scheme:
+ */
+#define SIFF_flat_alloc		      0x100000
+
+/* SQS Atomics (only has effect for PSIF rev > 3) */
+#define SIFF_force_sqs_atomic_disable 0x200000
+
+#define SIFF_force_ib_atomic_hca_mode 0x400000
+
+/* Force link retraining upon some errors to ease PCIe triggering */
+#define SIFF_pcie_trigger	      0x800000
+
+/* Use 0 as magic value in qp setup to debug #3595 */
+#define SIFF_zero_magic		     0x1000000
+
+/* Use optimization of 2 sge_entries with the first being 48 */
+#define SIFF_disable_inline_first_sge 0x2000000
+/* disable Adaptive int coalescing */
+#define SIFF_dis_auto_int_coalesce    0x4000000
+
+/*
+ * Bringup SIF a in limited mode, where no IB traffic and only
+ * limited mailbox traffic will be possible
+ */
+#define SIFF_force_limited_mode       0x8000000
+
+/*
+ * Force WA for HW bug bug 3646, PSIF does not honor min_rnr_timer,
+ * assumes a homogenous PSIF cluster.
+ */
+#define SIFF_force_wa_3646           0x10000000
+
+#define SIFF_force_rc_2048_mtu       0x20000000
+
+/* Configure PSIF to use the opposite base page size (e.g. 8K on x86 and 4K on sparc) */
+#define SIFF_toggle_page_size        0x40000000
+
+#define SIFF_all_features	     0x7ffeddfb
+
+#define sif_feature(x) (sif_feature_mask & (SIFF_##x))
+
+extern ulong sif_vendor_flags;
+#define sif_vendor_enable(x, uflags) ((sif_vendor_flags | uflags) & x)
+
+extern uint sif_vf_en;
+extern uint sif_fwa_mr_en;
+
+extern uint sif_max_inline;
+
+extern uint sif_qp_size;
+extern uint sif_mr_size;
+extern uint sif_ah_size;
+extern uint sif_cq_size;
+extern uint sif_rq_size;
+
+extern ulong sif_eps_log_size;
+extern ushort sif_eps_log_level;
+
+extern ushort sif_perf_sampling_threshold;
+extern uint sif_fmr_cache_flush_threshold;
+
+/* Maximum number of outstanding privileged QP requests supported */
+extern uint sif_max_pqp_wr;
+
+/* Max number of stencil PQPs for (bulk) key invalidate to allocate */
+extern uint sif_ki_spqp_size;
+
+/* Max number of collect buffers supported */
+extern uint sif_cb_max;
+
+/* Initialized in init */
+extern struct kmem_cache *compl_cache;
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_dma.c b/drivers/infiniband/hw/sif/sif_dma.c
new file mode 100644
index 0000000000000..18218d1e4a7e7
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_dma.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_dma.c: DMA memory mapping
+ */
+#include <linux/version.h>
+#include "sif_dma.h"
+#include "sif_dev.h"
+#include "psif_hw_data.h"
+
+struct page *sif_alloc_pages(struct sif_dev *sdev, gfp_t gfp_mask, unsigned int order)
+{
+#ifdef CONFIG_NUMA
+	if (sdev->pdev->dev.numa_node >= 0) {
+		struct page *page = alloc_pages_node(sdev->pdev->dev.numa_node, gfp_mask, order);
+
+		if (page)
+			return page;
+
+		sif_logi(&sdev->ib_dev, SIF_INFO, "Warning: unable to allocate order %d, on numa node %d",
+			 order, sdev->pdev->dev.numa_node);
+	}
+#endif
+	return alloc_pages(gfp_mask, order);
+}
+
+
+
+
+/* allocate/release aligned memory */
+void *sif_dma_alloc_aligned(struct ib_device *dev, size_t size,
+			dma_addr_t *dma_handle, gfp_t flag,
+			enum dma_data_direction dir)
+{
+	dma_addr_t ioaddr;
+	int ret;
+	void *cpu_addr;
+	struct sif_dev *sdev = to_sdev(dev);
+	struct page *page = sif_alloc_pages(sdev, flag, get_order(size));
+
+	if (!page)
+		return NULL;
+
+	cpu_addr = page_address(page);
+	ioaddr = (dma_addr_t) ib_dma_map_single(dev, cpu_addr, size, dir);
+	ret = dma_mapping_error(dev->dma_device, ioaddr);
+	if (ret) {
+		sif_logi(dev, SIF_DMA, "DMA mapping %p sz %lx %sfailed",
+			cpu_addr, size, (dir == DMA_TO_DEVICE ? "read only " : ""));
+		free_pages((unsigned long)cpu_addr, get_order(size));
+		return NULL;
+	}
+	*dma_handle = ioaddr;
+	return cpu_addr;
+}
+
+void sif_dma_free_aligned(struct ib_device *dev, size_t size,
+			void *cpu_addr, u64 dma_handle,
+			enum dma_data_direction dir)
+{
+	ib_dma_unmap_single(dev, dma_handle, size, dir);
+	free_pages((unsigned long)cpu_addr, get_order(size));
+}
+
+
+void *sif_dma_alloc_readonly(struct ib_device *dev, size_t size,
+			dma_addr_t *dma_handle, gfp_t flag)
+{
+	return sif_dma_alloc_aligned(dev, size, dma_handle, flag, DMA_TO_DEVICE);
+}
+
+void sif_dma_free_readonly(struct ib_device *dev, size_t size,
+			void *cpu_addr, dma_addr_t dma_handle)
+{
+	sif_dma_free_aligned(dev, size, cpu_addr, dma_handle, DMA_TO_DEVICE);
+}
diff --git a/drivers/infiniband/hw/sif/sif_dma.h b/drivers/infiniband/hw/sif/sif_dma.h
new file mode 100644
index 0000000000000..c9de27fd37ed9
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_dma.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_dma.h: DMA memory mapping
+ */
+
+#ifndef __SIF_DMA_H
+#define __SIF_DMA_H
+
+#include <rdma/ib_verbs.h>
+
+struct sif_dev;
+
+struct page *sif_alloc_pages(struct sif_dev *sdev, gfp_t gfp_mask, unsigned int order);
+
+void *sif_dma_alloc_coherent(struct ib_device *dev, size_t size,
+			     u64 *dma_handle, gfp_t flag);
+void sif_dma_free_coherent(struct ib_device *dev, size_t size,
+			   void *cpu_addr, u64 dma_handle);
+
+/* allocate/release readonly (and noncoherent?) memory */
+void *sif_dma_alloc_readonly(struct ib_device *dev, size_t size,
+			dma_addr_t *dma_handle, gfp_t flag);
+
+void sif_dma_free_readonly(struct ib_device *dev, size_t size,
+			void *cpu_addr, dma_addr_t dma_handle);
+
+/* Allocate/release memory that is naturally aligned according to size,
+ * eg. 2M gets 2M aligned etc:
+ */
+void *sif_dma_alloc_aligned(struct ib_device *dev, size_t size,
+			dma_addr_t *dma_handle, gfp_t flag,
+			enum dma_data_direction dir);
+
+void sif_dma_free_aligned(struct ib_device *dev, size_t size,
+			void *cpu_addr, u64 dma_handle,
+			enum dma_data_direction dir);
+
+
+struct sif_table;
+
+/* Largest single dma alloc we can get
+ * - if larger need, switch to vmalloc:
+ */
+#define SIF_MAX_CONT (PAGE_SIZE << (MAX_ORDER - 1))
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_drvapi.h b/drivers/infiniband/hw/sif/sif_drvapi.h
new file mode 100644
index 0000000000000..2e6ba7ac7bb14
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_drvapi.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_drvapi.h: Device specific operations available via the FWA access path
+ *
+ */
+#ifndef _SIF_DRVAPI_H
+#define _SIF_DRVAPI_H
+
+
+enum sif_drv_opcode {
+	SIF_DRV_CMD_EPSA_SETUP,     /* Set up the standard communication link towards an EPS-A */
+	SIF_DRV_CMD_EPSA_TEARDOWN,  /* Terminate the communication link with an EPS-A */
+};
+
+struct epsa_setup {
+	enum psif_eps_a_core epsa; /* Which EPS-A to operate on */
+	u32 req_size; /* Size in number of reqs of the EPS-A req/rsp queues (only 2**n sizes supported) */
+};
+
+
+struct sif_drv_req {
+	enum sif_drv_opcode opcode;
+	union {
+		struct epsa_setup epsa; /* The EPS-A number for the operation */
+	} u;
+};
+
+struct sif_drv_rsp {
+	enum sif_drv_opcode opcode;    /* The opcode of the driver operation */
+	struct psif_epsc_csr_rsp eps_rsp;  /* If status != EPSC_SUCCESS an opt. err resp. from the EPSC */
+};
+
+
+static inline enum psif_mbox_type epsa_to_mbox(enum psif_eps_a_core epsa)
+{
+	switch (epsa) {
+	case PSIF_EPS_A_1:
+		return MBOX_EPSA0;
+	case PSIF_EPS_A_2:
+		return MBOX_EPSA1;
+	case PSIF_EPS_A_3:
+		return MBOX_EPSA2;
+	case PSIF_EPS_A_4:
+		return MBOX_EPSA3;
+	default:
+		break;
+	}
+	return (enum psif_mbox_type)-1;
+}
+
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_elog.c b/drivers/infiniband/hw/sif/sif_elog.c
new file mode 100644
index 0000000000000..5547fd64e5595
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_elog.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_elog.c: Log over PCIe support for firmware
+ *   TBD: Remove
+ */
+
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include "sif_dev.h"
+#include "sif_elog.h"
+#include "sif_query.h"
+
+static int sif_elog_wait(struct sif_dev *sdev, enum psif_mbox_type eps_num)
+{
+	int ret;
+	struct sif_eps *es = &sdev->es[eps_num];
+	struct psif_epsc_csr_rsp resp;
+	struct psif_epsc_csr_req req;
+
+	init_completion(&es->logdev_more_log);
+
+	memset(&req, 0, sizeof(req));
+	req.opcode = EPSC_LOG_REQ_NOTIFY;
+	ret = sif_eps_wr(sdev, eps_num, &req, &resp);
+	if (ret || resp.status != EPSC_SUCCESS)
+		return -EINVAL;
+
+	/* data contains the last byte written by eps at the moment
+	 * where the notify call was processed.
+	 */
+
+	if (resp.data > be64_to_cpu(es->data->log.consume_offset))
+		return 0;
+
+	ret = wait_for_completion_interruptible(&es->logdev_more_log);
+
+	return ret;
+}
+
+void sif_elog_intr(struct sif_dev *sdev, enum psif_mbox_type eps_num)
+{
+	complete(&sdev->es[eps_num].logdev_more_log);
+}
+
+static int sif_elog_open(struct inode *inode, struct file *f)
+{
+	struct sif_eps *es = container_of(f->f_op, struct sif_eps, logdev_ops);
+	int ok = atomic_add_unless(&es->logdev_use, -1, 0);
+
+	if (!ok)
+		return -EBUSY;
+
+	return 0;
+}
+
+
+static int sif_elog_release(struct inode *inode, struct file *f)
+{
+	struct sif_eps *es = container_of(f->f_op, struct sif_eps, logdev_ops);
+
+	atomic_inc(&es->logdev_use);
+	return 0;
+}
+
+
+static ssize_t sif_elog_read(struct file *f, char __user *user, size_t size, loff_t *offset)
+{
+	int stat;
+	struct sif_eps *es = container_of(f->f_op, struct sif_eps, logdev_ops);
+	struct sif_dev *sdev = es->sdev;
+	struct psif_epsc_log_stat ls;
+	u64 start_off, end_off, sz, len, start;
+restart:
+	if (eps_version_ge(es, 0, 31))
+		copy_conv_to_sw(&ls, &es->data->log, sizeof(ls));
+	else
+		memcpy(&ls, &es->data->log, sizeof(ls));
+
+	start_off = ls.consume_offset;
+	end_off = ls.produce_offset;
+	sz = ls.size;
+
+	len = min((u64)size, end_off - start_off);
+	start = start_off % sz;
+
+	if (start + len > sz)
+		len = sz - start;
+
+	if (len == 0) {
+		stat = sif_elog_wait(sdev, es->eps_num);
+		if (stat < 0)
+			return stat;
+		goto restart;
+	}
+
+	sif_log(sdev, SIF_EPS, " requested sz %lx, off %llx. Queue: produce %llx, consume %llx - got %llx",
+		size, *offset, ls.produce_offset, ls.consume_offset,
+		len);
+
+	if (copy_to_user(user, &es->data->log_data_area[start], len))
+		return -EIO;
+
+	ls.consume_offset += len;
+	es->data->log.consume_offset = cpu_to_be64(ls.consume_offset);
+	return len;
+}
+
+
+
+int sif_elog_init(struct sif_dev *sdev, enum psif_mbox_type eps_num)
+{
+	struct sif_eps *es = &sdev->es[eps_num];
+	struct miscdevice *logdev = &es->logdev;
+	struct file_operations *logdev_ops = &es->logdev_ops;
+	struct pci_dev *pdev = sdev->pdev;
+
+	snprintf(es->logdevname, MAX_LOGDEVNAME, "infiniband/sif_eps%s/%02x:%02x.%x",
+		eps_suffix(sdev, eps_num), pdev->bus->number,
+		PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+	logdev_ops->read = sif_elog_read;
+	logdev_ops->open = sif_elog_open;
+	logdev_ops->release = sif_elog_release;
+	logdev_ops->owner = THIS_MODULE;
+	logdev->name = es->logdevname;
+	logdev->minor = MISC_DYNAMIC_MINOR;
+	logdev->fops = &es->logdev_ops;
+	atomic_set(&es->logdev_use, 1);
+	return misc_register(logdev);
+}
+
+int sif_elog_deinit(struct sif_dev *sdev, enum psif_mbox_type eps_num)
+{
+	return misc_deregister(&sdev->es[eps_num].logdev);
+}
diff --git a/drivers/infiniband/hw/sif/sif_elog.h b/drivers/infiniband/hw/sif/sif_elog.h
new file mode 100644
index 0000000000000..8c0ecdaa7efe8
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_elog.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_elog.h: Misc device for capturing log from the EPSC
+ */
+
+#ifndef _SIF_ELOG_H
+#define _SIF_ELOG_H
+
+struct sif_dev;
+
+int sif_elog_init(struct sif_dev *sdev, enum psif_mbox_type eps_num);
+int sif_elog_deinit(struct sif_dev *sdev, enum psif_mbox_type eps_num);
+
+void sif_elog_intr(struct sif_dev *sdev, enum psif_mbox_type eps_num);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_enl.h b/drivers/infiniband/hw/sif/sif_enl.h
new file mode 100644
index 0000000000000..9fa605461cb8e
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_enl.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_enl.h: Protocol definitions for the netlink protocol for EPSC access from
+ *   user space. Shared between kernel and user space.
+ */
+
+#ifndef _SIF_ENL_H
+#define _SIF_ENL_H
+
+/* Supported packet types */
+enum sif_enl_cmd_type {
+	SIF_ENL_CMD_NONE,
+	SIF_ENL_CMD_REQ,     /* Request to an EPS */
+	SIF_ENL_CMD_RSP,     /* Response from an EPS */
+	SIF_ENL_CMD_REQ_DRV, /* Driver requests */
+	SIF_ENL_CMD_RSP_DRV, /* Driver response */
+	SIF_ENL_CMD_MAX
+};
+
+/* Supported attributes */
+enum sif_test_attr {
+	SIF_ENL_A_CMD,
+	SIF_ENL_A_COMPLEX,
+	SIF_ENL_A_BUS,
+	SIF_ENL_A_DEVFN,
+	SIF_ENL_A_PAYLOAD,
+	SIF_ENL_A_DATA,
+	SIF_ENL_A_INDEX,
+	SIF_ENL_A_MAX
+};
+
+
+/* attribute policy */
+static struct nla_policy sif_enl_policy[SIF_ENL_A_MAX] = {
+	[SIF_ENL_A_CMD] =	{ .type = NLA_U32 },
+	[SIF_ENL_A_COMPLEX] =	{ .type = NLA_U16 },
+	[SIF_ENL_A_BUS] =	{ .type = NLA_U16 },
+	[SIF_ENL_A_DEVFN] =	{ .type = NLA_U16 },
+	[SIF_ENL_A_PAYLOAD]  =	{ .type = NLA_UNSPEC },
+	[SIF_ENL_A_DATA]  =	{ .type = NLA_UNSPEC },
+	[SIF_ENL_A_INDEX] =	{ .type = NLA_U32 }
+};
+
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_epsc.c b/drivers/infiniband/hw/sif/sif_epsc.c
new file mode 100644
index 0000000000000..c38e9122e7a07
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_epsc.c
@@ -0,0 +1,1739 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_epsc.c: Implementation of API for communication with the EPSC
+ *
+ *  In general this module has to make sure that
+ *  1) we never have more packets outstanding with the EPS than hw_enties
+ *  2) we do not post more packets than we have completion entries for,
+ *     eg. we must ensure that completions not yet forwarded as a result of
+ *     a *waitfor* call is not overwritten by hw.
+ */
+
+#include "sif_epsc.h"
+#include "sif_eq.h"
+#include "sif_dev.h"
+#include "sif_base.h"
+#include "psif_hw_csr.h"
+#include "psif_hw_data.h"
+#include "psif_hw_setget.h"
+#include "sif_dma.h"
+#include "sif_query.h"
+#include "sif_elog.h"
+#include "sif_hwi.h"
+#include "sif_spt.h"
+#include "sif_defs.h"
+#include <linux/bitmap.h>
+#include <linux/seq_file.h>
+
+#define CSR_ONLINE_MASK 0x8000
+
+#define EPSC_LOG_MODE_BUFFER EPSC_LOG_MODE_SCAT
+
+
+static int write_csr(struct sif_dev *sdev, u32 addr, u64 val);
+static u64 read_csr(struct sif_dev *sdev, u32 addr, bool local);
+
+union sif_mailbox {
+	u64 raw;
+	struct psif_epsc_csr_doorbell x;
+};
+
+static int __sif_eps_send_keep_alive(struct sif_dev *sdev, enum psif_mbox_type eps_num,
+			bool force);
+
+static enum psif_mbox_type sif_tab2mbox(struct sif_dev *sdev, enum sif_tab_type tab_type)
+{
+	return (tab_type & ~1) == epsc_csr_req ? sdev->mbox_epsc
+		: ((tab_type - epsa0_csr_req) >> 1);
+}
+
+
+static enum sif_tab_type sif_mbox2req_tab(struct sif_dev *sdev, enum psif_mbox_type eps_num)
+{
+	return eps_num == sdev->mbox_epsc ? epsc_csr_req
+		: epsa0_csr_req + (eps_num << 1);
+}
+
+static enum sif_tab_type sif_mbox2rsp_tab(struct sif_dev *sdev, enum psif_mbox_type eps_num)
+{
+	return eps_num == sdev->mbox_epsc ? epsc_csr_rsp
+		: epsa0_csr_rsp + (eps_num << 1);
+}
+
+
+const char *eps_name(struct sif_dev *sdev, enum psif_mbox_type eps_num)
+{
+	if (eps_num == sdev->mbox_epsc)
+		return "C";
+
+	switch (eps_num) {
+	case MBOX_EPSA0:
+		return "A-0";
+	case MBOX_EPSA1:
+		return "A-1";
+	case MBOX_EPSA2:
+		return "A-2";
+	case MBOX_EPSA3:
+		return "A-3";
+	default:
+		break;
+	}
+	return "(nonexisting eps)";
+}
+
+
+const char *eps_suffix(struct sif_dev *sdev, enum psif_mbox_type eps_num)
+{
+	if (eps_num == sdev->mbox_epsc)
+		return "c";
+
+	switch (eps_num) {
+	case MBOX_EPSA0:
+		return "a0";
+	case MBOX_EPSA1:
+		return "a1";
+	case MBOX_EPSA2:
+		return "a2";
+	case MBOX_EPSA3:
+		return "a3";
+	default:
+		break;
+	}
+	return "(nonexisting eps)";
+}
+
+
+bool is_eps_rsp_tab(enum sif_tab_type type)
+{
+	switch (type) {
+	case epsc_csr_rsp:
+	case epsa0_csr_rsp:
+	case epsa1_csr_rsp:
+	case epsa2_csr_rsp:
+	case epsa3_csr_rsp:
+		return true;
+	default:
+		return false;
+	}
+}
+
+
+int eps_status_to_err(enum psif_epsc_csr_status status)
+{
+	switch (status) {
+	case EPSC_SUCCESS:
+		return 0;
+	case EPSC_EKEYREJECTED:
+		return -EKEYREJECTED;
+	case EPSC_EADDRNOTAVAIL:
+		return -EPERM;
+	case EPSC_EOPNOTSUPP:
+		return -EOPNOTSUPP;
+	case EPSC_ENOMEM:
+		return -ENOMEM;
+	case EPSC_ENODATA: /* ENODATA is not an error */
+		return 0;
+	case EPSC_EAGAIN:
+		return -EAGAIN;
+	case EPSC_ECANCELED:
+		return -ECANCELED;
+	case EPSC_ECONNRESET:
+		return -ECONNRESET;
+	case EPSC_ECSR:
+		return -EACCES;
+	case EPSC_MODIFY_QP_OUT_OF_RANGE:
+		return -ERANGE;
+	case EPSC_MODIFY_QP_INVALID:
+		return -EINVAL;
+	case EPSC_MODIFY_CANNOT_CHANGE_QP_ATTR:
+		return -EBUSY;
+	case EPSC_MODIFY_INVALID_QP_STATE:
+	case EPSC_MODIFY_INVALID_MIG_STATE:
+		return -EINVAL;
+	case EPSC_MODIFY_TIMEOUT:
+		return -ETIMEDOUT;
+	case EPSC_ETEST_HEAD:
+	case EPSC_ETEST_TAIL:
+	case EPSC_ETEST_PATTERN:
+		return -EIO;
+	case EPSC_EADDRINUSE:
+		return -EADDRINUSE;
+	case EPSC_EINVALID_VHCA:
+		return -ECHRNG;
+	case EPSC_EINVALID_PORT:
+		return -ELNRNG;
+	case EPSC_EINVALID_ADDRESS:
+		return -EADDRNOTAVAIL;
+	case EPSC_EINVALID_PARAMETER:
+		return -EINVAL;
+	case EPSC_FAIL:
+		return -ENOTRECOVERABLE;
+	default:
+		return -EUCLEAN; /* If this is returned, this function needs corrections */
+	}
+}
+
+
+struct psif_epsc_csr_req *get_eps_csr_req(struct sif_dev *sdev,
+	enum psif_mbox_type eps_num, int index)
+{
+	enum sif_tab_type type = sif_mbox2req_tab(sdev, eps_num);
+
+	return (struct psif_epsc_csr_req *)
+		(sif_mem_kaddr(sdev->ba[type].mem, index * sdev->ba[type].ext_sz));
+}
+
+struct psif_epsc_csr_rsp *get_eps_csr_rsp(struct sif_dev *sdev,
+	enum psif_mbox_type eps_num, int index)
+{
+	enum sif_tab_type type = sif_mbox2rsp_tab(sdev, eps_num);
+
+	return (struct psif_epsc_csr_rsp *)
+		(sif_mem_kaddr(sdev->ba[type].mem, index * sdev->ba[type].ext_sz));
+}
+
+static inline u16 get_eps_mailbox_seq_num(volatile struct psif_epsc_csr_rsp *rsp)
+{
+	return rsp->seq_num & (CSR_ONLINE_MASK - 1);
+}
+
+/* Cond. call completion on an entry in the response queue
+ * Assumes the eps lock is held
+ */
+static inline void __epsc_complete(struct sif_dev *sdev, enum psif_mbox_type eps_num, int idx)
+{
+	struct sif_eps *es = &sdev->es[eps_num];
+	struct sif_eps_cqe *cqe = es->cqe[idx];
+
+	if (cqe && cqe->need_complete)
+		complete(&cqe->cmpl);
+}
+
+void epsc_complete(struct sif_dev *sdev, enum psif_mbox_type eps_num, int idx)
+{
+	unsigned long flags;
+	struct sif_eps *es = &sdev->es[eps_num];
+
+	spin_lock_irqsave(&es->lock, flags);
+	__epsc_complete(sdev, eps_num, idx);
+	spin_unlock_irqrestore(&es->lock, flags);
+}
+
+static int sif_eps_api_version_ok(struct sif_dev *sdev, enum psif_mbox_type eps_num)
+{
+	bool psif_version_ok;
+	bool epsc_version_ok;
+	struct sif_eps *es = &sdev->es[eps_num];
+
+	/* Validate that we have compatible versions */
+	sif_log(sdev, SIF_INFO, "Connected to SIF version %d.%d,  EPS%s version %d.%d",
+		es->ver.psif_major, es->ver.psif_minor,
+		eps_name(sdev, eps_num),
+		es->ver.epsc_major, es->ver.epsc_minor);
+
+	psif_version_ok =
+		es->ver.psif_major == PSIF_MAJOR_VERSION &&
+		es->ver.psif_minor == PSIF_MINOR_VERSION;
+
+	if (!psif_version_ok) {
+		u32 ever, dver, rev1ver;
+
+		sif_log(sdev, SIF_INFO,
+		" *** PSIF architecture version mismatch: driver expects v.%d.%d, fw supports v.%d.%d ***",
+			PSIF_MAJOR_VERSION, PSIF_MINOR_VERSION,
+			es->ver.psif_major, es->ver.psif_minor);
+		ever = PSIF_API_VERSION(es->ver.psif_major, es->ver.psif_minor);
+		rev1ver = PSIF_API_VERSION(4, 06);
+		dver = PSIF_VERSION;
+		if ((dver > rev1ver && ever <= rev1ver) ||
+			(ever > rev1ver && dver <= rev1ver)) {
+			sif_log(sdev, SIF_INFO, "Wrong driver build for this chip revision!");
+			return -ENOEXEC;
+		}
+	}
+
+	epsc_version_ok =
+		es->ver.epsc_major == EPSC_MAJOR_VERSION &&
+		es->ver.epsc_minor == EPSC_MINOR_VERSION;
+
+	if (!epsc_version_ok) {
+		sif_log(sdev, SIF_INFO_V,
+		" *** EPS%s API version mismatch: driver expects v.%d.%d, firmware implements v.%d.%d ***",
+			eps_name(sdev, eps_num),
+			EPSC_MAJOR_VERSION, EPSC_MINOR_VERSION,
+			es->ver.epsc_major, es->ver.epsc_minor);
+	}
+
+	/* PSIF version must match exactly, any EPSC version is ok */
+	if (!psif_version_ok)
+		return -ENOEXEC;
+	return 0;
+}
+
+
+static int sif_eps_firmware_version_ok(struct sif_dev *sdev, enum psif_mbox_type eps_num)
+{
+	int ret;
+	int i = 0, fi = 0;
+	struct psif_epsc_csr_req req;
+	struct psif_epsc_csr_rsp rsp;
+	char *p;
+	char *start;
+	char *vs;
+	struct sif_eps *es = &sdev->es[eps_num];
+
+	memset(&req, 0, sizeof(req));
+	req.opcode = EPSC_FW_VERSION;
+	req.u.fw_version.host_addr =
+		(u64)es->data_dma_hdl + offsetof(struct sif_epsc_data, fw_version);
+
+	ret = sif_eps_wr_poll(sdev, eps_num, &req, &rsp);
+	if (ret)
+		return ret;
+
+	/* Parse the string we got: */
+	p = start = es->data->fw_version;
+	for (i = 0; i < MAX_FW_VERSION_INFO_SZ; i++) {
+		if (p[i] == '\0') {
+			sif_log(sdev, SIF_VERBS, "fw_version[%d]: %s",
+				fi, start);
+			es->ver.fw_version[fi++] = start;
+			/* skip 0 byte */
+			start = p + i + 1;
+			if (fi >= FWV_MAX)
+				break;
+		}
+	}
+	sif_log(sdev, SIF_INFO, "EPSC firmware image revision string %s",
+		es->ver.fw_version[FWV_EPS_REV_STRING]);
+	sif_log(sdev, SIF_INFO, "EPSC firmware version tag:\n%s",
+		es->ver.fw_version[FWV_EPS_GIT_LAST_COMMIT]);
+	if (es->ver.fw_version[FWV_EPS_GIT_STATUS][0] != '\0')
+		sif_log(sdev, SIF_INFO,	" *** epsfw git status at build time: ***\n%s",
+			es->ver.fw_version[FWV_EPS_GIT_STATUS]);
+
+	vs = es->ver.fw_version[FWV_EPS_REV_STRING];
+	if (sscanf(vs, "%hu.%hu", &es->ver.fw_major, &es->ver.fw_minor) != 2)
+		return -EINVAL;
+
+	if (vs[0] == 'R' && es->ver.fw_minor == 0)
+		es->ver.fw_minor = 1;
+
+	sif_log(sdev, SIF_INFO, "EPSC interpreted firmware revision: %hu.%hu",
+		es->ver.fw_major, es->ver.fw_minor);
+	return 0;
+}
+
+
+static int sif_eps_log_ctrl(struct sif_dev *sdev, enum psif_mbox_type eps_num,
+		enum psif_epsc_log_mode mode,
+		enum psif_epsc_log_level level)
+{
+	int ret;
+	struct psif_epsc_csr_req req;
+	struct psif_epsc_csr_rsp rsp;
+	struct sif_eps *es = &sdev->es[eps_num];
+
+	if (eps_num != sdev->mbox_epsc) {
+		/* TBD: Data area has not been allocated for EPSAs! */
+		return -ENOMEM;
+	}
+
+	if (!es->data->log.size) {
+		sif_log(sdev, SIF_INFO, "cannot redirect - no data buffer configured");
+		return -ENOMEM;
+	}
+
+	memset(&req, 0, sizeof(req));
+	req.opcode = EPSC_LOG_CTRL;
+	/* TBD: Higher log levels than debug will give a feedback loop... */
+	req.u.log_ctrl.level = level > EPS_LOG_DEBUG ? EPS_LOG_DEBUG : level;
+	req.u.log_ctrl.mode = mode;
+	req.u.log_ctrl.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx;
+
+	if (mode == EPSC_LOG_MODE_HOST) {
+		req.u.log_ctrl.stat_base =
+			(u64)es->data_dma_hdl + offsetof(struct sif_epsc_data, log);
+		req.u.log_ctrl.base =
+			(u64)es->data_dma_hdl + offsetof(struct sif_epsc_data, log_data_area);
+		req.u.log_ctrl.length =
+			es->data->log.size;
+	}
+
+	ret = sif_eps_wr_poll(sdev, eps_num, &req, &rsp);
+	if (!ret) {
+		if (mode == EPSC_LOG_MODE_HOST) {
+			sif_log(sdev, SIF_INFO,
+				"Enabled EPS log redirect to buffer at %p (sz 0x%llx)",
+				es->data->log_data_area,
+				es->data->log.size);
+			ret = sif_elog_init(sdev, eps_num);
+			if (ret)
+				sif_log(sdev, SIF_INFO, "Failed to create eps logging device for EPS%s",
+					eps_name(sdev, eps_num));
+			es->log_redir_en = true;
+		} else {
+			if (es->log_redir_en) {
+				sif_elog_deinit(sdev, eps_num);
+				es->log_redir_en = false;
+			}
+			sif_log(sdev, SIF_INFO,	"Disabled EPS log redirect");
+		}
+	}
+	return ret;
+}
+
+
+int epsc_set_mmu_upper(struct sif_dev *sdev, u16 value)
+{
+	int ret;
+
+	if (eps_version_ge(&sdev->es[sdev->mbox_epsc], 0, 103)) {
+		struct psif_epsc_csr_req req;
+		struct psif_epsc_csr_rsp rsp;
+
+		memset(&req, 0, sizeof(req));
+		req.opcode = EPSC_SET;
+		req.u.set.data.op = EPSC_QUERY_TA_UPPER_TWELVE;
+		req.u.set.info.op = EPSC_QUERY_PA_UPPER_TWELVE;
+		req.u.set.data.value = value;
+		req.u.set.info.value = value;
+		ret = sif_epsc_wr_poll(sdev, &req, &rsp);
+	} else {
+		u64 v = read_csr(sdev, 0x200000, false);
+
+		v &= ~((0xfffull << 48) | (0xfffull << 32));
+		v |= ((u64)value << 48) | ((u64)value << 32);
+		ret = write_csr(sdev, 0x200000, v);
+	}
+	if (ret)
+		sif_log(sdev, SIF_INFO, "Failed to set mmu_upper bits!");
+
+	if (PSIF_REVISION(sdev) <= 3)
+		/* Enable WA for Bug #4096: TA/PA upper has no effect on level0 contexts */
+		sdev->single_pte_pt = true;
+	return ret;
+}
+
+
+
+/* special epsc initialization */
+static void eps_struct_init(struct sif_dev *sdev)
+{
+	struct sif_eps *es;
+	u8 i;
+
+	for (i = 0; i < sdev->eps_cnt; i++) {
+		memset(sdev->es, 0, sizeof(*sdev->es));
+		es = &sdev->es[i];
+		es->sdev = sdev;
+		es->eps_num = i;
+		spin_lock_init(&es->lock);
+
+		if (i != sdev->mbox_epsc)
+			continue;
+
+		/* EPSC is implicitly started at power on */
+		if (es->state == ES_NOT_RUNNING)
+			es->state = ES_RUNNING;
+	}
+}
+
+
+static int eps_set_state(struct sif_dev *sdev, enum psif_mbox_type eps_num,
+			enum sif_eps_state new_state)
+{
+	unsigned long flags;
+	struct sif_eps *es = &sdev->es[eps_num];
+	int ret = 0;
+
+	spin_lock_irqsave(&es->lock, flags);
+	switch (es->state) {
+	case ES_NOT_RUNNING:
+	case ES_RUNNING:
+		if (new_state == ES_INIT || new_state == ES_NOT_RUNNING)
+			break;
+		ret = -EINVAL;
+		goto init_failed;
+	case ES_INIT:
+		if (new_state == ES_ACTIVE || new_state == ES_NOT_RUNNING)
+			break;
+		ret = -ENODEV;
+		goto init_failed;
+	case ES_ACTIVE:
+		if (new_state == ES_RUNNING)
+			break;
+		ret = -EBUSY;
+		goto init_failed;
+	}
+	es->state = new_state;
+	spin_unlock_irqrestore(&es->lock, flags);
+	return 0;
+init_failed:
+	sif_log(sdev, SIF_INIT, "Invalid EPS%s state transition (%d -> %d)",
+		eps_name(sdev, eps_num), es->state, new_state);
+	spin_unlock_irqrestore(&es->lock, flags);
+	return ret;
+}
+
+
+/* Define the atomic op completer device capabilites and device control here as
+ * not able to find it in the pci_reg.h. This should be get into the pci_reg.h.
+ */
+#define ATOMIC_OP_32_BIT_COMPLETER_SUPPORTED (1ULL << 7)
+#define ATOMIC_OP_64_BIT_COMPLETER_SUPPORTED (1ULL << 8)
+#define CAS_OP_128_BIT_COMPLETER_SUPPORTED (1ULL << 9)
+#define ATOMIC_OP_REQUESTER_ENABLE (1ULL << 6)
+static enum psif_epsc_csr_atomic_op sif_get_atomic_config(struct sif_dev *sdev,
+							enum psif_mbox_type eps_num)
+{
+	struct pci_dev *parent;
+	int pcie_cap, pcie_parent_cap;
+	u16 pdevcap2, devctrl2;
+	int ret = 0;
+	enum psif_epsc_csr_atomic_op atomic_op_flags = PSIF_PCIE_ATOMIC_OP_NONE;
+	parent = pci_upstream_bridge(sdev->pdev);
+
+	if (!parent) {
+		sif_log(sdev, SIF_INFO,
+			"No parent bridge device, cannot determine atomic capabilities!");
+		return PSIF_PCIE_ATOMIC_OP_NONE;
+	}
+
+	pcie_parent_cap = pci_find_capability(parent, PCI_CAP_ID_EXP);
+
+	if (!pcie_parent_cap) {
+		sif_log(sdev, SIF_INFO,
+			"PCIe capability in parent device not found, cannot determine atomic capabilities!");
+		return PSIF_PCIE_ATOMIC_OP_NONE;
+	}
+
+	ret = pci_read_config_word(parent, pcie_parent_cap + PCI_EXP_DEVCAP2, &pdevcap2);
+	if (ret) {
+		/* set to PSIF_PCIE_ATOMIC_OP_NONE if pci read fails*/
+		return atomic_op_flags;
+	}
+	if (pdevcap2 & (ATOMIC_OP_32_BIT_COMPLETER_SUPPORTED |
+			ATOMIC_OP_64_BIT_COMPLETER_SUPPORTED |
+			CAS_OP_128_BIT_COMPLETER_SUPPORTED)) {
+		pcie_cap = pci_find_capability(sdev->pdev, PCI_CAP_ID_EXP);
+		ret = pci_read_config_word(sdev->pdev, pcie_cap + PCI_EXP_DEVCTL2, &devctrl2);
+		/* check whether PSIF set the ATOMIC_OP_REQUESTER_ENABLE bit */
+		if (!(devctrl2 & ATOMIC_OP_REQUESTER_ENABLE)) {
+			ret = pci_write_config_word(sdev->pdev, pcie_cap + PCI_EXP_DEVCTL2,
+						    (devctrl2 | ATOMIC_OP_REQUESTER_ENABLE));
+			if (ret) {
+				/* set to PSIF_PCIE_ATOMIC_OP_NONE if pci write fails*/
+				return atomic_op_flags;
+			}
+			sif_log(sdev, SIF_INFO,
+				"Set atomic_op_requester_enable in devctrl2 (%x)\n", devctrl2);
+		}
+
+		/* Always enable SQS atomic and IB global atomic if RC supports atomicOp */
+		atomic_op_flags = PSIF_PCIE_ATOMIC_OP_BOTH;
+		/* EPS-A cores do not need to worry about different IB atomic mode, as they only
+		 * need to know whether PSIF has atomic_op_requester_enable set.
+		 */
+		if (eps_num == sdev->mbox_epsc) {
+			/* SQS atomics does not work in these revisions: */
+			bool disable_sqs_atomics = PSIF_REVISION(sdev) <= 3 ?
+				true : sif_feature(force_sqs_atomic_disable);
+
+			if (disable_sqs_atomics &&
+			    sif_feature(force_ib_atomic_hca_mode)) {
+				atomic_op_flags = PSIF_PCIE_ATOMIC_OP_NONE;
+			} else if (disable_sqs_atomics) {
+				atomic_op_flags = PSIF_PCIE_ATOMIC_OP_IB;
+			} else if (sif_feature(force_ib_atomic_hca_mode)) {
+				atomic_op_flags = PSIF_PCIE_ATOMIC_OP_SQS;
+			}
+		}
+	}
+	return atomic_op_flags;
+}
+
+
+/* Helper function to handle the legacy cases of endianness conversion for the
+ * initial config request (see #3804)
+ */
+static struct psif_epsc_csr_config *eps_init_config(struct sif_eps *es, struct psif_epsc_csr_config *lcfg)
+{
+#ifdef __LITTLE_ENDIAN
+	switch (es->ver.seq_set_proto) {
+	case 0:
+		return lcfg;
+	case 1:
+	case 2:
+		/* Use a config struct in network byte order */
+		copy_conv_to_hw(&es->ver.nb_cfg, lcfg, sizeof(*lcfg));
+		return &es->ver.nb_cfg;
+	}
+#else
+	struct sif_dev *sdev = es->sdev;
+
+	switch (es->ver.seq_set_proto) {
+	case 0:
+		/* Legacy mode:
+		 * Handling not endian neutral and becomes different depending on
+		 * EPSC platform endianness..
+		 */
+		if (IS_SIBS(sdev)) {
+			sif_log(sdev, SIF_INFO, "Using straight through mode");
+			return lcfg;
+		}
+		sif_log(sdev, SIF_INFO, "Converting config to LE (bw comp mode)");
+		copy_conv_to_le(&es->ver.nb_cfg, lcfg, sizeof(*lcfg));
+		return &es->ver.nb_cfg;
+	case 1:
+	case 2:
+		return lcfg;
+	}
+#endif
+	return NULL;
+}
+
+
+/* Initial setup of communication with the EPSC:
+ * The initial phase consists of using the mailbox to communicate
+ * about about where the request and response queues of the EPSC
+ * should be placed in memory, and a few basic configuration options.
+ * This is done via
+ *  1) A reset cycle
+ *  2) An optional (supported by all new firmware) protocol version negotiation
+ *  3) Transfer of the psif_epsc_csr_config request which informs EPSC about where to find the
+ *     req and resp queues, which is used for all following communication
+ *     for the rest of the driver instance's lifetime.
+ */
+
+/* This driver supports all initial mailbox exchange protocol versions up to and
+ * including this version:
+ */
+#define MAILBOX_SUPPORTED_PROTOCOL 2
+
+int sif_eps_init(struct sif_dev *sdev, enum sif_tab_type type)
+{
+	/* We get called with the response queue type */
+	enum psif_mbox_type eps_num = sif_tab2mbox(sdev, type);
+	struct sif_table *req_tp = &sdev->ba[type - 1];
+	struct sif_table *rsp_tp = &sdev->ba[type];
+	struct psif_epsc_csr_config lconfig;
+	struct psif_epsc_csr_config *config;
+	struct sif_eps_cqe lcqe;
+	struct psif_epsc_csr_rsp lrsp;
+	union sif_mailbox set, get;
+	struct psif_epsc_csr_rsp *cqe;
+	struct sif_eps *es = &sdev->es[eps_num];
+	int ret = 0;
+	u16 seq_num = 0; /* Init runs in separate seq.numbers */
+	int i;
+	ulong timeout = es->keepalive_interval = sdev->min_resp_ticks * 2;
+	ulong timeout_time = jiffies + timeout;
+	u64 tries = 0;
+	size_t bsz;
+	size_t config_cycle_count = sizeof(struct psif_epsc_csr_config)/sizeof(u32);
+	bool restarted_reset = false;
+
+	/* Max mailbox exchange protocol version supported by this driver */
+	u16 mailbox_seq_version_to_use = 2;
+
+	if (eps_num == sdev->mbox_epsc)
+		eps_struct_init(sdev);
+
+	es->last_seq = 0;
+
+	ret = eps_set_state(sdev, eps_num, ES_INIT);
+	if (ret)
+		return ret;
+
+	es->last_seq = 0;
+	atomic_set(&es->cur_reqs, 1); /* The initial request is not "posted" */
+	es->max_reqs = 0;
+	es->mask = req_tp->entry_cnt - 1;
+	es->lowpri_lim = req_tp->entry_cnt - min_t(int, req_tp->entry_cnt/2, 2);
+
+	if (rsp_tp->entry_cnt != req_tp->entry_cnt) {
+		sif_log(sdev, SIF_INFO,
+			"Illegal config - EPS queues must have the same length");
+		return -EINVAL;
+	}
+
+	bsz = sizeof(struct sif_eps_cqe *) * rsp_tp->entry_cnt;
+	es->cqe = kzalloc(bsz, GFP_KERNEL);
+	if (!es->cqe) {
+		sif_log(sdev, SIF_INFO,
+			"Failed to allocate %ld bytes for EPS%s completions", bsz,
+			eps_suffix(sdev, eps_num));
+		return -ENOMEM;
+	}
+
+	/* Use extra allocated space at the end of the completion array for the data area
+	 * TBD: This code is not safe if any of the data elements cross a 2M page boundary
+	 * - should move it out as a separate allocation.
+	 */
+	es->data = sif_mem_kaddr(rsp_tp->mem, rsp_tp->table_sz);
+	es->data_dma_hdl = sif_mem_dma(rsp_tp->mem, rsp_tp->table_sz);
+	es->data->log.size = sif_eps_log_size;
+
+	/* Initialize the first response status to != 0 */
+	cqe = get_eps_csr_rsp(sdev, eps_num, 0);
+	set_psif_epsc_csr_rsp__seq_num(cqe, (u64)-1);
+
+	sif_log(sdev, SIF_INIT, "Data area for EPSC queries: %p (dma %pad) len %ld",
+		es->data, &es->data_dma_hdl, sizeof(struct sif_epsc_data));
+	memset(&lconfig, 0, sizeof(lconfig));
+	config = &lconfig;
+	memset(&lrsp, 0x6a, sizeof(struct psif_epsc_csr_rsp));
+	lcqe.rsp = &lrsp;
+	lcqe.need_complete = false;
+
+	lconfig.hwapi_major_ver = PSIF_MAJOR_VERSION;
+	lconfig.hwapi_minor_ver = PSIF_MINOR_VERSION;
+	lconfig.epsapi_major_ver = EPSC_MAJOR_VERSION;
+	lconfig.epsapi_minor_ver = EPSC_MINOR_VERSION;
+
+	lconfig.request = req_tp->sif_base;
+	lconfig.response = rsp_tp->sif_base;
+	lconfig.extent_req = req_tp->ext_sz;
+	lconfig.extent_rsp = rsp_tp->ext_sz;
+	lconfig.entries = rsp_tp->entry_cnt;
+	if (!sdev->is_vf)
+		lconfig.atomic_support = sif_get_atomic_config(sdev, eps_num);
+	else
+		lconfig.atomic_support = PSIF_PCIE_ATOMIC_OP_NONE;
+	/* Ask the EPSC to reset the function we are accessing - starting from a clean state */
+	lconfig.clean_state = 1;
+
+#ifndef __LITTLE_ENDIAN
+	/* Tell the EPSC that host is big endian */
+	sif_log(sdev, SIF_INFO, "Configure for big endian host");
+	lconfig.big_endian = 1;
+#endif
+	lconfig.sparc_pages = (sdev->mi.page_size == 0x2000) ? 1 : 0;
+	if (rsp_tp->mem->mem_type != SIFMT_BYPASS) {
+		sif_log(sdev, SIF_INFO,
+			"Failed EPSC mappings: GVA2GPA mode not supported yet, consider reducing epsc_size");
+		ret = -ENOMEM;
+		goto err_map_ctx;
+	}
+
+	/* Allocate bypass mmu context (for responses) with wr_access set */
+	ret = sif_map_ctx(sdev, &rsp_tp->mmu_ctx, rsp_tp->mem, rsp_tp->sif_base,
+			rsp_tp->table_sz, true);
+	if (ret) {
+		sif_log(sdev, SIF_INFO, "Failed to set mmu context for epsc_rsp");
+		goto err_map_ctx;
+	}
+
+	/* Pass the populated context on to the EPS */
+	lconfig.mmu_cntx = rsp_tp->mmu_ctx.mctx;
+
+eps_reset:
+	sif_log(sdev, SIF_INIT, "Resetting EPS%s..", eps_name(sdev, eps_num));
+
+	/* 1) EPSC reset cycles:
+	 * Special write cycle to reset EPS communication
+	 */
+	set.raw = MAILBOX_RESTART;
+	do {
+		tries++;
+		eps_mailbox_write(sdev, eps_num, set.raw);
+		get.raw = eps_mailbox_read(sdev, eps_num);
+	} while (get.raw != 0 && time_is_after_jiffies(timeout_time));
+
+	if (get.raw != MAILBOX_RESTART) {
+		sif_log(sdev, SIF_INFO,
+			"Failed to reset EPS%s after %lld tries (%ld ticks) - last read 0x%llx",
+			eps_name(sdev, eps_num), tries, timeout, get.raw);
+		ret = -ENODEV;
+		goto epsc_failed;
+	}
+
+	/* 2) Meta protocol version negotiation:
+	 *    This step is basically used to determine how the initial config request
+	 *    should look:
+	 */
+	timeout_time = jiffies + timeout;
+	tries = 0;
+
+	if (restarted_reset && mailbox_seq_version_to_use > 1) {
+		/* 2nd attempt - very old firmware - skip the protocol probing algo.. */
+		goto proto_probing_done;
+	}
+	set.x.head = set.x.tail = MAILBOX_SEQ_SET_PROTOCOL;
+
+	if (!restarted_reset) {
+		/* Handle bug #4101:
+		 * Some old firmware versions will respond with the same mailbox protocol version
+		 * as the one requested by the driver, no matter what. We must check that we don't have
+		 * this version by trying version 0xffff which does not exist. If we get v.0xffff back
+		 * we know we have this old firmware and can retry with v.0.
+		 * v.2 and later will respond with the negotiated version.
+		 */
+		set.x.data = 0xffff;
+	} else {
+		/* The meta protocol number we request - if this fails, we are at the legacy firmware
+		 * version which does not support this stage, and where config data is
+		 * expected in LE order (See #3804)
+		 */
+		set.x.data = mailbox_seq_version_to_use;
+	}
+
+	do {
+		tries++;
+		eps_mailbox_write(sdev, eps_num, set.raw);
+		get.raw = eps_mailbox_read(sdev, eps_num);
+	} while (get.x.head != MAILBOX_SEQ_SET_PROTOCOL && get.raw != MAILBOX_IN_ERROR
+		&& time_is_after_jiffies(timeout_time));
+
+	if (time_is_before_eq_jiffies(timeout_time)) {
+		sif_log(sdev, SIF_INFO,
+		"Failed to get seq.protocol info from EPS%s after %lld tries (%ld ticks) - last read 0x%llx",
+			eps_name(sdev, eps_num), tries, timeout, get.raw);
+		if (!restarted_reset) {
+			restarted_reset = true;
+			sif_log(sdev, SIF_INFO,
+			"- assuming very old firmware without protocol version probing: restarting..");
+			goto eps_reset;
+		} else {
+			ret = -ESRCH;
+			goto epsc_failed;
+		}
+	}
+
+	if (!restarted_reset && get.x.data == 0xffff) {
+		/* We have identified bug #4101 in firmware:
+		 * Firmware that responds wrongly on the mailbox exchange protocol,
+		 * retry with version 0:
+		 */
+		sif_log(sdev, SIF_INFO,
+		"- found old firmware which responds wrongly to protocol version probing: restarting..");
+		restarted_reset = true;
+		mailbox_seq_version_to_use = 0;
+		goto eps_reset;
+	}
+
+	if (get.x.head != MAILBOX_SEQ_SET_PROTOCOL) {
+		sif_log(sdev, SIF_INFO, "Legacy firmware found - no SEQ_SET_PROTOCOL supported");
+		es->ver.seq_set_proto = 0;
+	} else if (get.x.data > MAILBOX_SUPPORTED_PROTOCOL) {
+		mailbox_seq_version_to_use = MAILBOX_SUPPORTED_PROTOCOL;
+		restarted_reset = true;
+		goto eps_reset;
+	} else
+		es->ver.seq_set_proto = get.x.data;
+
+proto_probing_done:
+	sif_log(sdev, SIF_INFO, "In contact with EPS%s with initial mailbox negotiation protocol v.%d",
+		eps_name(sdev, eps_num), es->ver.seq_set_proto);
+	if (!es->ver.seq_set_proto)
+		sif_log(sdev, SIF_INFO,
+		"***** Warning: firmware update necessary, support for this version discontinued! *****");
+
+	/* Set up the config struct correctly for transfer */
+	config = eps_init_config(es, &lconfig);
+	if (!config)
+		goto epsc_failed;
+
+	/* At this point it is safe to enable bus master for PSIF
+	 * Firmware guarantees that we do not get here until all state
+	 * from any previous runs have been cleared out
+	 */
+	pci_set_master(sdev->pdev);
+
+	/* 3) Transfer the psif_epsc_csr_config request via the mailbox.
+	 *    The result is then expected as response in the first response queue
+	 *    element in the area pointed to by the request transferred here:
+	 */
+	tries = 0;
+	sif_log(sdev, SIF_INIT,
+		"Setting up EPS%s: req at %llx, rsp at %llx, entries %d cycles %ld",
+		eps_name(sdev, eps_num), lconfig.request, lconfig.response,
+		lconfig.entries, sizeof(lconfig)/sizeof(u32));
+
+
+	seq_num = 0;
+	for (i = 0; i < config_cycle_count; i++) {
+		set.x.head = set.x.tail = ++seq_num;
+		set.x.data = ((u32 *)(config))[i];
+		tries = 0;
+		timeout_time = jiffies + timeout;
+		do {
+			tries++;
+			eps_mailbox_write_data(sdev, eps_num, set.raw);
+			get.raw = eps_mailbox_read_data(sdev, eps_num);
+		} while (((get.x.head != seq_num) || (get.x.tail != seq_num)) &&
+			get.raw != MAILBOX_IN_ERROR &&
+			time_is_after_jiffies(timeout_time));
+		if (get.raw == MAILBOX_IN_ERROR && time_is_after_jiffies(timeout_time)) {
+			sif_log(sdev, SIF_INFO,
+				"Writing config data failed before timeout - retrying...");
+			goto eps_reset;
+		} else if (seq_num > 0xa && time_is_before_eq_jiffies(timeout_time)) {
+			config_cycle_count = i;
+			sif_log(sdev, SIF_INFO,
+				"Unable to get part %d (%lld tries) - old firmware? - retrying...",
+				i, tries);
+			goto eps_reset;
+		} else if (set.x.data != get.x.data || time_is_before_eq_jiffies(timeout_time)) {
+			sif_log(sdev, SIF_INFO,
+			"Failed during init sequence for EPS%s, part %d (%lld tries) set %llx get %llx, expected seq %x %s",
+				eps_name(sdev, eps_num), i, tries, set.raw, get.raw, seq_num,
+				(time_is_before_jiffies(timeout_time) ? "[timeout]" : ""));
+			ret = -EIO;
+			goto epsc_failed;
+		}
+	}
+
+	sdev->es[eps_num].timeout  = timeout_time;
+
+	/* Set storage for this initial request manually before polling */
+	es->cqe[0] = &lcqe;
+
+	/* At this point we expect to have a valid response in the first position: */
+	ret = sif_eps_poll_cqe(sdev, eps_num, 0, &lcqe);
+	if (ret) {
+		goto epsc_failed;
+	}
+	/* We are up and running with the EPSC, figure out what
+	 * this firmware offers.
+	 */
+
+	/* in protocol version 2 bits 16-31 of the response sequence number contain
+	 * an ID the driver has to provide in requests
+     */
+	es->mbox_id = (lrsp.seq_num >> 16) & 0xffff;
+
+	memcpy(&es->ver, &lrsp.data, sizeof(lrsp.data));
+
+	/* The addr field now contains the number of available event queues from this EPS */
+	es->eqs.max_cnt = lrsp.addr & 0xffff;
+	/* minimum number of async EPSC EQ entries per port is in the higher 16 bits
+	 * and is an offset to 16
+	 */
+	es->eqs.min_sw_entry_cnt = ((lrsp.addr >> 16) & 0xffff) + 16;
+
+	/* PSIF has flagged that it is running in degraded mode */
+	if (lrsp.info & PSIF_INFO_FLAG_DEGRADED) {
+		sif_log(sdev, SIF_INFO, "PSIF device is degraded");
+		sdev->degraded = true;
+	}
+
+	if (sif_cq_eq_max < 1)
+		sif_cq_eq_max = 1; /* Adjust - need at least 1 completion event queue */
+
+	/* We only allocate resources for these */
+	es->eqs.cnt = min_t(ulong, es->eqs.max_cnt, sif_cq_eq_max + 2);
+
+	ret = sif_eps_api_version_ok(sdev, eps_num);
+	if (ret)
+		goto epsc_failed;
+
+	/* APIs are ok - now request, report and possibly
+	 * validate epsc firmware (build) version info
+	 */
+	ret = sif_eps_firmware_version_ok(sdev, eps_num);
+	if (ret)
+		goto epsc_failed;
+
+#if defined(CONFIG_ARCH_DMA_ADDR_T_64BIT) && defined(__sparc__)
+	/* The kernel is currently using iommu bypass mode in the sparc iommu, and
+	 * the PSIF MMU requires a fixed configuration of the upper 12 bits of the
+	 * DMA addresses: we need bit 63 set in all GVA2GPA accesses.
+	 */
+	{
+		u16 upper_12 = sif_mem_dma(rsp_tp->mem, 0) >> PSIF_TABLE_PTR_SHIFT;
+
+		ret = epsc_set_mmu_upper(sdev, upper_12);
+		if (ret)
+			goto epsc_failed;
+	}
+#endif
+
+	/* Interrupt setup */
+	if (eps_num == sdev->mbox_epsc) {
+		ret = sif_enable_msix(sdev);
+		if (ret)
+			goto epsc_failed;
+	}
+
+	/* Set up the event queues as a special case here */
+	ret = sif_eq_init(sdev, es, &lrsp);
+	if (ret)
+		goto epsc_eq_init_failed;
+
+	if (sif_eps_log_size)
+		ret = sif_eps_log_ctrl(sdev, eps_num, EPSC_LOG_MODE_HOST, sif_eps_log_level);
+	if (ret)
+		goto epsc_log_ctrl_failed;
+
+	eps_set_state(sdev, eps_num, ES_ACTIVE);
+	return ret;
+
+
+epsc_log_ctrl_failed:
+	sif_eq_deinit(sdev, es);
+epsc_eq_init_failed:
+	if (eps_num == sdev->mbox_epsc)
+		sif_disable_msix(sdev);
+epsc_failed:
+	sif_unmap_ctx(sdev, &rsp_tp->mmu_ctx);
+err_map_ctx:
+	kfree(es->cqe);
+	return ret;
+}
+
+
+int sif_eps_deinit(struct sif_dev *sdev, enum sif_tab_type rsp_type)
+{
+	enum psif_mbox_type eps_num = sif_tab2mbox(sdev, rsp_type);
+	struct sif_eps *es = &sdev->es[eps_num];
+	struct sif_table *rsp_tp = &sdev->ba[rsp_type];
+	struct psif_epsc_csr_req req;
+	struct psif_epsc_csr_rsp rsp;
+
+	if (es->data->log.size)
+		sif_eps_log_ctrl(sdev, eps_num, EPSC_LOG_MODE_BUFFER, sif_eps_log_level);
+	sif_eq_deinit(sdev, es);
+
+	if (eps_num == sdev->mbox_epsc)
+		sif_disable_msix(sdev);
+
+	/* Note that beyond this point the EQs no longer exists so we need to use poll
+	 * mode for the remaining epsc communication.
+	 */
+
+	/* Flush TLB for old FW version. On current FW versions this is done
+	 * automatically by FW.
+	 * During takedown TLB invalidate is not generally possible since it requires
+	 * working privileged QPs. Instead flush the whole TLB in one go.
+	 */
+	if (!eps_fw_version_ge(es, 0, 54) && !sdev->is_vf)
+		sif_flush_tlb(sdev);
+
+	/* Tell the EPSC that we have terminated cleanly: */
+	memset(&req, 0, sizeof(req));
+	req.opcode = EPSC_TEARDOWN;
+	sif_epsc_wr_poll(sdev, &req, &rsp);
+
+	sif_unmap_ctx(sdev, &rsp_tp->mmu_ctx);
+	kfree(es->cqe);
+
+	return 0;
+}
+
+
+#define epsc_seq(x) (x & 0x7fff)
+
+/* process any queued responses from the EPS
+ * Return the number processed, or -errno upon errors:
+ * assumes es->lock is held
+ */
+static inline int __eps_process_cqe(struct sif_dev *sdev, enum psif_mbox_type eps_num)
+{
+	struct sif_eps *es = &sdev->es[eps_num];
+	int ret = 0;
+	int rsp_cnt = 0;
+	u64 seq_num_expected, seq_num;
+	u32 idx;
+	u16 ql;
+	struct psif_epsc_csr_rsp *cqe;
+	struct sif_eps_cqe *lcqe;
+
+	for (;;) {
+		seq_num_expected = es->first_seq | CSR_ONLINE_MASK;
+		idx = es->first_seq & es->mask;
+		cqe = get_eps_csr_rsp(sdev, eps_num, idx);
+		seq_num = be64_to_cpu((volatile u64)(cqe->seq_num)) & 0xffff;
+
+		if (seq_num != seq_num_expected)
+			break;
+		lcqe = es->cqe[idx];
+		if (lcqe) {
+			rmb();
+			sif_log(sdev, SIF_EPS, "copying to caller rsp at %p", lcqe->rsp);
+			copy_conv_to_sw(lcqe->rsp, cqe, sizeof(struct psif_epsc_csr_rsp));
+			if (lcqe->rsp->status != EPSC_SUCCESS && sif_feature(pcie_trigger))
+				force_pcie_link_retrain(sdev);
+			rsp_cnt++;
+			__epsc_complete(sdev, eps_num, idx);
+			es->cqe[idx] = NULL;
+		}
+		ql = atomic_dec_return(&es->cur_reqs);
+		es->first_seq = (es->first_seq + 1) & ~CSR_ONLINE_MASK;
+		ret++;
+	}
+	if (ret < 0)
+		sif_log(sdev, SIF_INFO, "failed with status %d", ret);
+	else if (ret > 0) {
+		sif_log(sdev, SIF_EPS,
+			"processed %d (%d with resp) requests - first_seq 0x%x, oustanding %d",
+			ret, rsp_cnt, es->first_seq, atomic_read(&es->cur_reqs));
+		mb();
+	}
+
+	__sif_eps_send_keep_alive(sdev, eps_num, false);
+
+	return ret;
+}
+
+
+static int eps_process_cqe(struct sif_dev *sdev, enum psif_mbox_type eps_num)
+{
+	int ret;
+	unsigned long flags;
+	struct sif_eps *es = &sdev->es[eps_num];
+
+	spin_lock_irqsave(&es->lock, flags);
+	ret = __eps_process_cqe(sdev, eps_num);
+	spin_unlock_irqrestore(&es->lock, flags);
+	return ret;
+}
+
+
+static void eps_reset_cmpl(struct sif_dev *sdev, u16 seq_num, enum psif_mbox_type eps_num)
+{
+	struct sif_eps *es = &sdev->es[eps_num];
+	struct sif_table *t = &sdev->ba[sif_mbox2rsp_tab(sdev, eps_num)];
+	u16 idx = seq_num % t->entry_cnt;
+	unsigned long flags;
+
+	/* Protect against nil'ing it while anyone accessing cqe */
+	spin_lock_irqsave(&es->lock, flags);
+	es->cqe[idx] = NULL;
+	spin_unlock_irqrestore(&es->lock, flags);
+}
+
+
+/* Asynchronous post of an EPS work request.
+ * returns nonzero if there is no more room
+ * in completion queue for a new entry.
+ * If seq_num is nonzero, the caller is expected to handle the
+ * completion using sif_epsc_poll_cqe, otherwise the entry is marked as
+ * "response ignored by the caller".
+ * If wait is set, post with flag EPSC_FL_NOTIFY to receive an interrupt from the eps:
+ *
+ */
+static int __sif_post_eps_wr(struct sif_dev *sdev, enum psif_mbox_type eps_num,
+		struct psif_epsc_csr_req *lreq, u16 *seq_num,
+		struct sif_eps_cqe *lcqe, bool wait)
+{
+	struct psif_epsc_csr_req *req;
+	struct sif_table *t = &sdev->ba[sif_mbox2rsp_tab(sdev, eps_num)];
+	struct sif_eps *es = &sdev->es[eps_num];
+	u32 idx;
+	union sif_mailbox lmbx;
+	u16 cur_reqs;
+	u16 limit = in_interrupt() ? t->entry_cnt : es->lowpri_lim;
+	unsigned long timeout = sdev->min_resp_ticks * 8;
+	int ret = 0;
+	bool waiting = false;
+
+	es->timeout = jiffies + timeout;
+restart:
+
+	if (atomic_read(&es->cur_reqs)) {
+		/* Make sure emptying the queue takes preference over filling it up: */
+		ret = __eps_process_cqe(sdev, eps_num);
+
+		if (ret > 0)
+			ret = 0; /* Got some rsps */
+		else if (ret < 0)
+			return ret;
+	}
+
+	/* Allocate a new seq.number */
+	cur_reqs = atomic_inc_return(&es->cur_reqs);
+	if (cur_reqs > limit) {
+		u16 tried_seq_num = (es->last_seq + 1) & ~CSR_ONLINE_MASK;
+
+		atomic_dec(&es->cur_reqs);
+		if (!waiting)
+			atomic_inc(&es->waiters);
+		if (es->first_seq != es->last_full_seq) {
+			sif_log(sdev, SIF_INFO_V,
+			"req.queue full: seq %d, first %d, cur_reqs %d, %slimit %d, epsc_req_size is %d",
+				tried_seq_num, es->first_seq, cur_reqs,
+				(in_interrupt() ? "" : "(low pri) "), limit, t->entry_cnt);
+			es->last_full_seq = es->first_seq;
+		}
+
+
+		if (in_interrupt()) {
+			/* Only the EVENT_INDEX updates are sent from interrupt level and
+			 * they are high pri, and should have reserved space:
+			 */
+			sif_log(sdev, SIF_INFO,
+			"Warning: Interrupt level EPSC req. while over limit (%d/%d), tried seq %d!",
+				cur_reqs, limit, tried_seq_num);
+			sif_logs(SIF_INFO, write_struct_psif_epsc_csr_req(NULL, 0, lreq));
+			return -EFAULT;
+		}
+
+		if (time_is_after_jiffies(es->timeout))
+			goto restart;
+		else {
+			sif_log(sdev, SIF_INFO,
+				"Timeout waiting for previous response (seq %d) to complete",
+				es->first_seq);
+			return -EAGAIN;
+		}
+	}
+	if (waiting)
+		atomic_dec(&es->waiters);
+
+	if (cur_reqs > es->max_reqs)
+		es->max_reqs = cur_reqs;
+
+	es->last_seq = (es->last_seq + 1) & ~CSR_ONLINE_MASK;
+	idx = es->last_seq & es->mask;
+	req = get_eps_csr_req(sdev, eps_num, idx);
+
+	lreq->seq_num = es->last_seq | CSR_ONLINE_MASK;
+	if (wait) {
+		/* Request interrupt upon completion */
+		lreq->flags |= EPSC_FL_NOTIFY;
+	}
+
+	/* Tell where to copy the completion upon arrival: */
+	es->cqe[idx] = lcqe;
+	if (lcqe) {
+		sif_log(sdev, SIF_EPS, "set cqe[%d] = %p", idx, lcqe);
+
+		/* set the software host order copy seq_num to something useful for comparison
+		 * in the poll routines:
+		 */
+		lcqe->rsp->seq_num = get_psif_epsc_csr_req__seq_num(req);
+		lcqe->need_complete = wait;
+	}
+	wmb();
+	sif_log(sdev, SIF_EPS, "opcode %s seq.%d to addr %p %s",
+		string_enum_psif_epsc_csr_opcode(lreq->opcode),
+		es->last_seq, req, (wait ? "wait" : ""));
+
+	/* Update hw accessible req */
+	copy_conv_to_hw(req, lreq, sizeof(struct psif_epsc_csr_req));
+
+	/* Doorbell - notify hw */
+	lmbx.x.head = CSR_ONLINE_MASK | lreq->seq_num;
+	if (es->ver.seq_set_proto == 2) {
+		lmbx.x.tail = es->mbox_id;
+		lmbx.x.data = lreq->opcode;
+	} else {
+		lmbx.x.tail = lmbx.x.head;
+		lmbx.x.data = 0x5a5a5a5a; /* Not used - just an easy recognizable pattern */
+	}
+	eps_mailbox_write(sdev, eps_num, lmbx.raw);
+
+	if (seq_num)
+		*seq_num = es->last_seq;
+	return ret;
+}
+
+/* Asynchronous post of an EPS work request.
+ * returns nonzero if there is no more room
+ * in completion queue for a new entry.
+ * If seq_num is nonzero, the caller is expected to handle the
+ * completion using sif_epsc_poll_cqe, otherwise the entry is marked as
+ * "response ignored by the caller".
+ * If wait is set, post with flag EPSC_FL_NOTIFY to receive an interrupt from the eps:
+ *
+ */
+int sif_post_eps_wr(struct sif_dev *sdev, enum psif_mbox_type eps_num,
+		struct psif_epsc_csr_req *lreq, u16 *seq_num,
+		struct sif_eps_cqe *lcqe, bool wait)
+{
+	struct sif_eps *es = &sdev->es[eps_num];
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&es->lock, flags);
+	ret = __sif_post_eps_wr(sdev, eps_num, lreq, seq_num, lcqe, wait);
+	spin_unlock_irqrestore(&es->lock, flags);
+	return ret;
+}
+
+int sif_post_epsc_wr(struct sif_dev *sdev, struct psif_epsc_csr_req *lreq,
+		u16 *seq_num, struct sif_eps_cqe *lcqe, bool wait)
+{
+	return sif_post_eps_wr(sdev, sdev->mbox_epsc, lreq, seq_num, lcqe, wait);
+}
+
+
+/* Poll waiting for response on request seq_num.
+ * Polls for different completions may be executing this code in parallel:
+ */
+int sif_eps_poll_cqe(struct sif_dev *sdev, enum psif_mbox_type eps_num,
+		u16 seq_num, struct sif_eps_cqe *lcqe)
+{
+	struct sif_eps *es = &sdev->es[eps_num];
+	int ret = 0;
+	ulong timeout = sdev->min_resp_ticks * 8;
+	int npolled = 0;
+
+	es->timeout = jiffies + timeout;
+	while (seq_num != get_eps_mailbox_seq_num(lcqe->rsp)) {
+		ret = eps_process_cqe(sdev, eps_num);
+		if (ret < 0)
+			goto out;
+
+		if (time_is_before_eq_jiffies(es->timeout)) {
+			if (sif_feature(pcie_trigger))
+				force_pcie_link_retrain(sdev);
+
+			sif_log(sdev, SIF_INFO,
+			"No response for req %#x from EPS (rsp->seq_num 0x%x) in %ld ms - #reqs outstanding %d",
+				seq_num, get_eps_mailbox_seq_num(lcqe->rsp), timeout,
+				atomic_read(&es->cur_reqs));
+			ret = -ETIMEDOUT;
+			goto out;
+		}
+		cpu_relax();
+		npolled += ret;
+	}
+
+	ret = eps_status_to_err(lcqe->rsp->status);
+
+	/* We got something, reset the timeout for all waiters */
+	es->timeout = jiffies + timeout;
+out:
+	if (ret < 0) {
+		int log_level = lcqe->rsp->opcode == EPSC_MODIFY_QP ? SIF_QPE : SIF_INFO;
+
+		if (sif_feature(pcie_trigger))
+			force_pcie_link_retrain(sdev);
+		if (ret != -ETIMEDOUT)
+			sif_log(sdev, log_level,
+				"Error response (%s) for req 0x%x from EPS (errno %d)",
+				string_enum_psif_epsc_csr_status(lcqe->rsp->status),
+				get_eps_mailbox_seq_num(lcqe->rsp), ret);
+		eps_reset_cmpl(sdev, seq_num, eps_num);
+	} else
+		sif_log(sdev, SIF_EPS, "seq 0x%x polled", seq_num);
+	return ret;
+}
+
+
+int sif_epsc_poll_cqe(struct sif_dev *sdev, u16 seq_num, struct sif_eps_cqe *lcqe)
+{
+	return sif_eps_poll_cqe(sdev, sdev->mbox_epsc, seq_num, lcqe);
+}
+
+
+/* Wait up to @timeout ticks for an earlier posted event
+ * with ID @seq_num to complete
+ */
+static int eps_waitfor_timeout(struct sif_dev *sdev, enum psif_mbox_type eps_num,
+			u16 seq_num, unsigned long timeout,
+			struct sif_eps_cqe *lcqe)
+{
+	struct completion *cmpl = &lcqe->cmpl;
+	unsigned long rem_time, wait_time;
+	volatile struct psif_epsc_csr_rsp *rsp = lcqe->rsp;
+	int ret;
+	unsigned int attempts = 4;
+
+
+	rem_time = wait_time = timeout/attempts;
+	for (;;) {
+		ret = eps_process_cqe(sdev, eps_num);
+		if (ret < 0)
+			goto out;
+
+		if (get_eps_mailbox_seq_num(rsp) != seq_num) {
+			rem_time = wait_for_completion_interruptible_timeout(cmpl, rem_time);
+			if (!rem_time) {
+				rem_time = wait_time;
+				if (!--attempts) {
+					sif_log(sdev, SIF_INFO, "req %u timed out after %ld ms",
+						seq_num, timeout);
+					ret = -ETIMEDOUT;
+					goto out;
+				}
+			}
+			continue;
+		}
+		break;
+	}
+
+	ret = eps_status_to_err(rsp->status);
+out:
+	if (ret < 0) {
+		if (ret != -ETIMEDOUT) {
+			sif_log(sdev, SIF_INFO,
+				"Error response (%s) for req 0x%x from EPS",
+				string_enum_psif_epsc_csr_status(rsp->status),
+				get_eps_mailbox_seq_num(rsp));
+		}
+		eps_reset_cmpl(sdev, seq_num, eps_num);
+	}
+	return ret;
+}
+
+/* Wait for an earlier posted request with ID @seq_num to complete
+ */
+static int eps_waitfor(struct sif_dev *sdev, enum psif_mbox_type eps_num,
+		u16 seq_num, struct sif_eps_cqe *cqe)
+{
+	ulong timeout = sdev->min_resp_ticks * (1 + atomic_read(&sdev->es[eps_num].cur_reqs)) * 8;
+
+	return eps_waitfor_timeout(sdev, eps_num, seq_num, timeout, cqe);
+}
+
+int sif_epsc_waitfor(struct sif_dev *sdev, u16 seq_num,
+		struct sif_eps_cqe *cqe)
+{
+	return eps_waitfor(sdev, sdev->mbox_epsc, seq_num, cqe);
+}
+
+/* Synchronous post of an EPS work request.
+ * Will wait until request completes and return the completion
+ * notification. Uses EPSC interrupts for wakeup.
+ */
+
+int sif_eps_wr(struct sif_dev *sdev, enum psif_mbox_type eps_num,
+	struct  psif_epsc_csr_req *req, struct psif_epsc_csr_rsp *cqe)
+{
+	u16 seq_num;
+	int ret;
+	struct sif_eps_cqe lcqe;
+
+	lcqe.rsp = cqe;
+	init_completion(&lcqe.cmpl);
+restart:
+	ret = sif_post_eps_wr(sdev, eps_num, req, &seq_num, &lcqe, true);
+	if (ret)
+		return ret;
+
+	ret = eps_waitfor(sdev, eps_num, seq_num, &lcqe);
+	if (ret == -EAGAIN) {
+		sif_log(sdev, SIF_EPS, "EPS%s requests retry for req# %d",
+			eps_name(sdev, eps_num), seq_num);
+		goto restart;
+	}
+	sif_log(sdev, SIF_EPS, "Received EPS%s completion for req# %d",
+		eps_name(sdev, eps_num), seq_num);
+	return ret;
+}
+
+
+int sif_epsc_wr(struct sif_dev *sdev, struct psif_epsc_csr_req *req,
+		struct psif_epsc_csr_rsp *cqe)
+{
+	return sif_eps_wr(sdev, sdev->mbox_epsc, req, cqe);
+}
+
+
+/* Same as sif_eps_wr but poll for completion */
+int sif_eps_wr_poll(struct sif_dev *sdev, enum psif_mbox_type eps_num,
+		struct psif_epsc_csr_req *req, struct psif_epsc_csr_rsp *cqe)
+{
+	u16 seq_num;
+	int ret;
+	struct sif_eps_cqe lcqe;
+
+	lcqe.rsp = cqe;
+restart:
+	ret = sif_post_eps_wr(sdev, eps_num, req, &seq_num, &lcqe, false);
+	if (ret)
+		return ret;
+
+	ret = sif_eps_poll_cqe(sdev, eps_num, seq_num, &lcqe);
+	if (ret == -EAGAIN) {
+		sif_log(sdev, SIF_EPS, "EPS%s requests retry for req# %d",
+			eps_name(sdev, eps_num), seq_num);
+		goto restart;
+	}
+	if (!ret)
+		sif_log(sdev, SIF_EPS, "Received EPS%s completion for req# %d",
+			eps_name(sdev, eps_num), seq_num);
+	return ret;
+}
+
+int sif_epsc_wr_poll(struct sif_dev *sdev, struct psif_epsc_csr_req *req,
+		struct psif_epsc_csr_rsp *rsp)
+{
+	return sif_eps_wr_poll(sdev, sdev->mbox_epsc, req, rsp);
+}
+
+
+
+/* EPS-A support */
+int sif_activate_epsa(struct sif_dev *sdev, enum psif_mbox_type eps_num)
+{
+	enum sif_tab_type type = epsa0_csr_req + (eps_num * 2);
+
+	/* First initiate communication protocol with the EPS# */
+	int ret = sif_table_init(sdev, type);
+
+	if (ret)
+		return ret;
+	ret = sif_table_init(sdev, type + 1);
+	if (ret)
+		return ret;
+
+	/* The rest of the init operations does not involve any memory setup,
+	 * it just communicates the table base pointers setup up with the EPSC
+	 * on to the EPSA.
+	 */
+
+	/* Only key (DMA validation) is needed so far */
+	ret = sif_table_update(sdev, eps_num, key);
+	return ret;
+}
+
+inline bool sif_eps_keep_alive_timeout(struct sif_eps *es)
+{
+	return time_is_before_jiffies(es->last_req_posted + es->keepalive_interval);
+}
+
+
+static int __sif_eps_send_keep_alive(struct sif_dev *sdev, enum psif_mbox_type eps_num,
+			bool force)
+{
+	struct psif_epsc_csr_req req;
+	struct sif_eps *es = &sdev->es[eps_num];
+	int ret = 0;
+
+	if (sif_eps_keep_alive_timeout(es) || force) {
+		sif_log(sdev, SIF_INFO, "Sending keep-alive (force=%i)", force);
+
+		/* prevent infinite loop with __sif_post_eps_wr */
+		es->last_req_posted = jiffies;
+
+		memset(&req, 0, sizeof(req));
+		req.opcode = EPSC_KEEP_ALIVE;
+		ret = __sif_post_eps_wr(sdev, eps_num, &req, NULL, NULL, false);
+	}
+	return ret;
+}
+
+int sif_eps_send_keep_alive(struct sif_dev *sdev, enum psif_mbox_type eps_num,
+			int force)
+{
+	struct sif_eps *es = &sdev->es[eps_num];
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&es->lock, flags);
+	ret = __sif_eps_send_keep_alive(sdev, eps_num, force);
+	spin_unlock_irqrestore(&es->lock, flags);
+	return ret;
+}
+
+/**** Low level mailbox handling ****/
+
+u64 eps_mailbox_read(struct sif_dev *sdev, u8 epsno)
+{
+	return be64_to_cpu(__raw_readq(&sdev->eps->eps[epsno].out));
+}
+
+void eps_mailbox_write(struct sif_dev *sdev, u8 epsno, u64 value)
+{
+	sdev->es[epsno].last_req_posted = jiffies;
+	wmb();
+	__raw_writeq(cpu_to_be64(value), &sdev->eps->eps[epsno].in);
+	wmb();
+}
+
+u64 eps_mailbox_read_data(struct sif_dev *sdev, u8 epsno)
+{
+	union sif_mailbox set;
+
+	set.raw = eps_mailbox_read(sdev, epsno);
+	if (sdev->es[epsno].ver.seq_set_proto <= 1)
+		set.x.data = le32_to_cpu(set.x.data);
+	else
+		set.x.data = be32_to_cpu(set.x.data);
+	return set.raw;
+}
+
+void eps_mailbox_write_data(struct sif_dev *sdev, u8 epsno, u64 value)
+{
+	union sif_mailbox set;
+
+	set.raw = value;
+	if (sdev->es[epsno].ver.seq_set_proto <= 1)
+		set.x.data = cpu_to_le32(set.x.data);
+	else
+		set.x.data = cpu_to_be32(set.x.data);
+	value = set.raw;
+	eps_mailbox_write(sdev, epsno, value);
+}
+
+
+/**** High level synchronous CSR operations */
+
+/* Read a 64 bit CSR register */
+static u64 read_csr(struct sif_dev *sdev, u32 addr, bool local)
+{
+	struct psif_epsc_csr_rsp resp;
+	struct psif_epsc_csr_req req;
+	int ret;
+
+	memset(&req, 0, sizeof(req));
+	req.opcode = local ? EPSC_GET_SINGLE : EPSC_GET_ONE_CSR;
+	req.addr = addr;
+
+	ret = sif_epsc_wr_poll(sdev, &req, &resp);
+	if (ret)
+		return -1;
+
+	sif_log(sdev, SIF_CSR, "%s address 0x%x value 0x%llx",
+		(local ? "UF local" : "global"), addr, resp.data);
+	return resp.data;
+}
+
+/* Write a 64 bit EPS CSR register. Only valid for old FW. */
+static int write_csr(struct sif_dev *sdev, u32 addr, u64 val)
+{
+	struct psif_epsc_csr_rsp resp;
+	struct psif_epsc_csr_req req;
+	int ret;
+
+	memset(&req, 0, sizeof(req));
+	req.opcode = EPSC_SET_ONE_CSR;
+	req.addr = addr;
+	req.u.single.data = val;
+	sif_log(sdev, SIF_CSR, "write address 0x%x value 0x%llx",
+		addr, val);
+
+	ret = sif_epsc_wr_poll(sdev, &req, &resp);
+	if (ret)
+		return ret;
+	return ret;
+}
+
+
+/* Read a 64 bit CSR register (local UF mapping) */
+u64 sif_read_local_csr(struct sif_dev *sdev, u32 addr)
+{
+	return read_csr(sdev, addr, true);
+}
+
+/* Read a 64 bit CSR register (global PSIF mapping - uf 0 only) */
+u64 sif_read_global_csr(struct sif_dev *sdev, u32 addr)
+{
+	return read_csr(sdev, addr, false);
+}
+
+/* Write a 64 bit EPS CSR register (global PSIF mapping - uf 0 only) */
+int sif_write_global_csr(struct sif_dev *sdev, u32 addr, u64 val)
+{
+	return write_csr(sdev, addr, val);
+}
+
+
+/* Helper for dfs iteration */
+int sif_eps_next_used(struct sif_table *table, int index)
+{
+	struct sif_dev *sdev = table->sdev;
+	enum psif_mbox_type eps_num = sif_tab2mbox(sdev, table->type);
+	struct sif_eps *es = &sdev->es[eps_num];
+	int first, last;
+
+	first = es->first_seq & es->mask;
+	last = es->last_seq & es->mask;
+
+	if (es->first_seq == es->last_seq + 1)
+		return -1;
+	if (first <= last) {
+		if (index <= first)
+			return first;
+		if (index > last)
+			return -1;
+	} else {
+		if (index >= table->entry_cnt)
+			return -1;
+		if (index > last && index < first)
+			return first;
+	}
+	return index;
+}
+
+
+static void sif_dfs_print_eps(struct seq_file *s, struct sif_dev *sdev,
+		loff_t pos, enum psif_mbox_type eps_num)
+{
+	struct psif_epsc_csr_req *req;
+	struct psif_epsc_csr_rsp *rsp;
+	struct sif_eps *es = &sdev->es[eps_num];
+	u16 seq, rsp_seq;
+
+	if (unlikely(pos < 0)) {
+		u32 sz = sdev->ba[epsc_csr_req].entry_cnt;
+
+		seq_printf(s,
+			"# EPS%s Request queue, outstanding %d/%d max.%d waiters %d first/last seq. %d/%d\n"
+			"# %6s %15s %8s %15s %6s\n",
+			eps_suffix(sdev, eps_num), atomic_read(&es->cur_reqs),
+			sz, es->max_reqs, atomic_read(&es->waiters),
+			es->first_seq, es->last_seq,
+			"Entry", "req.opcode", "req.seq", "rsp.opcode", "rsp.seq");
+		return;
+	}
+
+	req = get_eps_csr_req(sdev, eps_num, pos);
+	seq = get_psif_epsc_csr_req__seq_num(req) & ~CSR_ONLINE_MASK;
+
+	/* Correlate to response queue */
+	rsp = get_eps_csr_rsp(sdev, eps_num, pos);
+	rsp_seq = get_psif_epsc_csr_rsp__seq_num(rsp) & ~CSR_ONLINE_MASK;
+
+	seq_printf(s, "%8lld %15s %8d %15s %8d\n", pos,
+		string_enum_psif_epsc_csr_opcode(get_psif_epsc_csr_req__opcode(req)) + 5,
+		seq,
+		string_enum_psif_epsc_csr_opcode(get_psif_epsc_csr_rsp__opcode(rsp)) + 5,
+		rsp_seq);
+}
+
+
+void sif_dfs_print_epsc(struct seq_file *s, struct sif_dev *sdev,
+		loff_t pos)
+{
+	sif_dfs_print_eps(s, sdev, pos, sdev->mbox_epsc);
+}
+
+void sif_dfs_print_epsa0(struct seq_file *s, struct sif_dev *sdev,
+		loff_t pos)
+{
+	sif_dfs_print_eps(s, sdev, pos, MBOX_EPSA0);
+}
+
+void sif_dfs_print_epsa1(struct seq_file *s, struct sif_dev *sdev,
+		loff_t pos)
+{
+	sif_dfs_print_eps(s, sdev, pos, MBOX_EPSA1);
+}
+
+void sif_dfs_print_epsa2(struct seq_file *s, struct sif_dev *sdev,
+		loff_t pos)
+{
+	sif_dfs_print_eps(s, sdev, pos, MBOX_EPSA2);
+}
+
+void sif_dfs_print_epsa3(struct seq_file *s, struct sif_dev *sdev,
+		loff_t pos)
+{
+	sif_dfs_print_eps(s, sdev, pos, MBOX_EPSA3);
+}
+
+void epsc_report_degraded(struct sif_dev *sdev, u64 cause_mask)
+{
+	unsigned int cause;
+
+	for (cause = 0; cause < 64; cause++) {
+		if ((1L << cause) & cause_mask) {
+			sif_log(sdev, SIF_INFO, "Device reports degraded cause %s",
+				string_enum_psif_epsc_degrade_cause((enum psif_epsc_degrade_cause)cause));
+		}
+	}
+}
diff --git a/drivers/infiniband/hw/sif/sif_epsc.h b/drivers/infiniband/hw/sif/sif_epsc.h
new file mode 100644
index 0000000000000..4a7c6682cbab5
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_epsc.h
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_epsc.h: API for communication with the EPSC (and EPS-A's)
+ */
+
+#ifndef __SIF_EPSC_H
+#define __SIF_EPSC_H
+#include <linux/pci.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include "sif_eq.h"
+#include "psif_hw_data.h"
+
+struct sif_dev;
+struct sif_table;
+struct psif_epsc_csr_req;
+struct psif_epsc_csr_rsp;
+
+struct sif_epsc_data; /* sif_query.h */
+enum psif_mbox_type;  /* psif_hw_data.h */
+enum sif_tab_type;    /* sif_dev.h */
+
+/* Max number of strings (including final NULL)
+ * we expect from the firmware version details:
+ */
+enum sif_eps_fw_info_idx {
+	FWV_EPS_REV_STRING,
+	FWV_EPS_GIT_REPO,
+	FWV_EPS_GIT_LAST_COMMIT,
+	FWV_EPS_GIT_STATUS,
+	FWV_EPS_BUILD_USER,
+	FWV_EPS_BUILD_GIT_TIME,
+	FWV_PSIF_GIT_REPO,
+	FWV_PSIF_GIT_COMMIT,
+	FWV_PSIF_GIT_STATUS,
+	FWV_MAX
+};
+
+
+struct eps_version_data {
+#ifdef __LITTLE_ENDIAN
+	u16 epsc_minor;
+	u16 epsc_major;
+	u16 psif_minor;
+	u16 psif_major;
+#else
+	u16 psif_major;
+	u16 psif_minor;
+	u16 epsc_major;
+	u16 epsc_minor;
+#endif
+	u16 fw_minor;
+	u16 fw_major;
+	int seq_set_proto; /* Protocol version of the initial setup meta protocol (0 == legacy) */
+	struct psif_epsc_csr_config nb_cfg; /* "Network" byte order config storage (see #3804) */
+	char *fw_version[FWV_MAX];
+};
+
+
+enum sif_eps_state {
+	ES_NOT_RUNNING,  /* EPS core thread not started */
+	ES_RUNNING,      /* EPS core thread started but comm.protocol not initiated */
+	ES_INIT,	 /* Driver is working to set up tables with this EPS */
+	ES_ACTIVE	 /* Communication with this EPS is up and running */
+};
+
+
+struct sif_eps_cqe {
+	struct psif_epsc_csr_rsp *rsp;	/* process_cqe places a host order copy of the response here */
+	struct completion cmpl;		/* a completion to wait on for response */
+	bool need_complete;		/* req was posted with EPSC_FL_NOTIFY */
+};
+
+
+#define EPS_TAG_FROM_HOST  0x8000
+
+#define MAX_LOGDEVNAME 32
+
+/* Internal bookkeeping for sif_epsc.c/h: */
+struct sif_eps {
+	struct sif_dev *sdev;
+	enum psif_mbox_type eps_num; /* Which EPS this is */
+	enum sif_eps_state state;  /* Current state of the EPS */
+	struct eps_version_data ver; /* Minor/major version info of the epsc firmware */
+	spinlock_t lock;/* Serializes CPU access to the epsc hw and sw resources */
+	volatile u16 last_seq;    /* Last used sequence number */
+	volatile u16 first_seq;   /* First sequence number not seen any completion on */
+	u16 mask;	 /* req/rsp table sz - 1 */
+	u16 max_reqs; /* Max outstanding reqs seen */
+	u16 lowpri_lim;  /* Max number of outstanding low priority reqs */
+	u16 last_full_seq; /* notify when queue full was last logged to avoid repeating logs */
+	u16 mbox_id; /* ID of the mailbox as provided by EPS */
+	atomic_t cur_reqs; /* current outstanding req count */
+	atomic_t waiters; /* Number of threads waiting for a slot in the queue */
+	unsigned long timeout; /* EPSC resp timeout - rescheduled when new completions observed */
+	unsigned long keepalive_interval; /* how long to wait before sending a keepalive */
+	unsigned long last_req_posted; /* time the last request was posted */
+	struct sif_eps_cqe **cqe; /* An of caller owned pointers indexed by req.index */
+	struct sif_epsc_data *data;  /* Ptr to data recv area for EPS/SMA queries */
+	dma_addr_t data_dma_hdl; /* DMA address of data area for query device/port etc. */
+	struct sif_eq_base eqs; /* Setup of event queues */
+
+	/* log redirection support: */
+	struct miscdevice logdev;  /* Device for log rederect from the EPS, if enabled */
+	struct file_operations logdev_ops;
+	char logdevname[MAX_LOGDEVNAME];
+	bool log_redir_en;  /* Set if log is currently redirected */
+	atomic_t logdev_use;
+	struct completion logdev_more_log; /* elog reader will block on this one */
+};
+
+/**** Low level mailbox handling ****/
+u64 eps_mailbox_read(struct sif_dev *sdev, u8 epsno);
+void eps_mailbox_write(struct sif_dev *sdev, u8 epsno, u64 value);
+
+u64 eps_mailbox_read_data(struct sif_dev *sdev, u8 epsno);
+void eps_mailbox_write_data(struct sif_dev *sdev, u8 epsno, u64 value);
+
+/* (De-)initialization necessary to communicate with the EPS */
+int sif_eps_init(struct sif_dev *sdev, enum sif_tab_type rsp_type);
+int sif_eps_deinit(struct sif_dev *sdev, enum sif_tab_type rsp_type);
+
+const char *eps_name(struct sif_dev *sdev, enum psif_mbox_type eps_num);
+const char *eps_suffix(struct sif_dev *sdev, enum psif_mbox_type eps_num);
+
+/* Convert EPSC status code to errno */
+int eps_status_to_err(enum psif_epsc_csr_status status);
+
+struct psif_epsc_csr_req *get_eps_csr_req(struct sif_dev *sdev,
+	enum psif_mbox_type eps_num, int index);
+
+struct psif_epsc_csr_rsp *get_eps_csr_rsp(struct sif_dev *sdev,
+	enum psif_mbox_type eps_num, int index);
+
+/* Returns true if this is the response table for any of the EPSes: */
+bool is_eps_rsp_tab(enum sif_tab_type type);
+
+/* Asynchronous post of an EPSC work request to psif.
+ * returns nonzero if #of outstanding requests
+ * exceed what the hardware offers or if there is no more room
+ * in completion queue for a new entry.
+ * if @seq_num is nonzero, the sequence number of the posted request will be placed there.
+ * If @lcqe is nonzero, a host endian copy of the response will be placed
+ * there when detected.
+ *
+ * If wait is set, it means that the epsc wr should be posted with
+ * flag EPSC_FL_NOTIFY to receive an interrupt from the epsc:
+ */
+int sif_post_eps_wr(struct sif_dev *sdev, enum psif_mbox_type eps_num,
+		struct psif_epsc_csr_req *lreq, u16 *seq_num,
+		struct sif_eps_cqe *lcqe, bool wait);
+
+int sif_post_epsc_wr(struct sif_dev *sdev, struct psif_epsc_csr_req *lreq,
+		u16 *seq_num, struct sif_eps_cqe *lcqe, bool wait);
+
+/* Get the seq.num from a epsc response in host order */
+u16 sif_epsc_get_seq(struct psif_epsc_csr_rsp *cqe);
+
+/* Wait up to @timeout ticks
+ * for an earlier posted request with ID @seq_num to complete
+ * return 0 if success, -errno else. @cqe will be populated with the response
+ * from the EPS. Uses EPSC interrupts for wakeup.
+ */
+int sif_epsc_waitfor_timeout(struct sif_dev *sdev, u16 seq_num,
+			unsigned long timeout,
+			struct sif_eps_cqe *cqe);
+
+/* Wait for an earlier posted request with ID @seq_num to complete
+ * return 0 if success, -errno else. @cqe will be populated with the response
+ * from the EPS. Uses EPSC interrupts for wakeup.
+ */
+int sif_epsc_waitfor(struct sif_dev *sdev, u16 seq_num,
+		struct sif_eps_cqe *cqe);
+
+/* Poll waiting for a response - in attach we cannot suspend or sleep..
+ * return 0 if a successful operation, eg.EPSC_SUCCESS,
+ * otherwise a suitable -errno. @cqe will be populated with the response
+ * from the EPS
+ */
+int sif_epsc_poll_cqe(struct sif_dev *sdev, u16 seq_num,
+		struct sif_eps_cqe *cqe);
+
+/* Synchronous post of an EPSC work request.
+ * Will wait until request completes. @cqe will be populated with the response
+ * from the EPS. Return value: A suitable errno value that also captures the
+ * status code from the EPSC operation, if any.
+ */
+int sif_epsc_wr(struct sif_dev *sdev, struct  psif_epsc_csr_req *req,
+		struct psif_epsc_csr_rsp *rsp);
+
+/* Same as sif_epsc_wr but poll for completion */
+int sif_epsc_wr_poll(struct sif_dev *sdev, struct  psif_epsc_csr_req *req,
+		struct psif_epsc_csr_rsp *rsp);
+
+/* Generic EPS access (any EPS) */
+int sif_eps_wr(struct sif_dev *sdev, enum psif_mbox_type eps_num,
+	struct  psif_epsc_csr_req *req, struct psif_epsc_csr_rsp *rsp);
+
+int sif_eps_wr_poll(struct sif_dev *sdev, enum psif_mbox_type eps_num,
+		struct psif_epsc_csr_req *req, struct psif_epsc_csr_rsp *rsp);
+
+int sif_eps_poll_cqe(struct sif_dev *sdev, enum psif_mbox_type eps_num,
+		u16 seq_num, struct sif_eps_cqe *lcqe);
+
+/* EPS-A support */
+int sif_activate_epsa(struct sif_dev *sdev, enum psif_mbox_type eps_num);
+
+/* Send a keep-alive request to an EPS */
+int sif_eps_send_keep_alive(struct sif_dev *sdev, enum psif_mbox_type eps_num,
+			int force);
+
+/**** High level synchronous CSR operations */
+
+/* Read a 64 bit CSR register (local UF mapping) */
+u64 sif_read_local_csr(struct sif_dev *sdev, u32 addr);
+
+/* Read a 64 bit CSR register (global PSIF mapping - uf 0 only) */
+u64 sif_read_global_csr(struct sif_dev *sdev, u32 addr);
+
+/* Write a 64 bit EPS CSR register (global PSIF mapping - uf 0 only) */
+int sif_write_global_csr(struct sif_dev *sdev, u32 addr, u64 val);
+
+/* Helper for dfs iteration */
+int sif_eps_next_used(struct sif_table *table, int index);
+
+/* Sysfs entry printers */
+void sif_dfs_print_epsc(struct seq_file *s, struct sif_dev *sdev,
+			loff_t pos);
+void sif_dfs_print_epsa0(struct seq_file *s, struct sif_dev *sdev,
+			loff_t pos);
+void sif_dfs_print_epsa1(struct seq_file *s, struct sif_dev *sdev,
+			loff_t pos);
+void sif_dfs_print_epsa2(struct seq_file *s, struct sif_dev *sdev,
+			loff_t pos);
+void sif_dfs_print_epsa3(struct seq_file *s, struct sif_dev *sdev,
+			loff_t pos);
+
+/* completion invocation - called from sif_eq as result of epsc completion event processing */
+void epsc_complete(struct sif_dev *sdev, enum psif_mbox_type eps_num, int idx);
+
+/* Report cause for EPSC degraded mode */
+void epsc_report_degraded(struct sif_dev *sdev, u64 cause_mask);
+
+/* Set the SIF value to use for the 12 upper bits of a DMA address */
+int epsc_set_mmu_upper(struct sif_dev *sdev, u16 value);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_eq.c b/drivers/infiniband/hw/sif/sif_eq.c
new file mode 100644
index 0000000000000..e52890dd27821
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_eq.c
@@ -0,0 +1,1083 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_eq.c: Setup of event queues and interrupt handling
+ */
+
+#include "sif_dev.h"
+#include "sif_eq.h"
+#include "sif_qp.h"
+#include "sif_defs.h"
+#include "sif_query.h"
+#include "sif_base.h"
+#include "sif_dma.h"
+#include "sif_elog.h"
+#include "sif_hwi.h"
+#include "sif_ibqp.h"
+#include "psif_hw_csr.h"
+#include "psif_hw_setget.h"
+#include <linux/seq_file.h>
+
+static int sif_map_irq(struct sif_eq *eq);
+static int sif_irq_coalesce(struct sif_eq *eq);
+
+static void sif_unmap_irq(struct sif_eq *eq);
+
+static int sif_eq_table_init(struct sif_dev *sdev, struct sif_eps *es, u16 eq_idx);
+static void sif_eq_table_deinit(struct sif_dev *sdev, struct sif_eps *es, u16 eq_idx);
+
+static void sif_eq_deinit_tables(struct sif_dev *sdev, struct sif_eps *es);
+
+static int dispatch_eq(struct sif_eq *eq);
+
+static enum ib_event_type epsc2ib_event(struct psif_eq_entry *eqe);
+
+/* Work elements for dispatching events at non-interrupt level
+ */
+struct event_work {
+	struct work_struct ws;
+	struct ib_event ibe;
+	struct sif_eq *eq;
+};
+
+/* Define accessor functions - see sif_defs.h */
+sif_define_entry_funcs(eq, int)
+
+/* Set up the event queues using info about #of queues from the @cqe
+ * which contains a host byte order copy of the successful response
+ * to the configuration request to the EPS-C.
+ * The EPS-C event queue which receives the async events is always
+ * index 0
+ */
+int sif_eq_init(struct sif_dev *sdev, struct sif_eps *es, struct psif_epsc_csr_rsp *cqe)
+{
+	int ret = 0;
+	int i;
+	int cnt;
+	struct sif_eq_base *eqb = &es->eqs;
+	struct sif_eq *eq;
+
+	cnt = es->eqs.cnt;
+	sif_log(sdev, SIF_INIT, "setting up %d event queues for EPS%s", cnt,
+		eps_name(sdev, es->eps_num));
+
+	eq = (struct sif_eq *)
+		kzalloc(sizeof(struct sif_eq) * cnt, GFP_KERNEL);
+	if (!eq)
+		return -ENOMEM;
+
+	eqb->eq = eq;
+	for (i = 0; i < cnt; i++) {
+		ret = sif_eq_table_init(sdev, es, i);
+		if (ret) {
+			eqb->cnt = i;
+			goto eqi_failed;
+		}
+	}
+
+	eqb->cnt = cnt;
+	return 0;
+
+eqi_failed:
+	sif_eq_deinit_tables(sdev, es);
+	kfree(eqb->eq);
+	eqb->eq = NULL;
+	return ret;
+}
+
+
+static void sif_eq_deinit_tables(struct sif_dev *sdev, struct sif_eps *es)
+{
+	int i;
+
+	for (i = es->eqs.cnt - 1; i >= 0; i--)
+		sif_eq_table_deinit(sdev, es, i);
+	es->eqs.cnt = 0;
+}
+
+
+void sif_eq_deinit(struct sif_dev *sdev, struct sif_eps *es)
+{
+	if (es->eqs.cnt > 0)
+		sif_eq_deinit_tables(sdev, es);
+
+	kfree(es->eqs.eq);
+	es->eqs.eq = NULL;
+}
+
+static int sif_set_affinity_mask_hint(struct sif_dev *sdev, struct sif_eq *eq)
+{
+	int numa_node = dev_to_node(&sdev->pdev->dev);
+	int cpu;
+
+	if (!zalloc_cpumask_var(&eq->affinity_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	cpu = cpumask_local_spread(eq->index, numa_node);
+	cpumask_set_cpu(cpu, eq->affinity_mask);
+	return 0;
+}
+
+
+/* Bit field for #entries in hw is 5 bits wide */
+#define SIF_MAX_EQ_ENTRIES (1 << 0x1f)
+
+/* Set up of a single EQ requested by an EPS.
+ * This code is quite similar to base table setup in sif_base.c - sif_table_init
+ * but since we do not have the base_layout for each of these tables since
+ * we do not know the number of tables in advance, we cannot use the same code.
+ * We also need separat accessor functions and use a dynamically allocated array
+ * of sif_eq objects with some more extra info in addition to the sif_table
+ */
+static int sif_eq_table_init(struct sif_dev *sdev, struct sif_eps *es, u16 eq_idx)
+{
+	struct sif_eq *eq = &es->eqs.eq[eq_idx];
+	volatile struct psif_eq_entry *eqe;
+	struct sif_table *tp = &eq->ba;
+	int extent;  /* As log2 */
+	int ret = 0;
+	u32 min_entries, headroom;
+
+	struct psif_epsc_csr_req req; /* local epsc wr copy */
+	struct psif_epsc_csr_rsp resp;
+
+	memset(eq, 0, sizeof(*eq));
+	eq->eps = es;
+	eq->index = tp->type = eq_idx; /* We *reuse* type with a different meaning here */
+	eq->next_seq = 0;
+	tp->sdev = sdev;
+	tp->ext_sz = roundup_pow_of_two(sizeof(struct psif_eq_entry));
+	tp->is_eq = true;  /* To distinguish namespace from other base tables */
+
+	/* Event queue sizes: It is critical that these are sized for worst case.
+	 * The size of event queues used for completions must be large enough to
+	 * receive at least one entry from each associated completion queue.
+	 * The async event queue (queue 1) must be scaled to fit every possible event.
+	 * See sec.36.2.3. Event Queue Sizing, page 361 in the PSIF PRM.
+	 */
+
+	switch (eq_idx)	{
+	case 0: /* Async + epsc events */
+		headroom = sif_epsc_eq_headroom;
+		min_entries = es->eps_num == sdev->mbox_epsc ?
+			(sif_epsc_size + headroom + 2*es->eqs.min_sw_entry_cnt + 1)
+			: 64;
+		break;
+	case 1:
+		/* TSU - asynchronous events: */
+		headroom = sif_tsu_eq_headroom;
+		min_entries = es->eps_num == sdev->mbox_epsc ?
+			7 * sif_qp_size + 2 * sif_rq_size + sif_cq_size + 9 + headroom : 64;
+		break;
+	default:
+		/* completion notification events coming here
+		 * TBD: We might want to scale the sizes of each of these queues and limit
+		 * the number of CQs to handle by each of them instead:
+		 */
+		headroom = sif_tsu_eq_headroom;
+		min_entries = es->eps_num == sdev->mbox_epsc ? sif_cq_size + headroom : 64;
+		break;
+	}
+
+	eq->entries = tp->entry_cnt = roundup_pow_of_two(min_entries);
+	eq->sw_index_interval = eq->entries - min_entries + headroom;
+	if (!eq->sw_index_interval)
+		eq->sw_index_interval = 1; /* Always update case */
+	eq->sw_index_next_update = eq->sw_index_interval;
+
+	if (eq->entries > SIF_MAX_EQ_ENTRIES) {
+		sif_log(sdev, SIF_INFO,
+			"requested %d entries but sif only supports %d",
+			eq->entries, SIF_MAX_EQ_ENTRIES);
+		return -ENFILE; /* 5 bit size_log2 field in eq descs in psif */
+	}
+
+	eq->mask = eq->entries - 1;
+	eq->extent = tp->ext_sz;
+	tp->table_sz = (size_t)tp->ext_sz * tp->entry_cnt;
+	extent = order_base_2(tp->ext_sz);
+
+	sif_alloc_table(tp, tp->table_sz);
+	if (!tp->mem) {
+		sif_log(sdev, SIF_INIT,
+			"Failed to allocate 0x%lx bytes of memory for event queue table %d",
+			tp->table_sz, eq_idx);
+		return -ENOMEM;
+	}
+
+	ret = sif_set_affinity_mask_hint(sdev, eq);
+	if (ret)
+		goto err_map_ctx;
+
+	/* No MMU translations from EPS-C in PSIF Rev 2 or SIBS rev 1 */
+	if (epsc_gva_permitted(sdev) && eq_idx == 0 && tp->mem->mem_type != SIFMT_BYPASS) {
+		sif_log(sdev, SIF_INFO,
+			"Rev 2.0 does not support MMU translations from EPS-C");
+		ret = -EINVAL;
+		goto err_map_ctx;
+	}
+
+	eq->mem = tp->mem;
+
+	/* Make sure the initial value of entry 0's seq.no is is different from a real event */
+	eqe = (struct psif_eq_entry *)get_eq_entry(eq, 0);
+	set_psif_eq_entry__seq_num(eqe, eq->entries);
+
+	sif_log(sdev, SIF_INFO,
+		"Event queue %d: entry cnt %d (min.req.%d), ext sz %d, extent %d, sw_index_interval %d",
+		eq_idx, tp->entry_cnt, min_entries, tp->ext_sz, extent, eq->sw_index_interval);
+	sif_log(sdev, SIF_INIT,	" - table sz 0x%lx %s sif_base 0x%llx",
+		tp->table_sz, sif_mem_type_str(tp->mem->mem_type),
+		tp->sif_base);
+
+	spin_lock_init(&tp->lock);
+
+	/* Set up HW descriptor */
+	memset(&req, 0, sizeof(req));
+
+	req.opcode = EPSC_SET_BASEADDR_EQ;
+	req.u.base_addr.address = tp->sif_base;
+	req.u.base_addr.num_entries = tp->entry_cnt;
+	req.u.base_addr.extent_log2 = extent;
+	req.addr = eq_idx; /* The "CSR address" for this operation is the index of the queue */
+
+	/* Allocate mmu context with wr_access set */
+	ret = sif_map_ctx(sdev, &tp->mmu_ctx, tp->mem, tp->sif_base, tp->table_sz, true);
+	if (ret) {
+		sif_log(sdev, SIF_INFO, "Failed to set mmu context for eq %d",
+			eq_idx);
+		goto err_map_ctx;
+	}
+
+	/* Allocate an irq index */
+	ret = sif_map_irq(eq);
+	if (ret)
+		goto err_map_irq;
+
+	/* Pass the populated mmu context on to the EPS */
+	req.u.base_addr.mmu_context = tp->mmu_ctx.mctx;
+
+	req.u.base_addr.msix_index = eq->intr_vec;
+
+	ret = sif_eps_wr_poll(sdev, es->eps_num, &req, &resp);
+	if (ret)
+		goto err_epsc_comm;
+
+	/* Default interrupt channel coalescing settings */
+	if (eq_idx != 0 && eps_version_ge(&sdev->es[sdev->mbox_epsc], 0, 36)) {
+		ret = sif_irq_coalesce(eq);
+		if (ret)
+			goto err_epsc_comm;
+	}
+
+	return 0;
+
+err_epsc_comm:
+	sif_unmap_irq(eq);
+err_map_irq:
+	sif_unmap_ctx(sdev, &tp->mmu_ctx);
+err_map_ctx:
+	sif_free_table(tp);
+	return ret;
+}
+
+
+static void sif_eq_table_deinit(struct sif_dev *sdev, struct sif_eps *es, u16 eq_idx)
+{
+	struct sif_eq *eq = &es->eqs.eq[eq_idx];
+	struct sif_table *tp = &eq->ba;
+
+	sif_unmap_irq(eq);
+
+	if (tp->mem) {
+		sif_unmap_ctx(sdev, &tp->mmu_ctx);
+		sif_free_table(tp);
+		tp->mem = NULL;
+	}
+}
+
+
+/* Interrupt routines for MSI-X */
+
+static irqreturn_t sif_intr(int irq, void *d)
+{
+	u32 nreqs;
+	struct sif_eq *eq = (struct sif_eq *)d;
+	struct sif_dev *sdev = eq->ba.sdev;
+	nreqs = dispatch_eq(eq);
+	sif_log(sdev, SIF_INTR,
+		"done [irq %d (eq %d) - %d events dispatched]",
+		irq, eq->index, nreqs);
+
+	if (sif_feature(check_all_eqs_on_intr)) {
+		int i;
+		struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+
+		sif_log(sdev, SIF_INTR, "feature check_all_eqs_on_intr - dispatching:");
+		for (i = 0; i < es->eqs.cnt; i++)
+			if (i != eq->index)
+				dispatch_eq(&es->eqs.eq[i]);
+		sif_log(sdev, SIF_INTR, "feature check_all_eqs_on_intr - dispatch done.");
+		/* Note: this feature does not check the EPSA* interrupt queues */
+	}
+
+	return IRQ_HANDLED;
+}
+
+/* Interrupt coalescing settings for a single channel */
+static int sif_irq_coalesce(struct sif_eq *eq)
+{
+	int ret;
+	struct sif_dev *s = eq->ba.sdev;
+	struct psif_epsc_csr_req req; /* local epsc wr copy */
+	struct psif_epsc_csr_rsp resp;
+
+	if (!eps_version_ge(&s->es[s->mbox_epsc], 0, 36))
+		goto opcode_not_available;
+
+	sif_log(s, SIF_INTR, "Set default coalescing settings for the interrupt channel %d\n",
+		eq->index);
+
+	memset(&req, 0, sizeof(req));
+
+	req.opcode = EPSC_HOST_INT_CHANNEL_CTRL;
+	req.uf = 0;
+	req.u.int_channel.int_channel = eq->index;
+#define SET_DEFAULT_HOST_INT_CTRL_SETTING(attr, _value) {		\
+		int value = ((sif_feature(dis_auto_int_coalesce)) ||	\
+			     (eq->index < 2)) ? 0 : _value;		\
+		req.u.int_channel.attributes.attr = 1;			\
+		req.u.int_channel.attr =  value;			\
+		eq->irq_ch.attr = value;				\
+	}
+	SET_DEFAULT_HOST_INT_CTRL_SETTING(enable_adaptive, 1);
+	SET_DEFAULT_HOST_INT_CTRL_SETTING(channel_rx_scale, 1);
+	SET_DEFAULT_HOST_INT_CTRL_SETTING(channel_rate_low, 0);
+	SET_DEFAULT_HOST_INT_CTRL_SETTING(channel_rate_high, 200000);
+	SET_DEFAULT_HOST_INT_CTRL_SETTING(channel_ausec, 0);
+	SET_DEFAULT_HOST_INT_CTRL_SETTING(channel_ausec_low, 0);
+	SET_DEFAULT_HOST_INT_CTRL_SETTING(channel_ausec_high, 190);
+	SET_DEFAULT_HOST_INT_CTRL_SETTING(channel_pusec, 0);
+	SET_DEFAULT_HOST_INT_CTRL_SETTING(channel_pusec_low, 0);
+	SET_DEFAULT_HOST_INT_CTRL_SETTING(channel_pusec_high, 10);
+
+	ret = sif_epsc_wr_poll(s, &req, &resp);
+	if (ret) {
+		sif_log(s, SIF_INFO,
+			"Failed to initialize the coalescing settings for interrupt channel %d\n",
+			eq->index);
+		memset(&eq->irq_ch, 0, sizeof(eq->irq_ch));
+		return ret;
+	}
+
+	return 0;
+opcode_not_available:
+	return -1;
+}
+
+/* Interrupt handling for a single event queue */
+static int sif_map_irq(struct sif_eq *eq)
+{
+	int irq;
+	int ret;
+	int vector_num;
+	struct sif_dev *s = eq->ba.sdev;
+	int flags = (s->intr_cnt !=  s->intr_req) ? IRQF_SHARED : 0;
+	const char *en;
+
+	spin_lock(&s->msix_lock);
+	vector_num = find_next_zero_bit(s->intr_used, s->msix_entries_sz, 0);
+	if (vector_num < s->msix_entries_sz)
+		set_bit(vector_num, s->intr_used);
+	else
+		vector_num = -1;
+	spin_unlock(&s->msix_lock);
+
+	if (vector_num == -1) {
+		sif_log(s, SIF_INFO, "Failed to allocate an irq for eq %d", eq->index);
+		return -ENOMEM;
+	}
+
+	irq = s->msix_entries[vector_num].vector;
+	en = eps_name(s, eq->eps->eps_num);
+
+	if (eq->index)
+		snprintf(eq->name, SIF_EQ_NAME_LEN, "sif%d-%d", 0, eq->index);
+	else
+		snprintf(eq->name, SIF_EQ_NAME_LEN, "sif%d-EPS%s", 0, en);
+
+	ret = request_irq(irq, &sif_intr, flags, eq->name, eq);
+	if (ret)
+		return ret;
+	sif_log(s, SIF_INFO_V, "Allocated irq %d for EPS%s, eq %d, name %s", irq, en,
+		eq->index, eq->name);
+	eq->intr_vec = vector_num;
+
+	ret = irq_set_affinity_hint(irq, eq->affinity_mask);
+	if (ret) {
+		sif_log(s, SIF_INFO_V, "set affinity hint for irq %d, failed", irq);
+		return ret;
+	}
+	return 0;
+}
+
+static void sif_unmap_irq(struct sif_eq *eq)
+{
+	struct sif_dev *s = eq->ba.sdev;
+	int irq = s->msix_entries[eq->intr_vec].vector;
+
+	free_cpumask_var(eq->affinity_mask);
+	irq_set_affinity_hint(irq, NULL);
+	free_irq(irq, eq);
+	spin_lock(&s->msix_lock);
+	clear_bit(eq->intr_vec, s->intr_used);
+	spin_unlock(&s->msix_lock);
+	eq->intr_vec = -1;
+	sif_log(s, SIF_INTR, "Freed irq %d for EPS%s", irq, eps_name(s, eq->eps->eps_num));
+}
+
+
+int sif_enable_msix(struct sif_dev *sdev)
+{
+	int err;
+	int i = -1;
+	int cnt = sdev->es[sdev->mbox_epsc].eqs.cnt + 4;
+	int array_alloc_cnt = cnt;
+	int bitmap_words = max(1, array_alloc_cnt + 63 / 64);
+
+	sdev->msix_entries = kcalloc(array_alloc_cnt, sizeof(struct msix_entry), GFP_KERNEL);
+	if (!sdev->msix_entries)
+		return -ENOMEM;
+
+	sdev->msix_entries_sz = array_alloc_cnt;
+	sdev->intr_used = kcalloc(bitmap_words, sizeof(ulong), GFP_KERNEL);
+	if (!sdev->intr_used) {
+		err = -ENOMEM;
+		goto iu_failed;
+	}
+
+	sif_log(sdev, SIF_INIT,
+		"EPSC offers %ld event queues, need %ld + 4 for the EPSA's = %d vecs, array sz %d",
+		sdev->es[sdev->mbox_epsc].eqs.max_cnt, sdev->es[sdev->mbox_epsc].eqs.cnt,
+		cnt, array_alloc_cnt);
+	spin_lock_init(&sdev->msix_lock);
+
+	for (i = 0; i < cnt; i++)
+		sdev->msix_entries[i].entry = i;
+
+	err = pci_enable_msix_range(sdev->pdev, sdev->msix_entries, 1, cnt);
+	if (err < 0) {
+		sif_log(sdev, SIF_INFO,
+			"Failed to allocate %d MSI-X vectors", cnt);
+		goto vector_alloc_failed;
+	}
+
+	if (err < cnt)
+		sif_log(sdev, SIF_INFO,
+			"Unable to allocate more than %d MSI-X vectors", err);
+
+	sdev->intr_req = cnt;
+	sdev->intr_cnt = err;
+	return 0;
+
+vector_alloc_failed:
+	kfree(sdev->intr_used);
+iu_failed:
+	kfree(sdev->msix_entries);
+	return err;
+}
+
+
+int sif_disable_msix(struct sif_dev *sdev)
+{
+	pci_disable_msix(sdev->pdev);
+	kfree(sdev->intr_used);
+	kfree(sdev->msix_entries);
+	return 0;
+}
+
+
+/* simple allocation of EPSC EQ channels for CQs: Just do round robin for now: */
+u32 sif_get_eq_channel(struct sif_dev *sdev, struct sif_cq *cq)
+{
+	struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+	u32 seq = atomic_inc_return(&es->eqs.eq_sel_seq);
+
+	/* This is supposed to be a number between 0 and cnt - 2 as the EPSC EQ and the
+	 * EQ for async events are not counted by hardware, so the first eilgible EQ
+	 * is eq[2] which for hardware has index 0:
+	 */
+	u32 eqs_cnt = (u32) (es->eqs.cnt - 2);
+
+	return seq % eqs_cnt;
+}
+
+/* check a valid EQ channel */
+bool sif_check_valid_eq_channel(struct sif_dev *sdev, int comp_vector)
+{
+	struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+	u32 eqs_cnt = (u32) (es->eqs.cnt - 2);
+
+	return ((comp_vector >= 0) && (comp_vector <= eqs_cnt) ? true : false);
+}
+
+/* @eqe contains little endian copy of event triggering the call
+ *   - called from interrupt level
+ *  Returns the number of events handled
+ */
+static u32 handle_completion_event(struct sif_eq *eq, struct psif_eq_entry *eqe)
+{
+	u32 ret = 1;
+	struct sif_dev *sdev = eq->ba.sdev;
+	struct sif_cq *cq = safe_get_sif_cq(sdev, eqe->cqd_id);
+
+	if (!cq) {
+		sif_log(sdev, SIF_INTR, "eq %d: CQ Event seq %d: invalid or out-of-range cqd_id %d",
+			eq->index, eqe->seq_num, eqe->cqd_id);
+		return 0;
+	}
+	if (atomic_add_unless(&cq->refcnt, 1, 0)) {
+		u32 ec = atomic_inc_return(&cq->event_cnt);
+
+		sif_log(sdev, SIF_INTR, "eq %d: Processing PSIF_EVENT_COMPLETION event #%d, seq %d - cq %d",
+			eq->index, ec, eqe->seq_num, eqe->cqd_id);
+		if (unlikely(!cq->ibcq.comp_handler)) {
+			/* This should not be possible - hw error? */
+			sif_log(sdev, SIF_INFO,
+				"eq %d: No handler for PSIF_EVENT_COMPLETION event seq %d on cq %d",
+				eq->index, eqe->seq_num, eqe->cqd_id);
+			ret = 0;
+		} else
+			cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+
+		if (atomic_dec_and_test(&cq->refcnt))
+			complete(&cq->cleanup_ok);
+
+	} else {
+		/* TBD: We end up here also if an event was processed after the cq was destroyed
+		 * but before the cq was reallocated again. We may consequently also
+		 * get "spurious" events on a new CQ that was a delayed event from the previous
+		 * usage but that should be ok.
+		 */
+		sif_log(sdev, SIF_INFO,
+			"eq %d: PSIF_EVENT_COMPLETION event seq %d - cq %d for invalid cq",
+			eq->index, eqe->seq_num, eqe->cqd_id);
+		ret = 0;
+	}
+	return ret;
+}
+
+
+static void handle_event_work(struct work_struct *work)
+{
+	struct event_work *ew = container_of(work, struct event_work, ws);
+	struct sif_dev *sdev = to_sdev(ew->ibe.device);
+
+	atomic_inc(&ew->eq->work_cnt);
+
+	if (unlikely(!sdev->registered)) {
+		sif_log(sdev, SIF_INFO,
+			"Event of type %s received before verbs framework is up - ignoring",
+			ib_event2str(ew->ibe.event));
+
+		if ((ew->ibe.event == IB_EVENT_LID_CHANGE)
+			&& (ew->ibe.element.port_num == 1)
+			&& (PSIF_REVISION(sdev) <= 3))
+			sif_r3_recreate_flush_qp(sdev);
+		goto out;
+	}
+
+	switch (ew->ibe.event) {
+	case IB_EVENT_CQ_ERR: {
+		struct ib_cq *cq = ew->ibe.element.cq;
+
+		if (cq->event_handler)
+			cq->event_handler(&ew->ibe, cq->cq_context);
+		else
+			sif_log(sdev, SIF_INFO,
+				"Unhandled event of type %s received",
+				ib_event2str(ew->ibe.event));
+		break;
+	}
+	case IB_EVENT_SRQ_LIMIT_REACHED:
+	case IB_EVENT_SRQ_ERR: {
+		struct ib_srq *srq = ew->ibe.element.srq;
+
+		if (ew->ibe.event == IB_EVENT_SRQ_LIMIT_REACHED)
+			to_srq(srq)->srq_limit = 0;
+
+		if (srq->event_handler)
+			srq->event_handler(&ew->ibe, srq->srq_context);
+		else
+			sif_log(sdev, SIF_INFO,
+				"Unhandled event of type %s received, srq %d",
+				ib_event2str(ew->ibe.event), to_srq(srq)->index);
+		break;
+	}
+	case IB_EVENT_QP_FATAL:
+	case IB_EVENT_QP_REQ_ERR:
+	case IB_EVENT_QP_ACCESS_ERR:
+	case IB_EVENT_PATH_MIG_ERR:
+	case IB_EVENT_QP_LAST_WQE_REACHED: {
+		struct ib_qp *ibqp = ew->ibe.element.qp;
+		struct sif_qp *qp = to_sqp(ibqp);
+
+		if (is_regular_qp(qp)) {
+			struct sif_rq *rq = get_sif_rq(sdev, qp->rq_idx);
+			struct sif_rq_sw *rq_sw = get_sif_rq_sw(sdev, rq->index);
+
+			/* WA #3850:if SRQ, generate LAST_WQE event */
+			if (rq->is_srq && ibqp->event_handler) {
+				struct ib_event ibe = {
+					.device = &sdev->ib_dev,
+					.event = IB_EVENT_QP_LAST_WQE_REACHED,
+					.element.qp = &qp->ibqp
+				};
+				ibqp->event_handler(&ibe, ibqp->qp_context);
+			} else {
+				/* WA #622: if reqular RQ, flush */
+				if (sif_flush_rq(sdev, rq, qp, atomic_read(&rq_sw->length)))
+					sif_log(sdev, SIF_INFO, "failed to flush RQ %d",
+						rq->index);
+			}
+		}
+		if (!ibqp->event_handler)
+			sif_log(sdev, SIF_INFO,
+				"Unhandled event of type %s received, qp %d",
+				ib_event2str(ew->ibe.event), qp->qp_idx);
+		/* fall through */
+	}
+	case IB_EVENT_PATH_MIG:
+	case IB_EVENT_COMM_EST: {
+		struct ib_qp *ibqp = ew->ibe.element.qp;
+		struct sif_qp *qp = to_sqp(ibqp);
+
+		if (ibqp->event_handler)
+			ibqp->event_handler(&ew->ibe, ibqp->qp_context);
+
+		if (atomic_dec_and_test(&qp->refcnt))
+			complete(&qp->can_destroy);
+
+		if (!ibqp->event_handler)
+			sif_log(sdev, SIF_INFO,
+				"Unhandled event of type %s received, qp %d",
+				ib_event2str(ew->ibe.event), qp->qp_idx);
+		break;
+	}
+	case IB_EVENT_LID_CHANGE:
+		if (ew->ibe.element.port_num == 1 && PSIF_REVISION(sdev) <= 3)
+			sif_r3_recreate_flush_qp(sdev);
+	case IB_EVENT_PORT_ERR:
+	case IB_EVENT_CLIENT_REREGISTER:
+	case IB_EVENT_PORT_ACTIVE:
+	case IB_EVENT_DEVICE_FATAL:
+	case IB_EVENT_PKEY_CHANGE:
+	case IB_EVENT_GID_CHANGE:
+	case IB_EVENT_SM_CHANGE:
+		ib_dispatch_event(&ew->ibe);
+		break;
+	default:
+		sif_log(sdev, SIF_INFO, "Unhandled event type %d", ew->ibe.event);
+		break;
+	}
+out:
+	kfree(ew);
+	}
+
+/* Generic event handler - @eqe contains little endian copy of event triggering the call
+ * ib_dispatch_event dispatches directly so we have to defer the actual dispatch
+ * a better priority level via sdev->wq:
+ */
+
+static u32 handle_event(struct sif_eq *eq, void *element, enum ib_event_type ev_type)
+{
+	struct sif_dev *sdev = eq->ba.sdev;
+	struct event_work *ew = kmalloc(sizeof(struct event_work), GFP_ATOMIC);
+
+	if (!ew) {
+		/* TBD: kmem_cache_alloc or fallback static necessary? */
+		sif_log(sdev, SIF_INFO, "FATAL: Failed to allocate work struct");
+		return 0;
+	}
+	memset(&ew->ibe, 0, sizeof(struct ib_event));
+	ew->ibe.device = &sdev->ib_dev;
+	ew->ibe.event = ev_type;
+	ew->eq = eq;
+
+	/* Assume ibe.element is a union and that our caller has
+	 * set up the right value for us (port, cq, qp or srq):
+	 */
+	ew->ibe.element.cq = element;
+	INIT_WORK(&ew->ws, handle_event_work);
+
+	sif_log(sdev, SIF_INTR, "Processing IB event type %s",
+		ib_event2str(ew->ibe.event));
+	queue_work(sdev->wq, &ew->ws);
+	return 1;
+}
+
+static u32 handle_psif_event(struct sif_eq *eq, struct psif_eq_entry *eqe,
+			const char *type_str)
+{
+	struct sif_dev *sdev = eq->ba.sdev;
+
+	sif_log(sdev, SIF_INFO, "Received (unhandled) psif event of type %s, port flags %s",
+		type_str,
+		string_enum_psif_event(eqe->port_flags));
+	return 1;
+}
+
+static u32 handle_epsc_event(struct sif_eq *eq, struct psif_eq_entry *eqe)
+{
+	struct sif_dev *sdev = eq->ba.sdev;
+	struct sif_eps *es = &sdev->es[eq->eps->eps_num];
+	u32 ret = 1;
+	enum psif_event event_type;
+
+	if (eqe->port_flags == PSIF_EVENT_EXTENSION)
+		event_type = eqe->extension_type;
+	else
+		event_type = eqe->port_flags;
+
+	switch (event_type) {
+	case PSIF_EVENT_MAILBOX:
+		sif_log(sdev, SIF_INTR, "epsc completion event for seq.%d eps_num %d",
+			eqe->cq_sequence_number, eq->eps->eps_num);
+		epsc_complete(sdev, eq->eps->eps_num, eqe->cq_sequence_number & es->mask);
+		break;
+	case PSIF_EVENT_LOG:
+		sif_log(sdev, SIF_INTR, "epsc log event");
+		sif_elog_intr(sdev, sdev->mbox_epsc);
+		break;
+	case PSIF_EVENT_EPSC_KEEP_ALIVE:
+		sif_log(sdev, SIF_INTR, "epsc keep-alive event");
+		sif_eps_send_keep_alive(sdev, eq->eps->eps_num, true);
+		break;
+	default:
+	{
+		enum ib_event_type ibe = epsc2ib_event(eqe);
+
+		if (ibe != (enum ib_event_type)-1) {
+			void *element = (void *)((u64) eqe->port + 1);
+
+			return handle_event(eq, element, ibe);
+		}
+		sif_log(sdev, SIF_INFO, "Unhandled epsc event of type %s::%s (%d::%u)",
+			string_enum_psif_event(eqe->port_flags),
+			string_enum_psif_event(eqe->extension_type),
+			eqe->port_flags, eqe->extension_type);
+		if (eqe->extension_type == PSIF_EVENT_DEGRADED_MODE) {
+			sdev->degraded = true;
+			epsc_report_degraded(sdev, eqe->event_data);
+		}
+		ret = 0;
+		break;
+	}
+	}
+	return ret;
+}
+
+
+static u32 handle_epsa_event(struct sif_eq *eq, struct psif_eq_entry *eqe)
+{
+	struct sif_dev *sdev = eq->ba.sdev;
+
+	sif_log(sdev, SIF_INFO, "Received (unhandled) epsa event of type %s",
+		string_enum_psif_event(eqe->port_flags));
+	return 1;
+}
+
+#define check_for_psif_event(__event__)\
+		if (leqe.__event__)\
+			nevents += handle_psif_event(eq, &leqe, #__event__)
+
+/* Bug #3952 - WA for HW bug #3523 (leqe.rqd_id is not valid)
+ * If QP transport is different from XRC
+ * and the QP is not already destroyed
+ * then retrieve the rq_idx from the QP
+ * Note: For SRQ_LIM event due to modify_srq, QP points to pQP.
+ */
+static u32 handle_srq_event(struct sif_eq *eq, void *element, enum ib_event_type ev_type)
+{
+	if (element != NULL) {
+		struct sif_dev *sdev = eq->ba.sdev;
+		struct sif_qp *qp = to_sqp(element);
+		enum psif_qp_trans type = qp->type;
+		struct sif_rq *rq = (ev_type == IB_EVENT_SRQ_LIMIT_REACHED &&
+				     type == PSIF_QP_TRANSPORT_MANSP1) ?
+			get_sif_rq(sdev, qp->srq_idx) : get_sif_rq(sdev, qp->rq_idx);
+
+		/* release the qp lock */
+		if (atomic_dec_and_test(&qp->refcnt))
+			complete(&qp->can_destroy);
+
+		return handle_event(eq, (void *)&rq->ibsrq, ev_type);
+	}
+	sif_log(eq->ba.sdev, SIF_INFO, "eq %d: Discarding %s event: QP destroyed", eq->index,
+		ev_type == IB_EVENT_SRQ_ERR ? "IB_EVENT_SRQ_ERR" : "IB_EVENT_SRQ_LIMIT_REACHED");
+	return 1;
+}
+
+
+#define dump_eq_entry(level, _s, _eqe)	\
+	sif_logs(level, printk("%s: ", _s); \
+		write_struct_psif_eq_entry(NULL, 0, &leqe); printk("\n"))
+
+
+/* Called from interrupt threads */
+static int dispatch_eq(struct sif_eq *eq)
+{
+	volatile struct psif_eq_entry *eqe;
+	struct psif_eq_entry leqe;
+	struct psif_epsc_csr_req req;
+	struct sif_dev *sdev = eq->ba.sdev;
+
+	u32 seqno;
+	u32 nreqs = 0;
+	ulong flags;
+	void *port_elem;
+	void *qp_elem = NULL;
+
+	/* Serialize event queue processing: */
+	spin_lock_irqsave(&eq->ba.lock, flags);
+	seqno = eq->next_seq;
+	eqe = (struct psif_eq_entry *)get_eq_entry(eq, seqno);
+	sif_log(sdev, SIF_INTR, "eqe at %p next seq.no %x", eqe, seqno);
+	while (get_psif_eq_entry__seq_num(eqe) == seqno) {
+		u32 nevents = 0;
+
+		eq->next_seq++;
+
+		/* Update eq_sw::index if necessary */
+		if (eq->next_seq == eq->sw_index_next_update) {
+			u32 old_nu = eq->sw_index_next_update;
+
+			memset(&req, 0, sizeof(req));
+			req.opcode = EPSC_EVENT_INDEX;
+			req.addr = eq->index;
+			req.u.single.data = eq->next_seq;
+			eq->sw_index_next_update += eq->sw_index_interval;
+
+			spin_unlock_irqrestore(&eq->ba.lock, flags);
+
+			sif_log(eq->ba.sdev, SIF_INFO_V,
+				"Updating EQ_SW_INDEX for eq %d to %x. Interval %x, lim %x, next lim %x",
+				eq->index, eq->next_seq, eq->sw_index_interval, old_nu,
+				eq->sw_index_next_update);
+
+			/* We ignore the response by providing NULL for seq_num and lcqe */
+			sif_post_eps_wr(eq->ba.sdev, eq->eps->eps_num, &req, NULL, NULL, false);
+		} else {
+			/* Avoid callbacks while interrupts off */
+			spin_unlock_irqrestore(&eq->ba.lock, flags);
+		}
+
+		copy_conv_to_sw(&leqe, eqe, sizeof(leqe));
+
+		port_elem = (void *)((u64) leqe.port + 1);
+
+		if (likely(leqe.event_status_cmpl_notify)) {
+			nevents += handle_completion_event(eq, &leqe);
+
+			/* No other event type bits will be set on a CNE */
+			goto only_cne;
+		}
+
+		dump_eq_entry(SIF_DUMP, " ", &leqe);
+
+		/* TBD: Handle this check with a mask... */
+		if (unlikely(leqe.event_status_local_work_queue_catastrophic_error ||
+			     leqe.event_status_path_migration_request_error ||
+			     leqe.event_status_invalid_request_local_wq_error ||
+			     leqe.event_status_local_access_violation_wq_error ||
+			     leqe.event_status_last_wqe_reached ||
+			     leqe.event_status_communication_established ||
+			     leqe.event_status_path_migrated ||
+			     leqe.event_status_srq_limit_reached ||
+			     leqe.event_status_srq_catastrophic_error)) {
+			struct sif_qp *sif_qp_elem = safe_get_sif_qp(sdev, leqe.qp);
+			bool is_srq_event = (leqe.event_status_srq_limit_reached ||
+					      leqe.event_status_srq_catastrophic_error);
+
+			/* silently drop the event if qp is no longer there. */
+			if (!sif_qp_elem) {
+				sif_log(eq->ba.sdev, SIF_INFO, "QP context is NULL!");
+				goto only_cne;
+			}
+
+			/* silently drop the event if it is a PQP. */
+			if (unlikely(sif_qp_elem->type == PSIF_QP_TRANSPORT_MANSP1) &&
+			    !leqe.event_status_srq_limit_reached) {
+				sif_log(eq->ba.sdev, SIF_INFO, "Received async event on PQP!");
+				goto only_cne;
+			}
+
+			if (unlikely(sif_qp_elem->type == PSIF_QP_TRANSPORT_XRC) && is_srq_event) {
+				sif_log(sdev, SIF_INTR,
+					"eq %d: Discarding %s event: QP transport XRC",
+					eq->index, leqe.event_status_srq_catastrophic_error ?
+					"IB_EVENT_SRQ_ERR" : "IB_EVENT_SRQ_LIMIT_REACHED");
+				goto only_cne;
+			}
+
+			/* check whether a qp context is required */
+			if (PSIF_REVISION(sdev) <= 3 || !is_srq_event) {
+				/* silently drop the event if qp has been destroyed at this point. */
+				if (!atomic_add_unless(&sif_qp_elem->refcnt, 1, 0)) {
+					sif_log(sdev, SIF_INTR,
+						"eq %d: qp %d has been destroyed for event seq %d",
+						eq->index, sif_qp_elem->qp_idx, eqe->seq_num);
+					goto only_cne;
+				}
+				qp_elem = (void *) &sif_qp_elem->ibqp;
+			}
+		}
+
+		if (leqe.event_status_eps_c)
+			nevents += handle_epsc_event(eq, &leqe);
+		if (leqe.event_status_eps_a)
+			nevents += handle_epsa_event(eq, &leqe);
+		if (leqe.event_status_port_error)
+			nevents += handle_event(eq, port_elem, IB_EVENT_PORT_ERR);
+		if (leqe.event_status_client_registration)
+			nevents += handle_event(eq, port_elem, IB_EVENT_CLIENT_REREGISTER);
+		if (leqe.event_status_port_active)
+			nevents += handle_event(eq, port_elem, IB_EVENT_PORT_ACTIVE);
+		if (leqe.event_status_local_work_queue_catastrophic_error) {
+			nevents += handle_event(eq, qp_elem, IB_EVENT_QP_FATAL);
+			dump_eq_entry(SIF_INFO, "Got Fatal error", &leqe);
+		}
+		if (leqe.event_status_srq_catastrophic_error)
+			nevents += PSIF_REVISION(sdev) <= 3 ?
+				handle_srq_event(eq, qp_elem, IB_EVENT_SRQ_ERR) :
+				handle_event(eq, &get_sif_rq(sdev, leqe.rqd_id)->ibsrq, IB_EVENT_SRQ_ERR);
+		if (leqe.event_status_path_migration_request_error)
+			nevents += handle_event(eq, qp_elem, IB_EVENT_PATH_MIG_ERR);
+		if (leqe.event_status_local_access_violation_wq_error)
+			nevents += handle_event(eq, qp_elem, IB_EVENT_QP_ACCESS_ERR);
+		if (leqe.event_status_invalid_request_local_wq_error)
+			nevents += handle_event(eq, qp_elem, IB_EVENT_QP_REQ_ERR);
+		if (leqe.event_status_last_wqe_reached)
+			nevents += handle_event(eq, qp_elem,
+						IB_EVENT_QP_LAST_WQE_REACHED);
+		if (leqe.event_status_srq_limit_reached)
+			nevents += PSIF_REVISION(sdev) <= 3 ?
+				handle_srq_event(eq, qp_elem, IB_EVENT_SRQ_LIMIT_REACHED) :
+				handle_event(eq, &get_sif_rq(sdev, leqe.rqd_id)->ibsrq,
+					IB_EVENT_SRQ_LIMIT_REACHED);
+		if (leqe.event_status_communication_established)
+			nevents += handle_event(eq, qp_elem, IB_EVENT_COMM_EST);
+		if (leqe.event_status_path_migrated)
+			nevents += handle_event(eq, qp_elem, IB_EVENT_PATH_MIG);
+		if (leqe.event_status_cq_error) {
+			nevents += handle_event(eq, &get_sif_cq(sdev, leqe.cqd_id)->ibcq,
+						IB_EVENT_CQ_ERR);
+			dump_eq_entry(SIF_INFO, "Got cq_error", &leqe);
+		}
+		if (leqe.event_status_local_catastrophic_error)
+			nevents += handle_event(eq, port_elem, IB_EVENT_DEVICE_FATAL);
+
+
+		/* TBD: These are the ones that do not map directly to IB errors */
+		check_for_psif_event(event_status_port_changed);
+		check_for_psif_event(event_status_invalid_xrceth);
+		check_for_psif_event(event_status_xrc_domain_violation);
+
+		if (!nevents) {
+			sif_log(eq->ba.sdev, SIF_INTR, "eq %d: Warning: No events found for seq 0x%x",
+				eq->index, seqno);
+			dump_eq_entry(SIF_INFO, "(no event processed)", &leqe);
+		} else
+			sif_log(eq->ba.sdev, SIF_INTR, "Handled %d set event bits", nevents);
+
+only_cne:
+		spin_lock_irqsave(&eq->ba.lock, flags);
+		seqno = eq->next_seq;
+		eqe = (struct psif_eq_entry *)get_eq_entry(eq, seqno);
+		nreqs++;
+	}
+	spin_unlock_irqrestore(&eq->ba.lock, flags);
+	atomic_add(nreqs, &eq->intr_cnt);
+	return nreqs;
+}
+
+
+static enum ib_event_type epsc2ib_event(struct psif_eq_entry *eqe)
+{
+	switch (eqe->port_flags) {
+	case PSIF_EVENT_SGID_TABLE_CHANGED:
+		return IB_EVENT_GID_CHANGE;
+	case PSIF_EVENT_PKEY_TABLE_CHANGED:
+		return IB_EVENT_PKEY_CHANGE;
+	case PSIF_EVENT_MASTER_SM_LID_CHANGED:
+	case PSIF_EVENT_MASTER_SM_SL_CHANGED:
+	case PSIF_EVENT_IS_SM_DISABLED_CHANGED:
+		return IB_EVENT_SM_CHANGE;
+	case PSIF_EVENT_LID_TABLE_CHANGED:
+		return IB_EVENT_LID_CHANGE;
+	case PSIF_EVENT_SUBNET_TIMEOUT_CHANGED:
+	case PSIF_EVENT_CLIENT_REREGISTER:
+		return IB_EVENT_CLIENT_REREGISTER;
+	case PSIF_EVENT_PORT_ACTIVE:
+		return IB_EVENT_PORT_ACTIVE;
+	case PSIF_EVENT_PORT_ERR:
+		return IB_EVENT_PORT_ERR;
+	default:
+		return (enum ib_event_type)-1;
+	}
+}
+
+
+void sif_dfs_print_eq(struct seq_file *s, struct sif_dev *sdev,
+		loff_t pos)
+{
+	struct sif_eq *eq;
+
+	if (unlikely(pos < 0)) {
+		seq_printf(s, "#   sii = software index update interval\n"
+			"#   niu = (index of) next software index update\n#\n"
+			"#   ni = Number of events seen\n"
+			"#   wi = Number of events handled in work queue\n"
+			"# Name\tindex\tentries\textent\tn.seq\tvector#\tIRQ#\t"
+			"#ni\t#wi\tsii\tniu\n");
+		return;
+	}
+
+	eq = &sdev->es[sdev->mbox_epsc].eqs.eq[pos];
+
+	seq_printf(s, "%-12s%u\t%u\t%u\t%u\t%d\t%d\t%u\t%u\t%u\t%u\n",
+		eq->name, eq->index, eq->entries, eq->extent, eq->next_seq, eq->intr_vec,
+		sdev->msix_entries[eq->intr_vec].vector,
+		atomic_read(&eq->intr_cnt), atomic_read(&eq->work_cnt),
+		eq->sw_index_interval, eq->sw_index_next_update);
+}
+
+void sif_dfs_print_irq_ch(struct seq_file *s, struct sif_dev *sdev,
+			loff_t pos)
+{
+	struct sif_eq *eq;
+
+	if (unlikely(pos < 0)) {
+		seq_printf(s, "#   Interrupt channel coalescing settings\n#\n"
+			"# echo \"channel=1;adaptive=0;rx_scale=0;rate_low=0;"
+			"rate_high=0;ausec=0;ausec_low=0;ausec_high=0;pusec=0;"
+			"pusec_low=0;pusec_high=0\" > irq_ch\n#\n\n"
+			"# Channel  adaptive  rx_scale  rate_low  rate_high  ausec  ausec_low  ausec_high  pusec  pusec_low  pusec_high\n");
+		return;
+	}
+
+	eq = &sdev->es[sdev->mbox_epsc].eqs.eq[pos];
+	seq_printf(s, "%-11s%-10u%-10u%-10u%-11u%-7d%-11d%-12u%-7u%-11u%-12u\n",
+		   eq->name, eq->irq_ch.enable_adaptive, eq->irq_ch.channel_rx_scale,
+		   eq->irq_ch.channel_rate_low, eq->irq_ch.channel_rate_high,
+		   eq->irq_ch.channel_ausec, eq->irq_ch.channel_ausec_low,
+		   eq->irq_ch.channel_ausec_high, eq->irq_ch.channel_pusec,
+		   eq->irq_ch.channel_pusec_low, eq->irq_ch.channel_pusec_high);
+}
diff --git a/drivers/infiniband/hw/sif/sif_eq.h b/drivers/infiniband/hw/sif/sif_eq.h
new file mode 100644
index 0000000000000..0b7c114a63577
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_eq.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_eq.h: Event queues and interrupt handling
+ */
+
+#ifndef _SIF_EQ_H
+#define _SIF_EQ_H
+#include "psif_hw_csr.h"
+
+extern uint sif_cq_eq_max;
+
+struct sif_dev;
+struct psif_epsc_csr_rsp;
+struct sif_eq;
+struct sif_cq;
+struct sif_eps;
+
+struct sif_eq_base {
+	size_t max_cnt;	/* Number of available event queues in hw */
+	size_t min_sw_entry_cnt;	/* Number of required event queue entries per port for EPSC EQ */
+	size_t cnt;	/* Number of configured hardware event queues */
+	u16 irq_moderation; /* Interrupt total moderation */
+	atomic_t eq_sel_seq;  /* A "sequence number" used to select EQ for CQs (EPSC only) */
+	struct sif_eq *eq;  /* Dyn.alloc'ed array of sz cnt of eq.desc setup */
+};
+
+
+/* Set up the event queues for an EPS using info about #of queues from the @cqe
+ * which contains a host byte order copy of the successful response
+ * to the configuration request to the EPS in question
+ */
+int sif_eq_init(struct sif_dev *sdev, struct sif_eps *es, struct psif_epsc_csr_rsp *cqe);
+
+void sif_eq_deinit(struct sif_dev *sdev, struct sif_eps *es);
+
+int sif_enable_msix(struct sif_dev *s);
+int sif_disable_msix(struct sif_dev *sdev);
+
+/* Printer for debugfs eq file */
+void sif_dfs_print_eq(struct seq_file *s, struct sif_dev *sdev, loff_t pos);
+
+/* Printer for debugfs int channel file */
+void sif_dfs_print_irq_ch(struct seq_file *s, struct sif_dev *sdev, loff_t pos);
+
+/* simple allocation of EQ channel for CQs: */
+u32 sif_get_eq_channel(struct sif_dev *sdev, struct sif_cq *cq);
+bool sif_check_valid_eq_channel(struct sif_dev *sdev, int comp_vector);
+
+#endif
diff --git a/drivers/infiniband/hw/sif/sif_fmr.c b/drivers/infiniband/hw/sif/sif_fmr.c
new file mode 100644
index 0000000000000..e2fc65229b4d2
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_fmr.c
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_fmr.c: Implementation of fast memory registration for SIF
+ */
+
+#include <rdma/ib_verbs.h>
+#include <linux/slab.h>
+#include "sif_fmr.h"
+#include "sif_dev.h"
+#include "sif_defs.h"
+#include "sif_mr.h"
+#include "sif_base.h"
+#include "psif_hw_setget.h"
+
+struct ib_fmr *sif_alloc_fmr(struct ib_pd *ibpd,
+			     int mr_access_flags, struct ib_fmr_attr *fmr_attr)
+{
+	struct sif_dev *sdev = to_sdev(ibpd->device);
+	struct sif_pd *pd = to_spd(ibpd);
+	struct sif_fmr *fmr = kmalloc(sizeof(struct sif_fmr), GFP_KERNEL);
+	struct sif_mem *mem;
+	struct ib_fmr *ibfmr;
+	void *ret;
+
+	if (!fmr) {
+		sif_log(sdev, SIF_INFO, "Unable to allocate memory for the fmr");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	mem = sif_mem_create_fmr(sdev, fmr_attr->max_pages, fmr_attr->page_shift, GFP_KERNEL);
+	if (!mem) {
+		ret = ERR_PTR(-ENOMEM);
+		goto mem_create_failed;
+	}
+
+	memset(fmr, 0, sizeof(struct sif_fmr));
+	fmr->mr = alloc_mr(sdev, pd, mem, 0,
+			IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
+			IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_ATOMIC);
+	if (IS_ERR(fmr->mr)) {
+		ret = fmr->mr;
+		goto mr_alloc_failed;
+	}
+
+	ibfmr = &fmr->ibfmr;
+	ibfmr->lkey = fmr->mr->index;
+	ibfmr->rkey = fmr->mr->index;
+
+	sif_log(sdev, SIF_FMR, "max_pages %d, page_shift %d, max_maps %d",
+		fmr_attr->max_pages, fmr_attr->page_shift, fmr_attr->max_maps);
+	return &fmr->ibfmr;
+
+mr_alloc_failed:
+	sif_mem_free(mem);
+mem_create_failed:
+	kfree(fmr);
+	return ret;
+}
+
+
+int sif_map_phys_fmr(struct ib_fmr *ibfmr,
+		     u64 *page_list, int list_len, u64 iova)
+{
+	struct sif_dev *sdev = to_sdev(ibfmr->device);
+	struct sif_fmr *fmr = to_sfmr(ibfmr);
+	struct sif_mem *mem = fmr->mr->mem;
+	int ret = 0;
+
+	if (mem->mem_type != SIFMT_PTONLY) {
+		sif_log(sdev, SIF_FMR, "Attempt to map an already mapped fmr - must unmap first");
+		ret = sif_unmap_phys_fmr(ibfmr);
+		if (ret)
+			return ret;
+	}
+
+	ret = sif_mem_map_fmr(mem, iova, page_list, list_len);
+	if (ret)
+		return ret;
+
+	ret = sif_map_fmr_ctx(sdev, &fmr->mr->mmu_ctx, mem);
+	return ret;
+}
+
+
+int sif_unmap_phys_fmr(struct ib_fmr *ibfmr)
+{
+	struct sif_fmr *fmr = to_sfmr(ibfmr);
+	struct sif_dev *sdev = to_sdev(ibfmr->device);
+	struct sif_mmu_ctx *ctx = &fmr->mr->mmu_ctx;
+	int index = fmr->mr->index;
+	struct psif_key *key = get_key(sdev, index);
+
+	/* See sif_mr.c for details on invalidation of DMA validation keys */
+
+	/* First set key to a state where memory accesses are invalid: */
+	set_psif_key__lkey_state(key, PSIF_DMA_KEY_MMU_VALID);
+	set_psif_key__rkey_state(key, PSIF_DMA_KEY_MMU_VALID);
+	sif_invalidate_key(sdev, index, PCM_WAIT);
+
+	/* Synchronous TLB invalidation to avoid invalidating the key too early: */
+	sif_unmap_fmr_ctx(sdev, ctx, PCM_WAIT);
+
+	/* Invalidate the keys */
+	set_psif_key__lkey_state(key, PSIF_DMA_KEY_INVALID);
+	set_psif_key__rkey_state(key, PSIF_DMA_KEY_INVALID);
+	sif_invalidate_key(sdev, index, PCM_WAIT);
+
+	/* TBD: We could add code here to nil the ptes
+	 * for debugging purposes, for now they are left behind..
+	 * (can leave stale PTE data behind, but never for pages we allow access to)
+	 */
+
+	/* Reset the memory object - remove stale refs to pages
+	 * (for sanity checking purposes, could be eliminated)
+	 */
+	sif_mem_unmap_fmr(fmr->mr->mem);
+	return 0;
+}
+
+
+static int invalidate_fmr_key(struct sif_st_pqp *spqp, struct ib_fmr *ibfmr,
+		enum psif_dma_vt_key_states state, enum wr_mode mode)
+{
+	struct sif_fmr *fmr = to_sfmr(ibfmr);
+	struct sif_dev *sdev = to_sdev(ibfmr->device);
+	int index = fmr->mr->index;
+	struct psif_key *key = get_key(sdev, index);
+
+	set_psif_key__lkey_state(key, state);
+	set_psif_key__rkey_state(key, state);
+	if (spqp)
+		return sif_inv_key_update_st(spqp, index, mode);
+	else
+		return sif_invalidate_key(sdev, index, mode);
+}
+
+
+int sif_unmap_phys_fmr_list(struct list_head *fmr_list)
+{
+	struct ib_fmr *ib_fmr;
+	struct sif_dev *sdev = NULL;
+	enum wr_mode mode;
+	int ret;
+	int cnt = 0;
+	bool flush_all = false;
+	struct sif_st_pqp *spqp = NULL;
+	u16 ms = 0;
+	ulong start_time = jiffies;
+
+	if (!list_empty(fmr_list)) {
+		ib_fmr = list_first_entry(fmr_list, struct ib_fmr, list);
+		sdev = to_sdev(ib_fmr->device);
+	} else
+		return 0;
+
+	if (!sif_feature(disable_stencil_invalidate)) {
+		spqp = sif_alloc_ki_spqp(sdev);
+		if (!spqp)
+			sif_log(sdev, SIF_PQPT,
+				"All %u configured stencil pqps busy, consider increasing ki_spqp_size",
+				sdev->ki_spqp.pool_sz);
+	}
+
+	if (!sdev->is_vf && sdev->num_vfs == 0) {
+		/* Check if we should do a brute force whole MMU caches flush (PF only) */
+		list_for_each_entry(ib_fmr, fmr_list, list) {
+			cnt++;
+			if (cnt >= sif_fmr_cache_flush_threshold) {
+				ret = sif_post_flush_tlb(sdev, false);
+				flush_all = true;
+				goto key_to_invalid;
+			}
+		}
+	}
+
+	cnt = 0;
+	list_for_each_entry(ib_fmr, fmr_list, list) {
+		mode = list_is_last(&ib_fmr->list, fmr_list) ? PCM_WAIT
+			: (!(cnt & 0x1f) ? PCM_POST_COMPL : PCM_POST);
+		ret = invalidate_fmr_key(spqp, ib_fmr, PSIF_DMA_KEY_MMU_VALID, mode);
+		if (ret)
+			goto out;
+		cnt++;
+	}
+	sif_log(sdev, SIF_INFO_V, "done with %d invalidates to MMU_VALID", cnt);
+
+	cnt = 0;
+	list_for_each_entry(ib_fmr, fmr_list, list) {
+		mode = list_is_last(&ib_fmr->list, fmr_list) ? PCM_WAIT
+			: (!(cnt & 0x1f) ? PCM_POST_COMPL : PCM_POST);
+		sif_unmap_fmr_ctx(to_sdev(ib_fmr->device),
+				&(to_sfmr(ib_fmr))->mr->mmu_ctx, mode);
+		cnt++;
+	}
+	sif_log(sdev, SIF_INFO_V, "done with %d unmap_fmr_ctxs", cnt);
+key_to_invalid:
+	cnt = 0;
+
+	list_for_each_entry(ib_fmr, fmr_list, list) {
+		mode = list_is_last(&ib_fmr->list, fmr_list) ? PCM_WAIT
+			: (!(cnt & 0x1f) ? PCM_POST_COMPL : PCM_POST);
+		ret = invalidate_fmr_key(spqp, ib_fmr, PSIF_DMA_KEY_INVALID, mode);
+		if (ret)
+			goto out;
+		cnt++;
+	}
+	sif_log(sdev, SIF_INFO_V, "done invalidating %d fmr keys%s",
+		cnt, (spqp ? " (stencil)" : ""));
+
+	if (flush_all) {
+		ret = sif_complete_flush_tlb(sdev);
+		if (ret)
+			goto out;
+	}
+
+	cnt = 0;
+	list_for_each_entry(ib_fmr, fmr_list, list) {
+		sif_mem_unmap_fmr((to_sfmr(ib_fmr))->mr->mem);
+		cnt++;
+	}
+	ms = jiffies_to_msecs(jiffies - start_time);
+	sif_log_perf(sdev, SIF_PERF_V, "done unmapping %d fmrs in %u ms", cnt, ms);
+out:
+	if (spqp)
+		sif_release_ki_spqp(spqp);
+
+	return ret;
+}
+
+
+int sif_dealloc_fmr(struct ib_fmr *ibfmr)
+{
+	struct sif_dev *sdev = to_sdev(ibfmr->device);
+	struct sif_fmr *fmr = to_sfmr(ibfmr);
+
+	if (fmr->mr->mem->mem_type != SIFMT_PTONLY) {
+		sif_log(sdev, SIF_FMR, "Attempt to deallocate a mapped fmr (key %d) - must unmap first",
+			fmr->mr->index);
+		return -EBUSY;
+	}
+	sif_dealloc_mr(sdev, fmr->mr);
+	kfree(fmr);
+	return 0;
+}
diff --git a/drivers/infiniband/hw/sif/sif_fmr.h b/drivers/infiniband/hw/sif/sif_fmr.h
new file mode 100644
index 0000000000000..15625631f1c91
--- /dev/null
+++ b/drivers/infiniband/hw/sif/sif_fmr.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ *    Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_fmr.h: Interface to internal IB Fast Memory Registration (FMR)
+ *   logic for SIF
+ */
+
+#ifndef __SIF_FMR_H
+#define __SIF_FMR_H
+
+struct sif_fmr {
+	struct ib_fmr ibfmr;
+	struct sif_mr *mr;
+};
+
+static inline struct sif_fmr *to_sfmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct sif_fmr, ibfmr);
+}
+
+struct ib_fmr *sif_alloc_fmr(struct ib_pd *ibpd,
+			     int mr_access_flags, struct ib_fmr_attr *fmr_attr);
+int sif_map_phys_fmr(struct ib_fmr *ibfmr,
+		     u64 *page_list, int list_len, u64 iova);
+
+int sif_unmap_phys_fmr(struct ib_fmr *ibfmr);
+int sif_unmap_phys_fmr_list(struct list_head *fmr_list);
+
+int sif_dealloc_fmr(struct ib_fmr *ibfmr);
+
+#endif