--- /dev/null
+/*
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_fwa.c: Firmware access API (netlink based out-of-band comm)
+ *
+ */
+#include "sif_dev.h"
+#include "sif_fwa.h"
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include "sif_enl.h"
+#include "sif_defs.h"
+#include "sif_query.h"
+#include "sif_base.h"
+#include "sif_qp.h"
+#include "psif_hw_csr.h"
+#include "sif_drvapi.h"
+
+/* Generic netlink protocol family definition */
+static struct genl_family sif_enl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = "sif_enl",
+ .version = 1,
+ .maxattr = 16
+};
+
+/* Netlink request handlers */
+static int sif_fwa_req(struct sk_buff *skb, struct genl_info *info);
+static int sif_fwa_drv_req(struct sk_buff *skb, struct genl_info *info);
+
+/* Netlink req operation definition */
+static struct genl_ops sif_enl_ops[] = {
+ {
+ .cmd = SIF_ENL_CMD_REQ,
+ .flags = 0,
+ .policy = sif_enl_policy,
+ .doit = sif_fwa_req,
+ .dumpit = NULL,
+ },
+
+ {
+ .cmd = SIF_ENL_CMD_REQ_DRV,
+ .flags = 0,
+ .policy = sif_enl_policy,
+ .doit = sif_fwa_drv_req,
+ .dumpit = NULL,
+ }
+};
+
+
+/* Global datastructure to keep track of instances and number of active
+ * processes:
+ */
+
+struct fwa_data {
+ struct list_head sdev_list; /* Access to devices */
+ spinlock_t lock; /* Protects device list */
+};
+
+static struct fwa_data fwa;
+
+
+/* Called from sif_init/exit to set up/clean up global data structures
+ * such as netlink communication and device registry:
+ */
+int sif_fwa_init(void)
+{
+ int stat;
+
+ INIT_LIST_HEAD(&fwa.sdev_list);
+ spin_lock_init(&fwa.lock);
+
+ stat = genl_register_family_with_ops(&sif_enl_family, sif_enl_ops);
+ if (stat)
+ goto fail;
+
+ sif_log0(SIF_INIT, "Enabled firmware access API");
+ return 0;
+fail:
+ sif_log0(SIF_INIT, "ERROR: Failed to enable firmware access API - error %d", stat);
+ return stat;
+}
+
+void sif_fwa_exit(void)
+{
+ sif_log0(SIF_INIT, "Disabling firmware access API");
+ genl_unregister_family(&sif_enl_family);
+}
+
+
+/* Called from probe to register a new device */
+int sif_fwa_register(struct sif_dev *sdev)
+{
+ struct pci_dev *pdev = sdev->pdev;
+
+ sif_log(sdev, SIF_INIT, "register device %02x:%02x.%d",
+ pdev->bus->number, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+ spin_lock(&fwa.lock);
+ list_add_tail(&sdev->fwa.list, &fwa.sdev_list);
+ spin_unlock(&fwa.lock);
+ return 0;
+}
+
+/* Called from remove to unregister a device */
+void sif_fwa_unregister(struct sif_dev *sdev)
+{
+ spin_lock(&fwa.lock);
+ list_del(&sdev->fwa.list);
+ spin_unlock(&fwa.lock);
+}
+
+
+static struct sif_dev *fwa_find_dev(struct genl_info *info)
+{
+ struct sif_dev *sdev = NULL;
+ struct sif_dev *s;
+
+ u16 domain = nla_get_u16(info->attrs[SIF_ENL_A_COMPLEX]);
+ u16 bus = nla_get_u16(info->attrs[SIF_ENL_A_BUS]);
+ u16 devfn = nla_get_u16(info->attrs[SIF_ENL_A_DEVFN]);
+
+ /* TBD: Ref.count access to sdev */
+ sif_log0(SIF_FWA, "bus %x devfn %x",
+ bus, devfn);
+
+ spin_lock(&fwa.lock);
+ list_for_each_entry(s, &fwa.sdev_list, fwa.list) {
+ if (domain == pci_domain_nr(s->pdev->bus) &&
+ bus == s->pdev->bus->number &&
+ devfn == s->pdev->devfn) {
+ sdev = s;
+ break;
+ }
+ sif_log(s, SIF_FWA, "bus %x devfn %x", s->pdev->bus->number, s->pdev->devfn);
+ }
+ spin_unlock(&fwa.lock);
+ return sdev;
+}
+
+
+static int fwa_valid_opcode(struct sif_dev *sdev, struct psif_epsc_csr_req *req,
+ enum psif_mbox_type eps_num)
+{
+ switch (req->opcode) {
+ case EPSC_SETUP:
+ case EPSC_SETUP_BASEADDR:
+ case EPSC_SET_BASEADDR:
+ case EPSC_SET_BASEADDR_EQ:
+ case EPSC_SET_ONE_CSR:
+ /* These are kernel only */
+ return -EPERM;
+ case EPSC_HOST_INT_CHANNEL_CTRL:
+ case EPSC_HOST_INT_COMMON_CTRL:
+ case EPSC_SET_LID:
+ case EPSC_SET_EOIB_MAC:
+ case EPSC_UF_RESET:
+ case EPSC_MODIFY_QP:
+ case EPSC_GET_SINGLE:
+ case EPSC_GET_ONE_CSR:
+ case EPSC_QUERY:
+ case EPSC_SET:
+ case EPSC_QUERY_QP:
+ case EPSC_QUERY_DEVICE:
+ case EPSC_QUERY_PORT_1:
+ case EPSC_QUERY_PORT_2:
+ case EPSC_QUERY_PKEY:
+ case EPSC_QUERY_GID:
+ case EPSC_MODIFY_DEVICE:
+ case EPSC_MODIFY_PORT_1:
+ case EPSC_MODIFY_PORT_2:
+ case EPSC_MC_ATTACH:
+ case EPSC_MC_DETACH:
+ case EPSC_MC_QUERY:
+ case EPSC_FLASH_START:
+ case EPSC_FLASH_ERASE_SECTOR:
+ case EPSC_FLASH_RD:
+ case EPSC_FLASH_WR:
+ case EPSC_FLASH_STOP:
+ case EPSC_A_CONTROL:
+ case EPSC_LINK_CNTRL:
+ case EPSC_UPDATE:
+ case EPSC_UF_CTRL:
+ case EPSC_VIMMA_CTRL:
+ /* These are not meaningful for the EPSAs for now */
+ if (eps_num == sdev->mbox_epsc)
+ return 0;
+ else
+ return -EPERM;
+ case EPSC_NOOP:
+ case EPSC_MAILBOX_PING:
+ case EPSC_KEEP_ALIVE:
+ case EPSC_EVENT_ACK:
+ case EPSC_EVENT_INDEX:
+ case EPSC_TEST_HOST_RD:
+ case EPSC_TEST_HOST_WR:
+ case EPSC_FW_VERSION:
+ case EPSC_LOG_CTRL:
+ case EPSC_LOG_REQ_NOTIFY:
+ case EPSC_A_COMMAND:
+ case EPSC_EXERCISE_MMU:
+ case EPSC_CLI_ACCESS:
+ break;
+ case EPSC_LAST_OP:
+ default:
+ /* Fail on all unknown operations: */
+ sif_log(sdev, SIF_FWA, "Unknown operation %d", req->opcode);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+
+static int sif_fwa_verify_find_dev(struct genl_info *info, struct sif_dev **sdev_p, int payload_len)
+{
+ struct sif_dev *sdev;
+ int len;
+
+ if (!info->attrs[SIF_ENL_A_COMPLEX]) {
+ sif_log0(SIF_FWA, "PCI complex no. not set!");
+ return -EINVAL;
+ }
+
+ if (!info->attrs[SIF_ENL_A_BUS]) {
+ sif_log0(SIF_FWA, "PCI bus no. not set!");
+ return -EINVAL;
+ }
+
+ if (!info->attrs[SIF_ENL_A_DEVFN]) {
+ sif_log0(SIF_FWA, "PCI device/function not set!");
+ return -EINVAL;
+ }
+
+ if (!info->attrs[SIF_ENL_A_PAYLOAD]) {
+ sif_log0(SIF_FWA, "Received empty request!");
+ return -EINVAL;
+ }
+ len = nla_len(info->attrs[SIF_ENL_A_PAYLOAD]);
+ if (len < payload_len) {
+ sif_log0(SIF_FWA, "Request too short!");
+ return -EFAULT;
+ }
+
+ /* TBD: Better input checking... */
+
+ sdev = fwa_find_dev(info);
+ if (!sdev) {
+ sif_log0(SIF_FWA, "No such device found!");
+ return -ENODEV;
+ }
+ *sdev_p = sdev;
+ return 0;
+}
+
+
+static int sif_fwa_drv_req(struct sk_buff *skb, struct genl_info *info)
+{
+ int msg_sz;
+ int stat;
+ size_t data_sz = 0;
+ struct sif_dev *sdev;
+ struct sif_drv_req *req = NULL;
+ struct sif_drv_rsp rsp;
+ enum psif_mbox_type eps_num;
+ struct sk_buff *resp_skb;
+ void *data;
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN)) {
+ sif_log0(SIF_FWA, "Request from client without the CAP_NET_ADMIN privilege");
+ return -EPERM;
+ }
+
+ ret = sif_fwa_verify_find_dev(info, &sdev, sizeof(struct sif_drv_req));
+ if (ret)
+ return ret;
+
+ req = nla_data(info->attrs[SIF_ENL_A_PAYLOAD]);
+
+ sif_log(sdev, SIF_FWA, "op %d", req->opcode);
+
+ if (IS_SIBS(sdev)) {
+ sif_log(sdev, SIF_FWA, "Device does not have any EPS-A modules");
+ return -EINVAL;
+ }
+
+ eps_num = epsa_to_mbox(req->u.epsa.epsa);
+ if (eps_num == (enum psif_mbox_type)-1) {
+ sif_log(sdev, SIF_FWA, "Unknown EPS-A %d", req->u.epsa.epsa);
+ return -EINVAL;
+ }
+
+ switch (req->opcode) {
+ case SIF_DRV_CMD_EPSA_SETUP:
+ ret = sif_activate_epsa(sdev, eps_num);
+ rsp.opcode = SIF_DRV_CMD_EPSA_SETUP;
+ rsp.eps_rsp.status = ret;
+ break;
+ case SIF_DRV_CMD_EPSA_TEARDOWN:
+ break;
+ }
+
+ if (ret)
+ return ret;
+
+ /* Start building a response */
+ msg_sz = NLMSG_DEFAULT_SIZE + data_sz;
+ resp_skb = nlmsg_new(msg_sz, GFP_KERNEL);
+ if (!resp_skb)
+ return -ENOMEM;
+
+ data = genlmsg_put_reply(resp_skb, info, &sif_enl_family,
+ 0, SIF_ENL_CMD_RSP_DRV);
+ if (data == NULL) {
+ stat = -ENOMEM;
+ goto put_fail;
+ }
+
+ stat = nla_put(resp_skb, SIF_ENL_A_PAYLOAD, sizeof(struct sif_drv_rsp), &rsp);
+ if (stat) {
+ sif_log(sdev, SIF_FWA, "failed to append response to netlink packet");
+ goto put_fail;
+ }
+
+ /* Recompute message header */
+ genlmsg_end(resp_skb, data);
+
+ stat = genlmsg_reply(resp_skb, info);
+ if (stat) {
+ sif_log(sdev, SIF_FWA, "failed to send reply - status %d", stat);
+ goto put_fail;
+ }
+
+ sif_log(sdev, SIF_FWA, "Sent response with drv opcode %d msg sz %d",
+ rsp.opcode, msg_sz);
+ return 0;
+put_fail:
+ nlmsg_free(resp_skb);
+ return stat;
+}
+
+static int sif_fwa_req(struct sk_buff *skb, struct genl_info *info)
+{
+ int len;
+ int stat;
+ int msg_sz;
+ struct sif_dev *sdev;
+ enum psif_mbox_type eps_num;
+ struct sif_eps *es;
+ void *data;
+ size_t data_sz = 0;
+ struct psif_epsc_csr_req *req = NULL;
+ struct psif_epsc_csr_rsp rsp;
+ struct psif_query_qp *qqp;
+ struct sk_buff *resp_skb;
+ void *kaddr = NULL;
+
+ if (!capable(CAP_NET_ADMIN)) {
+ sif_log0(SIF_FWA, "Request from client without the CAP_NET_ADMIN privilege");
+ return -EPERM;
+ }
+
+ stat = sif_fwa_verify_find_dev(info, &sdev, sizeof(struct psif_epsc_csr_req));
+ if (stat)
+ return stat;
+
+ req = nla_data(info->attrs[SIF_ENL_A_PAYLOAD]);
+
+ if (info->attrs[SIF_ENL_A_INDEX]) {
+ eps_num = nla_get_u32(info->attrs[SIF_ENL_A_INDEX]);
+ if (IS_SIBS(sdev)) {
+ if (eps_num == MBOX_EPSC)
+ eps_num = SIBS_MBOX_EPSC;
+ else {
+ sif_log(sdev, SIF_FWA, "Invalid EPS selection (%d)", eps_num);
+ return -EINVAL;
+ }
+ }
+ if (eps_num >= sdev->eps_cnt) {
+ sif_log(sdev, SIF_FWA, "Invalid EPS selection (%d)", eps_num);
+ return -EINVAL;
+ }
+ } else {
+ /* Default to use the EPSC (bw.comp) */
+ eps_num = sdev->mbox_epsc;
+ }
+
+ sif_log(sdev, SIF_FWA, "%s to %s",
+ string_enum_psif_epsc_csr_opcode(req->opcode),
+ string_enum_psif_mbox_type(eps_num));
+
+ es = &sdev->es[eps_num];
+ if (es->state != ES_ACTIVE) {
+ sif_log0(SIF_FWA, "Communication with EPS%s has not been set up (state = %d)!",
+ eps_name(sdev, eps_num), es->state);
+ return -ENODEV;
+ }
+
+ /* Check that this opcode is valid in this context */
+ stat = fwa_valid_opcode(sdev, req, eps_num);
+ if (stat) {
+ if (stat == -EPERM)
+ sif_log(sdev, SIF_FWA,
+ "Operation %s not permitted for EPS%s from user space",
+ string_enum_psif_epsc_csr_opcode(req->opcode),
+ eps_name(sdev, eps_num));
+ return stat;
+ }
+
+
+ /* The below opcodes picks up additional data from (fixed) buffers */
+ switch (req->opcode) {
+ case EPSC_QUERY_DEVICE:
+ req->u.query_hw.address =
+ (u64)es->data_dma_hdl +
+ offsetof(struct sif_epsc_data, dev);
+ kaddr = &es->data->dev;
+ data_sz = sizeof(struct psif_epsc_device_attr);
+ break;
+ case EPSC_QUERY_PORT_1:
+ req->u.query_hw.address =
+ (u64)es->data_dma_hdl +
+ offsetof(struct sif_epsc_data, port[0]);
+ kaddr = &es->data->port[0];
+ data_sz = sizeof(struct psif_epsc_port_attr);
+ break;
+ case EPSC_QUERY_PORT_2:
+ req->u.query_hw.address =
+ (u64)es->data_dma_hdl +
+ offsetof(struct sif_epsc_data, port[1]);
+ kaddr = &es->data->port[1];
+ data_sz = sizeof(struct psif_epsc_port_attr);
+ break;
+ case EPSC_QUERY_QP:
+ {
+ struct sif_qp *qps;
+ u32 qp_idx = req->u.query_qp.ctrl.qp_num;
+
+ if (qp_idx >= sdev->ba[qp].entry_cnt)
+ return -ENOENT;
+ qps = get_sif_qp(sdev, qp_idx);
+ kaddr = qqp = &qps->qqp;
+ req->u.query_qp.address = sif_qqp_dma_addr(sdev, qps);
+ data_sz = sizeof(struct psif_query_qp);
+ break;
+ }
+ case EPSC_FLASH_RD:
+ case EPSC_FLASH_WR:
+ data_sz = req->u.flash.length;
+ if (data_sz)
+ kaddr = &es->data->flash;
+
+ /* Use the reserved 'flash' buffer allocated with the EPSC's resp.queue: */
+ req->u.flash.host_addr = es->data_dma_hdl +
+ offsetof(struct sif_epsc_data, flash);
+ req->u.flash.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx;
+ break;
+ case EPSC_CLI_ACCESS:
+ data_sz = MAX_FWA_NL_PAYLOAD;
+ kaddr = &es->data->epsc_cli;
+
+ /* Use the reserved 'epsc_cli' buffer allocated with the EPSC's resp. queue: */
+ req->u.cli.host_addr = es->data_dma_hdl +
+ offsetof(struct sif_epsc_data, epsc_cli);
+ req->u.cli.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx;
+ break;
+ case EPSC_VIMMA_CTRL:
+ data_sz = MAX_FWA_NL_PAYLOAD;
+ kaddr = &es->data->vimm_agent;
+
+ /* Use the reserved 'vimm_agent' buffer allocated with the EPSC's resp. queue: */
+ req->u.vimma_ctrl.host_addr = es->data_dma_hdl +
+ offsetof(struct sif_epsc_data, vimm_agent);
+ req->u.vimma_ctrl.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx;
+ break;
+ case EPSC_UPDATE:
+ switch (req->u.update.opcode) {
+ case EPSC_UPDATE_OP_READ:
+ case EPSC_UPDATE_OP_WRITE:
+ /* Use the reserved 'flash' buffer allocated with the EPSC's resp.queue: */
+ req->u.update.host_addr = es->data_dma_hdl +
+ offsetof(struct sif_epsc_data, flash);
+ req->u.update.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx;
+ /* fall through */
+ case EPSC_UPDATE_OP_POLL:
+ data_sz = req->u.update.length;
+ kaddr = &es->data->flash;
+ break;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+
+ /* Copy any extra input data to the kernel buffer: */
+ if (info->attrs[SIF_ENL_A_DATA]) {
+ len = nla_len(info->attrs[SIF_ENL_A_DATA]);
+ data = nla_data(info->attrs[SIF_ENL_A_DATA]);
+ switch (req->opcode) {
+ case EPSC_UPDATE:
+ if (req->u.update.opcode != EPSC_UPDATE_OP_WRITE)
+ break;
+ /* fall through */
+ case EPSC_FLASH_WR:
+ case EPSC_CLI_ACCESS:
+ case EPSC_VIMMA_CTRL:
+ if (kaddr) {
+ memcpy(kaddr, data, len);
+ sif_log(sdev, SIF_FWA, "dma kaddr %p data %p len %x",
+ kaddr, data, len);
+ mb();
+ } else
+ sif_log(sdev, SIF_FWA, "Found aux.data input but no data area");
+ break;
+ default:
+ sif_log(sdev, SIF_FWA, "Found aux.data input in unexpected op %s",
+ string_enum_psif_epsc_csr_opcode(req->opcode));
+ break;
+ }
+ }
+
+ stat = sif_eps_wr(sdev, eps_num, req, &rsp);
+ switch (stat) {
+ case -ETIMEDOUT:
+ return stat;
+ default:
+ break;
+ }
+
+ if (data_sz > MAX_FWA_NL_PAYLOAD)
+ return -EMSGSIZE;
+
+ /* Start building a response */
+ msg_sz = NLMSG_DEFAULT_SIZE + data_sz;
+ resp_skb = nlmsg_new(msg_sz, GFP_KERNEL);
+ if (!resp_skb) {
+ sif_log(sdev, SIF_FWA, "failed to allocate netlink packet");
+ return -ENOMEM;
+ }
+
+ data = genlmsg_put_reply(resp_skb, info, &sif_enl_family,
+ 0, SIF_ENL_CMD_RSP);
+ if (data == NULL) {
+ sif_log(sdev, SIF_FWA, "failed to add generic netlink header");
+ stat = -ENOMEM;
+ goto put_fail;
+ }
+
+ stat = nla_put(resp_skb, SIF_ENL_A_PAYLOAD, sizeof(struct psif_epsc_csr_rsp), &rsp);
+ if (stat) {
+ sif_log(sdev, SIF_FWA, "failed to append response to netlink packet");
+ goto put_fail;
+ }
+
+ if (kaddr && req->opcode != EPSC_FLASH_WR &&
+ !(req->opcode == EPSC_UPDATE && req->u.update.opcode == EPSC_UPDATE_OP_WRITE)) {
+ stat = nla_put(resp_skb, SIF_ENL_A_DATA, data_sz, kaddr);
+ if (stat) {
+ sif_log(sdev, SIF_FWA, "failed to append %ld bytes of data", data_sz);
+ goto put_fail;
+ }
+ }
+
+ /* Recompute message header */
+ genlmsg_end(resp_skb, data);
+
+ stat = genlmsg_reply(resp_skb, info);
+ if (stat) {
+ sif_log(sdev, SIF_FWA, "failed to send reply - status %d", stat);
+ goto put_fail;
+ }
+
+ sif_log(sdev, SIF_FWA, "Sent response with opcode %s msg sz %d",
+ string_enum_psif_epsc_csr_opcode(rsp.opcode), msg_sz);
+ return 0;
+put_fail:
+ nlmsg_free(resp_skb);
+ return stat;
+}
--- /dev/null
+/*
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_fwa.h: Low level access to a SIF device
+ *
+ * Allows access to low level functions such as (re)programming the EPSC flash
+ * via direct access to the EPSC protocol proxied via Netlink.
+ * Requires CAP_NETADMIN privileges.
+ */
+
+#ifndef __SIF_FWA_H
+#define __SIF_FWA_H
+#include <linux/list.h>
+
+struct sif_dev;
+
+/* The max size we support sending/receiving from user space
+ * in a single netlink message.
+ * Limited by a 4k max netlink message size:
+ */
+#define MAX_FWA_NL_PAYLOAD 0x800
+
+/* Per instance data structure */
+struct sif_fwa {
+ struct list_head list; /* Linkage for the global list */
+};
+
+/* Called from sif_init/exit to set up/clean up global data structures
+ * such as netlink communication and device registry:
+ */
+int sif_fwa_init(void);
+void sif_fwa_exit(void);
+
+/* Called from probe to register a new device */
+int sif_fwa_register(struct sif_dev *sdev);
+
+/* Called from remove to unregister a device */
+void sif_fwa_unregister(struct sif_dev *sdev);
+
+/* Value definition for the fwa module parameter: */
+#define SIF_FWA_MR_ENABLE 0x1 /* Enable FWA mode */
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_hwi.c: Hardware init for SIF - combines the various init steps for psif
+ */
+
+#include "sif_dev.h"
+#include "sif_hwi.h"
+#include "sif_base.h"
+#include "sif_cq.h"
+#include "sif_pqp.h"
+#include "sif_qp.h"
+#include "sif_ibqp.h"
+#include "sif_pd.h"
+#include "sif_eq.h"
+#include "sif_xrc.h"
+#include "sif_defs.h"
+#include "sif_query.h"
+#include "psif_hw_data.h"
+#include "psif_hw_setget.h"
+#include <net/checksum.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+/* Create the special SIF privileged QP which is used
+ * for special sif specific work requests such as for instance
+ * requesting completion event notification on a cq.
+ */
+
+static void sif_pqp_fini(struct sif_dev *sdev);
+
+
+static int sif_chip_init(struct sif_dev *sdev);
+static void sif_chip_deinit(struct sif_dev *sdev);
+
+
+static int sif_pqp_init(struct sif_dev *sdev)
+{
+ struct sif_pqp *pqp;
+ struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+ int i;
+ int ret = 0;
+ uint n_pqps = es->eqs.cnt - 2;
+
+ sdev->pqp = sif_kmalloc(sdev, sizeof(struct sif_pqp *) * n_pqps, GFP_KERNEL | __GFP_ZERO);
+ if (!sdev->pqp)
+ return -ENOMEM;
+
+ for (i = 0; i < n_pqps; i++) {
+ pqp = sif_create_pqp(sdev, i);
+ if (IS_ERR(pqp)) {
+ if ((i > 0) &&
+ !(eps_version_ge(es, 0, 42))) {
+ sif_log(sdev, SIF_INFO,
+ "SIF device has an old FW version that only supports one pqp");
+ break;
+ }
+ ret = PTR_ERR(pqp);
+ goto failed;
+ }
+ sdev->pqp[i] = pqp;
+ }
+ sdev->pqp_cnt = i;
+ atomic_set(&sdev->next_pqp, 0);
+ return 0;
+
+failed:
+ sdev->pqp_cnt = i;
+ sif_pqp_fini(sdev);
+ return ret;
+}
+
+
+static void sif_pqp_fini(struct sif_dev *sdev)
+{
+ /* we must maintain a consistent state of the PQP array
+ * during takedown as these operations themselves
+ * generate PQP requests..
+ */
+ while (sdev->pqp_cnt > 0) {
+ int i = sdev->pqp_cnt - 1;
+ struct sif_pqp *pqp = sdev->pqp[i];
+
+ if (i > 0) {
+ /* Remove ourselves first, except the final PQP */
+ sdev->pqp[i] = NULL;
+ sdev->pqp_cnt--;
+ }
+ sif_destroy_pqp(sdev, pqp);
+ if (i == 0)
+ sdev->pqp_cnt--;
+ }
+ kfree(sdev->pqp);
+ sdev->pqp = NULL;
+}
+
+
+static void sif_ki_spqp_fini(struct sif_dev *sdev);
+
+static int sif_ki_spqp_init(struct sif_dev *sdev)
+{
+ int i;
+ int ret = 0;
+ int n = max(sif_ki_spqp_size, 0U);
+ int bm_len = max(1, n/8);
+
+ mutex_init(&sdev->ki_spqp.lock);
+ sdev->ki_spqp.spqp =
+#ifdef CONFIG_NUMA
+ kmalloc_node(sizeof(struct sif_st_pqp *) * n, GFP_KERNEL | __GFP_ZERO,
+ sdev->pdev->dev.numa_node);
+#else
+ kmalloc(sizeof(struct sif_st_pqp *) * n, GFP_KERNEL | __GFP_ZERO);
+#endif
+ if (!sdev->ki_spqp.spqp)
+ return -ENOMEM;
+
+ sdev->ki_spqp.bitmap =
+#ifdef CONFIG_NUMA
+ kmalloc_node(sizeof(ulong) * bm_len, GFP_KERNEL | __GFP_ZERO,
+ sdev->pdev->dev.numa_node);
+#else
+ kmalloc(sizeof(ulong) * bm_len, GFP_KERNEL | __GFP_ZERO);
+#endif
+ if (!sdev->ki_spqp.bitmap) {
+ ret = -ENOMEM;
+ goto bm_failed;
+ }
+
+ for (i = 0; i < n; i++) {
+ struct sif_st_pqp *spqp = sif_create_inv_key_st_pqp(sdev);
+
+ if (IS_ERR(spqp)) {
+ ret = PTR_ERR(spqp);
+ break;
+ }
+ sdev->ki_spqp.spqp[i] = spqp;
+ spqp->index = i;
+ }
+ sdev->ki_spqp.pool_sz = i;
+ if (ret && i) {
+ sif_log(sdev, SIF_INFO, "Failed to create %d INVALIDATE_KEY stencil QPs", i);
+ sif_ki_spqp_fini(sdev);
+ }
+
+ if (i)
+ sif_log(sdev, SIF_INFO, "Created %d INVALIDATE_KEY stencil QPs", i);
+bm_failed:
+ if (ret)
+ kfree(sdev->ki_spqp.spqp);
+ return 0; /* Never fail on stencil PQP allocation */
+}
+
+
+static void sif_ki_spqp_fini(struct sif_dev *sdev)
+{
+ int i;
+
+ if (!sdev->ki_spqp.spqp)
+ return;
+ for (i = sdev->ki_spqp.pool_sz - 1; i >= 0; i--)
+ sif_destroy_st_pqp(sdev, sdev->ki_spqp.spqp[i]);
+ kfree(sdev->ki_spqp.bitmap);
+ kfree(sdev->ki_spqp.spqp);
+ sdev->ki_spqp.spqp = NULL;
+}
+
+
+static void sif_hw_kernel_cb_fini(struct sif_dev *sdev)
+{
+ int i;
+
+ while (sdev->kernel_cb_cnt > 0) {
+ int j = sdev->kernel_cb_cnt - 1;
+
+ for (i = 0; i < 2; i++)
+ if (sdev->kernel_cb[i][j])
+ release_cb(sdev, sdev->kernel_cb[i][j]);
+ sdev->kernel_cb_cnt--;
+ }
+ for (i = 0; i < 2; i++)
+ kfree(sdev->kernel_cb[i]);
+}
+
+
+
+static int sif_hw_kernel_cb_init(struct sif_dev *sdev)
+{
+ int i;
+ uint n_cbs = min(sif_cb_max, num_present_cpus());
+
+ if (!n_cbs)
+ n_cbs = 1;
+
+ for (i = 0; i < 2; i++) {
+ sdev->kernel_cb[i] = kcalloc(n_cbs, sizeof(struct sif_cb *), GFP_KERNEL);
+ if (!sdev->kernel_cb[i])
+ goto alloc_failed;
+ }
+
+ for (i = 0; i < n_cbs; i++) {
+ sdev->kernel_cb[0][i] = alloc_cb(sdev, false);
+ if (!sdev->kernel_cb[0][i])
+ goto alloc_failed;
+ sdev->kernel_cb[1][i] = alloc_cb(sdev, true);
+ if (!sdev->kernel_cb[1][i])
+ goto alloc_failed;
+ }
+ sdev->kernel_cb_cnt = i;
+ return 0;
+
+alloc_failed:
+ sdev->kernel_cb_cnt = i;
+ sif_hw_kernel_cb_fini(sdev);
+ return -ENOMEM;
+}
+
+
+static int get_tsl_map(struct sif_dev *sdev,
+ int opcode,
+ int port,
+ struct psif_tsl_map *map)
+{
+ struct psif_epsc_csr_req req;
+ struct psif_epsc_csr_rsp rsp;
+
+ /* EPSC supports the new requests starting from v.0.56 */
+ if (eps_fw_version_ge(&sdev->es[sdev->mbox_epsc], 0, 56)) {
+ int ret = 0;
+
+ memset(&req, 0, sizeof(req));
+ req.opcode = EPSC_QUERY;
+ req.u.query.data.op = opcode;
+ req.u.query.data.index = port;
+
+ ret = sif_epsc_wr(sdev, &req, &rsp);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "Failed to query sl to tsl map, opcode %s, port %d",
+ string_enum_psif_epsc_query_op(opcode) + strlen("EPSC_QUERY_"),
+ port);
+ return ret;
+ }
+ *map = *((struct psif_tsl_map *)&rsp.data);
+ return 0;
+ }
+
+ sif_log(sdev, SIF_INFO, "PSIF API %s has fw version less than %s. Cannot retrieve SL2TSL map",
+ "0.98", "0.56");
+ return -EOPNOTSUPP;
+}
+
+
+static void setup_sl2tsl_map(struct sif_dev *sdev)
+{
+ int port;
+ int sl;
+ int qosl;
+
+
+ /* TBD: separate bulk and rcv pqp vcb/tsl */
+ for (port = 0; port < 2; ++port) {
+ sdev->pqp_rcn_tsl[port] = TSL_PRIV;
+ sdev->pqp_bulk_tsl[port] = TSL_PRIV;
+ sdev->pqp_qosl_rcn_hint[port] = QOSL_LOW_LATENCY;
+ sdev->pqp_qosl_bulk_hint[port] = QOSL_LOW_LATENCY;
+ }
+
+ /* Default or least aggressive common denominator */
+ memset(sdev->sl2tsl + 0, TSL_DATA, sizeof(sdev->sl2tsl));
+ memset(sdev->qp0_tsl + 0, TSL_DATA, sizeof(sdev->qp0_tsl));
+
+ if (eps_fw_version_lt(&sdev->es[sdev->mbox_epsc], 0, 50)) {
+ sif_log(sdev, SIF_INFO, "Using a single TSL for regular QPs (fw < 0.50)");
+ return;
+ }
+
+ /* See BZ 3883 and https://cod.no.oracle.com/gerrit/r/#/c/6587/ */
+ for (sl = 0; sl < 16; ++sl)
+ for (port = 0; port < 2; ++port)
+ for (qosl = QOSL_HIGH_BANDWIDTH; qosl <= QOSL_LOW_LATENCY; ++qosl)
+ sdev->sl2tsl[sl][port][qosl] = port ? TSL_DATA_1 : TSL_DATA;
+
+ if (eps_fw_version_lt(&sdev->es[sdev->mbox_epsc], 0, 56)) {
+ sif_log(sdev, SIF_INFO, "Setting up TSL per port (0.50 <= fw <= 0.56)");
+ return;
+ }
+
+#define GET_TSL(i) map.m ## i ## _tsl
+#define GET_QOS(i) map.m ## i ## _tqos
+
+ {
+ struct psif_tsl_map map;
+ int opc;
+
+ sif_log(sdev, SIF_TSL, "Retrieving SL to TSL map from epsc (fw >= 0.56)");
+
+ for (port = 0; port < 2; ++port) {
+ if (get_tsl_map(sdev, EPSC_QUERY_MAP_PQP_TO_TSL, port + 1, &map))
+ return;
+ /* RCN pqp info in first entry, bulk in second */
+ sdev->pqp_rcn_tsl[port] = GET_TSL(0);
+ sdev->pqp_bulk_tsl[port] = GET_TSL(1);
+ sdev->pqp_qosl_rcn_hint[port] = GET_QOS(0);
+ sdev->pqp_qosl_bulk_hint[port] = GET_QOS(1);
+ }
+
+ for (opc = EPSC_QUERY_MAP_SL_TO_TSL_LO; opc <= EPSC_QUERY_MAP_SL_TO_TSL_HI; ++opc) {
+ bool last8 = opc == EPSC_QUERY_MAP_SL_TO_TSL_HI;
+
+ for (port = 0; port < 2; ++port) {
+ if (get_tsl_map(sdev, opc, port + 1, &map))
+ return;
+ for (qosl = QOSL_HIGH_BANDWIDTH; qosl <= QOSL_LOW_LATENCY; ++qosl) {
+ sdev->sl2tsl[8*last8 + 0][port][qosl] = GET_TSL(0);
+ sdev->sl2tsl[8*last8 + 1][port][qosl] = GET_TSL(1);
+ sdev->sl2tsl[8*last8 + 2][port][qosl] = GET_TSL(2);
+ sdev->sl2tsl[8*last8 + 3][port][qosl] = GET_TSL(3);
+ sdev->sl2tsl[8*last8 + 4][port][qosl] = GET_TSL(4);
+ sdev->sl2tsl[8*last8 + 5][port][qosl] = GET_TSL(5);
+ sdev->sl2tsl[8*last8 + 6][port][qosl] = GET_TSL(6);
+ sdev->sl2tsl[8*last8 + 7][port][qosl] = GET_TSL(7);
+
+ sdev->qp_qosl_hint[8*last8 + 0][port] = GET_QOS(0);
+ sdev->qp_qosl_hint[8*last8 + 1][port] = GET_QOS(1);
+ sdev->qp_qosl_hint[8*last8 + 2][port] = GET_QOS(2);
+ sdev->qp_qosl_hint[8*last8 + 3][port] = GET_QOS(3);
+ sdev->qp_qosl_hint[8*last8 + 4][port] = GET_QOS(4);
+ sdev->qp_qosl_hint[8*last8 + 5][port] = GET_QOS(5);
+ sdev->qp_qosl_hint[8*last8 + 6][port] = GET_QOS(6);
+ sdev->qp_qosl_hint[8*last8 + 7][port] = GET_QOS(7);
+ }
+ }
+ }
+
+ if (!eps_version_ge(&sdev->es[sdev->mbox_epsc], 1, 6)) {
+ sif_log(sdev, SIF_INFO, "FW version does not not support special QP0 TSL");
+ return;
+ }
+ for (port = 0; port < 2; ++port) {
+ if (get_tsl_map(sdev, EPSC_QUERY_MAP_QP0_TO_TSL, port + 1, &map))
+ return;
+ sdev->qp0_tsl[port] = GET_TSL(0);
+ sdev->qp0_qosl_hint[port] = GET_QOS(0);
+ }
+ }
+#undef GET_TSL
+#undef GET_QOS
+}
+
+
+static void dump_sl2tsl_map(struct sif_dev *sdev)
+{
+ int sl;
+ int port;
+ int qosl;
+
+ for (port = 0; port < 2; ++port) {
+ sif_log(sdev, SIF_TSL, "rcn pqp port:%d tsl:%2d fw_hint:%s",
+ port + 1, sdev->pqp_rcn_tsl[port],
+ string_enum_psif_tsu_qos(sdev->pqp_qosl_rcn_hint[port]) + strlen("QOSL_"));
+ sif_log(sdev, SIF_TSL, "bulk pqp port:%d tsl:%2d fw_hint:%s",
+ port + 1, sdev->pqp_bulk_tsl[port],
+ string_enum_psif_tsu_qos(sdev->pqp_qosl_bulk_hint[port]) + strlen("QOSL_"));
+ }
+
+ for (port = 0; port < 2; ++port)
+ for (sl = 0; sl < 16; ++sl)
+ for (qosl = QOSL_HIGH_BANDWIDTH; qosl <= QOSL_LOW_LATENCY; ++qosl)
+ sif_log(sdev, SIF_TSL,
+ "plain qp port:%d sl:%2d qosl:%-14s tsl:%2d fw_hint:%s",
+ port + 1, sl, string_enum_psif_tsu_qos(qosl) + strlen("QOSL_"),
+ sdev->sl2tsl[sl][port][qosl],
+ string_enum_psif_tsu_qos(sdev->qp_qosl_hint[sl][port]) +
+ strlen("QOSL_"));
+
+ for (port = 0; port < 2; ++port) {
+ sif_log(sdev, SIF_TSL, "qp0 port:%d tsl:%2d fw_hint:%s",
+ port + 1, sdev->qp0_tsl[port],
+ string_enum_psif_tsu_qos(sdev->qp0_qosl_hint[port]) + strlen("QOSL_"));
+ }
+}
+
+/* Device is degraded; set limited mode and report cause */
+static int sif_handle_degraded(struct sif_dev *sdev)
+{
+ int ret = 0;
+
+ sdev->limited_mode = true;
+ if (eps_version_ge(&sdev->es[sdev->mbox_epsc], 1, 0)) {
+ struct psif_epsc_csr_req req;
+ struct psif_epsc_csr_rsp rsp;
+
+ /* Ask the EPSC if it's running in degraded mode */
+ memset(&req, 0, sizeof(req));
+ req.opcode = EPSC_QUERY;
+ req.u.query.data.op = EPSC_QUERY_DEGRADED_CAUSE;
+ ret = sif_epsc_wr(sdev, &req, &rsp);
+ if (ret) {
+ sif_log(sdev, SIF_INFO,
+ "Request to the EPSC for degraded cause failed with %d", ret);
+ return ret;
+ }
+ if (rsp.data != 0)
+ epsc_report_degraded(sdev, rsp.data);
+ sif_log(sdev, SIF_EPS, "Device reports degraded mode, mask 0x%llx", rsp.data);
+ }
+ return ret;
+}
+
+
+int sif_hw_init(struct sif_dev *sdev)
+{
+ int i;
+ int ret = -ENOMEM;
+ struct sif_pd *pd = NULL;
+
+ /* PSIF 2.x requires MRRS to be at least 512, ref BZ #3301 */
+ if (pcie_get_readrq(sdev->pdev) < 512) {
+ sif_log(sdev, SIF_INFO, "PSIF 2.x requires MRRS to be at least 512 bytes");
+ ret = -EINVAL;
+ goto chip_init_failed;
+ }
+
+ sif_mem_init(sdev);
+
+ /* Misc. PSIF chip version specific
+ * configuration (must be before base_init):
+ */
+ ret = sif_chip_init(sdev);
+ if (ret)
+ goto chip_init_failed;
+
+ ret = sif_base_init(sdev);
+ if (ret)
+ goto base_failed;
+
+ /* Allocate collect buffers for kernel usage */
+ ret = sif_hw_kernel_cb_init(sdev);
+ if (ret)
+ goto cb_alloc_failed;
+
+ ret = sif_init_pd(sdev);
+ if (ret)
+ goto pd_init_failed;
+
+ /* We need a kernel protection domain for resource allocation */
+ pd = alloc_pd(sdev);
+ if (!pd)
+ goto pd_alloc_failed;
+ pd->ibpd.device = &sdev->ib_dev;
+ sdev->pd = pd;
+ if (sdev->degraded)
+ sif_handle_degraded(sdev);
+ if (sdev->limited_mode) {
+ sif_log(sdev, SIF_INFO, "Running in limited mode\n");
+ return 0;
+ }
+
+ /* Initialize the SL to TSL map, before any QPs are created */
+ setup_sl2tsl_map(sdev);
+ dump_sl2tsl_map(sdev);
+
+ /* Reserve indices for qp 0 and 1, ports 1 and 2 */
+ for (i = 0; i <= 3; i++)
+ sif_alloc_qp_idx(pd);
+
+ ret = sif_pqp_init(sdev);
+ if (ret)
+ goto pqp_failed;
+
+ ret = sif_ki_spqp_init(sdev);
+ if (ret)
+ goto ki_spqp_failed;
+
+ ret = sif_init_xrcd(sdev);
+ if (ret)
+ goto xrcd_failed;
+
+ return 0;
+
+xrcd_failed:
+ sif_ki_spqp_fini(sdev);
+ki_spqp_failed:
+ sif_pqp_fini(sdev);
+pqp_failed:
+ /* Release indices for qp 0 and 1 */
+ for (i = 3; i >= 0; i--)
+ sif_free_qp_idx(pd, i);
+ dealloc_pd(pd);
+
+pd_alloc_failed:
+ sif_deinit_pd(sdev);
+pd_init_failed:
+ sif_hw_kernel_cb_fini(sdev);
+cb_alloc_failed:
+ sif_base_deinit(sdev);
+base_failed:
+ sif_chip_deinit(sdev);
+chip_init_failed:
+ return ret;
+}
+
+void sif_hw_deinit(struct sif_dev *sdev)
+{
+ int i;
+
+ if (!sdev->limited_mode) {
+ sif_log(sdev, SIF_PQP, "enter");
+ sif_ki_spqp_fini(sdev);
+ sif_pqp_fini(sdev);
+
+ /* Release indices for qp 0 and 1 */
+ for (i = 3; i >= 0; i--)
+ sif_free_qp_idx(sdev->pd, i);
+ }
+
+ dealloc_pd(sdev->pd);
+ sif_deinit_pd(sdev);
+ sif_hw_kernel_cb_fini(sdev);
+ sif_base_deinit(sdev);
+ sif_chip_deinit(sdev);
+}
+
+
+int force_pcie_link_retrain(struct sif_dev *sdev)
+{
+ int err, parent_pcie_cap;
+ u16 parent_lnkctl;
+
+ parent_pcie_cap = pci_find_capability(sdev->pdev->bus->self, PCI_CAP_ID_EXP);
+ err = pci_read_config_word(sdev->pdev, parent_pcie_cap + PCI_EXP_LNKCTL, &parent_lnkctl);
+ parent_lnkctl |= PCI_EXP_LNKCTL_RL;
+ err = pci_write_config_word(sdev->pdev->bus->self, parent_pcie_cap + PCI_EXP_LNKCTL,
+ parent_lnkctl);
+ return err;
+}
+
+
+static int sif_chip_init(struct sif_dev *sdev)
+{
+ u16 devid;
+
+ /* Chip version specific config */
+ devid = sdev->pdev->device;
+ switch (devid) {
+ case PCI_DEVICE_ID_PSIF_VF:
+ sdev->is_vf = true;
+ sdev->num_vfs = 0;
+ sdev->mbox_epsc = MBOX_EPSC;
+ sdev->eps_cnt = MBOX_EPSC + 1;
+ break;
+
+ case PCI_DEVICE_ID_PSIF_PF:
+ sdev->is_vf = false;
+ sdev->mbox_epsc = MBOX_EPSC;
+ sdev->eps_cnt = MBOX_EPSC + 1;
+ break;
+
+ case PCI_DEVICE_ID_SN1_VF:
+ sdev->is_vf = true;
+ sdev->num_vfs = 0;
+ sdev->mbox_epsc = SIBS_MBOX_EPSC;
+ sdev->eps_cnt = SIBS_MBOX_EPSC + 1;
+ break;
+
+ case PCI_DEVICE_ID_SN1_PF:
+ sdev->is_vf = false;
+ sdev->mbox_epsc = SIBS_MBOX_EPSC;
+ sdev->eps_cnt = SIBS_MBOX_EPSC + 1;
+ break;
+
+ default:
+ sif_log(sdev, SIF_INFO, "Unknown device id %x", devid);
+ return -ENODEV;
+ }
+
+ if (!sif_vf_en && sdev->is_vf) {
+ sif_log(sdev, SIF_INFO, "Parameter vf_en=0: VF driver load disabled");
+ return -EINVAL;
+ }
+
+
+ sdev->es = kcalloc(sdev->eps_cnt, sizeof(struct sif_eps), GFP_KERNEL);
+ if (!sdev->es)
+ return -ENOMEM;
+
+ return 0;
+}
+
+
+static void sif_chip_deinit(struct sif_dev *sdev)
+{
+ kfree(sdev->es);
+ sdev->es = NULL;
+}
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_hwi.h: Hardware init for SIF
+ */
+
+#ifndef _SIF_HWI_H
+#define _SIF_HWI_H
+#include <rdma/ib_verbs.h>
+#include "sif_cq.h"
+#include "sif_r3.h"
+
+struct sif_dev;
+struct sif_pqp;
+struct sif_qp;
+struct sif_compl;
+struct sif_cqe;
+struct psif_wr;
+struct psif_cq_entry;
+enum psif_wr_type;
+
+/* Main calls for hardware specific initialization/deinitialization */
+
+int force_pcie_link_retrain(struct sif_dev *sdev);
+int sif_hw_init(struct sif_dev *sdev);
+void sif_hw_deinit(struct sif_dev *sdev);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_ibcq.h: External interface to IB completion queue logic for SIF
+ */
+
+#ifndef __SIF_IBCQ_H
+#define __SIF_IBCQ_H
+
+struct ib_cq *sif_create_cq(struct ib_device *ibdev, int cqe,
+ int comp_vector, struct ib_ucontext *context,
+ struct ib_udata *udata,
+ enum sif_proxy_type proxy);
+
+int sif_modify_cq(struct ib_cq *ibcq, u16 cq_count, u16 cq_period);
+int sif_destroy_cq(struct ib_cq *ibcq);
+int sif_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
+int sif_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int sif_peek_cq(struct ib_cq *ibcq, int wc_cnt);
+
+int sif_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
+int sif_req_ncomp_notif(struct ib_cq *ibcq, int wc_cnt);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_ibpd.h: External interface to (IB) protection domains for SIF
+ */
+
+#ifndef __SIF_IBPD_H
+#define __SIF_IBPD_H
+
+struct ib_pd *sif_alloc_pd(struct ib_device *ibdev,
+ struct ib_ucontext *ibucontext,
+ struct ib_udata *udata);
+
+int sif_dealloc_pd(struct ib_pd *ibpd);
+
+struct ib_shpd *sif_alloc_shpd(struct ib_device *ibdev,
+ struct ib_pd *ibpd,
+ struct ib_udata *udata);
+
+struct ib_pd *sif_share_pd(struct ib_device *ibdev,
+ struct ib_ucontext *context,
+ struct ib_udata *udata,
+ struct ib_shpd *shpd);
+
+int sif_remove_shpd(struct ib_device *ibdev,
+ struct ib_shpd *shpd,
+ int atinit);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_ibqp.h: External interface to IB queue pair logic for sif
+ */
+
+#ifndef __SIF_IBQP_H
+#define __SIF_IBQP_H
+
+struct ib_qp *sif_create_qp(struct ib_pd *ibpd,
+ struct ib_qp_init_attr *qp_init_attr,
+ struct ib_udata *udata);
+int sif_modify_qp(struct ib_qp *ibqp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask, struct ib_udata *udata);
+
+int sif_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+ int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+int sif_destroy_qp(struct ib_qp *ibqp);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_idr.c: Synchronized ID ref allocation
+ */
+
+#include "sif_idr.h"
+
+int sif_idr_init(struct sif_idr *sidr, int id_min, int id_max)
+{
+ int ret = 0;
+ idr_init(&sidr->idr);
+ mutex_init(&sidr->lock);
+ sidr->id_min = id_min;
+ sidr->id_max = id_max;
+ return ret;
+}
+
+
+void sif_idr_deinit(struct sif_idr *sidr)
+{
+ idr_destroy(&sidr->idr);
+}
+
+
+int sif_idr_alloc(struct sif_idr *sidr, void *ref, gfp_t gfp_mask)
+{
+ int index;
+
+ mutex_lock(&sidr->lock);
+ index = idr_alloc(&sidr->idr, ref, sidr->id_min, sidr->id_max, gfp_mask);
+ mutex_unlock(&sidr->lock);
+ return index;
+}
+
+void sif_idr_remove(struct sif_idr *sidr, int index)
+{
+ mutex_lock(&sidr->lock);
+ idr_remove(&sidr->idr, index);
+ mutex_unlock(&sidr->lock);
+}
--- /dev/null
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_idr.h: simple id allocation and deallocation for SIF
+ */
+
+#ifndef _SIF_IDR_H
+#define _SIF_IDR_H
+#include <linux/version.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+
+/* Synchronized ID ref allocation */
+
+struct sif_idr {
+ struct idr idr;
+ struct mutex lock;
+ int id_min;
+ int id_max;
+};
+
+int sif_idr_init(struct sif_idr *sidr, int id_min, int id_max);
+void sif_idr_deinit(struct sif_idr *sidr);
+
+int sif_idr_alloc(struct sif_idr *sidr, void *ref, gfp_t gfp_mask);
+void sif_idr_remove(struct sif_idr *sidr, int index);
+
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_int_user.h: This file defines special internal data structures used
+ * to communicate between libsif and the sif driver.
+ * This file is included both from user space and kernel space so
+ * it must not contain any kernel/user specific header file includes.
+ * This file is internal to libsif/sif driver since it relies on HW specific
+ * include files.
+ */
+
+#ifndef _SIF_INT_USER_H
+#define _SIF_INT_USER_H
+
+
+#include "psif_hw_data.h"
+
+/* Do this the brute force way, since structs are used in user-space */
+#if defined(__x86_64__) || defined(__sparc__)
+#define SIF_CACHE_BYTES 64
+#else
+#define SIF_CACHE_BYTES 64
+#endif
+
+/* We use the extension here to communicate with the driver
+ * (for correct debugfs reporting)
+ */
+
+/* sif_sq_sw flags definition
+ */
+enum sq_sw_state {
+ FLUSH_SQ_IN_PROGRESS = 0,
+ FLUSH_SQ_IN_FLIGHT = 1,
+};
+
+struct sif_sq_sw {
+ struct psif_sq_sw d; /* Hardware visible descriptor */
+ __u8 fill[SIF_CACHE_BYTES - sizeof(struct psif_sq_sw)]; /* separate the cache lines */
+ __u16 last_seq; /* Last used sq seq.num (req. sq->lock) */
+ __u16 head_seq; /* Last sq seq.number seen in a compl (req. cq->lock) */
+ __u16 trusted_seq; /* Last next_seq that was either generate or exist in the cq */
+ __u8 tsl; /* Valid after transition to RTR */
+ unsigned long flags; /* Flags, using unsigned long due to test_set/test_and_set_bit */
+};
+
+/* sif_rq_sw flags definition
+ */
+enum rq_sw_state {
+ FLUSH_RQ_IN_PROGRESS = 0,
+ FLUSH_RQ_IN_FLIGHT = 1,
+ FLUSH_RQ_FIRST_TIME = 2,
+ RQ_IS_INVALIDATED = 3,
+};
+
+struct sif_rq_sw {
+ struct psif_rq_sw d; /* Hardware visible descriptor */
+ __u8 fill[SIF_CACHE_BYTES - sizeof(struct psif_rq_sw)]; /* separate the cache lines */
+ atomic_t length; /* current length of queue as #posted - #completed */
+ __u32 next_seq; /* First unused sequence number */
+ unsigned long flags; /* Flags, using unsigned long due to test_set/test_and_set_bit */
+};
+
+enum cq_sw_state {
+ CQ_POLLING_NOT_ALLOWED = 0,
+ CQ_POLLING_IGNORED_SEQ = 1,
+ FLUSH_SQ_FIRST_TIME = 2,
+};
+
+struct sif_cq_sw {
+ struct psif_cq_sw d; /* Hardware visible descriptor */
+ __u8 fill[SIF_CACHE_BYTES - sizeof(struct psif_cq_sw)]; /* separate the cache lines */
+ __u32 next_seq; /* First unused sequence number */
+ __u32 cached_head; /* Local copy kept in sync w/hw visible head_indx */
+ __u32 last_hw_seq; /* Last next_seq reported in completion for req_notify_cq */
+ __u32 armed; /* Set if req_notify_cq has been called but event not processed */
+ __u32 miss_cnt; /* Number of in-flight completions observed by poll_cq */
+ __u32 miss_occ; /* Number of times 1 or more in-flight completions was seen */
+ unsigned long flags; /* Flags, using unsigned long due to test_set/test_and_set_bit */
+};
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_ireg.c: Utilities and entry points needed for Infiniband registration
+ */
+
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include "sif_dev.h"
+#include "sif_ireg.h"
+#include "sif_user.h"
+#include "sif_dma.h"
+#include "sif_ibpd.h"
+#include "sif_ibcq.h"
+#include "sif_ibqp.h"
+#include "sif_mr.h"
+#include "sif_mw.h"
+#include "sif_fmr.h"
+#include "sif_ah.h"
+#include "sif_srq.h"
+#include "sif_xrc.h"
+#include "sif_sndrcv.h"
+#include "sif_hwi.h"
+#include "sif_query.h"
+#include "sif_pd.h"
+#include "sif_base.h"
+#include "version.h"
+
+
+static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct sif_dev *sdev = dev_get_drvdata(dev);
+
+ return sprintf(buf, "%x\n", PSIF_REVISION(sdev));
+}
+
+static ssize_t show_fw_ver(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct sif_dev *sdev = dev_get_drvdata(device);
+ struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+
+ return sprintf(buf, "%hu.%hu.0\n", es->ver.fw_major, es->ver.fw_minor);
+}
+
+static ssize_t show_eps_api_ver(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct sif_dev *sdev = dev_get_drvdata(device);
+ struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+
+ return sprintf(buf, "%hu.%hu\n", es->ver.epsc_major, es->ver.epsc_minor);
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct sif_dev *sdev = dev_get_drvdata(device);
+
+ return sprintf(buf, "ORCL%d\n", PSIF_DEVICE(sdev));
+}
+
+static ssize_t show_board(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct sif_dev *sdev = dev_get_drvdata(device);
+ const char *prod_str = get_product_str(sdev);
+ /*
+ * Paranoia level: avoid dumping the whole kernel to
+ * user-space if the zero termination character in the product
+ * string has been compromised
+ */
+ const int n = min_t(int, 64, (int)strlen(prod_str));
+
+ return sprintf(buf, "%.*s\n", n, prod_str);
+}
+
+static ssize_t show_stats(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct sif_dev *sdev = dev_get_drvdata(device);
+ /* TBD: device specific counters, stats registers */
+ sif_log(sdev, SIF_VERBS, "Not implemented");
+ return -EOPNOTSUPP;
+}
+
+
+/* PSIF specific extensions */
+
+/* Version information details (git revision of driver and firmware etc) */
+static ssize_t show_versioninfo(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct sif_dev *sdev = dev_get_drvdata(device);
+ char **fwv = sdev->es[sdev->mbox_epsc].ver.fw_version;
+
+ return snprintf(buf, PAGE_SIZE, "%s - build user %s at %s\n"
+ "sifdrv git tag:\n%s\n%s\n"
+ "EPSC firmware: build user %s at %s\nimage revision string %s\n"
+ "version tag:\n%s\n%s",
+ sif_version.git_repo,
+ sif_version.build_user, sif_version.build_git_time,
+ sif_version.last_commit,
+ (sif_version.git_status[0] != '\0' ? sif_version.git_psifapi_status : ""),
+ fwv[FWV_EPS_BUILD_USER], fwv[FWV_EPS_BUILD_GIT_TIME],
+ fwv[FWV_EPS_REV_STRING], fwv[FWV_EPS_GIT_LAST_COMMIT],
+ (fwv[FWV_EPS_GIT_STATUS][0] != '\0' ? fwv[FWV_EPS_GIT_STATUS] : ""));
+}
+
+
+static ssize_t show_resp_ms(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct sif_dev *sdev = dev_get_drvdata(device);
+
+ return sprintf(buf, "%d\n", jiffies_to_msecs(sdev->min_resp_ticks));
+}
+
+
+static ssize_t set_resp_ms(struct device *device,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct sif_dev *sdev = dev_get_drvdata(device);
+ size_t old_val = jiffies_to_msecs(sdev->min_resp_ticks);
+ size_t new_val;
+ int ret = kstrtoul(buf, 0, &new_val);
+
+ if (ret || !new_val)
+ new_val = 1;
+ sif_log(sdev, SIF_INFO, "%ld ms -> %ld ms", old_val, new_val);
+ sdev->min_resp_ticks = msecs_to_jiffies(new_val);
+ return strlen(buf);
+}
+
+static ssize_t show_irq_moderation(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct sif_dev *sdev = dev_get_drvdata(device);
+
+ return sprintf(buf, "%hu\n", sdev->es[sdev->mbox_epsc].eqs.irq_moderation);
+}
+
+static ssize_t set_irq_moderation(struct device *device,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct sif_dev *sdev = dev_get_drvdata(device);
+ struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+ u16 old_val = es->eqs.irq_moderation;
+ u16 new_val;
+
+ int ret = kstrtou16(buf, 0, &new_val);
+ struct psif_epsc_csr_req req; /* local epsc wr copy */
+ struct psif_epsc_csr_rsp resp;
+
+ if (ret || !new_val)
+ new_val = 0;
+
+ if (eps_version_ge(es, 0, 36)) {
+ memset(&req, 0, sizeof(req));
+ req.opcode = EPSC_HOST_INT_COMMON_CTRL;
+ req.uf = 0;
+ req.u.int_common.total_usec = (uintptr_t)new_val;
+ ret = sif_epsc_wr_poll(sdev, &req, &resp);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "Failed to configure device interrupt total moderation\n");
+ return ret;
+ }
+ es->eqs.irq_moderation = new_val;
+ sif_log(sdev, SIF_INFO, "Interrupt total moderation: %d usecs -> %d usecs",
+ old_val, new_val);
+ return strlen(buf);
+ } else
+ return -1;
+}
+
+static ssize_t show_mt_override(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct sif_dev *sdev = dev_get_drvdata(device);
+
+ switch (sdev->mt_override) {
+ case SIFMT_BYPASS:
+ sprintf(buf, "bypass\n");
+ break;
+ case SIFMT_UMEM:
+ sprintf(buf, "umem (no override)\n");
+ break;
+ case SIFMT_UMEM_SPT:
+ sprintf(buf, "spt\n");
+ break;
+ case SIFMT_ZERO:
+ sprintf(buf, "zero\n");
+ break;
+ default:
+ /* Sanity check for debugging the driver only */
+ sprintf(buf, "***undefined***\n");
+ break;
+ }
+ return strlen(buf);
+}
+
+
+static ssize_t set_mt_override(struct device *device,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct sif_dev *sdev = dev_get_drvdata(device);
+
+ if (strcmp(buf, "bypass\n") == 0)
+ sdev->mt_override = SIFMT_BYPASS;
+ else if (strcmp(buf, "umem\n") == 0 || strcmp(buf, "none\n") == 0)
+ sdev->mt_override = SIFMT_UMEM;
+ else if (strcmp(buf, "spt\n") == 0)
+ sdev->mt_override = SIFMT_UMEM_SPT;
+ else if (strcmp(buf, "zero\n") == 0)
+ sdev->mt_override = SIFMT_ZERO;
+ else
+ return -EINVAL;
+ return strlen(buf);
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(eps_api_ver, S_IRUGO, show_eps_api_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+static DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL);
+static DEVICE_ATTR(versioninfo, S_IRUGO, show_versioninfo, NULL);
+static DEVICE_ATTR(min_resp_ms, S_IWUSR | S_IRUGO, show_resp_ms, set_resp_ms);
+static DEVICE_ATTR(mt_override, S_IWUSR | S_IRUGO, show_mt_override, set_mt_override);
+static DEVICE_ATTR(irq_moderation, S_IWUSR | S_IRUGO, show_irq_moderation, set_irq_moderation);
+
+static struct device_attribute *sif_class_attributes[] = {
+ &dev_attr_hw_rev,
+ &dev_attr_fw_ver,
+ &dev_attr_eps_api_ver,
+ &dev_attr_hca_type,
+ &dev_attr_board_id,
+ &dev_attr_stats,
+ &dev_attr_versioninfo,
+ &dev_attr_min_resp_ms,
+ &dev_attr_mt_override,
+ &dev_attr_irq_moderation,
+};
+
+static u64 dev_show(const struct device *device,
+ struct device_attribute *attr,
+ char *buf,
+ int opcode)
+{
+ struct sif_dev *sdev = dev_get_drvdata(device);
+ struct psif_epsc_csr_req req;
+ struct psif_epsc_csr_rsp rsp;
+
+ /* EPSC supports the new requests starting from v.0.43 */
+ if (eps_version_ge(&sdev->es[sdev->mbox_epsc], 0, 43)) {
+ int ret = 0;
+
+ memset(&req, 0, sizeof(req));
+ req.opcode = EPSC_QUERY;
+ req.u.query.data.op = opcode;
+ ret = sif_epsc_wr(sdev, &req, &rsp);
+ if (ret)
+ sif_log(sdev, SIF_INFO, "Failed to query tsu error counter\n");
+ else
+ sprintf(buf, "%llu\n", rsp.data);
+ }
+ return strlen(buf);
+}
+
+#define DEVICE_SHOW(field) \
+static ssize_t show_##field(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+{ \
+ return dev_show(dev, attr, buf, EPSC_QUERY_##field); \
+}
+
+DEVICE_SHOW(SQ_NUM_BRE);
+DEVICE_SHOW(NUM_CQOVF);
+DEVICE_SHOW(SQ_NUM_WRFE);
+DEVICE_SHOW(RQ_NUM_WRFE);
+DEVICE_SHOW(RQ_NUM_LAE);
+DEVICE_SHOW(RQ_NUM_LPE);
+DEVICE_SHOW(SQ_NUM_LLE);
+DEVICE_SHOW(RQ_NUM_LLE);
+DEVICE_SHOW(SQ_NUM_LQPOE);
+DEVICE_SHOW(RQ_NUM_LQPOE);
+DEVICE_SHOW(SQ_NUM_OOS);
+DEVICE_SHOW(RQ_NUM_OOS);
+DEVICE_SHOW(SQ_NUM_RREE);
+DEVICE_SHOW(SQ_NUM_TREE);
+DEVICE_SHOW(SQ_NUM_ROE);
+DEVICE_SHOW(RQ_NUM_ROE);
+DEVICE_SHOW(SQ_NUM_RAE);
+DEVICE_SHOW(RQ_NUM_RAE);
+DEVICE_SHOW(RQ_NUM_UDSDPRD);
+DEVICE_SHOW(RQ_NUM_UCSDPRD);
+DEVICE_SHOW(SQ_NUM_RIRE);
+DEVICE_SHOW(RQ_NUM_RIRE);
+DEVICE_SHOW(SQ_NUM_RNR);
+DEVICE_SHOW(RQ_NUM_RNR);
+
+static ssize_t clear_diag(struct device *device,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+
+ struct sif_dev *sdev = dev_get_drvdata(device);
+ int ret;
+ struct psif_epsc_csr_req req;
+ struct psif_epsc_csr_rsp resp;
+
+ if (strcmp(buf, "1\n") == 0) {
+
+ memset(&req, 0, sizeof(req));
+ memset(&resp, 0, sizeof(resp));
+
+ req.opcode = EPSC_SET;
+ req.u.set.data.op = EPSC_QUERY_RESET_CBLD_DIAG_COUNTERS;
+ req.u.set.data.value = 0xffffff;
+ ret = sif_epsc_wr_poll(sdev, &req, &resp);
+ if (ret)
+ sif_log(sdev, SIF_INFO, "Failed to clear psif diag counters\n");
+ } else
+ return -EINVAL;
+
+ return strlen(buf);
+}
+
+static DEVICE_ATTR(clear_diag, S_IWUSR, NULL, clear_diag);
+static DEVICE_ATTR(sq_num_bre, S_IRUGO, show_SQ_NUM_BRE, NULL);
+static DEVICE_ATTR(num_cqovf, S_IRUGO, show_NUM_CQOVF, NULL);
+static DEVICE_ATTR(sq_num_wrfe, S_IRUGO, show_SQ_NUM_WRFE, NULL);
+static DEVICE_ATTR(rq_num_wrfe, S_IRUGO, show_RQ_NUM_WRFE, NULL);
+static DEVICE_ATTR(rq_num_lae, S_IRUGO, show_RQ_NUM_LAE, NULL);
+static DEVICE_ATTR(rq_num_lpe, S_IRUGO, show_RQ_NUM_LPE, NULL);
+static DEVICE_ATTR(sq_num_lle, S_IRUGO, show_SQ_NUM_LLE, NULL);
+static DEVICE_ATTR(rq_num_lle, S_IRUGO, show_RQ_NUM_LLE, NULL);
+static DEVICE_ATTR(sq_num_lqpoe, S_IRUGO, show_SQ_NUM_LQPOE, NULL);
+static DEVICE_ATTR(rq_num_lqpoe, S_IRUGO, show_RQ_NUM_LQPOE, NULL);
+static DEVICE_ATTR(sq_num_oos, S_IRUGO, show_SQ_NUM_OOS, NULL);
+static DEVICE_ATTR(rq_num_oos, S_IRUGO, show_RQ_NUM_OOS, NULL);
+static DEVICE_ATTR(sq_num_rree, S_IRUGO, show_SQ_NUM_RREE, NULL);
+static DEVICE_ATTR(sq_num_tree, S_IRUGO, show_SQ_NUM_TREE, NULL);
+static DEVICE_ATTR(sq_num_roe, S_IRUGO, show_SQ_NUM_ROE, NULL);
+static DEVICE_ATTR(rq_num_roe, S_IRUGO, show_RQ_NUM_ROE, NULL);
+static DEVICE_ATTR(sq_num_rae, S_IRUGO, show_SQ_NUM_RAE, NULL);
+static DEVICE_ATTR(rq_num_rae, S_IRUGO, show_RQ_NUM_RAE, NULL);
+static DEVICE_ATTR(rq_num_udsdprd, S_IRUGO, show_RQ_NUM_UDSDPRD, NULL);
+static DEVICE_ATTR(rq_num_ucsdprd, S_IRUGO, show_RQ_NUM_UCSDPRD, NULL);
+static DEVICE_ATTR(sq_num_rire, S_IRUGO, show_SQ_NUM_RIRE, NULL);
+static DEVICE_ATTR(rq_num_rire, S_IRUGO, show_RQ_NUM_RIRE, NULL);
+static DEVICE_ATTR(sq_num_rnr, S_IRUGO, show_SQ_NUM_RNR, NULL);
+static DEVICE_ATTR(rq_num_rnr, S_IRUGO, show_RQ_NUM_RNR, NULL);
+
+static struct attribute *sif_diag_counters_class_attributes[] = {
+ &dev_attr_clear_diag.attr,
+ &dev_attr_sq_num_bre.attr,
+ &dev_attr_num_cqovf.attr,
+ &dev_attr_sq_num_wrfe.attr,
+ &dev_attr_rq_num_wrfe.attr,
+ &dev_attr_rq_num_lae.attr,
+ &dev_attr_rq_num_lpe.attr,
+ &dev_attr_sq_num_lle.attr,
+ &dev_attr_rq_num_lle.attr,
+ &dev_attr_sq_num_lqpoe.attr,
+ &dev_attr_rq_num_lqpoe.attr,
+ &dev_attr_sq_num_oos.attr,
+ &dev_attr_rq_num_oos.attr,
+ &dev_attr_sq_num_rree.attr,
+ &dev_attr_sq_num_tree.attr,
+ &dev_attr_sq_num_roe.attr,
+ &dev_attr_rq_num_roe.attr,
+ &dev_attr_sq_num_rae.attr,
+ &dev_attr_rq_num_rae.attr,
+ &dev_attr_rq_num_udsdprd.attr,
+ &dev_attr_rq_num_ucsdprd.attr,
+ &dev_attr_sq_num_rire.attr,
+ &dev_attr_rq_num_rire.attr,
+ &dev_attr_sq_num_rnr.attr,
+ &dev_attr_rq_num_rnr.attr,
+ NULL,
+};
+
+static struct attribute_group diag_counters_attr_group = {
+ .attrs = sif_diag_counters_class_attributes,
+ .name = "diag_counters",
+};
+
+static struct ib_ucontext *sif_alloc_ucontext(struct ib_device *ibdev,
+ struct ib_udata *udata)
+{
+ int ret;
+ struct sif_dev *sdev = to_sdev(ibdev);
+ struct sif_ucontext *s_uc;
+
+ s_uc = kzalloc(sizeof(*s_uc), GFP_KERNEL);
+ if (!s_uc)
+ return NULL;
+
+ s_uc->pd = alloc_pd(sdev);
+ if (!s_uc->pd) {
+ ret = -ENOMEM;
+ goto alloc_pd_failed;
+ }
+ s_uc->pd->ibpd.device = ibdev;
+
+ s_uc->cb = alloc_cb(sdev, false);
+ if (!s_uc->cb) {
+ ret = -ENOMEM;
+ goto alloc_cb_failed;
+ }
+
+ if (udata) {
+ struct sif_get_context_ext cmd;
+ struct sif_get_context_resp_ext resp;
+ u16 major_ver, minor_ver;
+
+ memset(&cmd, 0, sizeof(cmd));
+ ib_copy_from_udata(&cmd, udata, sizeof(cmd));
+
+ s_uc->abi_version = cmd.abi_version;
+ major_ver = s_uc->abi_version >> 8;
+ minor_ver = s_uc->abi_version & 0xff;
+ if (major_ver != SIF_UVERBS_ABI_MAJOR_VERSION) {
+ if (major_ver < 10 && major_ver > 0) {
+ sif_log(sdev, SIF_INFO,
+ "User verbs abi version mismatch - driver has v.%d.%d - libsif has v.%d.%d",
+ SIF_UVERBS_ABI_MAJOR_VERSION, SIF_UVERBS_ABI_MINOR_VERSION,
+ major_ver, minor_ver);
+ ret = -EINVAL;
+ goto udata_copy_failed;
+ } else {
+ static bool printed;
+ /* TBD: remove - bw comp - in this case probably not set */
+ /* Set to final version that does not report to us */
+ if (!printed) {
+ sif_log(sdev, SIF_INFO,
+ "Invalid version info - upgrade libsif!");
+ printed = true;
+ }
+ s_uc->abi_version = SIF_UVERBS_VERSION(3, 1);
+ }
+ }
+ memset(&resp, 0, sizeof(resp));
+ resp.sq_sw_ext_sz = sdev->ba[sq_sw].ext_sz;
+ resp.sq_hw_ext_sz = sdev->ba[sq_hw].ext_sz;
+ resp.rq_ext_sz = sdev->ba[rq_sw].ext_sz;
+ resp.cq_ext_sz = sdev->ba[cq_sw].ext_sz;
+ resp.sq_entry_per_block = sdev->ba[sq_sw].entry_per_block;
+ resp.rq_entry_per_block = sdev->ba[rq_sw].entry_per_block;
+ resp.cq_entry_per_block = sdev->ba[cq_sw].entry_per_block;
+ ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+ if (ret)
+ goto udata_copy_failed;
+ }
+
+ sif_log(sdev, SIF_VERBS_V, " at %p with pd %d used for CQs libsif abi v.%d.%d",
+ s_uc, s_uc->pd->idx, s_uc->abi_version >> 8, s_uc->abi_version & 0xff);
+ return &s_uc->ib_uc;
+
+udata_copy_failed:
+ release_cb(sdev, s_uc->cb);
+alloc_cb_failed:
+ dealloc_pd(s_uc->pd);
+alloc_pd_failed:
+ kfree(s_uc);
+ return ERR_PTR(ret);
+}
+
+static int sif_dealloc_ucontext(struct ib_ucontext *ib_uc)
+{
+ int ret;
+ u32 pd_idx = 0;
+ struct sif_dev *sdev = to_sdev(ib_uc->device);
+ struct sif_ucontext *s_uc =
+ container_of(ib_uc, struct sif_ucontext, ib_uc);
+
+ sif_logs(SIF_VERBS_V, pd_idx = s_uc->pd->idx);
+
+ ret = dealloc_pd(s_uc->pd);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "Failed (status %d) to deallocate pd %d", ret, s_uc->pd->idx);
+ return ret;
+ }
+
+ release_cb(sdev, s_uc->cb);
+ kfree(s_uc);
+ sif_log(sdev, SIF_VERBS_V, "at %p done (cq pd index %d)", s_uc, pd_idx);
+ return 0;
+}
+
+
+static int sif_mmap_block(struct sif_ucontext *uc, struct vm_area_struct *vma,
+ enum sif_tab_type type, u32 index, int vm_flags)
+{
+ struct sif_dev *sdev = to_sdev(uc->ib_uc.device);
+ struct sif_table *tp = &sdev->ba[type];
+ struct sif_table_block *b;
+ struct sif_pd *pd;
+ u64 start, block_sz;
+ off_t len;
+ off_t offset;
+ int ret;
+
+ if (tp->entry_per_block <= 1) {
+ sif_log(sdev, SIF_INFO,
+ "Failed to map %s block index %d: direct user access not available with flat_alloc scheme",
+ sif_table_name(type), index);
+ return -EPERM;
+ }
+ if (tp->block_cnt <= index) {
+ sif_log(sdev, SIF_INFO, "Failed to map %s block index %d: out of range - block_cnt %d",
+ sif_table_name(type), index, tp->block_cnt);
+ return -EINVAL;
+ }
+
+ b = sif_get_block(tp, index);
+ pd = b->pd;
+ if (!pd) {
+ sif_log(sdev, SIF_INFO, "Failed to map %s block index %d: not allocated",
+ sif_table_name(type), index);
+ return -ENODEV;
+ }
+ if (pd == uc->pd)
+ goto pd_ok; /* CQ case */
+
+ if (!sif_is_user_pd(pd)) {
+ sif_log(sdev, SIF_INFO, "Failed to map %s block index %d, pd %d - owned by kernel space",
+ sif_table_name(type), index, pd->idx);
+ return -EACCES;
+ }
+
+ /* TBD: Security aspects of XRC domain access
+ * (in the xrc case, we don't have a user context at the moment)
+ */
+ if (pd->ibpd.uobject && pd->ibpd.uobject->context != &uc->ib_uc) {
+ sif_log(sdev, SIF_INFO, "Failed to map %s block index %d: belongs to another user context",
+ sif_table_name(type), index);
+ return -EACCES;
+ }
+pd_ok:
+ block_sz = tp->ext_sz * tp->entry_per_block;
+ len = vma->vm_end - vma->vm_start;
+ if (block_sz != len) {
+ sif_log(sdev, SIF_INFO, "Failed to map %s block index %d: Expected map len %lld, got %ld",
+ sif_table_name(type), index,
+ block_sz, len);
+ return -EINVAL;
+ }
+
+ vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+ vma->vm_flags |= vm_flags;
+ start = vma->vm_start;
+
+ offset = block_sz * index;
+
+ ret = sif_mem_vma_map_part(tp->mem, vma, offset, len);
+ if (ret)
+ return ret;
+
+ /* TBD: ehca uses a vm_operations_struct and vma->private_data to ref.count
+ * but MLX does not - is it necessary?
+ * Also remap_pfn_range requires the mm sema to be held, but other drivers dont take it
+ * - is it already held by the caller here?
+ */
+ return 0;
+}
+
+
+static int sif_mmap_cb(struct sif_ucontext *uc, struct vm_area_struct *vma, u32 index)
+{
+ struct sif_dev *sdev = to_sdev(uc->ib_uc.device);
+ struct sif_cb *cb = sif_cb_from_uc(uc, index);
+ off_t len;
+ dma_addr_t cb_start;
+ int ret;
+
+ if (!cb) {
+ sif_log(sdev, SIF_INFO, "Failed to associate cb %d with context", index);
+ return -EINVAL;
+ }
+
+ len = vma->vm_end - vma->vm_start;
+ if (len != PAGE_SIZE) {
+ sif_log(sdev, SIF_INFO, "Failed to map cb index %d: Expected map len %ld, got %ld",
+ index, PAGE_SIZE, len);
+ return -EINVAL;
+ }
+ cb_start = pci_resource_start(sdev->pdev, SIF_CBU_BAR) + index * PAGE_SIZE;
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ vma->vm_flags |= VM_WRITE;
+ ret = io_remap_pfn_range(vma, vma->vm_start, cb_start >> PAGE_SHIFT,
+ PAGE_SIZE, vma->vm_page_prot);
+ if (ret)
+ sif_log(sdev, SIF_INFO, "io_remap_pfn_range failed with %d", ret);
+ return ret;
+}
+
+
+#define def_map_queue(type) \
+static int sif_mmap_##type(struct sif_ucontext *uc, struct vm_area_struct *vma, u32 index)\
+{\
+ struct sif_dev *sdev = to_sdev(uc->ib_uc.device);\
+ struct sif_##type *type;\
+ u64 q_sz;\
+ off_t len;\
+ \
+ type = safe_get_sif_##type(sdev, index);\
+ if (!type) {\
+ sif_log(sdev, SIF_INFO, "Failed to map " #type \
+ " index %d out of range", index);\
+ sif_log(sdev, SIF_INFO, "%p : %p", sdev->ba[type##_hw].bitmap, sdev->ba[qp].bitmap);\
+ return -EINVAL;\
+ } \
+ \
+ q_sz = type->mem->size;\
+ len = vma->vm_end - vma->vm_start;\
+ if (q_sz < len) {\
+ sif_log(sdev, SIF_INFO, "Failed to map " #type " index %d: "\
+ "Expected map req for <= %lld bytes, got %ld", index, q_sz, len);\
+ return -EINVAL;\
+ } \
+ \
+ vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;\
+ vma->vm_flags |= VM_READ|VM_WRITE;\
+ \
+ return sif_mem_vma_map_part(type->mem, vma, 0, len);\
+}
+
+def_map_queue(sq)
+def_map_queue(rq)
+def_map_queue(cq)
+
+static int sif_mmap(struct ib_ucontext *ib_uc, struct vm_area_struct *vma)
+{
+ enum sif_mmap_cmd cmd;
+ u32 index;
+ struct sif_dev *sdev = to_sdev(ib_uc->device);
+ struct sif_ucontext *s_uc = to_sctx(ib_uc);
+
+ mmap_get_cmd(vma->vm_pgoff << PAGE_SHIFT, &cmd, &index);
+
+ sif_log(sdev, SIF_MMAP,
+ "pg offset 0x%lx start 0x%lx, end 0x%lx len 0x%lx, flags 0x%lx index %d",
+ vma->vm_pgoff, vma->vm_start, vma->vm_end, vma->vm_end - vma->vm_start,
+ vma->vm_flags, index);
+
+ switch (cmd) {
+ case SIF_MAP_SQ_SW:
+ return sif_mmap_block(s_uc, vma, sq_sw, index, VM_READ|VM_WRITE);
+ case SIF_MAP_RQ_SW:
+ return sif_mmap_block(s_uc, vma, rq_sw, index, VM_READ|VM_WRITE);
+ case SIF_MAP_CQ_SW:
+ return sif_mmap_block(s_uc, vma, cq_sw, index, VM_READ|VM_WRITE);
+ case SIF_MAP_SQ_HW:
+ return sif_mmap_block(s_uc, vma, sq_hw, index, VM_READ);
+ case SIF_MAP_RQ_HW:
+ return sif_mmap_block(s_uc, vma, rq_hw, index, VM_READ);
+ case SIF_MAP_CQ_HW:
+ return sif_mmap_block(s_uc, vma, cq_hw, index, VM_READ);
+ case SIF_MAP_CB:
+ return sif_mmap_cb(s_uc, vma, index);
+ case SIF_MAP_SQ:
+ return sif_mmap_sq(s_uc, vma, index);
+ case SIF_MAP_RQ:
+ return sif_mmap_rq(s_uc, vma, index);
+ case SIF_MAP_CQ:
+ return sif_mmap_cq(s_uc, vma, index);
+ default:
+ break;
+ }
+ sif_log(sdev, SIF_MMAP, "cmd %d not implemented", cmd);
+ return -EOPNOTSUPP;
+}
+
+static int sif_get_protocol_stats(struct ib_device *ibdev,
+ union rdma_protocol_stats *stats)
+{
+ struct sif_dev *sdev = to_sdev(ibdev);
+
+ sif_log(sdev, SIF_VERBS, "Not implemented");
+ return -EOPNOTSUPP;
+}
+
+
+static enum rdma_link_layer sif_get_link_layer(struct ib_device *ibdev, u8 port_num)
+{
+ struct sif_dev *sdev = to_sdev(ibdev);
+
+ sif_log(sdev, SIF_VERBS, "returns IB_LINK_LAYER_INFINIBAND for port %d", port_num);
+ return IB_LINK_LAYER_INFINIBAND;
+}
+
+static int sif_port_callback(struct ib_device *ibdev, u8 portno, struct kobject *obj)
+{
+ struct sif_dev *sdev = to_sdev(ibdev);
+
+ sif_log(sdev, SIF_VERBS, "port %d", portno);
+ return 0;
+}
+
+static inline struct ib_cq *sif_ib_create_cq(struct ib_device *ibdev, int cqe,
+ int comp_vector, struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ return sif_create_cq(ibdev, cqe, comp_vector, context, udata, SIFPX_OFF);
+}
+
+/* putting this function here to avoid sif_epsc.h from being rdma/ib_verbs.h dependent */
+static int sif_eps_wr_ex(struct ib_device *ibdev, enum psif_mbox_type eps_num,
+ struct psif_epsc_csr_req *req, struct psif_epsc_csr_rsp *cqe)
+{
+ struct sif_dev *sdev = to_sdev(ibdev);
+
+ return sif_eps_wr(sdev, eps_num, req, cqe);
+
+}
+
+int sif_register_ib_device(struct sif_dev *sdev)
+{
+ int ret = 0;
+ int i;
+ struct ib_device *dev = &sdev->ib_dev;
+ struct psif_epsc_device_attr epsdev;
+
+ /* We need to do a query_device to get the node_guid */
+ ret = epsc_query_device(sdev, &epsdev);
+ if (ret)
+ return ret;
+
+ strlcpy(dev->name, "sif%d", IB_DEVICE_NAME_MAX);
+
+ dev->owner = THIS_MODULE;
+ dev->uverbs_abi_ver = SIF_UVERBS_ABI_VERSION;
+
+ /* SIF supported user verbs */
+ dev->uverbs_cmd_mask =
+ (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+ (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_AH) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_AH) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_AH) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_AH) |
+ (1ull << IB_USER_VERBS_CMD_REG_MR) |
+ (1ull << IB_USER_VERBS_CMD_REG_SMR) |
+ (1ull << IB_USER_VERBS_CMD_REREG_MR) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_MR) |
+ (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+ (1ull << IB_USER_VERBS_CMD_ALLOC_MW) |
+ (1ull << IB_USER_VERBS_CMD_BIND_MW) |
+ (1ull << IB_USER_VERBS_CMD_DEALLOC_MW) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+ (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+ (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+ (1ull << IB_USER_VERBS_CMD_PEEK_CQ) |
+ (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+ (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+ (1ull << IB_USER_VERBS_CMD_POST_RECV) |
+ (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) |
+ (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV)
+ | (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
+ (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) |
+ (1ull << IB_USER_VERBS_CMD_OPEN_QP)
+ | (1ull << IB_USER_VERBS_CMD_ALLOC_SHPD) |
+ (1ull << IB_USER_VERBS_CMD_SHARE_PD)
+ ;
+
+ dev->get_protocol_stats = sif_get_protocol_stats;
+
+ dev->query_device = sif_query_device;
+ dev->modify_device = sif_modify_device;
+
+ dev->query_port = sif_query_port;
+ dev->modify_port = sif_modify_port;
+
+ dev->get_link_layer = sif_get_link_layer;
+ dev->query_gid = sif_query_gid;
+ dev->query_pkey = sif_query_pkey;
+
+ dev->alloc_ucontext = sif_alloc_ucontext;
+ dev->dealloc_ucontext = sif_dealloc_ucontext;
+ dev->mmap = sif_mmap;
+
+ dev->alloc_pd = sif_alloc_pd;
+ dev->dealloc_pd = sif_dealloc_pd;
+ dev->create_ah = sif_create_ah;
+ dev->destroy_ah = sif_destroy_ah;
+ dev->query_ah = sif_query_ah;
+
+ dev->create_srq = sif_create_srq;
+ dev->modify_srq = sif_modify_srq;
+ dev->query_srq = sif_query_srq;
+ dev->destroy_srq = sif_destroy_srq;
+
+ dev->create_qp = sif_create_qp;
+ dev->modify_qp = sif_modify_qp;
+ dev->query_qp = sif_query_qp;
+ dev->destroy_qp = sif_destroy_qp;
+
+ dev->post_send = sif_post_send;
+ dev->post_recv = sif_post_recv;
+ dev->post_srq_recv = sif_post_srq_recv;
+
+ dev->create_cq = sif_ib_create_cq;
+ dev->destroy_cq = sif_destroy_cq;
+ dev->resize_cq = sif_resize_cq;
+ dev->poll_cq = sif_poll_cq;
+ dev->peek_cq = sif_peek_cq;
+ dev->req_notify_cq = sif_req_notify_cq;
+ dev->req_ncomp_notif = sif_req_ncomp_notif;
+
+ dev->get_dma_mr = sif_get_dma_mr;
+ dev->reg_phys_mr = sif_reg_phys_mr;
+ dev->rereg_phys_mr = sif_rereg_phys_mr;
+ dev->reg_user_mr = sif_reg_user_mr;
+ dev->dereg_mr = sif_dereg_mr;
+ dev->query_mr = sif_query_mr;
+
+ dev->alloc_fmr = sif_alloc_fmr;
+ dev->map_phys_fmr = sif_map_phys_fmr;
+ dev->unmap_fmr = sif_unmap_phys_fmr_list;
+ dev->dealloc_fmr = sif_dealloc_fmr;
+
+ dev->attach_mcast = sif_multicast_attach;
+ dev->detach_mcast = sif_multicast_detach;
+
+ /* All our mad handling happens via the normal QP0 paths
+ * this function is for devices which implements the SMA
+ * in software:
+ */
+ dev->process_mad = NULL;
+
+ dev->alloc_xrcd = sif_alloc_xrcd;
+ dev->dealloc_xrcd = sif_dealloc_xrcd;
+ dev->alloc_shpd = sif_alloc_shpd;
+ dev->share_pd = sif_share_pd;
+ dev->remove_shpd = sif_remove_shpd;
+
+ dev->node_guid = cpu_to_be64(epsdev.node_guid);
+
+ snprintf(dev->node_desc, sizeof(dev->node_desc), "sif_%s",
+ init_utsname()->nodename);
+
+ dev->node_type = RDMA_NODE_IB_CA;
+ dev->phys_port_cnt = sdev->limited_mode ? 0 : epsdev.phys_port_cnt;
+ dev->num_comp_vectors = sdev->es[sdev->mbox_epsc].eqs.cnt - 2;
+
+ ret = ib_register_device(dev, sif_port_callback);
+ if (ret) {
+ sif_log(sdev, SIF_VERBS, "Fail to register IB device: error %d",
+ -ret);
+ goto err_ibreg;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(sif_class_attributes); ++i) {
+ ret = device_create_file(&dev->dev, sif_class_attributes[i]);
+ if (ret) {
+ sif_log(sdev, SIF_VERBS,
+ "Fail to register with sysfs: error %d!", -ret);
+ goto err_sysfsreg;
+ }
+ }
+
+ /* Diag_counters */
+ ret = sysfs_create_group(&dev->dev.kobj, &diag_counters_attr_group);
+ if (ret) {
+ sif_log(sdev, SIF_VERBS,
+ "Fail to register diag_counters with sysfs: error %d!", -ret);
+ goto err_sysfsreg;
+ }
+
+ /* Populate the external kernel API (see sif_verbs.h): */
+ sdev->sv.eps_wr = sif_eps_wr_ex;
+ sdev->sv.create_cq = sif_create_cq;
+ sdev->ib_dev.local_dma_lkey = sdev->dma_mr->index;
+
+ sdev->registered = true;
+ sif_log(sdev, SIF_VERBS_V, "%s registered with IB", sdev->ib_dev.name);
+ return 0;
+
+err_sysfsreg:
+ ib_unregister_device(dev);
+err_ibreg:
+ sif_log(sdev, SIF_INFO, "Exit - error %d", -ret);
+ return ret;
+}
+
+void sif_unregister_ib_device(struct sif_dev *sdev)
+{
+ struct ib_device *ibdev = &sdev->ib_dev;
+
+ sdev->registered = false;
+ ib_unregister_device(ibdev);
+ sif_logi(ibdev, SIF_VERBS, "done unregistering device");
+}
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_ireg.h: support functions used in setup of sif as an IB HCA
+ */
+
+#ifndef __SIF_IREG_H
+#define __SIF_IREG_H
+
+/* User context of a user level ib call */
+struct sif_ucontext {
+ struct ib_ucontext ib_uc;
+ struct sif_pd *pd; /* A protection domain for completion queues */
+ struct sif_cb *cb; /* The collect buffer for the user process */
+ u32 abi_version; /* User level library's abi version */
+};
+
+static inline struct sif_ucontext *to_sctx(struct ib_ucontext *context)
+{
+ return container_of(context, struct sif_ucontext, ib_uc);
+}
+
+int sif_register_ib_device(struct sif_dev *sdev);
+void sif_unregister_ib_device(struct sif_dev *sdev);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_main.c: main entry points and initialization
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#ifdef CONFIG_X86
+#include <asm/mtrr.h>
+#endif
+#include <linux/pci.h>
+#include <linux/aer.h>
+#include "sif_dev.h"
+#include "sif_fwa.h"
+#include "sif_mmu.h"
+#include "sif_mr.h"
+#include "sif_hwi.h"
+#include "sif_r3.h"
+#include "sif_vf.h"
+#include "sif_pt.h"
+#include "sif_ireg.h"
+#include "sif_debug.h"
+#include "psif_hw_csr.h"
+#include "version.h"
+#include <xen/xen.h>
+
+
+#define PSIF_VERSION_STR "0.1.0.6+"
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Oracle SIF Infiniband HCA driver");
+MODULE_VERSION(PSIF_VERSION_STR);
+MODULE_AUTHOR("Knut Omang");
+
+/* The device(s) we support */
+
+static const struct pci_device_id pci_table[] = {
+ {PCI_DEVICE(PCI_VENDOR_ID_SUN, PCI_DEVICE_ID_PSIF_PF)},
+ {PCI_DEVICE(PCI_VENDOR_ID_SUN, PCI_DEVICE_ID_PSIF_VF)},
+ {PCI_DEVICE(PCI_VENDOR_ID_SUN, PCI_DEVICE_ID_SN1_PF)},
+ {PCI_DEVICE(PCI_VENDOR_ID_SUN, PCI_DEVICE_ID_SN1_VF)},
+ {0,}
+};
+
+MODULE_DEVICE_TABLE(pci, pci_table);
+
+/* module entry points */
+static int __init sif_init(void);
+static void __exit sif_exit(void);
+
+/* device entry points */
+static int sif_probe(struct pci_dev *pdev,
+ const struct pci_device_id *id);
+static void sif_remove(struct pci_dev *dev);
+
+static int sif_suspend(struct pci_dev *dev, pm_message_t state)
+{
+ struct sif_dev *sdev = pci_get_drvdata(dev);
+
+ sif_log(sdev, SIF_INFO, " ");
+ return 0;
+}
+
+static int sif_resume(struct pci_dev *dev)
+{
+ struct sif_dev *sdev = pci_get_drvdata(dev);
+
+ sif_log(sdev, SIF_INFO, " ");
+ return 0;
+}
+
+static void sif_shutdown(struct pci_dev *dev)
+{
+ struct sif_dev *sdev = pci_get_drvdata(dev);
+
+ sif_log(sdev, SIF_INFO, " ");
+}
+
+static struct pci_driver sif_driver = {
+ .name = "sif",
+ .id_table = pci_table,
+ .probe = sif_probe,
+ .remove = sif_remove,
+ .suspend = sif_suspend,
+ .resume = sif_resume,
+ .shutdown = sif_shutdown,
+ .sriov_configure = sif_vf_enable,
+};
+
+/* Driver parameters: */
+
+ulong sif_debug_mask = 0x3;
+module_param_named(debug_mask, sif_debug_mask, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(debug_mask, "Selective enabling of debugging output to the system log");
+
+#ifdef SIF_TRACE_MASK
+ulong sif_trace_mask = 0x0;
+module_param_named(trace_mask, sif_trace_mask, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(trace_mask, "Selective enabling of debugging output to the ftrace buffer");
+#endif
+
+ulong sif_feature_mask = 0;
+module_param_named(feature_mask, sif_feature_mask, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(feature_mask, "Selective enabling of sif driver features");
+
+ulong sif_vendor_flags = 0;
+module_param_named(vendor_flags, sif_vendor_flags, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(vendor_flags, "Selective enabling of sif driver vendor specific mode flags");
+
+uint sif_max_pqp_wr = SIF_SW_MAX_SQE;
+module_param_named(max_pqp_wr, sif_max_pqp_wr, uint, S_IRUGO);
+MODULE_PARM_DESC(max_pqp_wr, "Maximum number of outstanding privileged QP requests supported");
+
+uint sif_ki_spqp_size = 1;
+module_param_named(ki_spqp_size, sif_ki_spqp_size, uint, S_IRUGO);
+MODULE_PARM_DESC(ki_spqp_size, "Number of privileged QPs for key invalidate stencils to set up");
+
+/* pqp_size == cq_eq_max */
+uint sif_cq_eq_max = 12;
+module_param_named(cq_eq_max, sif_cq_eq_max, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(cq_eq_max, "Upper limit on no. of EQs to distribute completion events among");
+
+uint sif_cb_max = 100;
+module_param_named(cb_max, sif_cb_max, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(cb_max, "Upper limit on no. of CBs.");
+
+/* TBD - This is a debug feature to evaluate performance. */
+ushort sif_perf_sampling_threshold = 100;
+module_param_named(perf_sampling_threshold, sif_perf_sampling_threshold, ushort, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(perf_sampling_threshold, "The performance measurement based on every N samples");
+
+uint sif_fmr_cache_flush_threshold = 512;
+module_param_named(fmr_cache_flush_threshold, sif_fmr_cache_flush_threshold, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(fmr_cache_flush_threshold, "PF limit for when to use fast-path full MMU flush for FMR unmap");
+
+
+/* In principle, SIF can allow any max inline size but at the cost of more memory
+ * allocated per QP. This variable sets the upper limit for any QP by defining
+ * the max extent of the sq entries, which means that the real max size is slightly
+ * less, depending on the max number of sges requested:
+ */
+uint sif_max_inline = 0x400;
+module_param_named(max_inline, sif_max_inline, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(max_inline, "Max configurable inline data per QP");
+
+uint sif_vf_en = 1;
+module_param_named(vf_en, sif_vf_en, uint, S_IRUGO);
+MODULE_PARM_DESC(vf_en, "If set to 0, refuse to load VF drivers");
+
+ulong sif_eps_log_size = 0;
+module_param_named(eps_log_size, sif_eps_log_size, ulong, S_IRUGO);
+MODULE_PARM_DESC(eps_log_size, "Enable log redirection - value is size of log buffer to allocate");
+
+ushort sif_eps_log_level = EPS_LOG_INFO;
+module_param_named(eps_log_level, sif_eps_log_level, ushort, S_IRUGO);
+MODULE_PARM_DESC(eps_log_level, "Level of logging to set for EPS redirect at load");
+
+static int sif_bar_init(struct pci_dev *pdev);
+static void sif_bar_deinit(struct pci_dev *pdev);
+
+
+static int sif_set_check_max_payload(struct sif_dev *sdev)
+{
+ struct pci_dev *parent;
+ u16 devctl, devcap, pdevctl, pdevcap;
+ int pcie_cap, pcie_parent_cap, min_cap_mps, err;
+
+ u8 payload_sz, payload_sz_cap;
+ u8 parent_payload_sz, parent_payload_sz_cap;
+
+ pcie_cap = pci_find_capability(sdev->pdev, PCI_CAP_ID_EXP);
+
+ /* read PSIF max payload size capability and setting */
+ err = pci_read_config_word(sdev->pdev, pcie_cap + PCI_EXP_DEVCTL, &devctl);
+ if (err)
+ return err;
+
+ payload_sz = (devctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5;
+
+ err = pci_read_config_word(sdev->pdev, pcie_cap + PCI_EXP_DEVCAP, &devcap);
+ if (err)
+ return err;
+
+ payload_sz_cap = (devcap & PCI_EXP_DEVCAP_PAYLOAD);
+
+ if (sif_feature(max_supported_payload)) {
+ parent = pci_upstream_bridge(sdev->pdev);
+ if (!parent) {
+ sif_log(sdev, SIF_INFO,
+ "No parent bridge device, cannot determine atomic capabilities!");
+ return PSIF_PCIE_ATOMIC_OP_NONE;
+ }
+
+ pcie_parent_cap = pci_find_capability(parent, PCI_CAP_ID_EXP);
+ if (!pcie_parent_cap) {
+ sif_log(sdev, SIF_INFO,
+ "Unable to find any PCIe capability in parent device - assuming payload size is ok");
+ return 0;
+ }
+
+ /* read root complex (port) max payload size */
+ err = pci_read_config_word(parent, pcie_parent_cap + PCI_EXP_DEVCTL, &pdevctl);
+ if (err)
+ return err;
+
+ err = pci_read_config_word(parent, pcie_parent_cap + PCI_EXP_DEVCAP, &pdevcap);
+ if (err)
+ return err;
+
+ parent_payload_sz = (pdevctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5;
+ parent_payload_sz_cap = (pdevcap & PCI_EXP_DEVCAP_PAYLOAD);
+
+ min_cap_mps = min(parent_payload_sz_cap, payload_sz_cap);
+
+ /* adjusting the RC max payload size to the supported max payload size */
+ if (parent_payload_sz != min_cap_mps) {
+ sif_log(sdev, SIF_INFO,
+ "Adjusting RC max payload sz to %d\n", 128 << parent_payload_sz_cap);
+ err = pci_write_config_word(parent,
+ pcie_parent_cap + PCI_EXP_DEVCTL,
+ (pdevctl & ~PCI_EXP_DEVCTL_PAYLOAD) + (min_cap_mps << 5));
+ }
+
+ /* Adjusting the max payload size to the supported max payload size */
+ if (payload_sz != min_cap_mps) {
+ sif_log(sdev, SIF_INFO,
+ "Adjusting max payload sz to %d\n", 128 << parent_payload_sz_cap);
+ err = pci_write_config_word(sdev->pdev,
+ pcie_cap + PCI_EXP_DEVCTL,
+ (devctl & ~PCI_EXP_DEVCTL_PAYLOAD) + (min_cap_mps << 5));
+ }
+
+ if (min_cap_mps == 0) {
+ sif_log(sdev, SIF_INFO,
+ "PCI express max payload size is set to 128 which triggers a rev1 bug");
+ }
+ }
+ return err;
+}
+
+/* Entry of new instance */
+static int sif_probe(struct pci_dev *pdev,
+ const struct pci_device_id *id)
+{
+ int err = 0;
+
+ /* TBD: Zeroed memory from ib_alloc_device? */
+ struct sif_dev *sdev =
+ (struct sif_dev *)ib_alloc_device(sizeof(struct sif_dev));
+ if (!sdev) {
+ err = -ENOMEM;
+ goto pfail_ib_alloc;
+ }
+
+ sdev->pdev = pdev;
+ sdev->dfs = NULL;
+ sdev->fw_vfs = -1; /* #of VFS enabled in firmware not known yet */
+ sdev->ib_dev.dma_device = &pdev->dev;
+ sdev->limited_mode = sif_feature(force_limited_mode) ? true : false;
+
+ strlcpy(sdev->ib_dev.name, "sif%d", IB_DEVICE_NAME_MAX);
+
+ pci_set_drvdata(pdev, sdev);
+ sif_log(sdev, SIF_INFO,
+ "%s found, device id 0x%x, subsystem id 0x%x, revision %d, at 0x%p",
+ get_product_str(sdev), PSIF_DEVICE(sdev),
+ PSIF_SUBSYSTEM(sdev), PSIF_REVISION(sdev), sdev);
+
+ sdev->wq = create_singlethread_workqueue(sdev->ib_dev.name);
+ if (!sdev->wq) {
+ sif_log(sdev, SIF_INFO, "Failed to allocate kernel work queue");
+ err = -ENOMEM;
+ goto wq_fail;
+ }
+
+ err = sif_set_check_max_payload(sdev);
+ if (err)
+ goto wq_fail;
+
+ /* Ask PCI drivers to enable the device and set up BARs etc */
+ err = pci_enable_device_mem(pdev);
+ if (err)
+ goto pfail_enable;
+
+ /* Check if 64 bits DMA is supported */
+ err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+ if (!err) {
+ sif_log(sdev, SIF_INIT, "64 bit DMA supported");
+ pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+ } else {
+ err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+ if (!err) {
+ sif_log(sdev, SIF_INIT, "32 bit DMA supported");
+ pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+ } else {
+ sif_log(sdev, SIF_INIT, "No DMA support!?");
+ goto pfail_dma;
+ }
+ }
+
+ pci_enable_pcie_error_reporting(pdev);
+
+ /* Set up BAR access */
+ err = sif_bar_init(pdev);
+ if (err)
+ goto pfail_bar;
+
+ if (xen_pv_domain()) {
+ /* The Xen PV domain may return huge pages that are misaligned
+ * in DMA space, see Orabug: 21690736.
+ * Also we have to turn off the inline sge optimization, as it assumes
+ * that (guest) physical and DMA addresses are equal, which is not
+ * the case for the PV domain - see Orabug: 23012335.
+ */
+ sif_log(sdev, SIF_INFO, "xen pv domain: Restricting resource allocation..");
+ sif_feature_mask |= SIFF_no_huge_pages | SIFF_disable_inline_first_sge;
+ sif_qp_size = min(sif_qp_size, 0x1000U);
+ sif_mr_size = min(sif_mr_size, 0x1000U);
+ sif_ah_size = min(sif_ah_size, 0x1000U);
+ sif_cq_size = min(sif_cq_size, 0x1000U);
+ sif_rq_size = min(sif_rq_size, 0x1000U);
+ sif_max_pqp_wr = min(sif_max_pqp_wr, 0x1000U);
+ }
+
+ /* Timeout scaling factor:
+ * This value is used as a factor to calculate sensible
+ * timeout values throughout the driver:
+ */
+ sdev->min_resp_ticks = SIF_HW_TIMEOUT;
+ /* Type UMEM means no override - initialize */
+ sdev->mt_override = SIFMT_UMEM;
+
+ err = sif_dfs_register(sdev);
+ if (err)
+ goto pfail_dfs;
+
+ /* PSIF initialization */
+ err = sif_hw_init(sdev);
+ if (err)
+ goto pfail_psif_base;
+
+ err = sif_fwa_register(sdev);
+ if (err)
+ goto fwa_reg_failed;
+
+ /* Reserve key 0 as an invalid key for sanity checking
+ * See #3323 for details
+ */
+ sdev->dma_inv_mr = sif_alloc_invalid_mr(sdev->pd);
+ if (IS_ERR(sdev->dma_inv_mr)) {
+ err = PTR_ERR(sdev->dma_inv_mr);
+ goto pfail_dma_inv_mr;
+ }
+
+ /* Create a DMA MR (mapping the whole address space)
+ * for use with the local_dma_lkey
+ */
+ sdev->dma_mr = create_dma_mr(sdev->pd,
+ IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE);
+
+ if (IS_ERR(sdev->dma_mr)) {
+ err = PTR_ERR(sdev->dma_mr);
+ goto pfail_dma_mr;
+ }
+
+ if (PSIF_REVISION(sdev) <= 3) {
+ err = sif_r3_init(sdev);
+ if (err)
+ goto pfail_r3_init;
+ }
+
+ /* Successful device init */
+
+ err = sif_register_ib_device(sdev);
+ if (err)
+ goto pfail_ibreg;
+
+ /* Now that an IB device name exists, create a symlink in debugfs */
+ sif_dfs_link_to_ibdev(sdev);
+
+
+ sif_log(sdev, SIF_INFO, "Successfully probed and set up device");
+ return 0;
+pfail_ibreg:
+ sif_r3_deinit(sdev);
+pfail_r3_init:
+ sif_dealloc_mr(sdev, sdev->dma_mr);
+pfail_dma_mr:
+ sif_dealloc_mr(sdev, sdev->dma_inv_mr);
+pfail_dma_inv_mr:
+ sif_fwa_unregister(sdev);
+fwa_reg_failed:
+ sif_hw_deinit(sdev);
+pfail_psif_base:
+ sif_dfs_unregister(sdev);
+pfail_dfs:
+ sif_bar_deinit(pdev);
+pfail_bar:
+ pci_disable_pcie_error_reporting(pdev);
+pfail_dma:
+ pci_disable_device(pdev);
+pfail_enable:
+ destroy_workqueue(sdev->wq);
+wq_fail:
+ ib_dealloc_device(&sdev->ib_dev);
+pfail_ib_alloc:
+ sif_log0(SIF_INIT, "sif_probe failed with status %d\n", err);
+ return err;
+}
+
+/* Exit of instance */
+static void sif_remove(struct pci_dev *dev)
+{
+ struct sif_dev *sdev = pci_get_drvdata(dev);
+
+ sif_log0(SIF_INIT, "Enter: sif_remove");
+
+ sif_vf_disable(sdev);
+
+ sif_unregister_ib_device(sdev);
+ sif_r3_deinit(sdev);
+ sif_dealloc_mr(sdev, sdev->dma_mr);
+ sif_dealloc_mr(sdev, sdev->dma_inv_mr);
+ sif_fwa_unregister(sdev);
+ sif_hw_deinit(sdev);
+ sif_dfs_unregister(sdev);
+ sif_bar_deinit(dev);
+ pci_clear_master(dev);
+ pci_disable_device(dev);
+ flush_workqueue(sdev->wq);
+ destroy_workqueue(sdev->wq);
+ ib_dealloc_device(&sdev->ib_dev);
+ sif_log0(SIF_INIT, "exit sif_remove");
+}
+
+static int sif_bar_init(struct pci_dev *pdev)
+{
+ struct sif_dev *sdev = pci_get_drvdata(pdev);
+ int err;
+ phys_addr_t start;
+ size_t length;
+
+ /* Request access to the device space in BAR0 for this driver */
+ err = pci_request_region(pdev, SIF_CBU_BAR, "sif_cb");
+ if (err) {
+ sif_log(sdev, SIF_INIT, "Failed to request cb region");
+ goto pfail_bar0;
+ }
+
+ /* Then map all of it to allow access */
+ start = pci_resource_start(pdev, SIF_CBU_BAR);
+
+ /* This should not happen - kernel or BIOS bug?
+ * TBD: Check this from the CPU ID? (M bit?)
+ */
+ if (start > (1ULL << 52)) {
+ sif_log(sdev, SIF_INIT,
+ "pci_resource_start returned a physical address beyond CPU max phys.addr (%llx)",
+ start);
+ err = -ENOMEM;
+ goto pfail_ioremap0;
+ }
+
+ length = pci_resource_len(pdev, SIF_CBU_BAR);
+
+ sdev->cbu_mtrr = -1; /* Avoid attempt to free mtrr 0 */
+
+ /*
+ * Need iomap_wc() in order to get write-combining to work,
+ * even when using explicit write-combining instructions.
+ */
+ sdev->cb_base = ioremap_wc(start, length);
+ if (!sdev->cb_base) {
+ sif_log(sdev, SIF_INIT,
+ "ioremap_wc - failed to map cb BAR (start %llx len %lx)",
+ start, length);
+ err = -ENOMEM;
+ goto pfail_ioremap0;
+ }
+ sdev->cb_sz = length;
+
+ sif_log(sdev, SIF_INIT, "BAR%d (cb) mapped at kva %p start %llx len %lx",
+ SIF_CBU_BAR, sdev->cb_base, start, length);
+
+ err = pci_request_region(pdev, SIF_MSIX_BAR, "sif_msix");
+ if (err) {
+ sif_log(sdev, SIF_INIT, "Failed to request msix region");
+ goto pfail_bar2;
+ }
+
+ start = pci_resource_start(pdev, SIF_MSIX_BAR);
+ length = pci_resource_len(pdev, SIF_MSIX_BAR);
+ sdev->msi_base = ioremap_nocache(start, length);
+ if (!sdev->msi_base) {
+ sif_log(sdev, SIF_INIT,
+ "ioremap_nocache - failed to map msix BAR%d (start %llx len %lx)",
+ SIF_MSIX_BAR, start, length);
+ err = -ENOMEM;
+ goto pfail_ioremap2;
+ }
+ sdev->msi_sz = length;
+ sif_log(sdev, SIF_INIT, "BAR%d (msix) mapped at kva %p start %llx len %lx",
+ SIF_MSIX_BAR, sdev->msi_base, start, length);
+
+ err = pci_request_region(pdev, SIF_EPS_BAR, "sif_csr");
+ if (err) {
+ sif_log(sdev, SIF_INIT, "Failed to request eps region");
+ goto pfail_bar4;
+ }
+
+ start = pci_resource_start(pdev, SIF_EPS_BAR);
+ length = pci_resource_len(pdev, SIF_EPS_BAR);
+ sdev->eps_base = ioremap_nocache(start, length);
+ if (!sdev->eps_base) {
+ sif_log(sdev, SIF_INIT, "Failed to map eps BAR%d (start %llx len %lx)",
+ SIF_EPS_BAR, start, length);
+ err = -ENOMEM;
+ goto pfail_ioremap4;
+ }
+ sdev->eps = (struct __iomem psif_pcie_mbox *)sdev->eps_base;
+ sdev->eps_sz = length;
+
+ sif_log(sdev, SIF_INIT, "BAR%d (eps) mapped at kva %p start %llx len %lx",
+ SIF_EPS_BAR, sdev->eps, start, length);
+ return 0;
+
+pfail_ioremap4:
+ pci_release_region(pdev, SIF_EPS_BAR);
+pfail_bar4:
+ iounmap(sdev->msi_base);
+pfail_ioremap2:
+ pci_release_region(pdev, SIF_CBU_BAR);
+pfail_bar2:
+ iounmap(sdev->cb_base);
+pfail_ioremap0:
+#ifdef CONFIG_X86
+ if (sdev->cbu_mtrr >= 0)
+ mtrr_del(sdev->cbu_mtrr,
+ pci_resource_start(pdev, SIF_CBU_BAR),
+ pci_resource_len(pdev, SIF_CBU_BAR));
+#endif
+ pci_release_region(pdev, SIF_MSIX_BAR);
+pfail_bar0:
+ return err;
+}
+
+static void sif_bar_deinit(struct pci_dev *pdev)
+{
+ struct sif_dev *sdev = pci_get_drvdata(pdev);
+
+ iounmap(sdev->eps);
+ pci_release_region(pdev, 4);
+ iounmap(sdev->msi_base);
+ pci_release_region(pdev, 2);
+ iounmap(sdev->cb_base);
+#ifdef CONFIG_X86
+ if (sdev->cbu_mtrr >= 0)
+ mtrr_del(sdev->cbu_mtrr,
+ pci_resource_start(pdev, SIF_CBU_BAR),
+ pci_resource_len(pdev, SIF_CBU_BAR));
+#endif
+ pci_release_region(pdev, 0);
+}
+
+
+
+/* Statically register this driver with the kernel */
+
+static int __init sif_init(void)
+{
+ int stat = 0;
+
+ sif_log0(SIF_INFO, "**** Oracle development driver - internal use only! ****");
+ sif_log0(SIF_INFO, "%s - build user %s at %s", sif_version.git_repo,
+ sif_version.build_user, sif_version.build_git_time);
+ sif_log0(SIF_INFO, "sifdrv git tag:\n%s", sif_version.last_commit);
+ if (sif_version.git_status[0] != '\0')
+ sif_log0(SIF_INFO, " *** sifdrv git status at build time: ***\n%s", sif_version.git_status);
+ sif_log0(SIF_INFO, "psifapi git tag:\n%s", sif_version.last_psifapi_commit);
+ if (sif_version.git_psifapi_status[0] != '\0')
+ sif_log0(SIF_INFO, " *** psifapi git status at build time ***\n%s",
+ sif_version.git_psifapi_status);
+
+ sif_log0(SIF_INIT, "hw header release \"%s\"", PSIF_RELEASE_STR);
+ sif_log0(SIF_INIT, "built for PSIF version %d.%d, EPSC API version %d.%d",
+ PSIF_MAJOR_VERSION, PSIF_MINOR_VERSION, EPSC_MAJOR_VERSION, EPSC_MINOR_VERSION);
+ sif_log0(SIF_INIT, "sif debug mask 0x%lx", sif_debug_mask);
+ if (sif_feature_mask) {
+ u64 undef = sif_feature_mask & ~SIFF_all_features;
+
+ if (undef) {
+ sif_log0(SIF_INFO,
+ "***** Invalid feature mask - undefined bits %llx - get rid of legacy bits!",
+ undef);
+ return -EINVAL;
+ }
+ sif_log0(SIF_INFO, "sif feature mask 0x%lx", sif_feature_mask);
+ }
+
+ stat = sif_pt_init();
+ if (stat)
+ goto pt_init_failed;
+
+ stat = sif_fwa_init();
+ if (stat)
+ goto fwa_init_failed;
+
+ return pci_register_driver(&sif_driver);
+
+fwa_init_failed:
+ sif_pt_exit();
+pt_init_failed:
+ return stat;
+}
+
+static void __exit sif_exit(void)
+{
+ sif_fwa_exit();
+ pci_unregister_driver(&sif_driver);
+ sif_pt_exit();
+ sif_log0(SIF_INIT, "done unregistering");
+}
+
+module_init(sif_init);
+module_exit(sif_exit);
--- /dev/null
+/*
+ * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_mem.c: SIF table memory and page table management
+ */
+
+#include <linux/scatterlist.h>
+#include <linux/kref.h>
+#include <linux/slab.h>
+#include <rdma/ib_umem.h>
+#include "sif_dev.h"
+#include "sif_mem.h"
+#include "sif_dma.h"
+#include "sif_pt.h"
+
+/* Defined below */
+static int sif_mem_fixup_dma(struct scatterlist *sg);
+
+/* Initialization of global per device info */
+void sif_mem_init(struct sif_dev *sdev)
+{
+ struct sif_mem_info *mi = &sdev->mi;
+
+ if (sif_feature(toggle_page_size)) {
+ mi->page_shift = PAGE_SHIFT == 12 ? 13 : 12;
+ mi->page_size = PAGE_SIZE == 0x1000 ? 0x2000 : 0x1000;
+ } else {
+ mi->page_shift = PAGE_SHIFT;
+ mi->page_size = PAGE_SIZE;
+ }
+ mi->level_shift = 9;
+ mi->max_shift = mi->page_shift + mi->level_shift * PT_LEVELS;
+ mi->ptes_per_page = 1 << mi->level_shift;
+ mi->page_mask = ~(mi->page_size - 1);
+}
+
+/* Some utilities */
+
+inline size_t mem_type_to_page_shift(struct sif_dev *sdev, enum sif_mem_type mem_type)
+{
+ switch (mem_type) {
+ case SIFMT_2M:
+ return sdev->mi.page_shift + sdev->mi.level_shift;
+ default:
+ return sdev->mi.page_shift;
+ }
+}
+
+
+static u32 sif_mem_fmr_max_page_shift(struct sif_mem *mem)
+{
+ struct sif_dev *sdev = mem->sdev;
+ u32 max_shift = sdev->mi.max_shift;
+ u64 end = 0;
+ u32 bits = sizeof(dma_addr_t) << 3;
+ int i;
+ u64 incr = 1 << mem->m.fmr.page_shift;
+
+ BUG_ON(mem->mem_type != SIFMT_FMR);
+
+ for (i = 0; i < mem->m.fmr.page_list_len; i++) {
+ u64 next_addr = mem->m.fmr.page_list[i];
+
+ if (end && end != next_addr) {
+ unsigned long border = end | next_addr;
+ u32 shift = find_first_bit(&border, bits);
+
+ if (shift < max_shift) {
+ sif_log(sdev, SIF_MEM_V,
+ "%4d: start 0x%llx, sz 0x%llx, prev.end 0x%llx shift %d -> %d",
+ i, next_addr, incr, end, max_shift, shift);
+ max_shift = shift;
+ if (max_shift == mem->m.fmr.page_shift) /* No point in continuing */
+ break;
+ }
+ }
+ end = next_addr + incr;
+ }
+ sif_log(sdev, SIF_MEM_SG, "found max shift %d from inspecting %d sges", max_shift, i);
+ return max_shift;
+}
+
+
+/* Calculate the max.possible page_shift for this memory
+ * based on alignment of the DMA
+ */
+static u32 sif_mem_max_page_shift(struct sif_mem *mem)
+{
+ struct sif_dev *sdev = mem->sdev;
+ u32 max_shift = sdev->mi.max_shift;
+ u64 end = 0;
+ u32 bits = sizeof(dma_addr_t) << 3;
+ u32 sg_cnt = 0;
+
+ struct scatterlist *sg = sif_mem_get_sgl(mem);
+
+ if (!sg)
+ return sdev->mi.page_shift;
+ for (; sg; sg = sg_next(sg)) {
+ u64 dma_start = sg_dma_address(sg);
+
+ sg_cnt++;
+#ifdef __sparc__
+ /* TBD: Fix bug in umem:
+ * SG lists are not always properly terminated
+ */
+ if (!sg_dma_len(sg))
+ break;
+#endif
+ if (end && end != dma_start) {
+ unsigned long border = end | dma_start;
+ u32 shift = find_first_bit(&border, bits);
+
+ if (shift < max_shift) {
+ sif_log(sdev, SIF_MEM_V,
+ "%4d: start 0x%llx, sz %x, prev.end 0x%llx shift %d -> %d",
+ sg_cnt, dma_start, sg_dma_len(sg), end, max_shift, shift);
+ max_shift = shift;
+ if (max_shift == sdev->mi.page_shift) /* No point in continuing */
+ break;
+ /* BUG_ON(max_shift < sdev->mi.page_shift); */
+ if (max_shift < sdev->mi.page_shift) {
+ sif_log(sdev, SIF_INFO,
+ "Failed to find a valid page shift: max_shift %d sdev->mi.page_shift %d",
+ max_shift, sdev->mi.page_shift);
+ return max_shift;
+ }
+ }
+ }
+ end = sg_dma_address(sg) + sg_dma_len(sg);
+ }
+ sif_log(sdev, SIF_MEM_SG, "found max shift %d from inspecting %d sges", max_shift, sg_cnt);
+ return max_shift;
+}
+
+/* External observer:
+ * Return the largest page size (represented by page shift bits) usable for this memory
+ */
+u32 sif_mem_page_shift(struct sif_mem *mem)
+{
+ /* If a maximum has been calculated, use it: */
+ if (mem->max_page_shift)
+ return mem->max_page_shift;
+ return mem_type_to_page_shift(mem->sdev, mem->mem_type);
+}
+
+static struct scatterlist *sg_alloc_list(struct sif_dev *sdev, unsigned int nelems, gfp_t flag)
+{
+ struct scatterlist *sg = sif_kmalloc(sdev, sizeof(struct scatterlist) * nelems, flag);
+
+ if (sg) {
+ sif_log0(SIF_MMU, "start at %p, %d elems allocated", sg, nelems);
+ sg_init_table(sg, nelems);
+ }
+ return sg;
+}
+
+
+/* API for managing a sif_kmem object */
+
+/** Allocate a set of pages of size (1 << page_shift).
+ * Prepare for scatterlist(s) of fixed length @sg_size (in number of elements)
+ * and allocate an initial @sz bytes (must be multiple of 1 << page_shift)
+ * @sz must be less than what fits within the initial scatterlist.
+ * If sg_size is 0, figure out the optimal sg_size.
+ */
+int sif_kmem_init(struct sif_dev *sdev, struct sif_kmem *kmem, size_t sg_size, size_t sz,
+ u32 page_shift, gfp_t flag, enum dma_data_direction dir)
+{
+ int ret;
+
+ memset(kmem, 0, sizeof(*kmem));
+ kmem->page_shift = page_shift;
+
+ if (!sg_size)
+ sg_size = sz >> page_shift;
+ kmem->sg_size = sg_size;
+ kmem->dir = dir;
+ kmem->sg_max = 0; /* Indicates an empty list with no end mark set yet */
+
+ if (sz == 0)
+ return 0;
+
+ ret = sif_kmem_extend(sdev, kmem, sz, flag);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+
+static void sif_kmem_free_pages(struct sif_kmem *kmem, struct scatterlist *sg, u32 nelems)
+{
+ int i;
+ int order = kmem->page_shift - PAGE_SHIFT;
+
+ for (i = 0; i < nelems; i++) {
+ __free_pages(sg_page(sg), order);
+ sg = sg_next(sg);
+ }
+}
+
+
+static void sif_kmem_free_sgls(struct sif_kmem *kmem, struct scatterlist *sgl, u32 nlists)
+{
+ for (; nlists > 0; nlists--) {
+ struct scatterlist *nsgl = sg_chain_ptr(&sgl[kmem->sg_size]);
+
+ kfree(sgl);
+ sgl = nsgl;
+ }
+}
+
+/* Find the @n'th scatterlist array within kmem */
+static struct scatterlist *sif_kmem_find_sg_head_idx(struct sif_kmem *kmem, u32 n)
+{
+ int i = 0;
+ struct scatterlist *sgl = kmem->sg;
+
+ for (; n > i; i++)
+ sgl = sg_chain_ptr(&sgl[kmem->sg_size]);
+ return sgl;
+}
+
+
+/* Find the scatterlist element with index idx within kmem */
+struct scatterlist *sif_kmem_find_sg_idx(struct sif_kmem *kmem, u32 idx)
+{
+ struct scatterlist *sgl;
+ int n = idx / kmem->sg_size;
+
+ sgl = sif_kmem_find_sg_head_idx(kmem, n);
+ return &sgl[idx % kmem->sg_size];
+}
+
+
+void sif_kmem_free(struct sif_dev *sdev, struct sif_kmem *kmem)
+{
+ int npages = kmem->sg_max - kmem->sg_start;
+ struct scatterlist *sg = sif_kmem_find_sg_idx(kmem, kmem->sg_start);
+
+ ib_dma_unmap_sg(&sdev->ib_dev, sg, npages, kmem->dir);
+
+ sif_kmem_free_pages(kmem, sg, npages);
+ sif_kmem_free_sgls(kmem, sg, kmem->nlists);
+ kmem->sg = NULL;
+}
+
+
+/* Extend a kmem object by allocating more sg entries if necessary, then
+ * allocate pages and dma map them. The invariant upon exit is that
+ * all allocated pages are dma mapped, which means that we must
+ * clean up pages that did not get mapped, if mapping fails midway:
+ */
+
+int sif_kmem_extend(struct sif_dev *sdev, struct sif_kmem *kmem, size_t sz, gfp_t flag)
+{
+ u32 i;
+ int ret;
+ int order;
+ struct page *page;
+ struct scatterlist *sg;
+ struct scatterlist *sg_prev = NULL;
+ struct scatterlist *sg_start = NULL;
+ size_t page_size = 1UL << kmem->page_shift;
+ u64 page_mask = page_size - 1;
+ u32 sg_size = (sz + page_mask) >> kmem->page_shift;
+
+ u32 nl = kmem->nlists;
+ long free_sg = nl * kmem->sg_size - kmem->sg_max;
+
+ sif_log(sdev, SIF_MEM, "enter, kmem at %p, sz 0x%lx", kmem, sz);
+
+ /* Make room in sg list */
+ for (; free_sg < sg_size; free_sg += kmem->sg_size) {
+ sg = sg_alloc_list(sdev, kmem->sg_size + 1, flag);
+ if (!sg) {
+ ret = -ENOMEM;
+ goto failed;
+ }
+ if (kmem->last_sg)
+ sg_chain(kmem->last_sg, kmem->sg_size + 1, sg);
+ else
+ kmem->sg = sg;
+ kmem->last_sg = sg;
+ kmem->nlists++;
+ }
+
+ /* The end mark is always in the last used element, not the first available one
+ * which sg_max points to:
+ */
+ if (kmem->sg_max) {
+ sg_prev = sif_kmem_find_sg_idx(kmem, kmem->sg_max - 1);
+ sg_unmark_end(sg_prev);
+ sg = sg_next(sg_prev);
+ } else
+ sg = sif_kmem_find_sg_idx(kmem, 0);
+
+ sg_start = sg;
+ order = kmem->page_shift - PAGE_SHIFT;
+
+ /* Allocate the new memory */
+ for (i = 0; i < sg_size; i++) {
+ sif_log(sdev, SIF_MEM_V, "i = %d, sg %p", i, sg);
+ page = sif_alloc_pages(sdev, flag | __GFP_ZERO, order);
+ if (!page) {
+ ret = -ENOMEM;
+ sg_size = i;
+ sg_mark_end(sg);
+ goto map_failed;
+ }
+ BUG_ON(!sg);
+ sg_set_page(sg, page, page_size, 0);
+ sg_prev = sg;
+ sg = sg_next(sg);
+ }
+ sg_mark_end(sg_prev);
+
+ ret = ib_dma_map_sg(&sdev->ib_dev, sg_start, sg_size, kmem->dir);
+ if (ret < 0) {
+ sif_log(sdev, SIF_INFO, "ib_dma_map_sg failed with %d", ret);
+ ret = -EFAULT;
+ goto map_failed;
+ }
+
+ sif_logs(SIF_PT_VV, sif_dump_sg(sg_start));
+
+ /* TBD: Remove this when issues with wrong alignments of DMA addresses
+ * has been resolved (both Sparc and OVM, see Orabug: 21690736
+ * For 2M seg_size, check that all DMA addresses are 2M aligned:
+ */
+ if (page_size >= PMD_SIZE) {
+ for (sg = sg_start, i = 0; sg != NULL; sg = sg_next(sg), i++) {
+ if (sg_dma_address(sg) & ~PMD_MASK) {
+ sif_log(sdev, SIF_INFO,
+ "**** Orabug: 21690736 - aligned PA maps to unaligned IOVA: i = %d, pa %llx dma %pad",
+ i,
+ (u64)sg_phys(sg), &sg_dma_address(sg));
+ ret = -EIO;
+ goto map_failed;
+ }
+ sif_log(sdev, SIF_MEM_V, "i = %d, pa %llx dma %pad", i,
+ (u64)sg_phys(sg), &sg_dma_address(sg));
+ }
+ }
+
+ /* To enable direct lookup, we rely on the s/g list not being
+ * collapsed by dma mapping. This holds on x86 but eg. on sparc we see
+ * collapsed lists where the IOMMU delivers the whole DMA range in a single entry
+ * at the start. Handle this case too by rewriting the DMA list
+ * to comply with our needs, otherwise fail (and dump the sg list to the trace buffer
+ * for analysis):
+ */
+ if (sg_size != ret) {
+ if (ret == 1) {
+ sif_log(sdev, SIF_MEM, "Fixing up collapsed sg list (%d/%d)",
+ ret, sg_size);
+ ret = sif_mem_fixup_dma(sg_start);
+ if (ret)
+ goto map_failed;
+ sif_logs(SIF_PT_VV, sif_dump_sg(sg_start));
+ } else {
+ /* This should not happen, but sanity check it anyway */
+ sif_log(sdev, SIF_INFO,
+ "** Detected unhandled layout of s/g list (%d/%d) **",
+ ret, sg_size);
+ ret = -EPROTOTYPE;
+ goto map_failed;
+ }
+ }
+ i = kmem->sg_max;
+ kmem->sg_max += ret;
+ kmem->size += sz;
+ return i;
+map_failed:
+ sif_dump_sg(sg_start);
+ if (sg_size)
+ sif_kmem_free_pages(kmem, sg_start, sg_size);
+failed:
+ return ret;
+}
+
+
+/* Map a part of the @kmem object given by @offset, @size to the user space
+ * vm context given in @vma. The part must be page aligned and page sized:
+ */
+
+static int sif_kmem_vma_map_part(struct sif_dev *sdev, struct sif_kmem *kmem, struct vm_area_struct *vma,
+ off_t start_off, size_t size)
+{
+ off_t sg_index = start_off >> kmem->page_shift;
+ u64 page_size = 1 << kmem->page_shift;
+ u64 page_mask = (page_size - 1);
+ off_t off = start_off & page_mask; /* start offset within mem page */
+ off_t sz = min_t(off_t, size, page_size - off);
+ struct scatterlist *sg;
+ dma_addr_t pfn, sg_phy;
+ u64 start = vma->vm_start;
+ u64 rem = size;
+ int ret;
+
+ BUG_ON(off & ~PAGE_MASK);
+
+ sg = sif_kmem_find_sg_idx(kmem, sg_index);
+
+ sif_log(sdev, SIF_MMAP, "size %lx, off %lx start sg idx: %ld",
+ size, off, sg_index);
+
+ for (; rem > 0; sg = sg_next(sg)) {
+ sg_phy = sg_phys(sg);
+ pfn = (sg_phy + off) >> PAGE_SHIFT;
+ sif_log(sdev, SIF_MMAP, "pfn %pad, sz %lx sg_phys %pad off %lx",
+ &pfn, sz, &sg_phy, off);
+ ret = remap_pfn_range(vma, start, pfn, sz, vma->vm_page_prot);
+ if (ret)
+ return ret;
+ rem -= sz;
+ start += sz;
+ sz = min(rem, page_size);
+ off = 0;
+ }
+ return 0;
+}
+
+
+static int sif_vma_map_sg_part(struct sif_dev *sdev, struct scatterlist *sg,
+ struct vm_area_struct *vma, off_t start_off, size_t size)
+{
+ u64 start = vma->vm_start;
+ off_t off = start_off;
+ dma_addr_t pfn, sg_phy;
+ off_t rem = size;
+ off_t sz;
+ int ret;
+
+ BUG_ON(off & ~PAGE_MASK);
+
+ sif_log(sdev, SIF_MMAP, "size %lx, off %lx",
+ size, start_off);
+
+ while (off > sg->length) {
+ off -= sg->length;
+ sg = sg_next(sg);
+ }
+ sz = min_t(off_t, rem, sg->length - off);
+
+ for (;;) {
+ sg_phy = sg_phys(sg);
+ pfn = (sg_phy + off) >> PAGE_SHIFT;
+ sif_log(sdev, SIF_MMAP, "pfn %pad, sz %lx sg_phys %pad off %lx",
+ &pfn, sz, &sg_phy, off);
+ ret = remap_pfn_range(vma, start, pfn, sz, vma->vm_page_prot);
+ if (ret)
+ return ret;
+ rem -= sz;
+ start += sz;
+ off = 0;
+ if (rem <= 0)
+ break;
+ sg = sg_next(sg);
+ sz = min_t(off_t, rem, sg->length);
+ }
+ return 0;
+}
+
+
+/* Remove a set of sg entries from the list starting at page index sg_idx
+ * and unlink from the linked list.
+ *
+ * We have to make sure we maintain consistency for index lookups,
+ * so no scatterlist vectors can be deleted from the middle of the list,
+ * only head and tail removal is allowed,
+ * and if we remove scatterlists from the head of the list, we must update the offset.
+ */
+
+int sif_kmem_shrink(struct sif_dev *sdev, struct sif_kmem *kmem, int sg_idx, size_t size)
+{
+ /* TBD: Implement this! */
+ return -EOPNOTSUPP;
+}
+
+
+/************************************
+ * API for managing different higher level (scatter) memory segment abstractions
+ * used by SIF:
+ */
+
+/* Set up a sif_mem structure for handling a memory
+ * segment of initial size @size.
+ */
+struct sif_mem *sif_mem_create(struct sif_dev *sdev, size_t sg_size,
+ size_t size, enum sif_mem_type mem_type,
+ gfp_t flag, enum dma_data_direction dir)
+{
+ int ret;
+ u32 page_shift = mem_type_to_page_shift(sdev, mem_type);
+ struct sif_mem *mem = kzalloc(sizeof(*mem), flag);
+
+ if (!mem)
+ return NULL;
+
+ BUG_ON(mem_type != SIFMT_2M && mem_type != SIFMT_4K);
+
+
+ ret = sif_kmem_init(sdev, &mem->m.km, sg_size,
+ size, page_shift, flag, dir);
+ if (ret)
+ goto failed;
+
+ mem->sdev = sdev;
+ mem->size = size;
+ mem->mem_type = mem_type;
+ mem->max_page_shift = 0;
+ return mem;
+failed:
+ kfree(mem);
+ return NULL;
+}
+
+/* Create a sif_mem object from an umem object (User level memory)
+ * The sif_mem object resumes ownership of the umem:
+ */
+struct sif_mem *sif_mem_create_umem(struct sif_dev *sdev,
+ struct ib_umem *umem,
+ enum sif_mem_type mem_type,
+ gfp_t flag, enum dma_data_direction dir)
+{
+ struct sif_mem *mem;
+ u64 dma_addr;
+
+ if (mem_type != SIFMT_BYPASS && !umem) {
+ sif_log(sdev, SIF_INFO, "Invalid umem setup");
+ return NULL;
+ }
+ mem = kzalloc(sizeof(*mem), flag);
+ if (!mem)
+ return NULL;
+
+ BUG_ON(!umem);
+ BUG_ON(mem_type != SIFMT_UMEM &&
+ mem_type != SIFMT_UMEM_RO &&
+ mem_type != SIFMT_BYPASS);
+
+ mem->sdev = sdev;
+ mem->m.u.umem = umem;
+ mem->size = umem->length;
+ mem->mem_type = mem_type;
+
+ /* See commit eeb8461e - sg chain safe impl of umem in 3.15 */
+ mem->m.u.sg = umem->sg_head.sgl;
+ mem->m.u.start_offset = umem->address & ~PAGE_MASK;
+ mem->vmap_base = (void *)umem->address;
+ mem->max_page_shift = sif_mem_max_page_shift(mem);
+ dma_addr = sg_dma_address(mem->m.u.sg);
+ sif_log(sdev, SIF_MEM, "vaddr %p, sg dma start 0x%llx, umem start_offset %llx",
+ mem->vmap_base, dma_addr, mem->m.u.start_offset);
+ if (umem->nmap < umem->npages) {
+ int ret;
+
+ sif_log(sdev, SIF_MEM, "Fixing up collapsed sg list (%d/%d)",
+ umem->nmap, umem->npages);
+ sif_logs(SIF_MEM, sif_dump_sg(mem->m.u.sg));
+ ret = sif_mem_fixup_dma(mem->m.u.sg);
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "sg list fixup failed");
+ sif_dump_sg(mem->m.u.sg);
+ kfree(mem);
+ return NULL;
+ }
+ }
+ sif_logs(SIF_PT_VV, sif_dump_sg(mem->m.u.sg));
+ return mem;
+}
+
+/* Create a sif_mem object from a phys array of length @num_phys
+ * The phys array is owned by caller:
+ */
+struct sif_mem *sif_mem_create_phys(struct sif_dev *sdev, void *kvaddr,
+ struct ib_phys_buf *phys_buf, int num_phys,
+ gfp_t flag)
+{
+ int i;
+ u64 size = 0;
+ struct sif_mem *mem = kzalloc(sizeof(*mem), flag);
+
+ if (!mem)
+ return NULL;
+
+ mem->sdev = sdev;
+ mem->m.phys.phys_buf = phys_buf;
+ mem->m.phys.phys_buf_len = num_phys;
+ for (i = 0; i < num_phys; i++) {
+ sif_log(sdev, SIF_MMU_V, "phys_buf addr 0x%llx size 0x%llx",
+ phys_buf[i].addr, phys_buf[i].size);
+ size += phys_buf[i].size;
+ }
+ /* TBD: We could calculate this above but phys_mr is scheduled to be removed */
+ mem->max_page_shift = 0;
+ mem->vmap_base = kvaddr;
+ mem->size = size;
+ mem->mem_type = SIFMT_PHYS;
+ return mem;
+}
+
+struct sif_mem *sif_mem_create_fmr(struct sif_dev *sdev, size_t max_pages, u32 page_shift,
+ gfp_t flag)
+{
+ size_t size = max_pages << page_shift;
+ struct sif_mem *mem = sif_mem_create_ref(sdev, SIFMT_PTONLY, 0, size, flag);
+
+ if (mem)
+ mem->m.fmr.page_shift = page_shift;
+ sif_log(sdev, SIF_FMR, "page_shift %d, size 0x%lx", page_shift, size);
+ return mem;
+}
+
+/* Create a sif_mem object from a memory pointer array of length @num_pages
+ * The memory pointer array is owned by caller:
+ */
+int sif_mem_map_fmr(struct sif_mem *mem, u64 iova,
+ u64 *page_list, int num_pages)
+{
+ u64 actual_size = num_pages << mem->m.fmr.page_shift;
+
+ if (iova & (mem->m.fmr.page_shift - 1)) {
+ sif_log(mem->sdev, SIF_INFO, "Misaligned FMR start - iova 0x%llx", iova);
+ return -EINVAL;
+ }
+ if (actual_size > mem->size) {
+ /* This is really now an artificial limit for us, except for performance */
+ sif_log(mem->sdev, SIF_INFO, "Attempt to map 0x%llx bytes, max for this FMR is 0x%llx",
+ actual_size, mem->size);
+ return -ENOMEM;
+ }
+ mem->vmap_base = (void *)iova;
+ mem->m.fmr.page_list = page_list;
+ mem->m.fmr.page_list_len = num_pages;
+ mem->mem_type = SIFMT_FMR;
+
+ /* We save the max mem size to be able to restore it later */
+ mem->m.fmr.max_size = mem->size;
+ mem->size = actual_size;
+ mem->max_page_shift = sif_mem_fmr_max_page_shift(mem);
+ return 0;
+}
+
+void sif_mem_unmap_fmr(struct sif_mem *mem)
+{
+ mem->vmap_base = NULL;
+ mem->size = mem->m.fmr.max_size;
+ mem->m.fmr.page_list = NULL;
+ mem->m.fmr.page_list_len = 0;
+ mem->mem_type = SIFMT_PTONLY;
+}
+
+/* Create a sif_mem object mapped dma contiguous, suitable for
+ * BYPASS mapping (size constraints..)
+ */
+struct sif_mem *sif_mem_create_dmacont(struct sif_dev *sdev, size_t size,
+ gfp_t flag, enum dma_data_direction dir)
+{
+ struct sif_mem *mem = kzalloc(sizeof(*mem), flag);
+ dma_addr_t dma_handle;
+ struct scatterlist *sg;
+
+ if (!mem)
+ return NULL;
+
+ /* The __GFP_DMA32 bit is not supported by page_alloc in all kernels */
+ if (unlikely(flag & __GFP_DMA32)) {
+ u64 dma_addr;
+
+ mem->vmap_base = ib_dma_alloc_coherent(&sdev->ib_dev, size,
+ &dma_addr, flag);
+ dma_handle = dma_addr;
+ mem->m.u.flags = SMF_DMA32;
+ } else
+ mem->vmap_base = sif_dma_alloc_aligned(&sdev->ib_dev, size, &dma_handle,
+ flag, dir);
+ if (!mem->vmap_base)
+ goto dma_alloc_failed;
+ mem->sdev = sdev;
+ mem->mem_type = SIFMT_BYPASS;
+ mem->max_page_shift = sdev->mi.max_shift;
+ mem->size = size;
+ mem->m.u.dir = dir;
+ mem->m.u.umem = NULL;
+ sg = mem->m.u.sg = &mem->m.u.sg0;
+ sg_init_one(sg, mem->vmap_base, mem->size);
+ sg->dma_address = dma_handle;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+ sg->dma_length = mem->size;
+#endif
+ return mem;
+dma_alloc_failed:
+ kfree(mem);
+ return NULL;
+}
+
+
+/* Create a sif_mem object with no own memory backing - to use for CB, SQ_CMPL and
+ * kernel full passthrough cases to have a "shallow" mem object:
+ */
+struct sif_mem *sif_mem_create_ref(struct sif_dev *sdev, enum sif_mem_type mem_type,
+ u64 sif_vaddr, size_t size, gfp_t flag)
+{
+ struct sif_mem *mem = kzalloc(sizeof(*mem), flag);
+
+ if (!mem)
+ return NULL;
+
+ BUG_ON(mem_type != SIFMT_PTONLY && mem_type != SIFMT_NOMEM && mem_type != SIFMT_CS);
+
+ mem->sdev = sdev;
+ mem->mem_type = mem_type;
+ mem->vmap_base = (void *)sif_vaddr;
+ mem->size = size;
+ mem->max_page_shift = 0;
+ return mem;
+}
+
+
+/* Free a sif_mem previously created with sif_mem_create */
+int sif_mem_free(struct sif_mem *mem)
+{
+ switch (mem->mem_type) {
+ case SIFMT_2M:
+ case SIFMT_4K:
+ sif_kmem_free(mem->sdev, &mem->m.km);
+ break;
+ case SIFMT_BYPASS:
+ /* BYPASS mode can be used from kernel or user space
+ * If umem is set, it is a user space mapping:
+ */
+ if (!mem->m.u.umem) {
+ if (mem->m.u.flags & SMF_DMA32)
+ ib_dma_free_coherent(&mem->sdev->ib_dev, mem->size,
+ mem->vmap_base, sif_mem_dma(mem, 0));
+ else
+ sif_dma_free_aligned(&mem->sdev->ib_dev, mem->size,
+ mem->vmap_base, sif_mem_dma(mem, 0), mem->m.u.dir);
+ }
+ /* Deliberate fall-through */
+ case SIFMT_UMEM:
+ case SIFMT_UMEM_RO:
+ if (mem->m.u.umem)
+ ib_umem_release(mem->m.u.umem);
+ break;
+ default:
+ break; /* Nothing extra to do */
+ }
+ kfree(mem);
+ return 0;
+}
+
+
+/* Allocate some (more) memory for this sif_mem
+ * Return a pointer to the start of that memory and increase ref.cnt for the sif_mem
+ */
+int sif_mem_extend(struct sif_mem *mem, size_t size, gfp_t flag)
+{
+ int sg_idx;
+
+ if (mem->mem_type != SIFMT_2M && mem->mem_type != SIFMT_4K)
+ return -EINVAL;
+
+ sg_idx = sif_kmem_extend(mem->sdev, &mem->m.km, size, flag);
+ mem->size = mem->m.km.size;
+ return sg_idx;
+}
+
+/* Free a subrange of this memory object starting at @sg and dereference the
+ * sif_mem object. Assumes there is no other references to this subrange:
+ */
+int sif_mem_shrink(struct sif_mem *mem, int sg_idx, size_t size)
+{
+ int ret;
+
+ if (mem->mem_type != SIFMT_2M && mem->mem_type != SIFMT_4K)
+ return -EINVAL;
+
+ ret = sif_kmem_shrink(mem->sdev, &mem->m.km, sg_idx, size);
+ mem->size = mem->m.km.size;
+ return ret;
+}
+
+
+bool sif_mem_has_umem(struct sif_mem *mem)
+{
+ switch (mem->mem_type) {
+ case SIFMT_UMEM:
+ case SIFMT_UMEM_RO:
+ case SIFMT_BYPASS:
+ return mem->m.u.umem != NULL;
+ default:
+ break;
+ }
+ return false;
+}
+
+
+/* Find kernel virtual address at @offset within map */
+void *sif_mem_kaddr(struct sif_mem *mem, off_t offset)
+{
+ switch (mem->mem_type) {
+ case SIFMT_2M:
+ case SIFMT_4K:
+ {
+ off_t off = offset & ((1 << mem->m.km.page_shift) - 1);
+ u32 i = offset >> mem->m.km.page_shift;
+ struct scatterlist *sg = sif_kmem_find_sg_idx(&mem->m.km, i);
+
+ return sg_virt(sg) + off;
+ }
+ case SIFMT_BYPASS:
+ case SIFMT_UMEM:
+ case SIFMT_UMEM_RO:
+ case SIFMT_NOMEM:
+ case SIFMT_PHYS:
+ case SIFMT_FMR:
+ return mem->vmap_base + offset;
+ default:
+ break;
+ }
+
+ sif_log(mem->sdev, SIF_INFO, "Not implemented for type %d",
+ mem->mem_type);
+ return NULL;
+}
+
+/* Find DMA address at @offset within map */
+dma_addr_t sif_mem_dma(struct sif_mem *mem, off_t offset)
+{
+ switch (mem->mem_type) {
+ case SIFMT_PTONLY:
+ return offset;
+ case SIFMT_2M:
+ case SIFMT_4K:
+ {
+ off_t off = offset & ((1 << mem->m.km.page_shift) - 1);
+ u32 i = offset >> mem->m.km.page_shift;
+ struct scatterlist *sg = sif_kmem_find_sg_idx(&mem->m.km, i);
+
+ return sg_dma_address(sg) + off;
+ }
+ case SIFMT_BYPASS:
+ return sg_dma_address(mem->m.u.sg) + offset;
+ case SIFMT_UMEM:
+ case SIFMT_UMEM_RO:
+ {
+ struct scatterlist *sg = mem->m.u.sg;
+ /* umem objects have page aligned sg lists but may start at an offset */
+ offset += mem->m.u.start_offset;
+ while (sg && offset >= sg->length) {
+ offset -= sg->length;
+ sg = sg_next(sg);
+ }
+ return sg_dma_address(sg) + offset;
+ }
+ case SIFMT_PHYS:
+ {
+ struct ib_phys_buf *pb = mem->m.phys.phys_buf;
+
+ while (offset >= pb->size) {
+ offset -= pb->size;
+ pb++;
+ }
+ return pb->addr + offset;
+ }
+ case SIFMT_FMR:
+ {
+ u32 pageno = offset >> mem->m.fmr.page_shift;
+ off_t off = offset & ((1 << mem->m.fmr.page_shift) - 1);
+
+ return mem->m.fmr.page_list[pageno] + off;
+ }
+ default:
+ break;
+ }
+
+ sif_log(mem->sdev, SIF_INFO, "Not implemented for type %d",
+ mem->mem_type);
+ BUG();
+ return 0ull;
+}
+
+
+struct scatterlist *sif_mem_get_sgl(struct sif_mem *mem)
+{
+ switch (mem->mem_type) {
+ case SIFMT_UMEM:
+ case SIFMT_UMEM_RO:
+ case SIFMT_BYPASS:
+ return mem->m.u.sg;
+ case SIFMT_2M:
+ case SIFMT_4K:
+ return mem->m.km.sg;
+ default:
+ sif_log(mem->sdev, SIF_INFO, "unsupported memory type %d", mem->mem_type);
+ break;
+ }
+ return NULL;
+}
+
+
+/* If map is continuous, get start of dma mapping
+ * otherwise return an error pointer:
+ */
+dma_addr_t sif_mem_dma_if_cont(struct sif_mem *mem)
+{
+ struct scatterlist *sg;
+ size_t sz = 1 << sif_mem_max_page_shift(mem);
+
+ if (sz < mem->size) {
+ sif_log(mem->sdev, SIF_INFO,
+ "size: %lld - max possible page sz %ld: mmu bypass not possible",
+ mem->size, sz);
+ return (u64)ERR_PTR(-EPERM);
+ }
+ sg = sif_mem_get_sgl(mem);
+ if (unlikely(!sg))
+ return (u64)ERR_PTR(-EINVAL);
+ return sg_dma_address(sg);
+}
+
+
+int sif_mem_vma_map_part(struct sif_mem *mem, struct vm_area_struct *vma,
+ off_t start_off, size_t size)
+{
+ switch (mem->mem_type) {
+ case SIFMT_2M:
+ case SIFMT_4K:
+ return sif_kmem_vma_map_part(mem->sdev, &mem->m.km, vma, start_off, size);
+ case SIFMT_BYPASS:
+ case SIFMT_BYPASS_RO:
+ return sif_vma_map_sg_part(mem->sdev, mem->m.u.sg, vma, start_off, size);
+ default:
+ sif_log(mem->sdev, SIF_INFO, "not implemented for mem.type %d", mem->mem_type);
+ return -EOPNOTSUPP;
+ }
+}
+
+
+/* Map the memory referenced by @mem to the user space vma */
+int sif_mem_vma_map(struct sif_mem *mem, struct vm_area_struct *vma)
+{
+ return sif_mem_vma_map_part(mem, vma, 0, mem->size);
+}
+
+/* sif_mem iterator support (mainly for the types that do not expose a scatterlist) */
+
+int sif_mem_iter_init(struct sif_mem *mem, struct sif_mem_iter *it)
+{
+ it->mem = mem;
+ switch (mem->mem_type) {
+ case SIFMT_PHYS:
+ case SIFMT_FMR:
+ case SIFMT_PTONLY:
+ it->phys.i = 0;
+ break;
+ default:
+ it->sg = sif_mem_get_sgl(mem);
+ if (!it->sg)
+ return -EINVAL;
+ }
+ it->offset = 0;
+ return 0;
+}
+
+
+int sif_mem_iter_advance(struct sif_mem_iter *it, u64 incr)
+{
+ switch (it->mem->mem_type) {
+ case SIFMT_PHYS:
+ {
+ long left = it->mem->m.phys.phys_buf[it->phys.i].size - it->offset;
+
+ if (left > incr)
+ it->offset += incr;
+ else {
+ it->offset = incr - left;
+ it->phys.i++;
+ }
+ if (it->phys.i >= it->mem->m.phys.phys_buf_len)
+ return -ENOMEM;
+ return 0;
+ }
+ case SIFMT_FMR:
+ {
+ long page_size = 1 << it->mem->m.fmr.page_shift;
+ long left = page_size - it->offset;
+
+ if (left > incr)
+ it->offset += incr;
+ else {
+ it->offset = incr - left;
+ it->phys.i++;
+ }
+ if (it->phys.i >= it->mem->m.fmr.page_list_len)
+ return -ENOMEM;
+ return 0;
+ }
+ case SIFMT_PTONLY:
+ it->offset += incr;
+ if (it->offset >= it->mem->size)
+ return -ENOMEM;
+ return 0;
+ default:
+ it->offset += incr;
+ while (it->offset >= it->sg->length) {
+ it->offset = it->offset - it->sg->length;
+ it->sg = sg_next(it->sg);
+ }
+ if (it->sg)
+ return 0;
+ else
+ return -ENOMEM;
+ }
+}
+
+dma_addr_t sif_mem_iter_dma(struct sif_mem_iter *it)
+{
+ switch (it->mem->mem_type) {
+ case SIFMT_PHYS:
+ return it->mem->m.phys.phys_buf[it->phys.i].addr + it->offset;
+ case SIFMT_FMR:
+ return it->mem->m.fmr.page_list[it->phys.i] + it->offset;
+ case SIFMT_PTONLY:
+ return 0; /* For future fmr use: populate with empty ptes to be filled later */
+ default:
+ return sg_dma_address(it->sg) + it->offset;
+ }
+}
+
+
+/* DMA is mapped continuously and the map is reflected in a "collapsed" sg list for DMA,
+ * The rest of the list is still valid for the pa/va part - we need to loop through and
+ * make it consistent for our usage:
+ */
+static int sif_mem_fixup_dma(struct scatterlist *sg)
+{
+ struct scatterlist *from_sg = sg;
+ struct scatterlist *last_sg = sg;
+ dma_addr_t dma_addr = sg_dma_address(from_sg);
+ size_t dma_size = sg_dma_len(sg);
+ size_t sg_len = sg->length; /* Save the "homogeneous" length */
+
+ while (sg) {
+ if (dma_size < sg->length)
+ return -EINVAL; /* should not happen */
+
+ if (sg->dma_address && sg->dma_address != (dma_addr_t)-1) {
+ /* This entry is part of the collapsed list
+ * must keep address and dma_length until we have "consumed" it,
+ * Since all lengths are homogeneous in the resulting list we
+ * can temporarily "misuse" the length field in this entry to
+ * store the new dma_address, and just leave the dma_length
+ * for later consumption:
+ */
+ sg->length = sg->dma_address;
+ } else
+ sg->dma_length = sg_len;
+
+ sg->dma_address = dma_addr;
+ dma_addr += sg_len;
+ dma_size -= sg_len;
+ last_sg = sg;
+ sg = sg_next(sg);
+
+ if (!dma_size) {
+ /* Clean up our "temporary store" (see below comment) */
+ from_sg->length = from_sg->dma_length = sg_len;
+ from_sg = sg_next(from_sg);
+ dma_addr = from_sg->length; /* from temp store */
+ dma_size = sg_dma_len(from_sg);
+ }
+ }
+ return 0;
+}
+
+/* A utility for dumping an sg list to the trace buffer */
+void sif_dump_sg(struct scatterlist *sgl)
+{
+ struct scatterlist *sg = sgl;
+ int cnt = 0;
+
+ trace_printk(" **** sg dump - start at %p ****\n", sg);
+ trace_printk("%16s: %16s %8s %16s %16s %8s %8s %4s\n",
+ "sg", "dma", "dmalen", "pa", "kva", "length", "offset", "end mark");
+ while (sg) {
+ u64 dma_addr = sg_dma_address(sg);
+ u64 pa = sg_phys(sg);
+
+ trace_printk("%p: %#16llx %#8x %#16llx %p %#8x %#8x %4s\n",
+ sg, dma_addr, sg_dma_len(sg), pa,
+ sg_virt(sg), sg->length, sg->offset,
+ (sg_is_last(sg) ? "[last]" : ""));
+ sg = sg_next(sg);
+ cnt++;
+ }
+ trace_printk(" **** tot.%d elements ****\n", cnt);
+}
--- /dev/null
+/*
+ * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_mem.h: A common interface for all memory used by
+ * SIF for queue, table and page table management
+ */
+
+#ifndef _SIF_MEM_H
+#define _SIF_MEM_H
+#include <rdma/ib_verbs.h>
+#include "sif_user.h"
+
+/* We need to support 4 interfaces to memory; abbreviated umem, fmr,
+ * phys and kmem below, to be compatible with the different ways we are called.
+ * This is due to be cleaned up in the core IB stack,
+ * by allowing the use of scatterlists for all types of s/g memory
+ * provided to rdma devices.
+ */
+
+/* Allocation of table and queue memory:
+ * The Linux buddy allocator should guarantee us lots of up to 4M I/O contiguous
+ * memory segments through alloc_pages provided the system has enough memory.
+ * Assume that we get at least 4M standalone and any number of (aligned) 2M entries after that
+ *
+ * This way we allocate contiguous memory and use bypass/passthrough mapping if
+ * alloc_sz <= 4M, and revert to GVA2GPA if needs are larger, but allocate in 2M blocks
+ * and use PSIF 2M pages for this.
+ */
+
+struct ib_umem;
+struct sif_dev;
+
+/* Per device memory configuration info
+ * embedded in sif_dev:
+ */
+struct sif_mem_info {
+ u8 page_shift; /* number of bits within the smallest SIF level 0 page (depends on config) */
+ u8 level_shift; /* number of bits to shift to the next level in the page table */
+ u8 max_shift; /* Highest number of bits within the highest level page */
+ u32 ptes_per_page; /* Page table entries per page table page */
+ u64 page_size; /* size of a SIF level 0 page (as configured) */
+ u64 page_mask; /* All bits beyond page_shift set */
+};
+
+/* Valid for SIFMT_2M, SIFMT_4K and SIFMT_BYPASS_RO:
+ * Represented as a pool of equally sized pages.
+ * Allows direct page offset lookup from the kernel side.
+ * All pages are the same size.
+ * To maintain offset indexes, interior pages cannot be removed.
+ * sg_start will be > 0 if there are empty entries at the start, allowing
+ * indexes to remain valid if entries are deleted from the head
+ */
+struct sif_kmem {
+ u64 size; /* Size of the mapped memory of this kmem */
+ u32 page_shift; /* Represents page size of each scatter element */
+ u32 sg_size; /* Allocated number of (usable!) elements in (each) scatter list */
+ u32 sg_start; /* Current start offset into the sg list */
+ u32 sg_max; /* Last entry in use + 1 (<= sg_size * nlists) */
+ u32 nlists; /* Number of (sg_size+1'd) sg lists linked through sg */
+ enum dma_data_direction dir; /* DMA direction used for dma mapping */
+ struct scatterlist *sg; /* Pointer to start of scatterlist array */
+ struct scatterlist *last_sg; /* The start of the last list array in the sg list linkage */
+};
+
+/* Valid for SIFMT_FMR (when called from ib_map_phys_fmr) */
+struct sif_mem_fmr {
+ u64 *page_list; /* Array of dma addresses of buffers */
+ u32 page_list_len; /* length of page_list array */
+ u32 page_shift; /* Represents page size of each scatter element */
+ u64 max_size; /* Saved maximal size of the FMR as supplied during creation */
+};
+
+/* Valid for SIFMT_PHYS (when called from ib_reg_phys_mr)
+ * It is called "phys" but should have been called "dma" as it is used
+ * with dma addresses in at least 1 of the 2 use cases in the kernel...
+ * not important to support this API, but keep for completeness:
+ */
+struct sif_mem_phys {
+ struct ib_phys_buf *phys_buf; /* Array of dma address/size pairs of buffers */
+ u64 phys_buf_len; /* length of phys_buf array */
+};
+
+/* Flag values so far only used by 'flags' in sif_mem_umem: */
+enum sif_mem_flags {
+ SMF_DMA32 = 0x1 /* Set if this memory is allocated from the DMA32 space */
+};
+
+/* Memory types mapped from user space:
+ * Valid for SIFMT_UMEM, SIFMT_UMEM_RO, SIFMT_BYPASS:
+ */
+struct sif_mem_umem {
+ struct ib_umem *umem; /* User memory, NULL if this is a kernel bypass mapping */
+ struct scatterlist *sg; /* A pointer to a valid scatterlist (user and kernel) */
+ u64 start_offset; /* Stored misalignment according to the scatter element size */
+ enum dma_data_direction dir; /* DMA direction used for dma mapping */
+ u32 flags;
+ struct scatterlist sg0; /* Inline storage for bypass mode */
+};
+
+
+/* The generic sif s/g memory representation
+ *
+ */
+struct sif_mem {
+ struct sif_dev *sdev;
+ enum sif_mem_type mem_type; /* Logical type of mapping */
+ u16 max_page_shift; /* 0: unknown, >= 0: Largest page size that can be mapped cont. */
+ u64 size; /* Size of mapping */
+ void *vmap_base; /* Kernel address of the start of a vmap cont.mapping, if any */
+ union {
+ struct sif_mem_umem u; /* SIFMT_{UMEM*,BYPASS} */
+ struct sif_kmem km; /* SIFMT_{2M,CS,4K} */
+ struct sif_mem_fmr fmr; /* SIFMT_FMR */
+ struct sif_mem_phys phys; /* SIFMT_PHYS */
+ } m;
+};
+
+
+/* Initialization of global per device info - called from sif_hwi.c */
+void sif_mem_init(struct sif_dev *sdev);
+
+/* API for managing a sif_kmem object */
+
+/* Allocate a memory object of size @size and populate an sg list
+ * with it:
+ */
+int sif_kmem_init(struct sif_dev *sdev, struct sif_kmem *kmem, size_t sg_size, size_t size,
+ u32 page_shift, gfp_t flag, enum dma_data_direction dir);
+
+/* sg unmap and free the memory referenced by mem */
+void sif_kmem_free(struct sif_dev *sdev, struct sif_kmem *mem);
+
+/* Extend the kmem object with a total size of @size - return sg_index of the first
+ * allocated element:
+ */
+int sif_kmem_extend(struct sif_dev *sdev, struct sif_kmem *kmem,
+ size_t size, gfp_t flag);
+int sif_kmem_shrink(struct sif_dev *sdev, struct sif_kmem *mem, int sg_idx, size_t size);
+
+/* Find the scatterlist element with index idx within kmem */
+struct scatterlist *sif_kmem_find_sg_idx(struct sif_kmem *kmem, u32 idx);
+
+/************************************
+ * API for managing different higher level (scatter) memory segment abstractions
+ * used by SIF:
+ */
+
+/* Set up a sif_mem structure for handling a memory
+ * segment of initial size @size.
+ */
+struct sif_mem *sif_mem_create(struct sif_dev *sdev, size_t sg_size, size_t size,
+ enum sif_mem_type mem_type,
+ gfp_t flag,
+ enum dma_data_direction dir);
+
+/* Create a sif_mem object from an umem object (User level memory)
+ * The sif_mem object resumes ownership of the umem:
+ */
+struct sif_mem *sif_mem_create_umem(struct sif_dev *sdev,
+ struct ib_umem *umem,
+ enum sif_mem_type mem_type,
+ gfp_t flag, enum dma_data_direction dir);
+
+/* Create a sif_mem object from a phys array of length @num_phys
+ * The phys array is owned by caller:
+ */
+struct sif_mem *sif_mem_create_phys(struct sif_dev *sdev, void *iova_start,
+ struct ib_phys_buf *phys, int num_phys,
+ gfp_t flag);
+
+/* Create a sif_mem object from a memory pointer array of length @num_pages
+ * The memory pointer array is owned by caller:
+ */
+struct sif_mem *sif_mem_create_fmr(struct sif_dev *sdev, size_t size, u32 page_shift,
+ gfp_t flag);
+
+/* Create a sif_mem object with no own memory backing - to use for CB, SQ_CMPL and
+ * kernel full passthrough cases to have a "shallow" mem object:
+ */
+struct sif_mem *sif_mem_create_ref(struct sif_dev *sdev, enum sif_mem_type mem_type,
+ u64 sif_vaddr, size_t size, gfp_t flag);
+
+/* Create an aligned sif_mem object mapped coherent dma contiguous, suitable for
+ * BYPASS mapping (size constraints..)
+ */
+struct sif_mem *sif_mem_create_dmacont(struct sif_dev *sdev, size_t size, gfp_t flag,
+ enum dma_data_direction dir);
+
+/* Free a sif_mem previously created with sif_mem_create */
+int sif_mem_free(struct sif_mem *mem);
+
+/* Map a previously created sif_mem ref object from a memory pointer array of length @num_pages
+ * The memory pointer array is owned by caller:
+ * Returns -ENOMEM if the sif_mem ref object does not have a sufficiently large size.
+ */
+int sif_mem_map_fmr(struct sif_mem *mem, u64 iova,
+ u64 *page_list, int num_pages);
+
+/* Unmap and reset a mem object previously set up with sif_mem_map_fmr */
+void sif_mem_unmap_fmr(struct sif_mem *mem);
+
+/* Allocate some (more) memory for this sif_mem
+ * Return an s/g index (page offset to the start of that memory
+ * or -errval if an error.
+ */
+int sif_mem_extend(struct sif_mem *mem, size_t size, gfp_t flag);
+
+/* Free a subrange of this memory object starting at @sg_idx and dereference the
+ * sif_mem object. Assumes there is no other references to this subrange, and that
+ * this subrange corresponds exactly to a prior allocation with either create or extend above
+ * returns 0 upon success or a negative errno if failure:
+ */
+int sif_mem_shrink(struct sif_mem *mem, int sg_idx, size_t size);
+
+/* Returns true if this memory is represented internally by an umem object */
+bool sif_mem_has_umem(struct sif_mem *mem);
+
+/* Return the largest page size (represented by page shift bits) usable for this memory */
+u32 sif_mem_page_shift(struct sif_mem *mem);
+
+/* Find kernel virtual address at @offset within map */
+void *sif_mem_kaddr(struct sif_mem *mem, off_t offset);
+
+/* Find dma address at @offset within map */
+dma_addr_t sif_mem_dma(struct sif_mem *mem, off_t offset);
+
+/* If map is continuous, get start of dma mapping
+ * otherwise return an error pointer:
+ */
+dma_addr_t sif_mem_dma_if_cont(struct sif_mem *mem);
+
+/* Return the start of the s/g list for this mem object */
+struct scatterlist *sif_mem_get_sgl(struct sif_mem *mem);
+
+/* Map a part of the @mem object given by @offset, @size to the user space
+ * vm context given in @vma. The part must be page aligned and page sized:
+ */
+
+int sif_mem_vma_map_part(struct sif_mem *mem, struct vm_area_struct *vma,
+ off_t start_off, size_t size);
+
+/* Map the memory referenced by @mem to the user space vma */
+int sif_mem_vma_map(struct sif_mem *mem, struct vm_area_struct *vma);
+
+
+/* sif_mem iterator (mainly for the types that do not expose a scatterlist) */
+struct sif_mem_iter {
+ struct sif_mem *mem;
+ union {
+ struct {
+ int i; /* Index used by SIFMT_PHYS and SIFMT_FMR */
+ } phys;
+ struct scatterlist *sg; /* Used by scatterlist based types */
+ };
+ size_t offset; /* Current offset within element */
+};
+
+int sif_mem_iter_init(struct sif_mem *mem, struct sif_mem_iter *it);
+int sif_mem_iter_advance(struct sif_mem_iter *it, u64 incr);
+dma_addr_t sif_mem_iter_dma(struct sif_mem_iter *mi);
+
+/* A utility for dumping an sg list to the trace buffer */
+void sif_dump_sg(struct scatterlist *sgl);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_mmu.c: main entry points and initialization
+ */
+
+#include "sif_mmu.h"
+#include "sif_dev.h"
+#include "sif_base.h"
+#include "sif_dma.h"
+#include "sif_hwi.h"
+#include "sif_mem.h"
+#include "sif_spt.h"
+#include "sif_xmmu.h"
+#include "sif_pt.h"
+#include "sif_mr.h"
+#include "sif_query.h"
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/highmem.h>
+#include <linux/kref.h>
+#include <linux/version.h>
+#include <rdma/ib_umem.h>
+#include "psif_hw_setget.h"
+#include "sif_defs.h"
+
+static int sif_map_gva_ctx(struct sif_dev *sdev,
+ struct sif_mmu_ctx *ctx,
+ struct sif_mem *mem,
+ bool write);
+
+static int sif_map_bypass_ctx(struct sif_dev *sdev,
+ struct sif_mmu_ctx *ctx,
+ struct sif_mem *mem,
+ bool write);
+
+static int sif_map_cs_ctx(struct sif_dev *sdev,
+ struct sif_mmu_ctx *ctx,
+ bool write);
+
+#ifndef __sparc__
+/* Special handling for PHYS memory types which don't have any sg list: */
+static int sif_map_special_ctx(struct sif_dev *sdev,
+ struct sif_mmu_ctx *ctx,
+ struct sif_mem *mem,
+ bool write);
+#endif
+
+static int sif_mmu_invalidate_tlb(struct sif_dev *sdev, struct sif_mmu_ctx *ctx, enum wr_mode mode);
+
+void set_ctx(struct sif_dev *sdev,
+ struct sif_mmu_ctx *ctx,
+ enum psif_table_level level,
+ u64 val)
+{
+ struct psif_mmu_cntx *hw_ctx = &ctx->mctx;
+
+ val &= ~PSIF_TABLE_PTR_MASK;
+ hw_ctx->table_ptr = ((val) >> PT_PAGE_SHIFT);
+ hw_ctx->table_level = level;
+ sif_log(sdev, SIF_MMU, "%p ptr 0x%08llx level %d", hw_ctx, val, level);
+}
+
+
+
+int sif_map_ctx(struct sif_dev *sdev,
+ struct sif_mmu_ctx *ctx,
+ struct sif_mem *mem,
+ u64 virt_base, u64 size, bool write)
+{
+ /* hw_ctx entry assumed to be set up in pass through
+ * prior to the call (all null bytes)
+ */
+ ctx->type = MMU_GVA2GPA_MODE;
+ ctx->base = virt_base;
+ ctx->size = size;
+ ctx->mt = mem->mem_type;
+
+ switch (mem->mem_type) {
+ case SIFMT_BYPASS:
+ case SIFMT_BYPASS_RO:
+ case SIFMT_NOMEM:
+ return sif_map_bypass_ctx(sdev, ctx, mem, write);
+ case SIFMT_UMEM:
+ case SIFMT_UMEM_RO:
+ case SIFMT_2M:
+ case SIFMT_4K:
+ return sif_map_gva_ctx(sdev, ctx, mem, write);
+ case SIFMT_CS:
+ return sif_map_cs_ctx(sdev, ctx, write);
+ case SIFMT_ZERO:
+ return sif_zero_map_gva_ctx(sdev, ctx, mem, write);
+ case SIFMT_PTONLY:
+ return 0; /* Nothing to map yet */
+#ifndef __sparc__
+ case SIFMT_PHYS:
+ return sif_map_special_ctx(sdev, ctx, mem, write);
+ case SIFMT_UMEM_SPT:
+ return sif_spt_map_gva_ctx(sdev, ctx, mem, write);
+#endif
+ default:
+ sif_log(sdev, SIF_INFO, "Unimplemented mem_type %d %s",
+ mem->mem_type, sif_mem_type_str(mem->mem_type));
+ return -EOPNOTSUPP;
+ }
+ return -EINVAL;
+}
+
+void sif_unmap_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx)
+{
+ switch (ctx->mt) {
+ case SIFMT_BYPASS:
+ case SIFMT_BYPASS_RO:
+ case SIFMT_NOMEM:
+ break;
+ case SIFMT_UMEM:
+ case SIFMT_UMEM_RO:
+ case SIFMT_PHYS:
+ case SIFMT_FMR:
+ case SIFMT_2M:
+ case SIFMT_4K:
+ case SIFMT_CS:
+ case SIFMT_PTONLY:
+ sif_unmap_gva_ctx(sdev, ctx);
+ break;
+#ifndef __sparc__
+ case SIFMT_ZERO:
+ sif_zero_unmap_gva_ctx(sdev, ctx);
+ break;
+ case SIFMT_UMEM_SPT:
+ sif_spt_unmap_gva_ctx(sdev, ctx);
+ break;
+#endif
+ default:
+ sif_log(sdev, SIF_INFO, "Unimplemented mem type %d, ctx at %p", ctx->mt, ctx);
+ BUG(); /* Should not happen - throwing the cards */
+ }
+}
+
+static size_t num_pages(u64 base, u64 size, u32 page_shift)
+{
+ size_t pg_sz = 1 << page_shift;
+
+ return aligned_size(base, size, pg_sz) >> page_shift;
+}
+
+/* May return -1 or a valid enum value for psif_page_size */
+static int hw_leaf_page_sz(struct sif_dev *sdev, u32 page_shift)
+{
+ /* Page size not supported by device configuration */
+ if (sdev->mi.page_shift > page_shift) {
+ sif_log(sdev, SIF_INFO,
+ "Cannot support page shift %d - min.page shift supported in this configuration is %d",
+ page_shift, sdev->mi.page_shift);
+ return -1;
+ }
+
+ switch (sdev->mi.page_shift) {
+ case 12: /* Device configured for Intel page sizes */
+ if (page_shift < 21)
+ return PAGE_SIZE_IA32E_4KB;
+ if (page_shift < 30)
+ return PAGE_SIZE_IA32E_2MB;
+ return PAGE_SIZE_IA32E_1GB;
+ case 13: /* Device configured for Sparc page sizes */
+ if (page_shift < 16)
+ return PAGE_SIZE_S64_8KB;
+ if (page_shift < 19)
+ return PAGE_SIZE_S64_64KB;
+ if (page_shift < 22)
+ return PAGE_SIZE_S64_512KB;
+ if (page_shift < 25)
+ return PAGE_SIZE_S64_4MB;
+ if (page_shift < 28)
+ return PAGE_SIZE_S64_32MB;
+ if (page_shift < 34)
+ return PAGE_SIZE_S64_2GB;
+ return PAGE_SIZE_S64_16GB;
+ }
+ sif_log(sdev, SIF_INFO, "Cannot support page shift %d", page_shift);
+ return -1;
+}
+
+
+static inline enum psif_table_level hw_leaf_level(enum psif_page_size pg_sz)
+{
+ switch (pg_sz) {
+ case PAGE_SIZE_IA32E_2MB:
+ case PAGE_SIZE_S64_4MB:
+ return PAGE_LEVEL1;
+ case PAGE_SIZE_IA32E_1GB:
+ case PAGE_SIZE_S64_2GB:
+ return PAGE_LEVEL2;
+ default:
+ return PAGE_LEVEL0;
+ }
+}
+
+
+static int sif_map_bypass_ctx(struct sif_dev *sdev,
+ struct sif_mmu_ctx *ctx,
+ struct sif_mem *mem,
+ bool write)
+{
+ u64 addr = 0;
+ int ret = 0;
+
+ ctx->type = MMU_PASS_THROUGH0;
+
+ if (mem->mem_type == SIFMT_NOMEM)
+ ctx->mt = SIFMT_BYPASS;
+ if (write)
+ ctx->mctx.wr_access = 1;
+
+ if (mem->m.u.umem) {
+ addr = sif_mem_dma_if_cont(mem);
+ if (IS_ERR((void *)addr))
+ return PTR_ERR((void *)addr);
+ } else if (mem->mem_type != SIFMT_NOMEM)
+ addr = sif_mem_dma(mem, 0);
+
+ if (mem->mem_type == SIFMT_BYPASS || mem->mem_type == SIFMT_BYPASS_RO)
+ ctx->uv2dma = addr - ctx->base;
+ ctx->base = addr;
+ return ret;
+}
+
+
+static int sif_map_gva_ctx(struct sif_dev *sdev,
+ struct sif_mmu_ctx *ctx,
+ struct sif_mem *mem,
+ bool write)
+{
+ struct psif_mmu_cntx *hw_ctx = &ctx->mctx;
+ bool multipage;
+ u64 page_size;
+ u64 page_mask;
+ enum psif_table_level leaf_level;
+ u64 aligned_base;
+ u64 aligned_sz;
+ u32 page_shift = sif_mem_page_shift(mem);
+ u8 pt_leaf_level = 0;
+ u8 pt_pte_extent = 1;
+ u64 dma_addr;
+
+ /* Adjust to a supported page shift */
+ int ret = find_optimal_leaf_level(sdev, page_shift,
+ ctx->base, sif_mem_dma(mem, 0), ctx->size,
+ &pt_leaf_level, &pt_pte_extent);
+ if (ret)
+ return ret;
+
+ page_shift = sdev->mi.page_shift + pt_leaf_level * sdev->mi.level_shift;
+ page_size = 1ULL << page_shift;
+ page_mask = ~(page_size - 1);
+
+ hw_ctx->wr_access = write;
+ hw_ctx->translation_type = MMU_GVA2GPA_MODE;
+ hw_ctx->page_size = hw_leaf_page_sz(sdev, page_shift);
+
+ aligned_base = ctx->base & page_mask;
+ aligned_sz = aligned_size(ctx->base, ctx->size, page_size);
+ multipage = sdev->single_pte_pt || aligned_sz > page_size;
+ leaf_level = hw_leaf_level(hw_ctx->page_size);
+ dma_addr = sif_mem_dma(mem, 0);
+
+ sif_log(sdev, SIF_MMU_V, "base 0x%llx dma base 0x%llx size 0x%llx page shift %d size %s",
+ ctx->base, dma_addr, ctx->size, page_shift,
+ string_enum_psif_page_size(hw_ctx->page_size));
+
+ if (multipage) {
+ ctx->pt = sif_pt_create(sdev, sif_mem_get_sgl(mem),
+ ctx->base, ctx->size, page_shift, false, false);
+ if (!ctx->pt)
+ return -ENOMEM;
+ set_ctx(sdev, ctx, sif_pt_root_table_level(ctx->pt), sif_pt_dma_root(ctx->pt));
+ } else {
+ dma_addr_t aligned_dma_addr = dma_addr & ~((1 << page_shift) - 1);
+
+ set_ctx(sdev, ctx, leaf_level, aligned_dma_addr);
+ }
+ return 0;
+}
+
+
+static int sif_map_cs_ctx(struct sif_dev *sdev,
+ struct sif_mmu_ctx *ctx,
+ bool write)
+{
+ struct psif_mmu_cntx *hw_ctx = &ctx->mctx;
+
+ hw_ctx->wr_access = write;
+ hw_ctx->translation_type = MMU_GVA2GPA_MODE;
+ hw_ctx->page_size = PAGE_SIZE_IA32E_4KB;
+
+ /* Just create a page table with an empty top level page */
+ ctx->pt = sif_pt_create_empty(sdev, ctx->base, SIFMT_CS);
+ if (!ctx->pt)
+ return -ENOMEM;
+ set_ctx(sdev, ctx, sif_pt_root_table_level(ctx->pt), sif_pt_dma_root(ctx->pt));
+ return 0;
+}
+
+#ifndef __sparc__
+static int sif_map_special_ctx(struct sif_dev *sdev,
+ struct sif_mmu_ctx *ctx,
+ struct sif_mem *mem,
+ bool write)
+{
+ struct psif_mmu_cntx *hw_ctx = &ctx->mctx;
+ bool multipage = aligned_size(ctx->base, ctx->size, PAGE_SIZE) > PAGE_SIZE;
+
+ sif_log(sdev, SIF_MMU_V, "base 0x%llx size 0x%llx", ctx->base, ctx->size);
+
+ hw_ctx->page_size = PAGE_SIZE_IA32E_4KB;
+ hw_ctx->wr_access = write;
+ hw_ctx->translation_type = MMU_GVA2GPA_MODE;
+
+ if (multipage) {
+ ctx->pt = sif_pt_create_for_mem(mem, ctx->base, 12, true, true);
+ if (!ctx->pt)
+ return -ENOMEM;
+ set_ctx(sdev, ctx, sif_pt_root_table_level(ctx->pt), sif_pt_dma_root(ctx->pt));
+ } else
+ set_ctx(sdev, ctx, PAGE_LEVEL0, sif_mem_dma(mem, 0));
+ return 0;
+}
+#endif
+
+/* map an existing context to a new memory object
+ * Reuse key, page table and mmu context if possible
+ */
+int sif_map_fmr_ctx(struct sif_dev *sdev,
+ struct sif_mmu_ctx *ctx,
+ struct sif_mem *mem)
+{
+ struct psif_mmu_cntx *hw_ctx = &ctx->mctx;
+ struct psif_key *key = get_key(sdev, ctx->lkey);
+ bool multipage;
+ u64 vstart = (u64)mem->vmap_base;
+ u64 page_size;
+ u64 page_mask;
+ enum psif_table_level leaf_level;
+ u64 aligned_base;
+ u64 aligned_sz;
+ u32 page_shift = sif_mem_page_shift(mem);
+ u8 pt_leaf_level = 0;
+ u8 pt_pte_extent = 1;
+ int ret;
+
+ /* Adjust to a supported page shift */
+ ret = find_optimal_leaf_level(sdev, page_shift,
+ vstart, sif_mem_dma(mem, 0), mem->size,
+ &pt_leaf_level, &pt_pte_extent);
+ if (ret)
+ return ret;
+
+ page_shift = sdev->mi.page_shift + pt_leaf_level * sdev->mi.level_shift;
+ page_size = 1ULL << page_shift;
+ page_mask = ~(page_size - 1);
+
+ hw_ctx->wr_access = true;
+ hw_ctx->translation_type = MMU_GVA2GPA_MODE;
+ hw_ctx->page_size = hw_leaf_page_sz(sdev, page_shift);
+
+ aligned_base = ctx->base & page_mask;
+ aligned_sz = aligned_size(vstart, mem->size, page_size);
+ multipage = sdev->single_pte_pt || aligned_sz > page_size;
+ leaf_level = hw_leaf_level(hw_ctx->page_size);
+
+ /* Now page sizes may have changed too, if so we cannot reuse the page table, delete it: */
+ if (ctx->pt && page_shift > ctx->pt->page_shift) {
+ sif_pt_free(ctx->pt);
+ ctx->pt = NULL;
+ }
+
+ /* For FMRs we reuse the mmu context and modify the existing key */
+ ctx->base = (u64)mem->vmap_base;
+ ctx->size = mem->size;
+
+ set_psif_key__base_addr(key, ctx->base);
+ set_psif_key__lkey_state(key, PSIF_DMA_KEY_VALID);
+ set_psif_key__rkey_state(key, PSIF_DMA_KEY_VALID);
+ set_psif_key__length(key, mem->size);
+
+ sif_log(sdev, SIF_FMR, "key %d: base now at %llx (sz %llx - mem sz %llx)",
+ ctx->lkey, ctx->base, ctx->size, mem->size);
+
+ /* We have two cases:
+ * 1) a single page pointer: Pointer must be set to new address - keep page size and everything
+ * 2) a page table of any depth:
+ * appropriate ptes must be set to refer to new pages
+ */
+ if (!multipage) {
+ dma_addr_t dma_addr = sif_mem_dma(mem, 0);
+ dma_addr_t aligned_dma_addr = dma_addr & ~((1 << page_shift) - 1);
+
+ set_ctx(sdev, ctx, leaf_level, aligned_dma_addr);
+ } else if (!ctx->pt) {
+ ctx->pt = sif_pt_create_for_mem(mem, ctx->base, page_shift, true, false);
+ if (!ctx->pt)
+ return -ENOMEM;
+ set_ctx(sdev, ctx, sif_pt_root_table_level(ctx->pt), sif_pt_dma_root(ctx->pt));
+ } else {
+ sif_pt_remap_for_mem(ctx->pt, mem, page_shift, ctx->base);
+ /* Only the level of the top node may have changed, the page is
+ * guaranteed to be the same, but the previous use could
+ * have been a single page - just set it every time for now:
+ */
+ set_ctx(sdev, ctx, sif_pt_root_table_level(ctx->pt), sif_pt_dma_root(ctx->pt));
+ }
+ /* Update the used network endian context */
+ set_psif_key__mmu_context(key, *((u64 *)&ctx->mctx));
+ return 0;
+}
+
+void sif_unmap_gva_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx)
+{
+ /* TLB invalidate is not available at teardown, instead we
+ * invalidate the whole MMU as a final operation before taking down the
+ * communication with the EPSC.
+ */
+ if (likely(sdev->registered) && ctx->pt && !sif_feature(disable_invalidate_tlb))
+ sif_mmu_invalidate_tlb(sdev, ctx, PCM_WAIT);
+ if (ctx->pt)
+ sif_pt_free(ctx->pt);
+}
+
+
+void sif_unmap_fmr_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx, enum wr_mode mode)
+{
+ sif_log(sdev, SIF_FMR, "key %d", ctx->lkey);
+ if (!sif_feature(disable_invalidate_tlb))
+ sif_mmu_invalidate_tlb(sdev, ctx, mode);
+}
+
+
+static int sif_mmu_invalidate_tlb_partial(struct sif_dev *sdev, struct sif_mmu_ctx *ctx,
+ u64 start, u64 len, enum wr_mode mode)
+{
+ struct psif_wr wr;
+ int ncompleted;
+ int ret = 0;
+ u32 lkey = ctx->lkey;
+ u32 npages;
+ u32 shift;
+ u32 sq_entry_idx;
+ int pqp_sq_idx;
+ struct sif_sq *sq;
+ struct sif_pqp *pqp;
+ struct psif_cq_entry *cqe;
+ DECLARE_SIF_CQE_POLL(sdev, lcqe);
+
+ pqp = lcqe.pqp;
+
+ if (!lkey) {
+ lkey = allocate_invalidate_key(ctx);
+ if (!lkey) {
+ sif_log(sdev, SIF_INFO,
+ "Failed to allocate a TLB invalidation key!");
+ return -ENOMEM;
+ }
+ }
+
+ /* Do no invalidate TLB if page table is NULL.
+ * However, if mode == PCM_WAIT, need to generate
+ * a completion to itself to ensure that all the
+ * previous posted invalidate TLB pqp operations
+ * have completed.
+ *
+ * This is mainly to cater for invalidating the TLB of a
+ * list of fmr ctx. This is done here within the function as
+ * the generated completion needs to know the selected
+ * pqp. The caller sif_unmap_phys_fmr_list doesn't
+ * know the pqp until DECLARE_SIF_CQE_POLL.
+ * In a scenario for invalidating TLB for a ctx,
+ * the ctx->pt is checked before calling this function
+ * so that no additional completion will be generated.
+ * e.g in sif_unmap_gva_ctx.
+ */
+ if (unlikely(!ctx->pt)) {
+ if (mode == PCM_WAIT) {
+ ret = gen_pqp_cqe(&lcqe);
+ if (ret < 0) {
+ sif_log(sdev, SIF_INFO,
+ "cqe %p gen_pqp_cqe returned %d",
+ &lcqe, ret);
+ return ret;
+ }
+ ret = poll_cq_waitfor(&lcqe);
+ if (ret < 0) {
+ sif_log(sdev, SIF_INFO,
+ "cqe %p poll_cq_waitfor returned %d",
+ &lcqe, ret);
+ }
+ }
+ return ret;
+ }
+
+ memset(&wr, 0, sizeof(struct psif_wr));
+ wr.op = PSIF_WR_INVALIDATE_TLB;
+ wr.details.su.key = lkey;
+
+ shift = sif_pt_page_shift(ctx->pt);
+ npages = num_pages(ctx->base, len, shift);
+
+ while (npages) {
+ /* TLB invalidate only uses the lower 16 bits of the length field */
+ u32 n = min_t(u32, npages, 0xffff);
+
+ wr.details.su.addr = start;
+ wr.details.su.length = n;
+ npages -= n;
+ if (npages > 0) {
+ int sts = sif_pqp_post_send(sdev, &wr, NULL);
+
+ if (sts) {
+ sif_log(sdev, SIF_INFO,
+ "Partial invalidate TLB for key %d, base %llx, length %x failed, sts %d",
+ lkey, start, n << shift, sts);
+ return sts;
+ }
+ } else
+ break;
+ /* reset checksum for the next calculation */
+ wr.checksum = 0;
+ start += n << shift;
+ }
+
+ /* We can allow async post only if we do not depend on deleting the key after
+ * the request has completed:
+ */
+ if (mode != PCM_WAIT && ctx->lkey) {
+ wr.completion = (mode == PCM_POST) ? 0 : 1;
+ return sif_pqp_post_send(sdev, &wr, NULL);
+ }
+
+ wr.completion = 1;
+
+ sif_log(sdev, SIF_PQP, "Invalidate TLB for key %d, base %llx, length %x",
+ lkey, start, wr.details.su.length << shift);
+
+ ncompleted = sif_pqp_poll_wr(sdev, &wr, &lcqe);
+
+ if (ncompleted < 0) {
+ sif_log(sdev, SIF_INFO, "%s completion for pqp request",
+ (ncompleted ? "Error" : "No"));
+ ret = ncompleted;
+ goto out;
+ }
+
+ /* Note that we operate on 3 different indices here! */
+ cqe = &lcqe.cqe;
+ pqp_sq_idx = pqp->qp->qp_idx;
+ sq = get_sif_sq(sdev, pqp_sq_idx);
+
+ /* sq_id.sq_seq_num contains the send queue sequence number for this completion
+ * and by this driver's definition the index into the send queue will
+ * be this number modulo the length of the send queue:
+ */
+ sq_entry_idx = cqe->wc_id.sq_id.sq_seq_num & sq->mask;
+
+ if (cqe->status != PSIF_WC_STATUS_SUCCESS) {
+ sif_log(sdev, SIF_INFO,
+ "base %llx, length %x: failed with status %s(%d) for cq_seq %d",
+ start, wr.details.su.length << shift,
+ string_enum_psif_wc_status(cqe->status), cqe->status, cqe->seq_num);
+ sif_logs(SIF_INFO, write_struct_psif_cq_entry(NULL, 0, cqe));
+ ret = -EIO;
+ atomic_inc(&pqp->cq->error_cnt);
+ goto out;
+ }
+
+ sif_log(sdev, SIF_PQP, "cq_seq %d sq_seq %d, sq_entry_idx %d",
+ cqe->seq_num, cqe->wc_id.sq_id.sq_seq_num, sq_entry_idx);
+out:
+ if (!ctx->lkey)
+ release_invalidate_key(sdev, lkey);
+ return ret;
+}
+
+
+static int sif_mmu_invalidate_tlb(struct sif_dev *sdev, struct sif_mmu_ctx *ctx, enum wr_mode mode)
+{
+ return sif_mmu_invalidate_tlb_partial(sdev, ctx, ctx->base, ctx->size, mode);
+}
+
+
+/* extend an mmu context with DMA addresses from @mem.
+ * Only GVA2GPA memory types supports this:
+ */
+int sif_map_ctx_part(struct sif_dev *sdev,
+ struct sif_mmu_ctx *ctx,
+ struct sif_mem *mem,
+ u64 virt_base, u64 size)
+{
+ int ret;
+
+ if (ctx->type != MMU_GVA2GPA_MODE)
+ return -EINVAL;
+
+ ret = sif_pt_extend(ctx->pt, sif_mem_get_sgl(mem), virt_base, size);
+ if (ret >= 0 && ctx->mt == SIFMT_CS && ctx->pt->vsize == size)
+ set_ctx(sdev, ctx, sif_pt_root_table_level(ctx->pt), sif_pt_dma_root(ctx->pt));
+ return ret;
+}
+
+
+/* invalidate a pte range in an already existing context's page table
+ * Only GVA2GPA memory types supports this:
+ */
+
+int sif_unmap_gva_ctx_part(struct sif_dev *sdev, struct sif_mmu_ctx *ctx,
+ u64 virt_base, u64 size)
+{
+ int ret = sif_pt_free_part(ctx->pt, virt_base, size);
+
+ if (ret < 0)
+ return ret;
+
+ if (unlikely(!sdev->registered)) {
+ /* TLB invalidate is not available at teardown */
+ return 0;
+ }
+
+ /* Invalidate this range of the page table with PSIF - assume async call is ok */
+ return sif_mmu_invalidate_tlb_partial(sdev, ctx, virt_base, size, PCM_POST);
+}
+
+
+
+const char *sif_mem_type_str(enum sif_mem_type mem_type)
+{
+ switch (mem_type) {
+ case SIFMT_BYPASS:
+ return "SIFMT_BYPASS";
+ case SIFMT_UMEM:
+ return "SIFMT_UMEM";
+ case SIFMT_UMEM_RO:
+ return "SIFMT_UMEM_RO";
+ case SIFMT_BYPASS_RO:
+ return "SIFMT_BYPASS_RO";
+ case SIFMT_UMEM_SPT:
+ return "SIFMT_UMEM_SPT";
+ case SIFMT_2M:
+ return "SIFMT_2M";
+ case SIFMT_4K:
+ return "SIFMT_4K";
+ case SIFMT_CS:
+ return "SIFMT_CS";
+ case SIFMT_ZERO:
+ return "SIFMT_ZERO";
+ case SIFMT_PHYS:
+ return "SIFMT_PHYS";
+ case SIFMT_FMR:
+ return "SIFMT_FMR";
+ case SIFMT_NOMEM:
+ return "SIFMT_NOMEM";
+ case SIFMT_PTONLY:
+ return "SIFMT_PTONLY";
+ case SIFMT_MAX:
+ return "SIFMT_MAX";
+ default:
+ break;
+ }
+ return "(undefined sif_mem_type)";
+}
+
+
+struct psif_mmu_cntx sif_mmu_ctx_passthrough(bool write)
+{
+ struct psif_mmu_cntx ctx = { .wr_access = 1 };
+ return ctx;
+}
+
+
+#define TSU_MMU_FLUSH_CACHES_ADDR 0x00200003L
+
+/* Post a command to flush the TLBs PTE cache.
+ * If @ptw_cache is set, also flush the PTW cache.
+ */
+int sif_post_flush_tlb(struct sif_dev *sdev, bool ptw_cache)
+{
+ int ret;
+
+ if (eps_version_ge(&sdev->es[sdev->mbox_epsc], 0, 100)) {
+ struct psif_epsc_csr_rsp resp;
+ struct psif_epsc_csr_req req;
+
+ memset(&req, 0, sizeof(req));
+ req.opcode = EPSC_FLUSH_CACHES;
+ req.u.flush_caches.flush_mmu_caches.flush_mmu_cache = 1;
+ if (ptw_cache)
+ req.u.flush_caches.flush_mmu_caches.flush_ptw_cache = 1;
+ ret = sif_epsc_wr_poll(sdev, &req, &resp);
+ } else {
+ int bits = (ptw_cache ? 0x3 : 0x1);
+
+ ret = sif_write_global_csr(sdev, TSU_MMU_FLUSH_CACHES_ADDR, bits);
+ }
+ if (ret) {
+ sif_log(sdev, SIF_INFO,
+ "clearing MMU cache failed with error %d ", ret);
+ }
+ return ret;
+}
+
+
+/* Wait for a previously posted flush_tlb to complete */
+int sif_complete_flush_tlb(struct sif_dev *sdev)
+{
+ ulong start_time = jiffies;
+ ulong timeout = sdev->min_resp_ticks * 4;
+ ulong timeout_time = start_time + timeout;
+ u64 val;
+ int cnt = 0;
+ int ret;
+ int ms;
+
+ if (eps_version_ge(&sdev->es[sdev->mbox_epsc], 0, 100)) {
+ /* For API ver. >= 100, we already wait for completion in mailbox operation */
+ return 0;
+ }
+ do {
+ val = sif_read_global_csr(sdev, TSU_MMU_FLUSH_CACHES_ADDR);
+ cnt++;
+ } while (val != -1LL && (val & 0x4) != 0x4 && time_is_after_jiffies(timeout_time));
+ if (val == -1LL)
+ sif_log(sdev, SIF_INFO, "CSR error waiting for mmu cache flush to finish");
+ if (time_is_before_jiffies(timeout_time)) {
+ sif_log(sdev, SIF_INFO, "timeout waiting for mmu cache flush to finish, val = %lld",
+ val);
+ return -ETIMEDOUT;
+ }
+ ret = sif_write_global_csr(sdev, TSU_MMU_FLUSH_CACHES_ADDR, 0x0);
+ ms = jiffies_to_msecs(jiffies - start_time);
+ if (ret)
+ sif_log(sdev, SIF_INFO, "failed to turn off mmu cache flush mode in %d ms", ms);
+ else
+ sif_log(sdev, SIF_INFO_V, "flushing completed in %d ms, cnt %d",
+ ms, cnt);
+ return ret;
+}
--- /dev/null
+/*
+ * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_mmu.h: API for management of sif's on-chip mmu.
+ */
+
+#ifndef _SIF_MMU_H
+#define _SIF_MMU_H
+
+#include <rdma/ib_verbs.h>
+#include "psif_hw_data.h"
+#include "sif_user.h"
+
+struct sif_mem;
+struct psif_mmu_cntx;
+struct sif_dev;
+
+enum wr_mode {
+ PCM_POST, /* Post WR without requesting send completion */
+ PCM_POST_COMPL, /* Post WR requesting send completion but do not wait(poll) for it */
+ PCM_WAIT /* Post WR requesting send completion and wait(poll) for it to arrive */
+};
+
+enum post_mode {
+ PM_WRITE, /* Write the WR into the SQ but don't trigger any posting */
+ PM_DOORBELL, /* Post request and trigger doorbell (send queue mode) */
+ PM_CB, /* "Normal" collect buffer mode */
+};
+
+/* The driver's representation of an MMU context:
+ * The key is the only means for referring the MMU context wrt invalidation
+ * (TLB_INVALIDATE) but this is only necessary to do for GVA2GPA contexts
+ * [TBD: with level > 0 (?)]
+ */
+
+struct sif_mmu_ctx {
+ u64 base; /* Start of mapping (byte resolution) */
+ u64 size; /* Size of mapping (byte resolution) */
+ u32 lkey; /* Key to use for invalidation - only valid if nonzero */
+ enum sif_mem_type mt; /* Logical type of mapping */
+ enum psif_mmu_translation type; /* Defined in psif_hw_data */
+ struct psif_mmu_cntx mctx; /* host order version of MMU context populated by sif_map_ctx */
+ struct sif_pt *pt; /* sif page table this mmu context points into (only GVA2GPA types) */
+ off_t uv2dma; /* For bypass: user_va + uv2dma = actual dma_addr */
+ u64 phys_sz; /* Only used by SIFMT_ZERO mappings */
+};
+
+
+/* Prepare a new mmu context
+ * ctx points to storage for this mmu context
+ * mem points to a DMA mapped memory object to map
+ *
+ * - prepare any page tables needed for dma
+ * and/or allocate private structures
+ * - fill in information for hw in ctx->hw_ctx
+ *
+ * NB! hw_ctx is assumed to be set to values for
+ * MMU_PASS_THROUGH (all null bytes) by default
+ *
+ * Return 0 upon success or -errno
+ */
+int sif_map_ctx(struct sif_dev *sdev,
+ struct sif_mmu_ctx *ctx,
+ struct sif_mem *mem,
+ u64 virt_base, u64 size,
+ bool write);
+
+/* Release any resources associated with
+ * the mmu context c. This will typically be
+ * any driver managed page tables and any I/O mappings
+ * (pinning) of page table memory
+ */
+void sif_unmap_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *c);
+
+/* Populate/invalidate a pte range in an already existing context's page table
+ * Only GVA2GPA memory types supports this:
+ * page_list should contain the corresponding list of dma_addresses to map:
+ */
+int sif_map_ctx_part(struct sif_dev *sdev,
+ struct sif_mmu_ctx *c,
+ struct sif_mem *mem,
+ u64 virt_base, u64 size);
+
+int sif_unmap_gva_ctx_part(struct sif_dev *sdev, struct sif_mmu_ctx *c,
+ u64 virt_base, u64 size);
+
+/* Remap an existing context to a new memory object
+ * (of the same size)
+ */
+int sif_map_fmr_ctx(struct sif_dev *sdev,
+ struct sif_mmu_ctx *c,
+ struct sif_mem *mem);
+
+void sif_unmap_fmr_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx, enum wr_mode mode);
+
+/*** internal mmu code - used by sif_xmmu.h ***/
+
+void sif_unmap_gva_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx);
+
+const char *sif_mem_type_str(enum sif_mem_type mem_type);
+
+void set_ctx(struct sif_dev *sdev,
+ struct sif_mmu_ctx *ctx,
+ enum psif_table_level level,
+ u64 val);
+
+/* Return an mmu context in passthrough mode */
+struct psif_mmu_cntx sif_mmu_ctx_passthrough(bool write);
+
+/* The I/O side virtual address as seen from sif */
+static inline u64 sif_mmu_vaddr(struct sif_mmu_ctx *ctx, off_t offset)
+{
+ return ctx->base + offset;
+}
+
+/* Post a command to flush the TLBs PTE cache.
+ * If @ptw_cache is set, also flush the PTW cache.
+ */
+int sif_post_flush_tlb(struct sif_dev *sdev, bool ptw_cache);
+
+/* Wait for a previously posted flush_tlb to complete */
+int sif_complete_flush_tlb(struct sif_dev *sdev);
+
+/* Flush the TLB and wait for the flush to complete */
+static inline int sif_flush_tlb(struct sif_dev *sdev)
+{
+ int ret = sif_post_flush_tlb(sdev, true);
+
+ if (ret)
+ return ret;
+ return sif_complete_flush_tlb(sdev);
+}
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_mr.c: Implementation of memory regions support for SIF
+ */
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+
+#include "sif_dev.h"
+#include "psif_hw_data.h"
+#include "psif_hw_setget.h"
+#include "sif_defs.h"
+#include "sif_base.h"
+#include "sif_mr.h"
+#include "sif_pd.h"
+#include "sif_mmu.h"
+#include "sif_pt.h"
+#include "sif_user.h"
+#include <linux/seq_file.h>
+#include "sif_user.h"
+
+struct sif_mr *sif_alloc_invalid_mr(struct sif_pd *pd)
+{
+ struct sif_dev *sdev = to_sdev(pd->ibpd.device);
+ u64 bad_addr = (~0ull) ^ (PAGE_SIZE-1);
+ struct sif_mem *mem =
+ sif_mem_create_ref(sdev, SIFMT_NOMEM, bad_addr, 0, GFP_KERNEL);
+ if (!mem)
+ return ERR_PTR(-ENOMEM);
+
+ return alloc_mr(sdev, pd, mem, 0, 0);
+}
+
+struct sif_mr *create_dma_mr(struct sif_pd *pd, int acc_fl)
+{
+ /* Use a common MR (in bypass mode)
+ * covering the whole memory space (for each pd which needs it)
+ */
+ struct sif_dev *sdev = to_sdev(pd->ibpd.device);
+ struct sif_mr *mr;
+ struct sif_mem *mem =
+ sif_mem_create_ref(sdev, SIFMT_NOMEM, 0ull, (~0ull) ^ (PAGE_SIZE-1), GFP_KERNEL);
+ if (!mem)
+ return ERR_PTR(-ENOMEM);
+
+ mr = alloc_mr(sdev, pd, mem, 0, acc_fl);
+ if (IS_ERR(mr))
+ goto alloc_mr_failed;
+ return mr;
+
+alloc_mr_failed:
+ sif_mem_free(mem);
+ return mr;
+}
+
+
+struct ib_mr *sif_get_dma_mr(struct ib_pd *ibpd, int acc_fl)
+{
+ struct sif_mr *mr = create_dma_mr(to_spd(ibpd), acc_fl);
+
+ return mr ? &mr->ibmr : NULL;
+}
+
+
+struct ib_mr *sif_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
+ u64 virt_addr, int acc_fl,
+ struct ib_udata *udata)
+{
+ enum sif_mem_type mem_type = SIFMT_UMEM;
+ struct sif_dev *sdev = to_sdev(ibpd->device);
+ struct sif_mr *mr;
+ void *ret;
+ struct ib_umem *umem;
+ struct sif_mem *mem;
+ ulong user_flags = 0;
+ u64 map_length = 0;
+ u64 phys_length = 0;
+ u64 umem_length = length;
+ enum dma_data_direction dma_dir = DMA_BIDIRECTIONAL;
+ DEFINE_DMA_ATTRS(attrs);
+
+ if (udata) {
+ struct sif_reg_mr_ext cmd;
+ int rv;
+
+ rv = ib_copy_from_udata(&cmd, udata, sizeof(cmd));
+ if (rv)
+ return ERR_PTR(-EFAULT);
+ user_flags = cmd.flags;
+ if (sif_vendor_enable(MMU_special, user_flags)) {
+ mem_type =
+ sdev->mt_override == SIFMT_UMEM ? cmd.mem_type : sdev->mt_override;
+ map_length = cmd.map_length;
+ phys_length = cmd.phys_length;
+ if (mem_type == SIFMT_BYPASS_RO || mem_type == SIFMT_UMEM_RO)
+ dma_dir = DMA_TO_DEVICE;
+ if (mem_type == SIFMT_CS)
+ umem_length = phys_length;
+ }
+ }
+
+ sif_log(sdev, SIF_MR, "start 0x%llx len 0x%llx virt_addr 0x%llx flags 0x%lx",
+ start, length, virt_addr, user_flags);
+
+ /* Pin user memory */
+ umem = ib_umem_get_attrs(ibpd->uobject->context, start, umem_length, acc_fl,
+ dma_dir, &attrs);
+
+ if (IS_ERR(umem)) {
+ int ev = PTR_ERR(umem);
+
+ ret = (void *)umem;
+ sif_log(sdev, SIF_MR,
+ "#### Failed to get umem [err %d] (start %llx length %llx vaddr %llx, udata at %p)",
+ ev, start, length, virt_addr, udata);
+ return ret;
+ }
+
+ if (map_length) {
+ if (map_length < length) {
+ sif_log(sdev, SIF_INFO, "illegal map_length 0x%llx - must be > length 0x%llx",
+ map_length, length);
+ return ERR_PTR(-EINVAL);
+ }
+ length = map_length;
+ }
+
+ mem = sif_mem_create_umem(sdev, umem, mem_type, GFP_KERNEL, dma_dir);
+ if (!mem) {
+ mr = (void *)ERR_PTR(-ENOMEM);
+ goto err_create_mem;
+ }
+
+ mr = alloc_mr(sdev, to_spd(ibpd), mem, start, acc_fl);
+ if (IS_ERR(mr))
+ goto err_mmu_ctx;
+
+ if (udata) {
+ struct sif_reg_mr_resp_ext resp;
+ int rv;
+
+ memset(&resp, 0, sizeof(resp));
+ resp.uv2dma = mr->mmu_ctx.uv2dma;
+ rv = ib_copy_to_udata(udata, &resp, sizeof(resp));
+ if (rv) {
+ /* Exit here as ib_umem_release is implicit via dealloc_mr */
+ dealloc_mr(sdev, mr);
+ return ERR_PTR(-EFAULT);
+ }
+ }
+
+ sif_log(sdev, SIF_MR, "Exit: ibmr 0x%p - uv2dma %lx", &mr->ibmr, mr->mmu_ctx.uv2dma);
+ return &mr->ibmr;
+
+err_mmu_ctx:
+ sif_mem_free(mem); /* owns and frees the umem as well */
+ return (void *)mr;
+err_create_mem:
+ ib_umem_release(umem);
+ return (void *)mr;
+}
+
+
+struct ib_mr *sif_reg_phys_mr(struct ib_pd *ibpd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int acc_fl, u64 *iova_start)
+{
+ struct sif_dev *sdev = to_sdev(ibpd->device);
+ struct sif_mr *mr;
+ struct sif_mem *mem;
+
+ if ((num_phys_buf <= 0) || !phys_buf_array) {
+ sif_log(sdev, SIF_INFO, "input error: num_phys_buf 0%x phys_buf_array %p",
+ num_phys_buf, phys_buf_array);
+ mr = ERR_PTR(-EINVAL);
+ goto param_err;
+ }
+
+ sif_log(sdev, SIF_MR, " num_phys_buf %d, flags 0x%x, iova_start %p",
+ num_phys_buf, acc_fl, iova_start);
+
+ mem = sif_mem_create_phys(sdev, iova_start, phys_buf_array, num_phys_buf,
+ GFP_KERNEL);
+ if (!mem) {
+ sif_log(sdev, SIF_INFO, "Failed to create mem object (ENOMEM)");
+ mr = ERR_PTR(-ENOMEM);
+ goto param_err;
+ }
+
+ mr = alloc_mr(sdev, to_spd(ibpd), mem, (u64)iova_start, acc_fl);
+ if (IS_ERR(mr))
+ goto alloc_mr_failed;
+
+ return &mr->ibmr;
+alloc_mr_failed:
+ sif_mem_free(mem);
+param_err:
+ return (void *)mr;
+}
+
+
+int sif_rereg_phys_mr(struct ib_mr *ibmr, int mr_rereg_mask,
+ struct ib_pd *ibpd,
+ struct ib_phys_buf *phys_buf_array, int num_phys_buf,
+ int mr_access_flags, u64 *iova_start)
+{
+ struct sif_dev *sdev = to_sdev(ibpd->device);
+
+ sif_log(sdev, SIF_INFO, "Not implemented");
+ return -EOPNOTSUPP;
+}
+
+
+
+struct sif_mr *alloc_mr(struct sif_dev *sdev, struct sif_pd *pd,
+ struct sif_mem *mem, u64 map_start, int acc_fl)
+{
+ struct sif_mr *mr;
+ volatile struct psif_key *key;
+ struct psif_key lkey;
+ bool write;
+ int index;
+ int ret = 0;
+ u64 length = mem ? mem->size : ((~0ull) ^ (PAGE_SIZE-1));
+
+ index = sif_alloc_key_idx(sdev);
+ if (index < 0) {
+ sif_log(sdev, SIF_MR, "Failed to allocate key idx");
+ ret = -ENOMEM;
+ goto err_reg_mr;
+ }
+
+ mr = kzalloc(sizeof(struct sif_mr), GFP_KERNEL);
+ if (!mr) {
+ sif_log(sdev, SIF_MR, "Failed to allocate memory for sif_mr");
+ ret = -ENOMEM;
+ goto err_mr_alloc;
+ }
+
+ memset(mr, 0, sizeof(struct sif_mr));
+ memset(&lkey, 0, sizeof(struct psif_key));
+ mr->index = index;
+ mr->mem = mem;
+ set_sif_mr(sdev, index, mr);
+ key = get_key(sdev, index);
+
+ if (length) {
+ /* MR will always have L/R keys associated with them.*/
+ lkey.lkey_state = PSIF_DMA_KEY_VALID;
+ lkey.rkey_state = PSIF_DMA_KEY_VALID;
+ } else {
+ /* Allocation is for a special invalid key */
+ lkey.lkey_state = PSIF_DMA_KEY_INVALID;
+ lkey.rkey_state = PSIF_DMA_KEY_INVALID;
+ }
+
+ /* Access flags */
+ lkey.local_access_rd = 1;
+ if (acc_fl & IB_ACCESS_LOCAL_WRITE)
+ lkey.local_access_wr = 1;
+ if (acc_fl & IB_ACCESS_REMOTE_READ)
+ lkey.remote_access_rd = 1;
+ if (acc_fl & IB_ACCESS_REMOTE_WRITE)
+ lkey.remote_access_wr = 1;
+ if (acc_fl & IB_ACCESS_REMOTE_ATOMIC)
+ lkey.remote_access_atomic = 1;
+ /* TBD: IB_ACCESS_MW_BIND (what to do with that?)
+ * and also conditonal_wr
+ */
+
+ write = (lkey.local_access_wr ? 1:0) || (lkey.remote_access_wr ? 1:0);
+
+ lkey.pd = pd->idx;
+
+ ret = sif_map_ctx(sdev, &mr->mmu_ctx, mem, map_start, length, write);
+ if (ret)
+ goto err_map_ctx;
+
+ mr->mmu_ctx.lkey = index;
+ if (length)
+ lkey.base_addr = mr->mmu_ctx.base;
+ else
+ lkey.base_addr = (u64)-1LL;
+ lkey.length = mr->mmu_ctx.size;
+ lkey.mmu_context = mr->mmu_ctx.mctx;
+
+ sif_logs(SIF_DUMP, write_struct_psif_key(NULL, 0, &lkey));
+
+ /* Write to HW descriptor */
+ copy_conv_to_hw(key, &lkey, sizeof(lkey));
+
+ mr->ibmr.lkey = mr->ibmr.rkey = mr->index;
+
+ sif_log(sdev, SIF_MR, "type %s - key %d (pd %d) - success",
+ sif_mem_type_str(mem->mem_type),
+ mr->index, pd->idx);
+ return mr;
+err_map_ctx:
+ kfree(mr);
+ set_sif_mr(sdev, index, NULL);
+err_mr_alloc:
+ sif_clear_key(sdev, index);
+ sif_free_key_idx(sdev, index);
+err_reg_mr:
+ sif_log(sdev, SIF_MR, "Exit: failed with status %d", ret);
+ return ERR_PTR(ret);
+}
+
+int sif_query_mr(struct ib_mr *ibmr, struct ib_mr_attr *mr_attr)
+{
+ sif_logi(ibmr->device, SIF_MR, "Not implemented");
+ return -EOPNOTSUPP;
+}
+
+
+/* If the MMU is involved (not pass-through mode)
+ * PSIF MR deregistration is asyncronous and five-step (see #2002):
+ * 1) Invalidate associated dma validation entry but first
+ * make sure it is in the special MMU_VALID state which does not
+ * allow uses of it from IB but allows it to be used for invalidation
+ * operations. The invalidate req causes a flush of the entry in
+ * VAL's cache.
+ * 2) Invalidate MMU context (TLB_INVALIDATE)
+ * This will lead to a fetch of the key again, this time with
+ * state == MMU_VALID.
+ * 3) Issue another key invalidate
+ * 4) NIL validation entry - make valid = 0
+ * 5) Unpin/release memory associated with it
+ */
+
+void dealloc_mr(struct sif_dev *sdev, struct sif_mr *mr)
+{
+ int index = mr->index;
+ int sts;
+ struct psif_key *key = get_key(sdev, index);
+ bool need_5_step = mr->mmu_ctx.type == MMU_GVA2GPA_MODE;
+
+ /* We do not invalidate the invalid key at index 0 */
+ bool do_invalidate_key = index != 0 && !sif_feature(disable_invalidate_key);
+
+ if (do_invalidate_key) {
+ if (need_5_step) {
+ set_psif_key__lkey_state(key, PSIF_DMA_KEY_MMU_VALID);
+ set_psif_key__rkey_state(key, PSIF_DMA_KEY_MMU_VALID);
+ } else {
+ set_psif_key__lkey_state(key, PSIF_DMA_KEY_INVALID);
+ set_psif_key__rkey_state(key, PSIF_DMA_KEY_INVALID);
+ }
+
+ /* Flush this DMA validation entry */
+ sts = sif_invalidate_key(sdev, index, PCM_WAIT);
+ if (sts) {
+ sif_log(sdev, SIF_INFO,
+ "Invalidate key failed");
+ }
+ }
+
+ /* Invalidate and unmap MMU context */
+ sif_unmap_ctx(sdev, &mr->mmu_ctx);
+
+ if (need_5_step && do_invalidate_key) {
+ set_psif_key__lkey_state(key, PSIF_DMA_KEY_INVALID);
+ set_psif_key__rkey_state(key, PSIF_DMA_KEY_INVALID);
+
+ /* Flush this DMA validation entry - the final operation, must be synchronous: */
+ sts = sif_invalidate_key(sdev, index, PCM_WAIT);
+ if (sts) {
+ sif_log(sdev, SIF_INFO,
+ "Invalidate key failed");
+ }
+ }
+
+ kfree(mr);
+ set_sif_mr(sdev, index, NULL);
+
+ if (!sif_feature(disable_invalidate_key)) {
+ /* Release memory associated with this key */
+ sif_clear_key(sdev, index);
+ sif_free_key_idx(sdev, index);
+ }
+}
+
+
+void sif_dealloc_mr(struct sif_dev *sdev, struct sif_mr *mr)
+{
+ struct sif_mem *mem = mr->mem;
+
+ dealloc_mr(sdev, mr);
+ sif_mem_free(mem);
+}
+
+
+int sif_dereg_mr(struct ib_mr *ibmr)
+{
+ struct sif_mr *mr = to_smr(ibmr);
+ struct sif_dev *sdev = to_sdev(ibmr->device);
+ int index = mr->ibmr.lkey;
+
+ sif_logi(ibmr->device, SIF_MR, "Enter: mr 0x%p key 0x%x", mr,
+ index);
+
+ sif_dealloc_mr(sdev, mr);
+ sif_log(sdev, SIF_MR, "Exit: success");
+ return 0;
+}
+
+struct ib_mr *sif_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list_len)
+{
+ sif_logi(ibpd->device, SIF_FMR, "Not implemented");
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+struct ib_fast_reg_page_list *sif_alloc_fast_reg_page_list(struct ib_device
+ *ibdev,
+ int page_list_len)
+{
+ sif_logi(ibdev, SIF_FMR, "Not implemented");
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+void sif_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl)
+{
+ sif_logi(pl->device, SIF_FMR, "Not implemented");
+}
+
+
+/* Line printer for debugfs file */
+void sif_dfs_print_key(struct seq_file *s, struct sif_dev *sdev, loff_t pos)
+{
+ struct psif_key *key;
+ struct psif_key lkey;
+ const char *typestr;
+ char l_state, r_state;
+
+ if (unlikely(pos < 0)) {
+ seq_printf(s, "# %61s State %s\n", "", "Page table info");
+ seq_printf(s, "# Index %18s %18s %16s LR %s\n",
+ "Base address(hex)", "Length(hex)", "MMU ctx type", " top leaf pages");
+ return;
+ }
+
+ key = get_key(sdev, pos);
+ copy_conv_to_sw(&lkey, key, sizeof(struct psif_key));
+ typestr = string_enum_psif_mmu_translation(lkey.mmu_context.translation_type) + 4;
+ l_state = string_enum_psif_dma_vt_key_states(lkey.lkey_state)[13];
+ r_state = string_enum_psif_dma_vt_key_states(lkey.rkey_state)[13];
+
+ seq_printf(s, "%7lld %18llx %18llx %16s %c%c ", pos, lkey.base_addr, lkey.length,
+ typestr, l_state, r_state);
+ sif_pt_dfs_print(s, sdev, pos);
+}
+
+
+/* API to allocate/release a key for TLB invalidation only
+ * Note that 0 is considered an invalid key!
+ */
+u32 allocate_invalidate_key(struct sif_mmu_ctx *ctx)
+{
+ /* This call is only meaningful for contexts with a valid page table: */
+ struct sif_dev *sdev = ctx->pt->sdev;
+ int index;
+ struct psif_key lkey;
+ volatile struct psif_key *key;
+
+ index = sif_alloc_key_idx(sdev);
+ if (index < 0)
+ return 0;
+
+ key = get_key(sdev, index);
+ memset(&lkey, 0, sizeof(struct psif_key));
+ lkey.lkey_state = PSIF_DMA_KEY_MMU_VALID;
+ lkey.rkey_state = PSIF_DMA_KEY_MMU_VALID;
+ lkey.base_addr = ctx->base;
+ lkey.length = ctx->size;
+ lkey.mmu_context = ctx->mctx;
+
+ /* Write to HW descriptor */
+ copy_conv_to_hw(key, &lkey, sizeof(lkey));
+ return (u32)index;
+}
+
+/* Release and invalidate a previously allocated TLB invalidation key */
+void release_invalidate_key(struct sif_dev *sdev, u32 index)
+{
+ int sts;
+ struct psif_key *key = get_key(sdev, index);
+
+ set_psif_key__lkey_state(key, PSIF_DMA_KEY_INVALID);
+ set_psif_key__rkey_state(key, PSIF_DMA_KEY_INVALID);
+
+ /* Flush this DMA validation entry - we do not really depend on the result
+ * so safe to make it asynchronous:
+ */
+ sts = sif_invalidate_key(sdev, index, PCM_POST);
+ if (sts)
+ sif_log(sdev, SIF_INFO,
+ "Invalidate key failed");
+
+ /* Release memory associated with this key */
+ sif_clear_key(sdev, index);
+ sif_free_key_idx(sdev, index);
+}
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_mr.h: Interface to internal IB memory registration logic for SIF
+ */
+
+#ifndef __SIF_MR_H
+#define __SIF_MR_H
+#include "sif_mmu.h"
+
+struct ib_umem;
+struct sif_mem;
+
+struct sif_mr {
+ struct ib_mr ibmr;
+ int index;
+ struct sif_mem *mem;
+ struct sif_mmu_ctx mmu_ctx;
+};
+
+static inline struct sif_mr *to_smr(struct ib_mr *ibmr)
+{
+ return container_of(ibmr, struct sif_mr, ibmr);
+}
+
+struct ib_mr *sif_get_dma_mr(struct ib_pd *ibpd, int mr_access_flags);
+struct sif_mr *sif_alloc_invalid_mr(struct sif_pd *pd);
+struct ib_mr *sif_reg_phys_mr(struct ib_pd *ibpd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf, int mr_access_flags,
+ u64 *iova_start);
+
+struct ib_mr *sif_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
+ u64 virt_addr, int mr_access_flags,
+ struct ib_udata *udata);
+
+int sif_query_mr(struct ib_mr *ibmr, struct ib_mr_attr *mr_attr);
+int sif_dereg_mr(struct ib_mr *ibmr);
+
+struct ib_mr *sif_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list_len);
+struct ib_fast_reg_page_list *sif_alloc_fast_reg_page_list(struct ib_device
+ *ibdev,
+ int page_list_len);
+
+void sif_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
+int sif_rereg_phys_mr(struct ib_mr *ibmr,
+ int mr_rereg_mask,
+ struct ib_pd *ibpd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf, int mr_access_flags, u64 *iova_start);
+
+/* Deallocate MR - assumes ownership of mr->mem and deletes that as well.
+ * To be used with high level mr allocation operations that create their own
+ * sif_mem object:
+ */
+void sif_dealloc_mr(struct sif_dev *sdev, struct sif_mr *mr);
+
+struct sif_dev;
+struct seq_file;
+struct sif_pd;
+enum psif_mmu_translation;
+
+/* Line printer for debugfs file */
+void sif_dfs_print_key(struct seq_file *s, struct sif_dev *sdev, loff_t pos);
+
+/* Internal mr allocation/deallocation functions:
+ * Allocate an IB MR for the memory object @mem
+ * If length == 0, allocate an invalid map.
+ * The mr does not own the @mem object
+ */
+struct sif_mr *alloc_mr(struct sif_dev *sdev, struct sif_pd *pd,
+ struct sif_mem *mem, u64 map_start, int acc_fl);
+struct sif_mr *create_dma_mr(struct sif_pd *pd, int acc_fl);
+
+void dealloc_mr(struct sif_dev *sdev, struct sif_mr *mr);
+
+
+/* API to allocate/release a key for TLB invalidation only
+ * Note that 0 is considered an invalid key!
+ */
+u32 allocate_invalidate_key(struct sif_mmu_ctx *ctx);
+
+/* Release and invalidate a previously allocated TLB invalidation key */
+void release_invalidate_key(struct sif_dev *sdev, u32 lkey);
+
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_mw.c: Implementation of memory windows for SIF
+ */
+
+#include <rdma/ib_verbs.h>
+#include "sif_mw.h"
+#include "sif_dev.h"
+
+struct ib_mw *sif_alloc_mw(struct ib_pd *ibpd)
+{
+ sif_logi(ibpd->device, SIF_INFO, "Not implemented");
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+int sif_bind_mw(struct ib_qp *ibqp,
+ struct ib_mw *ibmw, struct ib_mw_bind *mw_bind)
+{
+ sif_logi(ibqp->device, SIF_INFO, "Not implemented");
+ return -EOPNOTSUPP;
+}
+
+int sif_dealloc_mw(struct ib_mw *ibmw)
+{
+ sif_logi(ibmw->device, SIF_INFO, "Not implemented");
+ return -EOPNOTSUPP;
+}
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_mw.h: Interface to internal IB memory window logic for SIF
+ */
+
+#ifndef __SIF_MW_H
+#define __SIF_MW_H
+
+struct ib_mw *sif_alloc_mw(struct ib_pd *ibpd);
+int sif_bind_mw(struct ib_qp *ibqp, struct ib_mw *ibmw,
+ struct ib_mw_bind *mw_bind);
+int sif_dealloc_mw(struct ib_mw *ibmw);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_pd.c: Implementation of IB protection domains for SIF
+ */
+
+#include <rdma/ib_verbs.h>
+#include "sif_dev.h"
+#include "sif_ibpd.h"
+#include "sif_pd.h"
+#include "sif_defs.h"
+#include "sif_base.h"
+#include "sif_mmu.h"
+#include "sif_mr.h"
+#include "sif_xrc.h"
+#include "sif_query.h"
+
+
+int sif_init_pd(struct sif_dev *sdev)
+{
+ /* Avoid using pd == 0 to have HW trap use of blank AHs: */
+ return sif_idr_init(&sdev->pd_refs, 1, SIF_MAX_PD_INDEX);
+}
+
+
+void sif_deinit_pd(struct sif_dev *sdev)
+{
+ sif_idr_deinit(&sdev->pd_refs);
+}
+
+
+inline void cancel_cb(struct psif_cb __iomem *cb)
+{
+ u64 __iomem *c_adr = (u64 __iomem *)((u8 __iomem *)cb + 0xff8);
+ u64 c_val = PSIF_WR_CANCEL_CMD_BE;
+
+ __raw_writeq(cpu_to_be64(c_val), c_adr);
+}
+
+
+struct sif_pd *alloc_pd(struct sif_dev *sdev)
+{
+ struct sif_pd *pd = kzalloc(sizeof(struct sif_pd), GFP_KERNEL);
+
+ if (!pd)
+ return NULL;
+
+ pd->idx = sif_idr_alloc(&sdev->pd_refs, pd, GFP_KERNEL);
+ spin_lock_init(&pd->lock);
+ INIT_LIST_HEAD(&pd->qp_list);
+ INIT_LIST_HEAD(&pd->cq_list);
+ INIT_LIST_HEAD(&pd->rq_list);
+
+ sif_log(sdev, SIF_PD, "pd idx %d", pd->idx);
+ return pd;
+}
+
+
+int dealloc_pd(struct sif_pd *pd)
+{
+ struct sif_dev *sdev = to_sdev(pd->ibpd.device);
+
+ sif_log(sdev, SIF_PD, "pd idx %d", pd->idx);
+
+ if (!list_empty(&pd->qp_list)) {
+ sif_log(sdev, SIF_INFO, "pd idx %d: failed - still active qp blocks", pd->idx);
+ return -EBUSY;
+ }
+ if (!list_empty(&pd->cq_list)) {
+ sif_log(sdev, SIF_INFO, "pd idx %d: failed - still active cq blocks", pd->idx);
+ return -EBUSY;
+ }
+ if (!list_empty(&pd->rq_list)) {
+ sif_log(sdev, SIF_INFO, "pd idx %d: failed - still active rq blocks", pd->idx);
+ return -EBUSY;
+ }
+
+ sif_idr_remove(&sdev->pd_refs, pd->idx);
+ kfree(pd);
+ return 0;
+}
+
+
+/* IB Verbs level interfaces (sif_ibpd.h) */
+
+
+struct ib_pd *sif_alloc_pd(struct ib_device *ibdev,
+ struct ib_ucontext *context, struct ib_udata *udata)
+{
+ struct sif_dev *sdev = to_sdev(ibdev);
+ struct sif_pd *pd;
+ int ret;
+
+ pd = alloc_pd(sdev);
+ if (!pd)
+ return ERR_PTR(-ENOMEM);
+
+ /* For bw comp with libsif */
+ if (udata) {
+ struct sif_ucontext *uc = to_sctx(context);
+ struct sif_alloc_pd_resp_ext resp;
+
+ memset(&resp, 0, sizeof(resp));
+ resp.cb_idx = uc->cb->idx;
+ ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+ if (ret) {
+ dealloc_pd(pd);
+ return ERR_PTR(-EFAULT);
+ }
+ }
+ return &pd->ibpd;
+}
+
+int sif_dealloc_pd(struct ib_pd *ibpd)
+{
+ return ibpd->shpd ? 0 : dealloc_pd(to_spd(ibpd));
+}
+
+struct ib_shpd *sif_alloc_shpd(struct ib_device *ibdev,
+ struct ib_pd *ibpd,
+ struct ib_udata *udata)
+{
+ struct sif_dev *sdev = to_sdev(ibdev);
+ struct sif_pd *pd = to_spd(ibpd);
+ struct sif_shpd *shpd;
+
+ shpd = kzalloc(sizeof(struct sif_shpd), GFP_KERNEL);
+ if (!shpd)
+ return ERR_PTR(-ENOMEM);
+
+ shpd->ibshpd.device = &sdev->ib_dev;
+ shpd->pd = pd;
+
+ return &shpd->ibshpd;
+}
+
+struct ib_pd *sif_share_pd(struct ib_device *ibdev,
+ struct ib_ucontext *context,
+ struct ib_udata *udata,
+ struct ib_shpd *ibshpd)
+{
+ struct sif_shpd *shpd = to_sshpd(ibshpd);
+ struct sif_pd *pd = shpd->pd;
+ int ret;
+
+ if (udata) {
+ struct sif_ucontext *uc = to_sctx(context);
+ struct sif_share_pd_resp_ext resp;
+
+ memset(&resp, 0, sizeof(resp));
+ resp.cb_idx = uc->cb->idx;
+ ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+ if (ret)
+ return ERR_PTR(-EFAULT);
+ }
+
+ return &pd->ibpd;
+}
+
+int sif_remove_shpd(struct ib_device *ibdev,
+ struct ib_shpd *ibshpd,
+ int atinit)
+{
+ struct sif_shpd *shpd = to_sshpd(ibshpd);
+
+ if (!atinit && shpd->pd)
+ dealloc_pd(shpd->pd);
+
+ kfree(ibshpd);
+
+ return 0;
+}
+
+/* Collect buffer management */
+
+
+/* Obtain information about lat_cb and bw_cb resources
+ * We cannot use the ba structs yet as they are not initialized at this point:
+ */
+static void sif_cb_init(struct sif_dev *sdev)
+{
+ struct psif_epsc_csr_req req;
+ struct psif_epsc_csr_rsp rsp;
+ struct sif_eps *es = &sdev->es[sdev->mbox_epsc];
+
+ /* EPSC supports the new requests starting from v.0.36 */
+ if (eps_version_ge(es, 0, 37)) {
+ int ret = 0;
+
+ memset(&req, 0, sizeof(req));
+ req.opcode = EPSC_QUERY;
+ req.u.query.data.op = EPSC_QUERY_CAP_VCB_LO;
+ req.u.query.info.op = EPSC_QUERY_CAP_VCB_HI;
+ ret = sif_epsc_wr(sdev, &req, &rsp);
+ if (ret)
+ sif_log(sdev, SIF_INFO, "Request for VCB info failed with %d", ret);
+ else {
+ sdev->bw_cb_cnt = rsp.data;
+ sdev->lat_cb_cnt = rsp.info;
+ sif_log(sdev, SIF_INFO, "Got %ld bw_cbs and %ld lat_cbs",
+ sdev->bw_cb_cnt, sdev->lat_cb_cnt);
+ }
+ }
+}
+
+
+/* Called from sif_base.c to initialize each of the cb tables */
+void sif_cb_table_init(struct sif_dev *sdev, enum sif_tab_type type)
+{
+ struct sif_table *tp;
+
+ BUG_ON(!is_cb_table(type));
+ tp = &sdev->ba[type];
+
+ /* Update table values with EPSC data: */
+ if (type == bw_cb) {
+ sif_cb_init(sdev);
+ if (sdev->bw_cb_cnt) {
+ tp->entry_cnt = sdev->bw_cb_cnt;
+ tp->table_sz = tp->ext_sz * tp->entry_cnt;
+ }
+ tp->sif_off = sdev->cb_base;
+ } else {
+ /* lat_cb */
+ if (sdev->lat_cb_cnt) {
+ tp->entry_cnt = sdev->lat_cb_cnt;
+ tp->table_sz = tp->ext_sz * tp->entry_cnt;
+ tp->sif_off = sdev->cb_base + sdev->ba[bw_cb].table_sz;
+ } else
+ tp->entry_cnt = 0;
+ }
+
+ tp->mem = sif_mem_create_ref(sdev, SIFMT_NOMEM, tp->sif_base,
+ tp->table_sz, GFP_KERNEL);
+}
+
+
+struct sif_cb *alloc_cb(struct sif_dev *sdev, bool lat_cb)
+{
+ int idx;
+ struct sif_cb *cb = kzalloc(sizeof(struct sif_cb), GFP_KERNEL);
+
+ if (!cb)
+ return NULL;
+
+ if (unlikely(lat_cb)) {
+ idx = sif_alloc_lat_cb_idx(sdev);
+ if (idx < 0) {
+ sif_log(sdev, SIF_INFO, "Unable to allocate lat_cb - trying bw_cb instead");
+ lat_cb = false;
+ } else
+ cb->cb = get_lat_cb(sdev, idx);
+ }
+
+ if (likely(!lat_cb)) {
+ idx = sif_alloc_bw_cb_idx(sdev);
+ if (idx < 0)
+ goto err_index;
+ cb->cb = get_bw_cb(sdev, idx);
+ }
+
+ /* Reset Collect buffer */
+ cb->idx = idx;
+ cb->is_lat_cb = lat_cb;
+
+ cancel_cb(cb->cb);
+
+ spin_lock_init(&cb->lock);
+ return cb;
+err_index:
+ kfree(cb);
+ return NULL;
+}
+
+
+void release_cb(struct sif_dev *sdev, struct sif_cb *cb)
+{
+ cancel_cb(cb->cb);
+ if (unlikely(cb->is_lat_cb))
+ sif_free_lat_cb_idx(sdev, cb->idx);
+ else
+ sif_free_bw_cb_idx(sdev, cb->idx);
+ kfree(cb);
+}
+
+
+/* Find the driver struct for a collect buffer index, if associated with @uc
+ */
+struct sif_cb *sif_cb_from_uc(struct sif_ucontext *uc, u32 index)
+{
+ if (uc->cb->idx == index)
+ return uc->cb;
+ return NULL;
+}
+
+
+/*
+ * Write a prepared work request (in wqe) to the associated collect buffer:
+ * Return 0 on success otherwise -EBUSY if lock is held
+ */
+int sif_cb_write(struct sif_qp *qp, struct psif_wr *wqe, int cp_len)
+{
+ unsigned long flags;
+ struct sif_cb *cb = get_cb(qp);
+
+ if (!spin_trylock_irqsave(&cb->lock, flags))
+ return -EBUSY;
+
+ wmb(); /* Previous memory writes must be ordered wrt the I/O writes */
+ copy_conv_to_mmio(cb->cb, wqe, cp_len);
+ wc_wmb(); /* I/O writes must be completed before we let go of the lock! */
+ spin_unlock_irqrestore(&cb->lock, flags);
+
+ return 0;
+}
+
+
+#define SQS_START_DOORBELL 0xfc0
+#define SQS_STOP_DOORBELL 0xf80
+
+/*
+ * Notify about a work request to the cb doorbell - triggering SQ mode:
+ */
+void sif_doorbell_write(struct sif_qp *qp, struct psif_wr *wqe, bool start)
+{
+ unsigned long flags;
+ u16 doorbell_offset = start ? SQS_START_DOORBELL : SQS_STOP_DOORBELL;
+ struct sif_cb *cb = get_cb(qp);
+ struct sif_dev *sdev = to_sdev(qp->ibqp.pd->device);
+
+ sif_log(sdev, SIF_QP, "%s sqs for qp %d sq_seq %d", (start ? "start" : "stop"),
+ qp->qp_idx, wqe->sq_seq);
+ spin_lock_irqsave(&cb->lock, flags);
+ wmb();
+ copy_conv_to_mmio((u8 __iomem *)cb->cb + doorbell_offset, wqe, 8);
+
+ /* Flush write combining */
+ wc_wmb();
+ spin_unlock_irqrestore(&cb->lock, flags);
+}
+
+
+/*
+ * Force the SQS to process an already posted WR:
+ */
+
+void sif_doorbell_from_sqe(struct sif_qp *qp, u16 seq, bool start)
+{
+ u16 doorbell_offset = start ? SQS_START_DOORBELL : SQS_STOP_DOORBELL;
+ struct sif_cb *cb = get_cb(qp);
+ struct sif_dev *sdev = to_sdev(qp->ibqp.pd->device);
+ struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx);
+ u64 *wqe = (u64 *)get_sq_entry(sq, seq);
+
+ /* Pick the 1st 8 bytes directly from the sq entry: */
+ wmb();
+ __raw_writeq(*wqe, ((u8 __iomem *)cb->cb + doorbell_offset));
+
+ /* Flush write combining */
+ wc_wmb();
+ sif_log(sdev, SIF_QP, "%s sqs for qp %d sq_seq %d", (start ? "start" : "stop"),
+ qp->qp_idx, seq);
+}
+
+
+static struct list_head *type_to_list(struct sif_pd *pd, enum sif_tab_type type)
+{
+ switch (type) {
+ case cq_hw:
+ return &pd->cq_list;
+ case rq_hw:
+ return &pd->rq_list;
+ case qp:
+ return &pd->qp_list;
+ default:
+ BUG();
+ }
+ return NULL;
+}
+
+
+/* Allocate a free index from a block:
+ * The index is a global index
+ */
+static int alloc_from_block(struct sif_table_block *b, enum sif_tab_type type)
+{
+ int next = 0;
+ int index;
+ int loc_idx;
+
+ struct sif_table *table = b->table;
+
+ if (table->alloc_rr)
+ next = (b->last_used + 1) & (table->entry_per_block - 1);
+ loc_idx = find_next_zero_bit(b->bitmap, table->entry_per_block, next);
+ if (table->alloc_rr && loc_idx >= table->entry_per_block)
+ loc_idx = find_next_zero_bit(b->bitmap, table->entry_per_block, 0);
+ if (loc_idx < table->entry_per_block) {
+ set_bit(loc_idx, b->bitmap);
+ if (table->alloc_rr)
+ b->last_used = loc_idx;
+ index = loc_idx + b->offset;
+ sif_log(table->sdev, SIF_IDX2,
+ "%s[%d:%d] -> %d ", sif_table_name(type),
+ b->offset / table->entry_per_block, loc_idx, index);
+ return index;
+ }
+ return -1;
+}
+
+
+/* Free a used index back to a block:
+ * The index is a global index
+ */
+static void free_to_block(struct sif_table_block *b, enum sif_tab_type type, int index)
+{
+ struct sif_table *table = b->table;
+ size_t ext_sz = table->ext_sz;
+ char *desc = sif_mem_kaddr(table->mem, index * ext_sz);
+
+ /* Get from global index to block index */
+ index -= b->offset;
+
+ /* Clean descriptor entry for reuse:
+ * note that we clean the whole extent here which
+ * includes all of sif_##type for inlined types:
+ */
+ if (type == rq_hw) /* only zero out driver data structure */
+ memset(desc + sizeof(struct psif_rq_hw), 0, ext_sz - sizeof(struct psif_rq_hw));
+ else if (!is_cb_table(type) && type != qp && type != cq_hw)
+ memset(desc, 0, ext_sz);
+
+ sif_log(table->sdev, SIF_IDX2,
+ "%s[%d:%d] ", sif_table_name(type),
+ b->offset / table->entry_per_block, index);
+ clear_bit(index, b->bitmap);
+}
+
+
+/* Support for per protection domain table index allocations (2nd level allocation):
+ * Invariants:
+ * - sif_table_block entries are 0-initialized, and initialized to real values on demand.
+ * - We keep a list of blocks and try to allocate starting from the first in the list
+ * assuming that the last added block has the most free entries.
+ */
+
+int sif_pd_alloc_index(struct sif_pd *pd, enum sif_tab_type type)
+{
+ struct sif_dev *sdev = to_sdev(pd->ibpd.device);
+ struct sif_table *tp = &sdev->ba[type];
+ struct list_head *list = type_to_list(pd, type);
+ struct sif_table_block *b;
+ int idx = -1;
+
+ if (tp->entry_per_block == 1) /* Handle 1-level alloc case */
+ return sif_alloc_index(sdev, type);
+
+ spin_lock(&pd->lock);
+ list_for_each_entry(b, list, pd_list) {
+ idx = alloc_from_block(b, type);
+ if (idx >= 0)
+ break;
+ }
+ if (idx < 0) {
+ /* Allocate a new block */
+ int blk_idx = sif_alloc_index(sdev, type);
+
+ if (blk_idx >= 0) {
+ b = sif_get_block(tp, blk_idx);
+ sif_log(sdev, SIF_IDX2, "%s blk_idx %d: %p [%ld/%d]",
+ sif_table_name(type), blk_idx, b,
+ sizeof(struct sif_table_block), tp->block_ext);
+ b->table = tp;
+ b->pd = pd;
+ b->offset = blk_idx * tp->entry_per_block;
+ /* Don't modify last_used as we want it to survive (de)allocations */
+ list_add(&b->pd_list, list);
+ idx = alloc_from_block(b, type);
+ }
+ }
+ spin_unlock(&pd->lock);
+ return idx;
+}
+
+
+void sif_pd_free_index(struct sif_pd *pd, enum sif_tab_type type, int index)
+{
+ struct sif_dev *sdev = to_sdev(pd->ibpd.device);
+ struct sif_table *tp = &sdev->ba[type];
+ struct sif_table_block *b;
+ int bits_used;
+ int blk_idx = index / tp->entry_per_block;
+
+ if (tp->entry_per_block == 1) /* Handle 1-level alloc case */
+ return sif_free_index(sdev, type, index);
+
+ b = sif_get_block(tp, blk_idx);
+ if (!b->table) {
+ /* BUG */
+ sif_log(sdev, SIF_INFO, "index %d: block table ptr NULL - blk_idx %d table %s",
+ index, blk_idx, sif_table_name(type));
+ return;
+ }
+ spin_lock(&pd->lock);
+ free_to_block(b, type, index);
+ bits_used = bitmap_weight(b->bitmap, tp->entry_per_block);
+ if (!bits_used) {
+ list_del(&b->pd_list);
+ sif_free_index(sdev, type, blk_idx);
+ }
+ spin_unlock(&pd->lock);
+}
+
+
+bool sif_pd_index_used(struct sif_table *tp, int idx)
+{
+ struct sif_table_block *b;
+ int blk_idx = idx / tp->entry_per_block;
+
+ if (!test_bit(blk_idx, tp->bitmap))
+ return false;
+ b = sif_get_block(tp, blk_idx);
+ return test_bit(idx % tp->entry_per_block, b->bitmap);
+}
+
+
+bool sif_is_user_pd(struct sif_pd *pd)
+{
+ if (pd->ibpd.uobject)
+ return true;
+ /* TBD: We don't know if an XRC domain originates from user space,
+ * as it does not get any uobject
+ */
+ if (pd->xrcd) /* TBD: && pd->xrcd->ib_xrcd.uobject) */
+ return true;
+ return false;
+}
--- /dev/null
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_pd.h: Internal interface to protection domains
+ * and collect buffer management for SIF
+ */
+
+#ifndef __SIF_PD_H
+#define __SIF_PD_H
+
+struct sif_dev;
+struct sif_pd;
+struct sif_cb;
+struct sif_qp;
+struct sif_ucontext;
+
+/**** Protection domains ****/
+
+/* SIF supports a 24 bit PD index: */
+#define SIF_MAX_PD_INDEX ((1 << 24) - 1)
+
+struct sif_pd {
+ struct ib_pd ibpd;
+ int idx; /* index of this pd */
+ struct sif_xrcd *xrcd; /* If set, this pd is owned by an xrcd */
+ spinlock_t lock; /* Protects lists and their bitmaps while owned by us */
+ /* List of blocks of descriptor entries owned by this pd */
+ struct list_head qp_list;
+ struct list_head cq_list;
+ struct list_head rq_list;
+};
+
+struct sif_shpd {
+ struct ib_shpd ibshpd;
+ struct sif_pd *pd;
+};
+
+/* Initialize/deinitialize the pd subsystem */
+int sif_init_pd(struct sif_dev *sdev);
+void sif_deinit_pd(struct sif_dev *sdev);
+
+struct sif_pd *alloc_pd(struct sif_dev *sdev);
+int dealloc_pd(struct sif_pd *pd);
+
+
+/* Per protection domain table index allocations (2nd level allocation) */
+int sif_pd_alloc_index(struct sif_pd *pd, enum sif_tab_type type);
+void sif_pd_free_index(struct sif_pd *pd, enum sif_tab_type type, int index);
+
+/* 2-level and 1-level safe index usage check:
+ * idx is the entry index (not block index)
+ * and is assumed to be within bounds:
+ *
+ */
+bool sif_pd_index_used(struct sif_table *tp, int idx);
+
+bool sif_is_user_pd(struct sif_pd *pd);
+
+
+/**** Collect buffers ****/
+
+static inline bool is_cb_table(enum sif_tab_type type)
+{
+ return type == bw_cb || type == lat_cb;
+}
+
+
+/* Called from sif_base.c to initialize the cb tables */
+void sif_cb_table_init(struct sif_dev *sdev, enum sif_tab_type type);
+
+
+/* per collect buffer struct */
+struct sif_cb {
+ int idx; /* index of this cb */
+ bool is_lat_cb; /* High bandwidth or low latency cb */
+ spinlock_t lock; /* Serializes access to this cb */
+ u64 reqs; /* Number of requests on this cb */
+ struct psif_cb __iomem *cb; /* Pointer to the actual collect buffer space */
+};
+
+/* Allocation and deallocation of collect buffers
+ * If @lat_cb is set, allocate low latency CB instead of high bandwidth one:
+ */
+struct sif_cb *alloc_cb(struct sif_dev *sdev, bool lat_cb);
+void release_cb(struct sif_dev *sdev, struct sif_cb *cb);
+
+/* Find the driver struct for a collect buffer index, if associated with @uc
+ */
+struct sif_cb *sif_cb_from_uc(struct sif_ucontext *uc, u32 index);
+
+
+/*
+ * Write a prepared work request (in wqe) to the associated collect buffer:
+ * Return 0 on success otherwise -EBUSY if lock is held
+ */
+int sif_cb_write(struct sif_qp *qp, struct psif_wr *wqe, int cp_len);
+
+
+/*
+ * Notify about a work request to the cb doorbell - triggering SQ mode:
+ */
+void sif_doorbell_write(struct sif_qp *qp, struct psif_wr *wqe, bool start);
+
+
+/*
+ * Force the SQS to process an already posted WR:
+ */
+void sif_doorbell_from_sqe(struct sif_qp *qp, u16 seq, bool start);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_pqp.c: Privileged QP handling
+ * The privileged QPs are SIFs internal send only QPs for management operations
+ */
+
+#include "sif_dev.h"
+#include "sif_cq.h"
+#include "sif_sq.h"
+#include "sif_base.h"
+#include "psif_hw_data.h"
+#include "psif_hw_setget.h"
+#include "sif_pqp.h"
+#include "sif_qp.h"
+#include "sif_hwi.h"
+#include "sif_ibqp.h"
+#include "sif_checksum.h"
+#include "sif_defs.h"
+
+static inline struct sif_qp *__create_init_qp(struct sif_dev *sdev, struct sif_cq *cq)
+{
+ struct sif_qp *qp;
+ struct ib_qp_init_attr init_attr = {
+ .event_handler = NULL,
+ .send_cq = &cq->ibcq,
+ .recv_cq = NULL, /* receive side not used */
+ .srq = NULL,
+ .cap = {
+ .max_send_wr = sif_max_pqp_wr,
+ .max_recv_wr = 0,
+ .max_send_sge = 0,
+ .max_recv_sge = 0,
+ .max_inline_data = 0
+ },
+ .qp_type = IB_QPT_UD,
+ };
+ struct sif_qp_init_attr sif_attr = {
+ .pd = sdev->pd,
+ .qp_type = PSIF_QP_TRANSPORT_MANSP1,
+ .qosl = QOSL_LOW_LATENCY,
+ .sq_hdl_sz = sizeof(struct sif_sq_hdl),
+ };
+
+ qp = create_qp(sdev, &init_attr, &sif_attr);
+ if (!IS_ERR(qp))
+ qp->ibqp.pd = &sdev->pd->ibpd;
+ return qp;
+}
+
+
+
+static struct sif_pqp *_sif_create_pqp(struct sif_dev *sdev, size_t alloc_sz, int comp_vector)
+{
+ struct sif_pqp *pqp;
+ struct sif_cq *cq;
+ struct sif_qp *qp;
+ struct sif_sq *sq = NULL;
+ int ret = 0;
+
+ /* The privileged QP only supports state in modify_qp */
+ struct ib_qp_attr mod_attr = {
+ .qp_state = IB_QPS_INIT
+ };
+
+ pqp = kzalloc(alloc_sz, GFP_KERNEL);
+ if (!pqp) {
+ sif_log(sdev, SIF_INFO, "Failed to allocate memory for priv.qp");
+ return NULL;
+ }
+
+ cq = create_cq(sdev->pd, sif_max_pqp_wr, comp_vector, SIFPX_OFF, false);
+ if (IS_ERR(cq)) {
+ ret = PTR_ERR(cq);
+ goto cq_alloc_failed;
+ }
+ cq->ibcq.device = &sdev->ib_dev;
+ pqp->cq = cq;
+ cq->pqp = pqp;
+ init_completion(&pqp->nonfull);
+
+ /* Now create a queue pair.
+ * TBD: Use a separate pqp for req_notify_cq and use low latency..
+ */
+ qp = __create_init_qp(sdev, cq);
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
+ goto qp_alloc_failed;
+ }
+
+ pqp->qp = qp;
+ sq = get_sif_sq(sdev, qp->qp_idx);
+ /* Reserve 1/2 or at least 1 entry for pqp requests with completion on the PQP */
+ pqp->lowpri_lim = sq->entries - min_t(int, sq->entries/2, 2);
+
+ /* Run the required qp modify sequence */
+ ret = sif_modify_qp(&qp->ibqp, &mod_attr,
+ IB_QP_STATE, NULL);
+ if (ret)
+ goto qp_alloc_failed;
+
+ mod_attr.qp_state = IB_QPS_RTR;
+ ret = sif_modify_qp(&qp->ibqp, &mod_attr,
+ IB_QP_STATE, NULL);
+ if (ret)
+ goto qp_alloc_failed;
+
+ mod_attr.qp_state = IB_QPS_RTS;
+ mod_attr.sq_psn = 0;
+ ret = sif_modify_qp(&qp->ibqp, &mod_attr,
+ IB_QP_STATE, NULL);
+ if (ret)
+ goto qp_alloc_failed;
+
+ atomic64_set(&pqp->qp->arm_srq_holdoff_time, 0);
+
+ sif_log(sdev, SIF_QP, "success");
+ return pqp;
+
+qp_alloc_failed:
+ /* Special destruction order, see below: */
+ destroy_cq(cq);
+ if (sq)
+ sq->cq_idx = -1;
+
+ if (pqp->qp)
+ destroy_qp(sdev, qp);
+cq_alloc_failed:
+ kfree(pqp);
+ sif_log(sdev, SIF_QP, "failed with %d", ret);
+ return ERR_PTR(ret);
+}
+
+
+int sif_destroy_pqp(struct sif_dev *sdev, struct sif_pqp *pqp)
+{
+ struct sif_sq *sq = get_sif_sq(sdev, pqp->qp->qp_idx);
+ bool self_destruct = get_pqp(sdev) == pqp;
+ /* For the last pqp we make an exception from the IB std reqs
+ * in that we keep the PQP itself up to invalidate the CQ using the
+ * PQP to send the invalidate, **before** we take down the QP itself.
+ * The hardware will make sure that for this special case
+ * the completion is sent before the CQ entry is invalidated.
+ */
+ int ret;
+
+ if (self_destruct) {
+ sif_log(sdev, SIF_PQP, "self destruct CQ %d", pqp->cq->index);
+ ret = destroy_cq(pqp->cq);
+ if (ret < 0)
+ return ret;
+
+ if (sq)
+ sq->cq_idx = -1;
+ }
+
+ ret = destroy_qp(sdev, pqp->qp);
+ if (ret < 0)
+ return ret;
+
+ /* Support the normal destruction order as long as we have
+ * other PQPs in the system:
+ */
+ if (!self_destruct) {
+ ret = destroy_cq(pqp->cq);
+ if (ret < 0)
+ return ret;
+
+ if (sq)
+ sq->cq_idx = -1;
+ }
+ kfree(pqp);
+ return 0;
+}
+
+
+struct sif_pqp *sif_create_pqp(struct sif_dev *sdev, int comp_vector)
+{
+ return _sif_create_pqp(sdev, sizeof(struct sif_pqp), comp_vector);
+}
+
+
+static void pqp_complete_nonfull(struct sif_pqp *pqp)
+{
+ int ql;
+ unsigned long flags;
+ struct sif_dev *sdev = to_sdev(pqp->cq->ibcq.device);
+ struct sif_sq *sq = get_sif_sq(sdev, pqp->qp->qp_idx);
+ struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, pqp->qp->qp_idx);
+return;
+ spin_lock_irqsave(&sq->lock, flags);
+ ql = sq_length(sq, sq_sw->head_seq, sq_sw->last_seq);
+ if (ql <= sq->mask && atomic_read(&pqp->waiters))
+ complete(&pqp->nonfull);
+ spin_unlock_irqrestore(&sq->lock, flags);
+}
+
+
+static inline void __pqp_complete_sq(struct sif_sq *sq, u32 sq_seq)
+{
+ /* TBD: Allow pqp posters to wait for completions */
+}
+
+
+
+static void pqp_reset_cmpl(struct sif_cqe *lcqe)
+{
+ struct sif_pqp *pqp = lcqe->pqp;
+ struct sif_cq *cq = pqp->cq;
+ struct sif_dev *sdev = to_sdev(cq->ibcq.device);
+ struct sif_sq *sq = get_sif_sq(sdev, pqp->qp->qp_idx);
+ struct sif_sq_hdl *wh = get_sq_hdl(sq, lcqe->sq_seq);
+ unsigned long flags;
+
+ spin_lock_irqsave(&cq->lock, flags);
+ wh->wr_id = 0;
+ wh->used = false;
+ spin_unlock_irqrestore(&cq->lock, flags);
+}
+
+
+
+/* Process all received completions on @cq - must be only PQP completions!
+ * Return the number processed, or -errno upon errors:
+ * Assumes the cq lock is held.
+ * If first_err is set, check for completion errors and return the first one with errors:
+ */
+
+/* TBD: Clean up memory barriers in this function */
+static int __pqp_process_cqe(struct sif_pqp *pqp, struct sif_cqe *first_err)
+{
+ struct sif_cq *cq = pqp->cq;
+ struct sif_dev *sdev = to_sdev(cq->ibcq.device);
+ struct sif_sq_sw *sq_sw;
+ volatile struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index);
+ struct sif_sq *sq;
+ u32 seqno = cq_sw->next_seq;
+ volatile struct psif_cq_entry *cqe_be = get_cq_entry(cq, seqno);
+ int npolled = 0;
+ int cqe_cnt = 0;
+ u64 wci;
+ struct psif_send_completion_id *wc_id = (struct psif_send_completion_id *)&wci;
+ int sq_seq;
+ struct sif_cqe *lcqe;
+ struct sif_sq_hdl *wh;
+ int ql = 0;
+ u64 dbg_mask;
+ bool err_seen = false;
+
+ for (; seqno == get_psif_cq_entry__seq_num(cqe_be); npolled++) {
+ enum psif_wc_status status = get_psif_cq_entry__status(cqe_be);
+ int sq_idx = get_psif_cq_entry__qp(cqe_be);
+ bool dump_it = false;
+
+ sq = get_sif_sq(sdev, sq_idx);
+ sq_sw = get_sif_sq_sw(sdev, sq_idx);
+ wci = get_psif_cq_entry__wc_id(cqe_be);
+ sq_seq = wc_id->sq_seq_num;
+ wh = get_sq_hdl(sq, sq_seq);
+
+ if (unlikely(status != PSIF_WC_STATUS_SUCCESS)) {
+ sif_log(sdev, SIF_INFO, "error completion polled");
+ dump_it = true;
+ }
+
+ if (pqp->qp->flags & SIF_QPF_KI_STENCIL)
+ goto cont_check_first_err;
+
+ if (unlikely(!wh)) {
+ sif_log(sdev, SIF_INFO,
+ "cqe %d for cq %d refers sq(qp) %d which has not been initialized",
+ seqno, cq->index, sq_idx);
+ dump_it = true;
+ goto cont_no_wh;
+ }
+ if (unlikely(!wh->used)) {
+ sif_log(sdev, SIF_INFO,
+ "ignoring unused cqe %d for cq %d, sq %d, sq_seq %d",
+ seqno, cq->index, sq_idx, sq_seq);
+ dump_it = true;
+ goto cont;
+ }
+ if (unlikely(wh->sq_seq != sq_seq)) {
+ sif_log(sdev, SIF_INFO,
+ "wrong cqe %d for cq %d: got sq_seq %d, expected %d",
+ seqno, cq->index, sq_seq, wh->sq_seq);
+ dump_it = true;
+ goto cont;
+ }
+
+ lcqe = (struct sif_cqe *)wh->wr_id;
+ if (lcqe) {
+ wh->wr_id = 0;
+ cqe_cnt++;
+ mb();
+ sif_log(sdev, SIF_PQP, "copying to caller cqe at %p", &lcqe->cqe);
+ copy_conv_to_sw(&lcqe->cqe, cqe_be, sizeof(struct psif_cq_entry));
+ wmb();
+ lcqe->written = true;
+ if (lcqe->need_complete)
+ complete(&lcqe->cmpl);
+ }
+cont_check_first_err:
+ if (unlikely(first_err && (status != PSIF_WC_STATUS_SUCCESS))) {
+ sif_log(sdev, SIF_PQP, "error completion received - aborting");
+ copy_conv_to_sw(&first_err->cqe, cqe_be, sizeof(struct psif_cq_entry));
+ err_seen = true;
+ first_err->written = true;
+ npolled++;
+ }
+cont:
+ wh->used = 0;
+cont_no_wh:
+ if (dump_it) {
+ sif_logs(SIF_INFO,
+ write_struct_psif_cq_entry(NULL, 1,
+ (const struct psif_cq_entry *)cqe_be);
+ printk("\n"));
+ }
+
+ mb();
+ sq_sw->head_seq = sq_seq;
+ seqno = ++cq_sw->next_seq;
+
+ if (cq_length(cq, cq_sw->cached_head, seqno) >= cq->high_watermark) {
+ /* Update CQ hardware pointer */
+ set_psif_cq_sw__head_indx(&cq_sw->d, seqno);
+ cq_sw->cached_head = seqno;
+ }
+
+ ql = sq_length(sq, sq_seq, sq_sw->last_seq);
+ if (ql <= sq->mask)
+ pqp_complete_nonfull(pqp);
+ mb();
+ if (unlikely(err_seen))
+ break;
+ cqe_be = get_cq_entry(cq, seqno);
+ }
+
+ dbg_mask = npolled ? SIF_PQP : SIF_IPOLL;
+ sif_log(sdev, dbg_mask, "processed %d (%d with waiters) requests - seqno 0x%x, ql %d",
+ npolled, atomic_read(&pqp->waiters),
+ seqno, ql);
+
+ if (npolled > 0) {
+ /* reset timeout each time we see a new completion: */
+ pqp->timeout = jiffies + sdev->min_resp_ticks * 4;
+ }
+ return npolled;
+}
+
+
+static int pqp_process_cqe(struct sif_pqp *pqp, struct sif_cqe *first_err)
+{
+ unsigned long flags;
+ int npolled;
+ struct sif_cq *cq = pqp->cq;
+
+ /* If someone else holds the lock, the CQEs are handled */
+ if (!spin_trylock_irqsave(&cq->lock, flags))
+ return -EBUSY;
+ npolled = __pqp_process_cqe(pqp, first_err);
+ spin_unlock_irqrestore(&cq->lock, flags);
+ return npolled;
+}
+
+
+static struct sif_pqp *find_any_pqp(struct sif_dev *sdev)
+{
+ int cpu;
+
+ for (cpu = 0; cpu < sdev->pqp_cnt; cpu++)
+ if (sdev->pqp[cpu])
+ return sdev->pqp[cpu];
+ return NULL;
+}
+
+/* Get the right PQP for the same EQ*/
+struct sif_pqp *get_pqp_same_eq(struct sif_dev *sdev, int comp_vector)
+{
+ unsigned int pqp_index = comp_vector - 2;
+ struct sif_pqp *pqp = sdev->pqp_cnt ? sdev->pqp[pqp_index % sdev->pqp_cnt] : NULL;
+
+ if (unlikely(!pqp)) {
+ /* Typically during take down */
+ return find_any_pqp(sdev);
+ }
+ return pqp;
+}
+
+
+/* Get the right PQP for the current CPU */
+struct sif_pqp *get_pqp(struct sif_dev *sdev)
+{
+ unsigned int cpu = smp_processor_id();
+ struct sif_pqp *pqp = sdev->pqp_cnt ? sdev->pqp[cpu % sdev->pqp_cnt] : NULL;
+
+ if (unlikely(!pqp)) {
+ /* Typically during take down */
+ return find_any_pqp(sdev);
+ }
+ return pqp;
+}
+
+/* Get the next PQP in a round robin fashion */
+struct sif_pqp *get_next_pqp(struct sif_dev *sdev)
+{
+ struct sif_pqp *pqp;
+ int next = atomic_inc_return(&sdev->next_pqp) % sdev->pqp_cnt;
+
+ pqp = sdev->pqp[next];
+ if (unlikely(!pqp)) {
+ /* Typically during take down */
+ return find_any_pqp(sdev);
+ }
+ return pqp;
+}
+
+struct sif_cb *get_cb(struct sif_qp *qp)
+{
+ struct sif_dev *sdev = to_sdev(qp->ibqp.pd->device);
+ unsigned int cpu = smp_processor_id();
+ return sdev->kernel_cb[qp->qosl][cpu % sdev->kernel_cb_cnt];
+}
+
+
+inline bool pqp_req_gets_completion(struct sif_pqp *pqp, struct psif_wr *wr, enum post_mode mode)
+{
+ return mode == PM_WRITE || (wr->op != PSIF_WR_GENERATE_COMPLETION && wr->completion) ||
+ wr->cq_desc_vlan_pri_union.cqd_id == pqp->cq->index;
+}
+
+/* Fill in common parts and post a work request to the management QP for the current CPU
+ * If @cqe is non-null, a completion will be requested and the result put there in
+ * host order when it is found (by __pqp_process_cqe())
+ */
+int sif_pqp_write_send(struct sif_pqp *pqp, struct psif_wr *wr, struct sif_cqe *cqe,
+ enum post_mode mode)
+{
+ struct sif_qp *qp = pqp->qp;
+ u32 qp_idx = qp->qp_idx;
+ struct sif_dev *sdev = to_sdev(pqp->qp->ibqp.device);
+ struct sif_pd *pd = sdev->pd;
+ struct sif_sq *sq = get_sif_sq(sdev, qp_idx);
+ struct psif_sq_entry *sqe;
+ struct sif_sq_hdl *wh;
+ unsigned long flags;
+ bool ring_doorbell;
+ int q_sz;
+ int ret = 0;
+ u16 head, sq_seq;
+ struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp_idx);
+ unsigned long timeout = sdev->min_resp_ticks * 4;
+ u16 limit = pqp_req_gets_completion(pqp, wr, mode) ? sq->entries : pqp->lowpri_lim;
+ /* Per IBTA 11.4.1.1, error is only returned
+ * when the QP is in the RESET, INIT or RTR states.
+ */
+ if (qp->last_set_state < IB_QPS_RTS)
+ return -EINVAL; /* The pqp is not ready */
+
+ pqp->timeout = jiffies + timeout;
+
+ wr->local_qp = qp_idx;
+ wr->tsu_qosl = qp->qosl;
+ wr->tsu_sl = qp->tsl;
+
+restart:
+ /* Make sure emptying the queue takes preference over filling it up: */
+ if (mode != PM_WRITE)
+ ret = pqp_process_cqe(pqp, NULL);
+ if (ret > 0 || ret == -EBUSY)
+ ret = 0; /* Got some reqs */
+ else if (ret < 0)
+ return ret;
+
+ spin_lock_irqsave(&sq->lock, flags);
+ sq_seq = sq_sw->last_seq;
+ head = sq_sw->head_seq;
+ q_sz = sq_length(sq, head, sq_seq);
+
+ if (q_sz >= limit) {
+ if (sq_seq != pqp->last_full_seq) {
+ sif_log(sdev, SIF_PQP,
+ "Privileged qp full - head %d sq_seq %d q_sz %d/%d",
+ head, sq_seq, q_sz, sq->entries);
+ pqp->last_full_seq = sq_seq;
+ }
+ spin_unlock_irqrestore(&sq->lock, flags);
+
+ if (limit < sq->entries && sq_seq != pqp->last_nc_full) {
+ /* Avoid spinning creating more sync completions
+ * - block on next try unless sequence number has changed:
+ */
+ pqp->last_nc_full = sq_seq;
+ return -EAGAIN;
+ }
+
+ /* PQP requests to a full queue should not be generated at interrupt level */
+ BUG_ON(in_interrupt());
+ if (time_is_after_jiffies(pqp->timeout)) {
+ goto restart;
+ if (sq_seq != pqp->last_full_seq)
+ sif_log(sdev, SIF_PQP, "priv.qp %d: spin waiting for slot in queue",
+ pqp->qp->qp_idx);
+ } else {
+ sif_log(sdev, SIF_INFO,
+ "Timeout waiting for previous response (seq %d) to complete",
+ sq_sw->head_seq);
+ return -ETIMEDOUT;
+ }
+ }
+ sq_seq = ++sq_sw->last_seq;
+
+ /* Store longest send queue observed */
+ if (unlikely(q_sz > sq->max_outstanding && mode != PM_WRITE))
+ sq->max_outstanding = q_sz;
+
+ /* For GENERATE_COMPLETION the CQ id to generate in is put here
+ * and no completion is expected on the PQP.
+ */
+ if (wr->op == PSIF_WR_GENERATE_COMPLETION) {
+ /* Are we generating a completion on our own QP? */
+ if (wr->details.su.u2.target_qp == pqp->qp->qp_idx)
+ wr->details.su.wc_id.sq_id.sq_seq_num = sq_seq;
+ } else
+ wr->cq_desc_vlan_pri_union.cqd_id = sq->cq_idx;
+
+ wh = get_sq_hdl(sq, sq_seq);
+ wh->wr_id = (u64)cqe;
+ wh->sq_seq = sq_seq;
+ wh->used = true;
+
+ if (cqe) {
+ if ((wr->op != PSIF_WR_GENERATE_COMPLETION) || (wr->se)) {
+ cqe->sq_seq = sq_seq;
+ wr->completion = 1;
+ }
+ BUG_ON(cqe->written);
+ }
+
+ sqe = get_sq_entry(sq, sq_seq);
+
+ sif_log(sdev, SIF_PQP, "pd %d cq_idx %d sq_idx %d sq.seqn %d op %s",
+ pd->idx, wr->cq_desc_vlan_pri_union.cqd_id, sq->index, sq_seq,
+ string_enum_psif_wr_type(wr->op));
+
+ if (likely(mode != PM_WRITE)) {
+ u64 csum;
+
+ wr->sq_seq = sq_seq;
+
+ /* Collect_length is always 0 for privileged wr's - they have no data */
+ csum = csum32_partial(wr, sizeof(*wr), qp->magic);
+ csum = csum32_fold(csum);
+ wr->checksum = csum;
+
+ sif_log(sdev, SIF_PQP, "PQP checksum %x", wr->checksum);
+ }
+
+ sif_logs(SIF_DUMP, write_struct_psif_wr(NULL, 0, wr));
+
+ /* update send queue */
+ copy_conv_to_hw(sqe, wr, sizeof(struct psif_wr));
+
+ if (likely(mode != PM_WRITE)) {
+ /* Flush writes before updating the sw pointer,
+ * This is necessary to ensure that the sqs do not see
+ * an incomplete entry:
+ */
+ wmb();
+
+ /* Update sw pointer visible to hw */
+ set_psif_sq_sw__tail_indx(&sq_sw->d, sq_seq);
+
+ /* Finally write to collect buffer - implicit barriers before/after I/O writes
+ *
+ * Workaround #3595: ring doorbell if SQS in SQ-mode
+ */
+ ring_doorbell = qp->flags & SIF_QPF_FORCE_SQ_MODE ||
+ !(get_psif_sq_hw__sq_next(&sq->d) & 0x1) ||
+ mode == PM_DOORBELL;
+
+ if (ring_doorbell)
+ sif_doorbell_from_sqe(qp, sq_seq, true);
+ else if (sif_cb_write(qp, wr, sizeof(struct psif_wr))) {
+ /* vcb lock busy, use db mode instead */
+ sif_doorbell_from_sqe(qp, sq_seq, true);
+ }
+ }
+
+ spin_unlock_irqrestore(&sq->lock, flags);
+ return ret;
+}
+
+
+int sif_pqp_post_send(struct sif_dev *sdev, struct psif_wr *wr, struct sif_cqe *cqe)
+{
+ struct sif_pqp *pqp = cqe ? cqe->pqp : get_pqp(sdev);
+ enum post_mode mode = pqp->qp->flags & SIF_QPF_FORCE_SQ_MODE ? PM_DOORBELL : PM_CB;
+
+ return sif_pqp_write_send(pqp, wr, cqe, mode);
+}
+
+int sif_pqp_poll_wr(struct sif_dev *sdev, struct psif_wr *wr, struct sif_cqe *cqe)
+{
+ int ret = sif_pqp_post_send(sdev, wr, cqe);
+
+ if (ret) {
+ sif_log(sdev, SIF_INFO, "PQP wr %d post failed on QP %d, CQ %d",
+ cqe->pqp->qp->qp_idx, cqe->pqp->cq->index, wr->sq_seq);
+ return ret;
+ }
+
+ ret = poll_cq_waitfor(cqe);
+ if (ret < 0)
+ sif_log(sdev, SIF_INFO, "poll_cq_waitfor, pqp QP %d, CQ %d failed with %d",
+ cqe->pqp->qp->qp_idx, cqe->pqp->cq->index, ret);
+ return ret;
+}
+
+
+/* Poll and process incoming (internal) completions
+ * while waiting for this particular completion
+ */
+int poll_cq_waitfor(struct sif_cqe *lcqe)
+{
+ struct sif_pqp *pqp = lcqe->pqp;
+ struct sif_cq *cq = pqp->cq;
+ struct sif_dev *sdev = to_sdev(cq->ibcq.device);
+ int ret = 0;
+ volatile bool *written = &lcqe->written;
+ u64 min_resp_ticks = sdev->min_resp_ticks;
+
+ /* TBD: This timeout is unsafe - we just keep it now to allow runs be aborted
+ * without having to reboot. Keep value for it a factor larger than other timeouts:
+ */
+ pqp->timeout = jiffies + min_resp_ticks * 4;
+
+ while (!(*written)) {
+ ret = pqp_process_cqe(pqp, NULL);
+ if (ret == -EBUSY) {
+ ret = 0;
+ continue;
+ } else if (ret < 0)
+ break;
+ else if (ret == 0) {
+ if (time_is_before_jiffies(pqp->timeout)) {
+ if (sif_feature(pcie_trigger))
+ force_pcie_link_retrain(sdev);
+ sif_log(sdev, SIF_INFO,
+ "cq %d: poll for cqe %p timed out", cq->index, lcqe);
+ atomic_inc(&cq->timeout_cnt);
+
+ sif_logs(SIF_PQPT,
+ struct sif_sq *sq = get_sif_sq(sdev, pqp->qp->qp_idx);
+ struct psif_sq_entry *sqe =
+ get_sq_entry(sq, lcqe->sq_seq);
+ write_struct_psif_sq_entry(NULL, 1, sqe));
+ ret = -ETIMEDOUT;
+ break;
+ }
+ if (!in_interrupt()) /* TBD: Fix this as well */
+ cond_resched();
+ else
+ cpu_relax();
+
+ if (sdev->min_resp_ticks != min_resp_ticks) {
+ /* Give us a quick way out by changing min_resp_ticks */
+ pqp->timeout -= (min_resp_ticks - sdev->min_resp_ticks) * 4;
+ min_resp_ticks = sdev->min_resp_ticks;
+ }
+ continue;
+ }
+ }
+
+ if (ret < 0)
+ pqp_reset_cmpl(lcqe);
+ return ret;
+}
+
+
+/* Poll for any pqp completion, return the number of completions polled */
+static int poll_cq_waitfor_any(struct sif_pqp *pqp, struct sif_cqe *first_err)
+{
+ struct sif_cq *cq = pqp->cq;
+ struct sif_dev *sdev = to_sdev(cq->ibcq.device);
+ int ret = 0;
+ u64 min_resp_ticks = sdev->min_resp_ticks;
+
+ pqp->timeout = jiffies + min_resp_ticks * 4;
+
+ while (!ret) {
+ ret = pqp_process_cqe(pqp, first_err);
+ if (ret == -EBUSY) {
+ ret = 0;
+ continue;
+ } else if (ret < 0)
+ break;
+ else if (ret == 0) {
+ if (time_is_before_jiffies(pqp->timeout)) {
+ if (sif_feature(pcie_trigger))
+ force_pcie_link_retrain(sdev);
+ sif_log(sdev, SIF_INFO,
+ "cq %d: poll timed out", cq->index);
+ atomic_inc(&cq->timeout_cnt);
+ ret = -ETIMEDOUT;
+ break;
+ }
+ if (!in_interrupt()) /* TBD: Fix this as well */
+ cond_resched();
+ else
+ cpu_relax();
+
+ if (sdev->min_resp_ticks != min_resp_ticks) {
+ /* Give us a quick way out by changing min_resp_ticks */
+ pqp->timeout -= (min_resp_ticks - sdev->min_resp_ticks) * 4;
+ min_resp_ticks = sdev->min_resp_ticks;
+ }
+ }
+ }
+ sif_log(sdev, SIF_PQP, "ret = %d", ret);
+ return ret;
+}
+
+
+/***** Generic completion generation *****/
+
+static int __gen_cqe(struct sif_dev *sdev, u32 target_cq, u64 wc_id, u32 target_qp,
+ enum psif_wc_opcode opcode, enum psif_wc_status status, struct sif_cqe *cqe,
+ bool event)
+{
+ struct psif_wr wr;
+
+ memset(&wr, 0, sizeof(struct psif_wr));
+ wr.op = PSIF_WR_GENERATE_COMPLETION;
+ wr.cq_desc_vlan_pri_union.cqd_id = target_cq;
+ wr.details.su.completion_status = status;
+ wr.details.su.completion_opcode = opcode;
+
+ if (opcode >= PSIF_WC_OPCODE_RECEIVE_SEND)
+ wr.details.su.wc_id.rq_id = wc_id;
+ else
+ wr.details.su.wc_id.sq_id.sq_seq_num = wc_id;
+
+ wr.details.su.u2.target_qp = target_qp;
+ /* set the IB_CQ_SOLICITED flag because the CQ might be armed
+ * and the consumer might be interested in getting these events.
+ * Setting IB_CQ_SOLICITED is generally safe because it is a
+ * subset of IB_CQ_NEXT_COMP.
+ */
+ if (event)
+ wr.se = 1;
+
+ return sif_pqp_post_send(sdev, &wr, cqe);
+}
+
+
+/* Generate a SUCCESS completion on the PQP itself
+ * We use this to be able to wait for a set of generated completions to other
+ * CQs to have been completed:
+ */
+int gen_pqp_cqe(struct sif_cqe *cqe)
+{
+ struct sif_pqp *pqp = cqe->pqp;
+ struct sif_dev *sdev = to_sdev(pqp->cq->ibcq.device);
+ struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, pqp->qp->qp_idx);
+
+ if (cqe)
+ cqe->written = false;
+
+ sif_log(sdev, SIF_PQP, " for sq %d, last_nc_full %d, head_seq %d last_seq %d",
+ pqp->qp->qp_idx, pqp->last_nc_full, sq_sw->head_seq, sq_sw->last_seq);
+ return __gen_cqe(sdev, pqp->cq->index, 0, pqp->qp->qp_idx,
+ PSIF_WC_OPCODE_GENERATE_COMPLETION, PSIF_WC_STATUS_SUCCESS,
+ cqe, true);
+}
+
+
+/* Post a request to generate a completion with the given values
+ * on the cq identified by @target_cq.
+ * This request generates no completion on the PQP itself:
+ */
+static int sif_gen_cqe(struct sif_dev *sdev, u32 target_cq, u64 wc_id, u32 target_qp,
+ enum psif_wc_opcode opcode, enum psif_wc_status status, bool event)
+{
+ return __gen_cqe(sdev, target_cq, wc_id, target_qp, opcode, status, NULL, event);
+}
+
+/* Post a request to generate a completion for an outstanding rq entry
+ * on the given qp. This request generates no completion on the PQP itself:
+ */
+
+static int sif_gen_rq_cqe(struct sif_dev *sdev, struct sif_rq *rq, u32 rq_seq,
+ struct sif_qp *target_qp, enum psif_wc_opcode opcode,
+ enum psif_wc_status status)
+{
+ struct psif_rq_entry *rqe = get_rq_entry(rq, rq_seq);
+ u64 wc_id = get_psif_rq_entry__rqe_id(rqe);
+ u32 cq_idx = get_psif_qp_core__rcv_cq_indx(&target_qp->d.state);
+
+ sif_log(sdev, SIF_PQP, "on rq %d, rq_seq %d, wc_id %llx, cq %d (target_qp %d)",
+ rq->index, rq_seq, wc_id, cq_idx, target_qp->qp_idx);
+
+ return sif_gen_cqe(sdev, cq_idx, wc_id, target_qp->qp_idx, opcode, status, true);
+}
+
+
+int sif_gen_rq_flush_cqe(struct sif_dev *sdev, struct sif_rq *rq,
+ u32 rq_seq, struct sif_qp *target_qp)
+{
+ return sif_gen_rq_cqe(sdev, rq, rq_seq, target_qp,
+ PSIF_WC_OPCODE_RECEIVE_SEND, PSIF_WC_STATUS_WR_FLUSH_ERR);
+}
+
+/* Post a request to generate a completion for an outstanding sq entry
+ * on the given qp. This request generates no completion on the PQP itself:
+ */
+
+static int sif_gen_sq_cqe(struct sif_dev *sdev, struct sif_sq *sq, u32 sq_seq, u32 target_qp,
+ enum psif_wc_opcode opcode, enum psif_wc_status status, bool event)
+{
+ struct psif_sq_entry *sqe = get_sq_entry(sq, sq_seq);
+ u64 wc_id = get_psif_wr__sq_seq(&sqe->wr);
+
+ sif_log(sdev, SIF_PQP, "on sq %d, sq_seq %d, wc_id %llx, cq %d (target_qp %d)",
+ sq->index, sq_seq, wc_id, sq->cq_idx, target_qp);
+
+ return sif_gen_cqe(sdev, sq->cq_idx, wc_id, target_qp, opcode, status, event);
+}
+
+
+int sif_gen_sq_flush_cqe(struct sif_dev *sdev, struct sif_sq *sq,
+ u32 sq_seq, u32 target_qp, bool event)
+{
+ return sif_gen_sq_cqe(sdev, sq, sq_seq, target_qp,
+ PSIF_WC_OPCODE_SEND, PSIF_WC_STATUS_WR_FLUSH_ERR, event);
+}
+
+
+/***** Stencil PQP support ****
+ *
+ * A stencil PQP is a PQP set up fully populated with WRs ready
+ * for parallel batch processing (using SQSes) of particularly performance
+ * critical PQP operations.
+ *
+ * The idea is to lay this out to allow the WRs to be reused with minimal
+ * updates:
+ */
+
+struct sif_st_pqp *sif_create_inv_key_st_pqp(struct sif_dev *sdev)
+{
+ int i;
+ struct sif_st_pqp *spqp = (struct sif_st_pqp *)_sif_create_pqp(sdev, sizeof(*spqp), 0);
+ struct sif_pqp *pqp;
+ int qp_idx;
+ struct sif_sq *sq;
+ struct sif_sq_sw *sq_sw;
+ struct psif_sq_entry *sqe;
+ struct psif_wr lwr;
+ u16 max_db_int;
+
+ if (IS_ERR(spqp))
+ return spqp;
+
+ pqp = &spqp->pqp;
+ qp_idx = pqp->qp->qp_idx;
+ sq = get_sif_sq(sdev, qp_idx);
+ sq_sw = get_sif_sq_sw(sdev, qp_idx);
+ max_db_int = (sq->entries >> 3);
+
+ /* Pre-populate the SQ */
+ for (i = 0; i < sq->entries; i++)
+ sif_write_invalidate(pqp, key, 0, NULL, PCM_POST, PM_WRITE);
+
+ /* Now, to start using the stencil at seq.1 (as normal SQs)
+ * we must reset the sw tail pointer which
+ * was updated by sif_write_invalidate:
+ */
+ sq_sw->last_seq = 0;
+ spqp->doorbell_seq = 1;
+
+ spqp->doorbell_interval = min_t(u16, SPQP_DOORBELL_INTERVAL, max_db_int);
+ spqp->next_doorbell_seq = spqp->doorbell_interval + 1;
+ spqp->req_compl = 0;
+ spqp->next_poll_seq = (sq->entries >> 1);
+ spqp->sq = sq;
+ spqp->sq_sw = sq_sw;
+ spqp->pqp.qp->flags |= SIF_QPF_KI_STENCIL;
+
+ /* Calculate a partial checksum
+ * - they are all the same since the fields we change
+ * are calculated with 0-values to ease checksum mod. later:
+ */
+ sqe = get_sq_entry(sq, 0);
+ copy_conv_to_sw(&lwr, &sqe->wr, sizeof(lwr));
+ spqp->checksum = csum32_partial(&lwr, sizeof(lwr), pqp->qp->magic);
+
+ sif_log(sdev, SIF_PQPT, "done qp %d, sq sz %d, next_poll_seq %d", qp_idx,
+ sq->entries, spqp->next_poll_seq);
+ return spqp;
+}
+
+
+int sif_destroy_st_pqp(struct sif_dev *sdev, struct sif_st_pqp *spqp)
+{
+ return sif_destroy_pqp(sdev, &spqp->pqp);
+}
+
+
+/* Update a new invalidate key request into a preconfigured stencil pqp
+ * Assumes exclusive access to the PQP SQ.
+ */
+int sif_inv_key_update_st(struct sif_st_pqp *spqp, int index, enum wr_mode mode)
+{
+ struct sif_sq *sq = spqp->sq;
+ struct sif_sq_sw *sq_sw = spqp->sq_sw;
+ u16 sq_seq = ++sq_sw->last_seq;
+ struct psif_sq_entry *sqe = get_sq_entry(sq, sq_seq);
+ struct sif_dev *sdev = to_sdev(spqp->pqp.cq->ibcq.device);
+ bool poll_prev = false;
+ int ret = 1;
+ u64 csum_inc = (u64)index + (u64)sq_seq;
+ u64 csum;
+ int q_sz;
+ u16 head;
+ DECLARE_SIF_CQE_POLL(sdev, first_err);
+
+ /* Modify the request to our need */
+ set_psif_wr_su__key(&sqe->wr.details.su, index);
+ set_psif_wr__sq_seq(&sqe->wr, sq_seq);
+
+ head = sq_sw->head_seq;
+ q_sz = sq_length(sq, head, sq_seq);
+
+ if (unlikely(q_sz > (int)sq->entries)) {
+ sif_log(sdev, SIF_INFO, "Error: Stencil pqp (qp %d) is full at seq %d, head %d",
+ sq->index, sq_seq, sq_sw->head_seq);
+ sq_sw->last_seq--;
+ return -ENOMEM;
+ }
+
+ /* Store longest send queue observed */
+ if (unlikely(q_sz > sq->max_outstanding))
+ sq->max_outstanding = q_sz;
+
+ if (unlikely(mode == PCM_WAIT || sq_seq == spqp->next_poll_seq)) {
+ set_psif_wr__completion(&sqe->wr, 1);
+ spqp->req_compl++;
+ sif_log(sdev, SIF_PQPT, "sq %d: requesting completion for seq %d (%d)",
+ sq->index, sq_seq, spqp->req_compl);
+ poll_prev = spqp->req_compl > 1;
+ if (sq_seq == spqp->next_poll_seq)
+ spqp->next_poll_seq += (sq->entries >> 1);
+ csum_inc += 0x80000000;
+ } else {
+ /* Reset the completion bit in case it was set in the previous generation! */
+ set_psif_wr__completion(&sqe->wr, 0);
+ }
+
+ /* Add the changes to the checksum */
+ csum = csum32_partial(&csum_inc, 8, spqp->checksum);
+ csum = csum32_fold(csum);
+ set_psif_wr__checksum(&sqe->wr, csum);
+
+ sif_log(sdev, SIF_PQP, "cq %d, sq %d, sq seq %d%s", spqp->pqp.cq->index,
+ sq->index, sq_seq, (poll_prev ? " (poll prev)" : ""));
+
+ if (unlikely(mode == PCM_WAIT || sq_seq == spqp->next_doorbell_seq)) {
+ sif_log(sdev, SIF_PQPT, "sq %d: writing doorbell at seq %d - tail at %d%s",
+ sq->index, spqp->doorbell_seq, sq_seq, (mode == PCM_WAIT ? " [wait]" : ""));
+ wmb();
+ set_psif_sq_sw__tail_indx(&sq_sw->d, sq_seq);
+ sif_doorbell_from_sqe(spqp->pqp.qp, spqp->doorbell_seq, true);
+ spqp->doorbell_seq = sq_seq + 1;
+ spqp->next_doorbell_seq = sq_seq + spqp->doorbell_interval + 1;
+ }
+
+ if (poll_prev) {
+ sif_log(sdev, SIF_PQPT, "enter wait (poll_prev) (%d)", spqp->req_compl);
+ ret = poll_cq_waitfor_any(&spqp->pqp, &first_err);
+ if (ret < 0)
+ goto out;
+ if (unlikely(first_err.written)) {
+ sif_log(sdev, SIF_INFO, "error completion with status %s",
+ string_enum_psif_wc_status(first_err.cqe.status));
+ goto out;
+ }
+ sif_log(sdev, SIF_PQPT, "polled %d completions", ret);
+ spqp->req_compl -= ret;
+ }
+
+ if (unlikely(mode == PCM_WAIT)) {
+ while (sq_sw->head_seq != sq_seq) {
+ sif_log(sdev, SIF_PQPT, "enter wait (%d) seq %d/%d",
+ spqp->req_compl, sq_sw->head_seq, sq_seq);
+ ret = poll_cq_waitfor_any(&spqp->pqp, &first_err);
+ if (ret < 0)
+ break;
+ spqp->req_compl -= ret;
+ sif_log(sdev, SIF_PQPT, "done wait - head now %d - rem.cmpl %d",
+ sq_sw->head_seq, spqp->req_compl);
+ }
+ }
+
+ if (ret == 0)
+ ret = -ENOMEM;
+ else if (ret > 0)
+ ret = 0;
+
+out:
+ sif_log(sdev, SIF_PQP, "done ret = %d", ret);
+ return ret;
+}
+
+
+/* get exclusive access to a stencil pqp */
+struct sif_st_pqp *sif_alloc_ki_spqp(struct sif_dev *sdev)
+{
+ int index;
+ struct sif_st_pqp *spqp = NULL;
+
+ mutex_lock(&sdev->ki_spqp.lock);
+ index = find_next_zero_bit(sdev->ki_spqp.bitmap, sdev->ki_spqp.pool_sz, 0);
+ if (index < sdev->ki_spqp.pool_sz) {
+ set_bit(index, sdev->ki_spqp.bitmap);
+ spqp = sdev->ki_spqp.spqp[index];
+ }
+ mutex_unlock(&sdev->ki_spqp.lock);
+ sif_log(sdev, SIF_PQPT, "bit index %d", index);
+ return spqp;
+}
+
+void sif_release_ki_spqp(struct sif_st_pqp *spqp)
+{
+ struct sif_dev *sdev = to_sdev(spqp->pqp.cq->ibcq.device);
+
+ mutex_lock(&sdev->ki_spqp.lock);
+ clear_bit(spqp->index, sdev->ki_spqp.bitmap);
+ mutex_unlock(&sdev->ki_spqp.lock);
+ sif_log(sdev, SIF_PQPT, "bit index %d", spqp->index);
+}
--- /dev/null
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * Author: Knut Omang <knut.omang@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters
+ *
+ * sif_pqp.h: Privileged QP handling
+ */
+
+#ifndef __SIF_PQP_H
+#define __SIF_PQP_H
+
+struct sif_qp;
+struct sif_cq;
+struct sif_rq;
+struct sif_sq;
+struct completion;
+enum post_mode;
+
+/* Data structure used by PQP requesters to get the completion information,
+ * and optionally block waiting for it to arrive:
+ */
+struct sif_cqe {
+ struct psif_cq_entry cqe; /* host order copy of hw cqe */
+ struct completion cmpl; /* a completion to wait on for response */
+ struct sif_pqp *pqp; /* Priv.qp to wait on */
+ bool need_complete; /* cmpl is initialized and a waiter is present */
+ bool written; /* Set to true when a completion has been copied here */
+ u16 sq_seq; /* set by post_send to allow us to reset ourselves */
+};
+
+/*
+ * Declare and initialize data structure to receive a poll completion
+ * cqe.status initialized tosomething != SUCCESS
+ */
+#define DECLARE_SIF_CQE_POLL(d_, c_)\
+ struct sif_cqe c_ = { \
+ .cqe.status = PSIF_WC_STATUS_FIELD_MAX,\
+ .pqp = get_pqp(d_),\
+ .need_complete = false,\
+ .written = false,\
+ }
+
+#define DECLARE_SIF_CQE_WITH_SAME_EQ(d_, c_, e_) \
+ struct sif_cqe c_ = { \
+ .cqe.status = PSIF_WC_STATUS_FIELD_MAX,\
+ .pqp = get_pqp_same_eq(d_, e_), \
+ .need_complete = false,\
+ .written = false,\
+ }
+
+
+#define DECLARE_SIF_CQE_WAIT(d_, c_)\
+ struct sif_cqe c_ = { \
+ .cqe.status = PSIF_WC_STATUS_FIELD_MAX,\
+ .pqp = get_pqp(d_),\
+ .need_complete = true,\
+ .written = false,\
+ };\
+ init_completion(&c_.cmpl)
+
+#define DECLARE_SIF_CQE_POLL_WITH_RR_PQP(d_, c_)\
+ struct sif_cqe c_ = { \
+ .cqe.status = PSIF_WC_STATUS_FIELD_MAX,\
+ .pqp = get_next_pqp(d_),\
+ .need_complete = false,\
+ .written = false,\
+ }
+
+
+struct sif_pqp {
+ struct sif_qp *qp; /* The qp used */
+ struct sif_cq *cq; /* Associated completion queue for this priv.QP */
+ unsigned long timeout; /* rescheduled when new completions observed */
+ struct completion nonfull; /* allow a poster to wait for a cred */
+ atomic_t waiters; /* number of waiters on nonfull */
+ u16 last_full_seq; /* For logging purposes, record when last observed full */
+ u16 last_nc_full; /* Track when to return EAGAIN to flush non-compl.entries */
+ u16 lowpri_lim; /* Max number of outstanding low priority reqs */
+};
+
+struct sif_pqp *sif_create_pqp(struct sif_dev *sdev, int comp_vector);
+int sif_destroy_pqp(struct sif_dev *sdev, struct sif_pqp *pqp);
+
+/* Get the right PQP for the current CPU */
+struct sif_pqp *get_pqp(struct sif_dev *sdev);
+
+/* Get the right PQP with the same EQ */
+struct sif_pqp *get_pqp_same_eq(struct sif_dev *sdev, int comp_vector);
+
+/* Get the next PQP in round robin fashion */
+struct sif_pqp *get_next_pqp(struct sif_dev *sdev);
+
+/* Get the right CB for the current CPU for the given QP */
+struct sif_cb *get_cb(struct sif_qp *qp);
+
+static inline struct sif_cq *pqp_cq(struct sif_dev *sdev)
+{
+ return (get_pqp(sdev))->cq;
+}
+
+static inline struct sif_qp *pqp_qp(struct sif_dev *sdev)
+{
+ return (get_pqp(sdev))->qp;
+}
+
+/* Fill in common parts and post a work request to the management QP for the current CPU
+ * If @cqe is non-null, a completion will be requested and eventually reflected to @cqe
+ * in host order.
+ */
+int sif_pqp_post_send(struct sif_dev *sdev, struct psif_wr *wr, struct sif_cqe *cqe);
+
+/* Same as post send but allow post_mode - sif_pqp_post_send uses PM_CB */
+int sif_pqp_write_send(struct sif_pqp *pqp, struct psif_wr *wr, struct sif_cqe *cqe,
+ enum post_mode mode);
+
+
+/* Poll and process incoming (internal) completions
+ * while waiting for this particular completion
+ */
+int poll_cq_waitfor(struct sif_cqe *lcqe);
+
+int sif_pqp_poll_wr(struct sif_dev *sdev, struct psif_wr *wr, struct sif_cqe *cqe);
+
+
+
+/* Generate a SUCCESS completion on the PQP itself
+ * We use this to be able to wait for a set of generated completions to other
+ * CQs to have been completed:
+ */
+int gen_pqp_cqe(struct sif_cqe *cqe);
+
+/* Post a request to generate a flushed-in-error completion for an outstanding rq entry
+ * on the given qp. This request generates no completion on the PQP itself:
+ */
+int sif_gen_rq_flush_cqe(struct sif_dev *sdev, struct sif_rq *rq,
+ u32 rq_seq, struct sif_qp *target_qp);
+
+/* Post a request to generate a flushed-in-error completion for an outstanding sq entry
+ * on the given qp. This request generates no completion on the PQP itself:
+ */
+int sif_gen_sq_flush_cqe(struct sif_dev *sdev, struct sif_sq *sq,
+ u32 sq_seq, u32 target_qp, bool notify_ev);
+
+/* Stencil PQP support - pre-populated PQPs for special performance sensitive use cases */
+
+#define SPQP_DOORBELL_INTERVAL 8192
+
+struct sif_st_pqp {
+ struct sif_pqp pqp; /* The PQP to use - must be first */
+ struct sif_sq *sq; /* Short path to sq */
+ struct sif_sq_sw *sq_sw;/* Short path to sq_sw */
+ int index; /* The index of this st_pqp within it's pool */
+ u16 doorbell_interval; /* Interval between each doorbell write */
+ u16 doorbell_seq; /* Seq.no to use in next doorbell */
+ u16 next_doorbell_seq; /* Next seqno to ring doorbell */
+ u16 req_compl; /* Number of completions requested */
+ u16 next_poll_seq; /* Next seqno to set completion and wait/poll for one */
+ u64 checksum; /* Host endian partial checksum of stencil WR entries */
+};
+
+
+/* Stencil PQP management */
+struct sif_spqp_pool {
+ struct mutex lock; /* Protects access to this pool */
+ struct sif_st_pqp **spqp; /* Key invalidate stencil PQPs */
+ u32 pool_sz; /* Number of stencil PQPs set up */
+ ulong *bitmap; /* Bitmap for allocation from spqp */
+};
+
+
+struct sif_st_pqp *sif_create_inv_key_st_pqp(struct sif_dev *sdev);
+
+/* get exclusive access to a stencil pqp */
+struct sif_st_pqp *sif_alloc_ki_spqp(struct sif_dev *sdev);
+void sif_release_ki_spqp(struct sif_st_pqp *spqp);
+
+/* Update a new invalidate key request into a preconfigured stencil pqp
+ * Assumes exclusive access to the PQP SQ.
+ */
+int sif_inv_key_update_st(struct sif_st_pqp *spqp, int index, enum wr_mode mode);
+
+
+int sif_destroy_st_pqp(struct sif_dev *sdev, struct sif_st_pqp *spqp);
+
+#endif