From: Knut Omang Date: Wed, 25 May 2016 09:01:11 +0000 (+0200) Subject: sif driver initial commit part 2 X-Git-Tag: v4.1.12-92~148^2~19 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=c9f42a310e20c1d7491b014457fed14ee3559f1e;p=users%2Fjedix%2Flinux-maple.git sif driver initial commit part 2 sif_fwa.c: Firmware access API (netlink based out-of-band comm) sif_fwa.h: Low level access to a SIF device sif_hwi.c: Hardware init for SIF - combines the various init steps for psif sif_hwi.h: Hardware init for SIF sif_ibcq.h: External interface to IB completion queue logic for SIF sif_ibpd.h: External interface to (IB) protection domains for SIF sif_ibqp.h: External interface to IB queue pair logic for sif sif_idr.c: Synchronized ID ref allocation sif_idr.h: simple id allocation and deallocation for SIF sif_int_user.h: This file defines special internal data structures used sif_ireg.c: Utilities and entry points needed for Infiniband registration sif_ireg.h: support functions used in setup of sif as an IB HCA sif_main.c: main entry points and initialization sif_mem.c: SIF table memory and page table management sif_mem.h: A common interface for all memory used by sif_mmu.c: main entry points and initialization sif_mmu.h: API for management of sif's on-chip mmu. sif_mr.c: Implementation of memory regions support for SIF sif_mr.h: Interface to internal IB memory registration logic for SIF sif_mw.c: Implementation of memory windows for SIF sif_mw.h: Interface to internal IB memory window logic for SIF sif_pd.c: Implementation of IB protection domains for SIF sif_pd.h: Internal interface to protection domains sif_pqp.c: Privileged QP handling sif_pqp.h: Privileged QP handling Signed-off-by: Knut Omang --- diff --git a/drivers/infiniband/hw/sif/sif_fwa.c b/drivers/infiniband/hw/sif/sif_fwa.c new file mode 100644 index 0000000000000..c6501db7d6521 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_fwa.c @@ -0,0 +1,589 @@ +/* + * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_fwa.c: Firmware access API (netlink based out-of-band comm) + * + */ +#include "sif_dev.h" +#include "sif_fwa.h" + +#include +#include +#include "sif_enl.h" +#include "sif_defs.h" +#include "sif_query.h" +#include "sif_base.h" +#include "sif_qp.h" +#include "psif_hw_csr.h" +#include "sif_drvapi.h" + +/* Generic netlink protocol family definition */ +static struct genl_family sif_enl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = "sif_enl", + .version = 1, + .maxattr = 16 +}; + +/* Netlink request handlers */ +static int sif_fwa_req(struct sk_buff *skb, struct genl_info *info); +static int sif_fwa_drv_req(struct sk_buff *skb, struct genl_info *info); + +/* Netlink req operation definition */ +static struct genl_ops sif_enl_ops[] = { + { + .cmd = SIF_ENL_CMD_REQ, + .flags = 0, + .policy = sif_enl_policy, + .doit = sif_fwa_req, + .dumpit = NULL, + }, + + { + .cmd = SIF_ENL_CMD_REQ_DRV, + .flags = 0, + .policy = sif_enl_policy, + .doit = sif_fwa_drv_req, + .dumpit = NULL, + } +}; + + +/* Global datastructure to keep track of instances and number of active + * processes: + */ + +struct fwa_data { + struct list_head sdev_list; /* Access to devices */ + spinlock_t lock; /* Protects device list */ +}; + +static struct fwa_data fwa; + + +/* Called from sif_init/exit to set up/clean up global data structures + * such as netlink communication and device registry: + */ +int sif_fwa_init(void) +{ + int stat; + + INIT_LIST_HEAD(&fwa.sdev_list); + spin_lock_init(&fwa.lock); + + stat = genl_register_family_with_ops(&sif_enl_family, sif_enl_ops); + if (stat) + goto fail; + + sif_log0(SIF_INIT, "Enabled firmware access API"); + return 0; +fail: + sif_log0(SIF_INIT, "ERROR: Failed to enable firmware access API - error %d", stat); + return stat; +} + +void sif_fwa_exit(void) +{ + sif_log0(SIF_INIT, "Disabling firmware access API"); + genl_unregister_family(&sif_enl_family); +} + + +/* Called from probe to register a new device */ +int sif_fwa_register(struct sif_dev *sdev) +{ + struct pci_dev *pdev = sdev->pdev; + + sif_log(sdev, SIF_INIT, "register device %02x:%02x.%d", + pdev->bus->number, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); + spin_lock(&fwa.lock); + list_add_tail(&sdev->fwa.list, &fwa.sdev_list); + spin_unlock(&fwa.lock); + return 0; +} + +/* Called from remove to unregister a device */ +void sif_fwa_unregister(struct sif_dev *sdev) +{ + spin_lock(&fwa.lock); + list_del(&sdev->fwa.list); + spin_unlock(&fwa.lock); +} + + +static struct sif_dev *fwa_find_dev(struct genl_info *info) +{ + struct sif_dev *sdev = NULL; + struct sif_dev *s; + + u16 domain = nla_get_u16(info->attrs[SIF_ENL_A_COMPLEX]); + u16 bus = nla_get_u16(info->attrs[SIF_ENL_A_BUS]); + u16 devfn = nla_get_u16(info->attrs[SIF_ENL_A_DEVFN]); + + /* TBD: Ref.count access to sdev */ + sif_log0(SIF_FWA, "bus %x devfn %x", + bus, devfn); + + spin_lock(&fwa.lock); + list_for_each_entry(s, &fwa.sdev_list, fwa.list) { + if (domain == pci_domain_nr(s->pdev->bus) && + bus == s->pdev->bus->number && + devfn == s->pdev->devfn) { + sdev = s; + break; + } + sif_log(s, SIF_FWA, "bus %x devfn %x", s->pdev->bus->number, s->pdev->devfn); + } + spin_unlock(&fwa.lock); + return sdev; +} + + +static int fwa_valid_opcode(struct sif_dev *sdev, struct psif_epsc_csr_req *req, + enum psif_mbox_type eps_num) +{ + switch (req->opcode) { + case EPSC_SETUP: + case EPSC_SETUP_BASEADDR: + case EPSC_SET_BASEADDR: + case EPSC_SET_BASEADDR_EQ: + case EPSC_SET_ONE_CSR: + /* These are kernel only */ + return -EPERM; + case EPSC_HOST_INT_CHANNEL_CTRL: + case EPSC_HOST_INT_COMMON_CTRL: + case EPSC_SET_LID: + case EPSC_SET_EOIB_MAC: + case EPSC_UF_RESET: + case EPSC_MODIFY_QP: + case EPSC_GET_SINGLE: + case EPSC_GET_ONE_CSR: + case EPSC_QUERY: + case EPSC_SET: + case EPSC_QUERY_QP: + case EPSC_QUERY_DEVICE: + case EPSC_QUERY_PORT_1: + case EPSC_QUERY_PORT_2: + case EPSC_QUERY_PKEY: + case EPSC_QUERY_GID: + case EPSC_MODIFY_DEVICE: + case EPSC_MODIFY_PORT_1: + case EPSC_MODIFY_PORT_2: + case EPSC_MC_ATTACH: + case EPSC_MC_DETACH: + case EPSC_MC_QUERY: + case EPSC_FLASH_START: + case EPSC_FLASH_ERASE_SECTOR: + case EPSC_FLASH_RD: + case EPSC_FLASH_WR: + case EPSC_FLASH_STOP: + case EPSC_A_CONTROL: + case EPSC_LINK_CNTRL: + case EPSC_UPDATE: + case EPSC_UF_CTRL: + case EPSC_VIMMA_CTRL: + /* These are not meaningful for the EPSAs for now */ + if (eps_num == sdev->mbox_epsc) + return 0; + else + return -EPERM; + case EPSC_NOOP: + case EPSC_MAILBOX_PING: + case EPSC_KEEP_ALIVE: + case EPSC_EVENT_ACK: + case EPSC_EVENT_INDEX: + case EPSC_TEST_HOST_RD: + case EPSC_TEST_HOST_WR: + case EPSC_FW_VERSION: + case EPSC_LOG_CTRL: + case EPSC_LOG_REQ_NOTIFY: + case EPSC_A_COMMAND: + case EPSC_EXERCISE_MMU: + case EPSC_CLI_ACCESS: + break; + case EPSC_LAST_OP: + default: + /* Fail on all unknown operations: */ + sif_log(sdev, SIF_FWA, "Unknown operation %d", req->opcode); + return -EINVAL; + } + return 0; +} + + +static int sif_fwa_verify_find_dev(struct genl_info *info, struct sif_dev **sdev_p, int payload_len) +{ + struct sif_dev *sdev; + int len; + + if (!info->attrs[SIF_ENL_A_COMPLEX]) { + sif_log0(SIF_FWA, "PCI complex no. not set!"); + return -EINVAL; + } + + if (!info->attrs[SIF_ENL_A_BUS]) { + sif_log0(SIF_FWA, "PCI bus no. not set!"); + return -EINVAL; + } + + if (!info->attrs[SIF_ENL_A_DEVFN]) { + sif_log0(SIF_FWA, "PCI device/function not set!"); + return -EINVAL; + } + + if (!info->attrs[SIF_ENL_A_PAYLOAD]) { + sif_log0(SIF_FWA, "Received empty request!"); + return -EINVAL; + } + len = nla_len(info->attrs[SIF_ENL_A_PAYLOAD]); + if (len < payload_len) { + sif_log0(SIF_FWA, "Request too short!"); + return -EFAULT; + } + + /* TBD: Better input checking... */ + + sdev = fwa_find_dev(info); + if (!sdev) { + sif_log0(SIF_FWA, "No such device found!"); + return -ENODEV; + } + *sdev_p = sdev; + return 0; +} + + +static int sif_fwa_drv_req(struct sk_buff *skb, struct genl_info *info) +{ + int msg_sz; + int stat; + size_t data_sz = 0; + struct sif_dev *sdev; + struct sif_drv_req *req = NULL; + struct sif_drv_rsp rsp; + enum psif_mbox_type eps_num; + struct sk_buff *resp_skb; + void *data; + int ret; + + if (!capable(CAP_NET_ADMIN)) { + sif_log0(SIF_FWA, "Request from client without the CAP_NET_ADMIN privilege"); + return -EPERM; + } + + ret = sif_fwa_verify_find_dev(info, &sdev, sizeof(struct sif_drv_req)); + if (ret) + return ret; + + req = nla_data(info->attrs[SIF_ENL_A_PAYLOAD]); + + sif_log(sdev, SIF_FWA, "op %d", req->opcode); + + if (IS_SIBS(sdev)) { + sif_log(sdev, SIF_FWA, "Device does not have any EPS-A modules"); + return -EINVAL; + } + + eps_num = epsa_to_mbox(req->u.epsa.epsa); + if (eps_num == (enum psif_mbox_type)-1) { + sif_log(sdev, SIF_FWA, "Unknown EPS-A %d", req->u.epsa.epsa); + return -EINVAL; + } + + switch (req->opcode) { + case SIF_DRV_CMD_EPSA_SETUP: + ret = sif_activate_epsa(sdev, eps_num); + rsp.opcode = SIF_DRV_CMD_EPSA_SETUP; + rsp.eps_rsp.status = ret; + break; + case SIF_DRV_CMD_EPSA_TEARDOWN: + break; + } + + if (ret) + return ret; + + /* Start building a response */ + msg_sz = NLMSG_DEFAULT_SIZE + data_sz; + resp_skb = nlmsg_new(msg_sz, GFP_KERNEL); + if (!resp_skb) + return -ENOMEM; + + data = genlmsg_put_reply(resp_skb, info, &sif_enl_family, + 0, SIF_ENL_CMD_RSP_DRV); + if (data == NULL) { + stat = -ENOMEM; + goto put_fail; + } + + stat = nla_put(resp_skb, SIF_ENL_A_PAYLOAD, sizeof(struct sif_drv_rsp), &rsp); + if (stat) { + sif_log(sdev, SIF_FWA, "failed to append response to netlink packet"); + goto put_fail; + } + + /* Recompute message header */ + genlmsg_end(resp_skb, data); + + stat = genlmsg_reply(resp_skb, info); + if (stat) { + sif_log(sdev, SIF_FWA, "failed to send reply - status %d", stat); + goto put_fail; + } + + sif_log(sdev, SIF_FWA, "Sent response with drv opcode %d msg sz %d", + rsp.opcode, msg_sz); + return 0; +put_fail: + nlmsg_free(resp_skb); + return stat; +} + +static int sif_fwa_req(struct sk_buff *skb, struct genl_info *info) +{ + int len; + int stat; + int msg_sz; + struct sif_dev *sdev; + enum psif_mbox_type eps_num; + struct sif_eps *es; + void *data; + size_t data_sz = 0; + struct psif_epsc_csr_req *req = NULL; + struct psif_epsc_csr_rsp rsp; + struct psif_query_qp *qqp; + struct sk_buff *resp_skb; + void *kaddr = NULL; + + if (!capable(CAP_NET_ADMIN)) { + sif_log0(SIF_FWA, "Request from client without the CAP_NET_ADMIN privilege"); + return -EPERM; + } + + stat = sif_fwa_verify_find_dev(info, &sdev, sizeof(struct psif_epsc_csr_req)); + if (stat) + return stat; + + req = nla_data(info->attrs[SIF_ENL_A_PAYLOAD]); + + if (info->attrs[SIF_ENL_A_INDEX]) { + eps_num = nla_get_u32(info->attrs[SIF_ENL_A_INDEX]); + if (IS_SIBS(sdev)) { + if (eps_num == MBOX_EPSC) + eps_num = SIBS_MBOX_EPSC; + else { + sif_log(sdev, SIF_FWA, "Invalid EPS selection (%d)", eps_num); + return -EINVAL; + } + } + if (eps_num >= sdev->eps_cnt) { + sif_log(sdev, SIF_FWA, "Invalid EPS selection (%d)", eps_num); + return -EINVAL; + } + } else { + /* Default to use the EPSC (bw.comp) */ + eps_num = sdev->mbox_epsc; + } + + sif_log(sdev, SIF_FWA, "%s to %s", + string_enum_psif_epsc_csr_opcode(req->opcode), + string_enum_psif_mbox_type(eps_num)); + + es = &sdev->es[eps_num]; + if (es->state != ES_ACTIVE) { + sif_log0(SIF_FWA, "Communication with EPS%s has not been set up (state = %d)!", + eps_name(sdev, eps_num), es->state); + return -ENODEV; + } + + /* Check that this opcode is valid in this context */ + stat = fwa_valid_opcode(sdev, req, eps_num); + if (stat) { + if (stat == -EPERM) + sif_log(sdev, SIF_FWA, + "Operation %s not permitted for EPS%s from user space", + string_enum_psif_epsc_csr_opcode(req->opcode), + eps_name(sdev, eps_num)); + return stat; + } + + + /* The below opcodes picks up additional data from (fixed) buffers */ + switch (req->opcode) { + case EPSC_QUERY_DEVICE: + req->u.query_hw.address = + (u64)es->data_dma_hdl + + offsetof(struct sif_epsc_data, dev); + kaddr = &es->data->dev; + data_sz = sizeof(struct psif_epsc_device_attr); + break; + case EPSC_QUERY_PORT_1: + req->u.query_hw.address = + (u64)es->data_dma_hdl + + offsetof(struct sif_epsc_data, port[0]); + kaddr = &es->data->port[0]; + data_sz = sizeof(struct psif_epsc_port_attr); + break; + case EPSC_QUERY_PORT_2: + req->u.query_hw.address = + (u64)es->data_dma_hdl + + offsetof(struct sif_epsc_data, port[1]); + kaddr = &es->data->port[1]; + data_sz = sizeof(struct psif_epsc_port_attr); + break; + case EPSC_QUERY_QP: + { + struct sif_qp *qps; + u32 qp_idx = req->u.query_qp.ctrl.qp_num; + + if (qp_idx >= sdev->ba[qp].entry_cnt) + return -ENOENT; + qps = get_sif_qp(sdev, qp_idx); + kaddr = qqp = &qps->qqp; + req->u.query_qp.address = sif_qqp_dma_addr(sdev, qps); + data_sz = sizeof(struct psif_query_qp); + break; + } + case EPSC_FLASH_RD: + case EPSC_FLASH_WR: + data_sz = req->u.flash.length; + if (data_sz) + kaddr = &es->data->flash; + + /* Use the reserved 'flash' buffer allocated with the EPSC's resp.queue: */ + req->u.flash.host_addr = es->data_dma_hdl + + offsetof(struct sif_epsc_data, flash); + req->u.flash.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx; + break; + case EPSC_CLI_ACCESS: + data_sz = MAX_FWA_NL_PAYLOAD; + kaddr = &es->data->epsc_cli; + + /* Use the reserved 'epsc_cli' buffer allocated with the EPSC's resp. queue: */ + req->u.cli.host_addr = es->data_dma_hdl + + offsetof(struct sif_epsc_data, epsc_cli); + req->u.cli.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx; + break; + case EPSC_VIMMA_CTRL: + data_sz = MAX_FWA_NL_PAYLOAD; + kaddr = &es->data->vimm_agent; + + /* Use the reserved 'vimm_agent' buffer allocated with the EPSC's resp. queue: */ + req->u.vimma_ctrl.host_addr = es->data_dma_hdl + + offsetof(struct sif_epsc_data, vimm_agent); + req->u.vimma_ctrl.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx; + break; + case EPSC_UPDATE: + switch (req->u.update.opcode) { + case EPSC_UPDATE_OP_READ: + case EPSC_UPDATE_OP_WRITE: + /* Use the reserved 'flash' buffer allocated with the EPSC's resp.queue: */ + req->u.update.host_addr = es->data_dma_hdl + + offsetof(struct sif_epsc_data, flash); + req->u.update.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx; + /* fall through */ + case EPSC_UPDATE_OP_POLL: + data_sz = req->u.update.length; + kaddr = &es->data->flash; + break; + default: + break; + } + break; + default: + break; + } + + /* Copy any extra input data to the kernel buffer: */ + if (info->attrs[SIF_ENL_A_DATA]) { + len = nla_len(info->attrs[SIF_ENL_A_DATA]); + data = nla_data(info->attrs[SIF_ENL_A_DATA]); + switch (req->opcode) { + case EPSC_UPDATE: + if (req->u.update.opcode != EPSC_UPDATE_OP_WRITE) + break; + /* fall through */ + case EPSC_FLASH_WR: + case EPSC_CLI_ACCESS: + case EPSC_VIMMA_CTRL: + if (kaddr) { + memcpy(kaddr, data, len); + sif_log(sdev, SIF_FWA, "dma kaddr %p data %p len %x", + kaddr, data, len); + mb(); + } else + sif_log(sdev, SIF_FWA, "Found aux.data input but no data area"); + break; + default: + sif_log(sdev, SIF_FWA, "Found aux.data input in unexpected op %s", + string_enum_psif_epsc_csr_opcode(req->opcode)); + break; + } + } + + stat = sif_eps_wr(sdev, eps_num, req, &rsp); + switch (stat) { + case -ETIMEDOUT: + return stat; + default: + break; + } + + if (data_sz > MAX_FWA_NL_PAYLOAD) + return -EMSGSIZE; + + /* Start building a response */ + msg_sz = NLMSG_DEFAULT_SIZE + data_sz; + resp_skb = nlmsg_new(msg_sz, GFP_KERNEL); + if (!resp_skb) { + sif_log(sdev, SIF_FWA, "failed to allocate netlink packet"); + return -ENOMEM; + } + + data = genlmsg_put_reply(resp_skb, info, &sif_enl_family, + 0, SIF_ENL_CMD_RSP); + if (data == NULL) { + sif_log(sdev, SIF_FWA, "failed to add generic netlink header"); + stat = -ENOMEM; + goto put_fail; + } + + stat = nla_put(resp_skb, SIF_ENL_A_PAYLOAD, sizeof(struct psif_epsc_csr_rsp), &rsp); + if (stat) { + sif_log(sdev, SIF_FWA, "failed to append response to netlink packet"); + goto put_fail; + } + + if (kaddr && req->opcode != EPSC_FLASH_WR && + !(req->opcode == EPSC_UPDATE && req->u.update.opcode == EPSC_UPDATE_OP_WRITE)) { + stat = nla_put(resp_skb, SIF_ENL_A_DATA, data_sz, kaddr); + if (stat) { + sif_log(sdev, SIF_FWA, "failed to append %ld bytes of data", data_sz); + goto put_fail; + } + } + + /* Recompute message header */ + genlmsg_end(resp_skb, data); + + stat = genlmsg_reply(resp_skb, info); + if (stat) { + sif_log(sdev, SIF_FWA, "failed to send reply - status %d", stat); + goto put_fail; + } + + sif_log(sdev, SIF_FWA, "Sent response with opcode %s msg sz %d", + string_enum_psif_epsc_csr_opcode(rsp.opcode), msg_sz); + return 0; +put_fail: + nlmsg_free(resp_skb); + return stat; +} diff --git a/drivers/infiniband/hw/sif/sif_fwa.h b/drivers/infiniband/hw/sif/sif_fwa.h new file mode 100644 index 0000000000000..dd806c3bacc37 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_fwa.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_fwa.h: Low level access to a SIF device + * + * Allows access to low level functions such as (re)programming the EPSC flash + * via direct access to the EPSC protocol proxied via Netlink. + * Requires CAP_NETADMIN privileges. + */ + +#ifndef __SIF_FWA_H +#define __SIF_FWA_H +#include + +struct sif_dev; + +/* The max size we support sending/receiving from user space + * in a single netlink message. + * Limited by a 4k max netlink message size: + */ +#define MAX_FWA_NL_PAYLOAD 0x800 + +/* Per instance data structure */ +struct sif_fwa { + struct list_head list; /* Linkage for the global list */ +}; + +/* Called from sif_init/exit to set up/clean up global data structures + * such as netlink communication and device registry: + */ +int sif_fwa_init(void); +void sif_fwa_exit(void); + +/* Called from probe to register a new device */ +int sif_fwa_register(struct sif_dev *sdev); + +/* Called from remove to unregister a device */ +void sif_fwa_unregister(struct sif_dev *sdev); + +/* Value definition for the fwa module parameter: */ +#define SIF_FWA_MR_ENABLE 0x1 /* Enable FWA mode */ + +#endif diff --git a/drivers/infiniband/hw/sif/sif_hwi.c b/drivers/infiniband/hw/sif/sif_hwi.c new file mode 100644 index 0000000000000..0c07b45e9ce10 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_hwi.c @@ -0,0 +1,602 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_hwi.c: Hardware init for SIF - combines the various init steps for psif + */ + +#include "sif_dev.h" +#include "sif_hwi.h" +#include "sif_base.h" +#include "sif_cq.h" +#include "sif_pqp.h" +#include "sif_qp.h" +#include "sif_ibqp.h" +#include "sif_pd.h" +#include "sif_eq.h" +#include "sif_xrc.h" +#include "sif_defs.h" +#include "sif_query.h" +#include "psif_hw_data.h" +#include "psif_hw_setget.h" +#include +#include +#include + +/* Create the special SIF privileged QP which is used + * for special sif specific work requests such as for instance + * requesting completion event notification on a cq. + */ + +static void sif_pqp_fini(struct sif_dev *sdev); + + +static int sif_chip_init(struct sif_dev *sdev); +static void sif_chip_deinit(struct sif_dev *sdev); + + +static int sif_pqp_init(struct sif_dev *sdev) +{ + struct sif_pqp *pqp; + struct sif_eps *es = &sdev->es[sdev->mbox_epsc]; + int i; + int ret = 0; + uint n_pqps = es->eqs.cnt - 2; + + sdev->pqp = sif_kmalloc(sdev, sizeof(struct sif_pqp *) * n_pqps, GFP_KERNEL | __GFP_ZERO); + if (!sdev->pqp) + return -ENOMEM; + + for (i = 0; i < n_pqps; i++) { + pqp = sif_create_pqp(sdev, i); + if (IS_ERR(pqp)) { + if ((i > 0) && + !(eps_version_ge(es, 0, 42))) { + sif_log(sdev, SIF_INFO, + "SIF device has an old FW version that only supports one pqp"); + break; + } + ret = PTR_ERR(pqp); + goto failed; + } + sdev->pqp[i] = pqp; + } + sdev->pqp_cnt = i; + atomic_set(&sdev->next_pqp, 0); + return 0; + +failed: + sdev->pqp_cnt = i; + sif_pqp_fini(sdev); + return ret; +} + + +static void sif_pqp_fini(struct sif_dev *sdev) +{ + /* we must maintain a consistent state of the PQP array + * during takedown as these operations themselves + * generate PQP requests.. + */ + while (sdev->pqp_cnt > 0) { + int i = sdev->pqp_cnt - 1; + struct sif_pqp *pqp = sdev->pqp[i]; + + if (i > 0) { + /* Remove ourselves first, except the final PQP */ + sdev->pqp[i] = NULL; + sdev->pqp_cnt--; + } + sif_destroy_pqp(sdev, pqp); + if (i == 0) + sdev->pqp_cnt--; + } + kfree(sdev->pqp); + sdev->pqp = NULL; +} + + +static void sif_ki_spqp_fini(struct sif_dev *sdev); + +static int sif_ki_spqp_init(struct sif_dev *sdev) +{ + int i; + int ret = 0; + int n = max(sif_ki_spqp_size, 0U); + int bm_len = max(1, n/8); + + mutex_init(&sdev->ki_spqp.lock); + sdev->ki_spqp.spqp = +#ifdef CONFIG_NUMA + kmalloc_node(sizeof(struct sif_st_pqp *) * n, GFP_KERNEL | __GFP_ZERO, + sdev->pdev->dev.numa_node); +#else + kmalloc(sizeof(struct sif_st_pqp *) * n, GFP_KERNEL | __GFP_ZERO); +#endif + if (!sdev->ki_spqp.spqp) + return -ENOMEM; + + sdev->ki_spqp.bitmap = +#ifdef CONFIG_NUMA + kmalloc_node(sizeof(ulong) * bm_len, GFP_KERNEL | __GFP_ZERO, + sdev->pdev->dev.numa_node); +#else + kmalloc(sizeof(ulong) * bm_len, GFP_KERNEL | __GFP_ZERO); +#endif + if (!sdev->ki_spqp.bitmap) { + ret = -ENOMEM; + goto bm_failed; + } + + for (i = 0; i < n; i++) { + struct sif_st_pqp *spqp = sif_create_inv_key_st_pqp(sdev); + + if (IS_ERR(spqp)) { + ret = PTR_ERR(spqp); + break; + } + sdev->ki_spqp.spqp[i] = spqp; + spqp->index = i; + } + sdev->ki_spqp.pool_sz = i; + if (ret && i) { + sif_log(sdev, SIF_INFO, "Failed to create %d INVALIDATE_KEY stencil QPs", i); + sif_ki_spqp_fini(sdev); + } + + if (i) + sif_log(sdev, SIF_INFO, "Created %d INVALIDATE_KEY stencil QPs", i); +bm_failed: + if (ret) + kfree(sdev->ki_spqp.spqp); + return 0; /* Never fail on stencil PQP allocation */ +} + + +static void sif_ki_spqp_fini(struct sif_dev *sdev) +{ + int i; + + if (!sdev->ki_spqp.spqp) + return; + for (i = sdev->ki_spqp.pool_sz - 1; i >= 0; i--) + sif_destroy_st_pqp(sdev, sdev->ki_spqp.spqp[i]); + kfree(sdev->ki_spqp.bitmap); + kfree(sdev->ki_spqp.spqp); + sdev->ki_spqp.spqp = NULL; +} + + +static void sif_hw_kernel_cb_fini(struct sif_dev *sdev) +{ + int i; + + while (sdev->kernel_cb_cnt > 0) { + int j = sdev->kernel_cb_cnt - 1; + + for (i = 0; i < 2; i++) + if (sdev->kernel_cb[i][j]) + release_cb(sdev, sdev->kernel_cb[i][j]); + sdev->kernel_cb_cnt--; + } + for (i = 0; i < 2; i++) + kfree(sdev->kernel_cb[i]); +} + + + +static int sif_hw_kernel_cb_init(struct sif_dev *sdev) +{ + int i; + uint n_cbs = min(sif_cb_max, num_present_cpus()); + + if (!n_cbs) + n_cbs = 1; + + for (i = 0; i < 2; i++) { + sdev->kernel_cb[i] = kcalloc(n_cbs, sizeof(struct sif_cb *), GFP_KERNEL); + if (!sdev->kernel_cb[i]) + goto alloc_failed; + } + + for (i = 0; i < n_cbs; i++) { + sdev->kernel_cb[0][i] = alloc_cb(sdev, false); + if (!sdev->kernel_cb[0][i]) + goto alloc_failed; + sdev->kernel_cb[1][i] = alloc_cb(sdev, true); + if (!sdev->kernel_cb[1][i]) + goto alloc_failed; + } + sdev->kernel_cb_cnt = i; + return 0; + +alloc_failed: + sdev->kernel_cb_cnt = i; + sif_hw_kernel_cb_fini(sdev); + return -ENOMEM; +} + + +static int get_tsl_map(struct sif_dev *sdev, + int opcode, + int port, + struct psif_tsl_map *map) +{ + struct psif_epsc_csr_req req; + struct psif_epsc_csr_rsp rsp; + + /* EPSC supports the new requests starting from v.0.56 */ + if (eps_fw_version_ge(&sdev->es[sdev->mbox_epsc], 0, 56)) { + int ret = 0; + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_QUERY; + req.u.query.data.op = opcode; + req.u.query.data.index = port; + + ret = sif_epsc_wr(sdev, &req, &rsp); + if (ret) { + sif_log(sdev, SIF_INFO, "Failed to query sl to tsl map, opcode %s, port %d", + string_enum_psif_epsc_query_op(opcode) + strlen("EPSC_QUERY_"), + port); + return ret; + } + *map = *((struct psif_tsl_map *)&rsp.data); + return 0; + } + + sif_log(sdev, SIF_INFO, "PSIF API %s has fw version less than %s. Cannot retrieve SL2TSL map", + "0.98", "0.56"); + return -EOPNOTSUPP; +} + + +static void setup_sl2tsl_map(struct sif_dev *sdev) +{ + int port; + int sl; + int qosl; + + + /* TBD: separate bulk and rcv pqp vcb/tsl */ + for (port = 0; port < 2; ++port) { + sdev->pqp_rcn_tsl[port] = TSL_PRIV; + sdev->pqp_bulk_tsl[port] = TSL_PRIV; + sdev->pqp_qosl_rcn_hint[port] = QOSL_LOW_LATENCY; + sdev->pqp_qosl_bulk_hint[port] = QOSL_LOW_LATENCY; + } + + /* Default or least aggressive common denominator */ + memset(sdev->sl2tsl + 0, TSL_DATA, sizeof(sdev->sl2tsl)); + memset(sdev->qp0_tsl + 0, TSL_DATA, sizeof(sdev->qp0_tsl)); + + if (eps_fw_version_lt(&sdev->es[sdev->mbox_epsc], 0, 50)) { + sif_log(sdev, SIF_INFO, "Using a single TSL for regular QPs (fw < 0.50)"); + return; + } + + /* See BZ 3883 and https://cod.no.oracle.com/gerrit/r/#/c/6587/ */ + for (sl = 0; sl < 16; ++sl) + for (port = 0; port < 2; ++port) + for (qosl = QOSL_HIGH_BANDWIDTH; qosl <= QOSL_LOW_LATENCY; ++qosl) + sdev->sl2tsl[sl][port][qosl] = port ? TSL_DATA_1 : TSL_DATA; + + if (eps_fw_version_lt(&sdev->es[sdev->mbox_epsc], 0, 56)) { + sif_log(sdev, SIF_INFO, "Setting up TSL per port (0.50 <= fw <= 0.56)"); + return; + } + +#define GET_TSL(i) map.m ## i ## _tsl +#define GET_QOS(i) map.m ## i ## _tqos + + { + struct psif_tsl_map map; + int opc; + + sif_log(sdev, SIF_TSL, "Retrieving SL to TSL map from epsc (fw >= 0.56)"); + + for (port = 0; port < 2; ++port) { + if (get_tsl_map(sdev, EPSC_QUERY_MAP_PQP_TO_TSL, port + 1, &map)) + return; + /* RCN pqp info in first entry, bulk in second */ + sdev->pqp_rcn_tsl[port] = GET_TSL(0); + sdev->pqp_bulk_tsl[port] = GET_TSL(1); + sdev->pqp_qosl_rcn_hint[port] = GET_QOS(0); + sdev->pqp_qosl_bulk_hint[port] = GET_QOS(1); + } + + for (opc = EPSC_QUERY_MAP_SL_TO_TSL_LO; opc <= EPSC_QUERY_MAP_SL_TO_TSL_HI; ++opc) { + bool last8 = opc == EPSC_QUERY_MAP_SL_TO_TSL_HI; + + for (port = 0; port < 2; ++port) { + if (get_tsl_map(sdev, opc, port + 1, &map)) + return; + for (qosl = QOSL_HIGH_BANDWIDTH; qosl <= QOSL_LOW_LATENCY; ++qosl) { + sdev->sl2tsl[8*last8 + 0][port][qosl] = GET_TSL(0); + sdev->sl2tsl[8*last8 + 1][port][qosl] = GET_TSL(1); + sdev->sl2tsl[8*last8 + 2][port][qosl] = GET_TSL(2); + sdev->sl2tsl[8*last8 + 3][port][qosl] = GET_TSL(3); + sdev->sl2tsl[8*last8 + 4][port][qosl] = GET_TSL(4); + sdev->sl2tsl[8*last8 + 5][port][qosl] = GET_TSL(5); + sdev->sl2tsl[8*last8 + 6][port][qosl] = GET_TSL(6); + sdev->sl2tsl[8*last8 + 7][port][qosl] = GET_TSL(7); + + sdev->qp_qosl_hint[8*last8 + 0][port] = GET_QOS(0); + sdev->qp_qosl_hint[8*last8 + 1][port] = GET_QOS(1); + sdev->qp_qosl_hint[8*last8 + 2][port] = GET_QOS(2); + sdev->qp_qosl_hint[8*last8 + 3][port] = GET_QOS(3); + sdev->qp_qosl_hint[8*last8 + 4][port] = GET_QOS(4); + sdev->qp_qosl_hint[8*last8 + 5][port] = GET_QOS(5); + sdev->qp_qosl_hint[8*last8 + 6][port] = GET_QOS(6); + sdev->qp_qosl_hint[8*last8 + 7][port] = GET_QOS(7); + } + } + } + + if (!eps_version_ge(&sdev->es[sdev->mbox_epsc], 1, 6)) { + sif_log(sdev, SIF_INFO, "FW version does not not support special QP0 TSL"); + return; + } + for (port = 0; port < 2; ++port) { + if (get_tsl_map(sdev, EPSC_QUERY_MAP_QP0_TO_TSL, port + 1, &map)) + return; + sdev->qp0_tsl[port] = GET_TSL(0); + sdev->qp0_qosl_hint[port] = GET_QOS(0); + } + } +#undef GET_TSL +#undef GET_QOS +} + + +static void dump_sl2tsl_map(struct sif_dev *sdev) +{ + int sl; + int port; + int qosl; + + for (port = 0; port < 2; ++port) { + sif_log(sdev, SIF_TSL, "rcn pqp port:%d tsl:%2d fw_hint:%s", + port + 1, sdev->pqp_rcn_tsl[port], + string_enum_psif_tsu_qos(sdev->pqp_qosl_rcn_hint[port]) + strlen("QOSL_")); + sif_log(sdev, SIF_TSL, "bulk pqp port:%d tsl:%2d fw_hint:%s", + port + 1, sdev->pqp_bulk_tsl[port], + string_enum_psif_tsu_qos(sdev->pqp_qosl_bulk_hint[port]) + strlen("QOSL_")); + } + + for (port = 0; port < 2; ++port) + for (sl = 0; sl < 16; ++sl) + for (qosl = QOSL_HIGH_BANDWIDTH; qosl <= QOSL_LOW_LATENCY; ++qosl) + sif_log(sdev, SIF_TSL, + "plain qp port:%d sl:%2d qosl:%-14s tsl:%2d fw_hint:%s", + port + 1, sl, string_enum_psif_tsu_qos(qosl) + strlen("QOSL_"), + sdev->sl2tsl[sl][port][qosl], + string_enum_psif_tsu_qos(sdev->qp_qosl_hint[sl][port]) + + strlen("QOSL_")); + + for (port = 0; port < 2; ++port) { + sif_log(sdev, SIF_TSL, "qp0 port:%d tsl:%2d fw_hint:%s", + port + 1, sdev->qp0_tsl[port], + string_enum_psif_tsu_qos(sdev->qp0_qosl_hint[port]) + strlen("QOSL_")); + } +} + +/* Device is degraded; set limited mode and report cause */ +static int sif_handle_degraded(struct sif_dev *sdev) +{ + int ret = 0; + + sdev->limited_mode = true; + if (eps_version_ge(&sdev->es[sdev->mbox_epsc], 1, 0)) { + struct psif_epsc_csr_req req; + struct psif_epsc_csr_rsp rsp; + + /* Ask the EPSC if it's running in degraded mode */ + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_QUERY; + req.u.query.data.op = EPSC_QUERY_DEGRADED_CAUSE; + ret = sif_epsc_wr(sdev, &req, &rsp); + if (ret) { + sif_log(sdev, SIF_INFO, + "Request to the EPSC for degraded cause failed with %d", ret); + return ret; + } + if (rsp.data != 0) + epsc_report_degraded(sdev, rsp.data); + sif_log(sdev, SIF_EPS, "Device reports degraded mode, mask 0x%llx", rsp.data); + } + return ret; +} + + +int sif_hw_init(struct sif_dev *sdev) +{ + int i; + int ret = -ENOMEM; + struct sif_pd *pd = NULL; + + /* PSIF 2.x requires MRRS to be at least 512, ref BZ #3301 */ + if (pcie_get_readrq(sdev->pdev) < 512) { + sif_log(sdev, SIF_INFO, "PSIF 2.x requires MRRS to be at least 512 bytes"); + ret = -EINVAL; + goto chip_init_failed; + } + + sif_mem_init(sdev); + + /* Misc. PSIF chip version specific + * configuration (must be before base_init): + */ + ret = sif_chip_init(sdev); + if (ret) + goto chip_init_failed; + + ret = sif_base_init(sdev); + if (ret) + goto base_failed; + + /* Allocate collect buffers for kernel usage */ + ret = sif_hw_kernel_cb_init(sdev); + if (ret) + goto cb_alloc_failed; + + ret = sif_init_pd(sdev); + if (ret) + goto pd_init_failed; + + /* We need a kernel protection domain for resource allocation */ + pd = alloc_pd(sdev); + if (!pd) + goto pd_alloc_failed; + pd->ibpd.device = &sdev->ib_dev; + sdev->pd = pd; + if (sdev->degraded) + sif_handle_degraded(sdev); + if (sdev->limited_mode) { + sif_log(sdev, SIF_INFO, "Running in limited mode\n"); + return 0; + } + + /* Initialize the SL to TSL map, before any QPs are created */ + setup_sl2tsl_map(sdev); + dump_sl2tsl_map(sdev); + + /* Reserve indices for qp 0 and 1, ports 1 and 2 */ + for (i = 0; i <= 3; i++) + sif_alloc_qp_idx(pd); + + ret = sif_pqp_init(sdev); + if (ret) + goto pqp_failed; + + ret = sif_ki_spqp_init(sdev); + if (ret) + goto ki_spqp_failed; + + ret = sif_init_xrcd(sdev); + if (ret) + goto xrcd_failed; + + return 0; + +xrcd_failed: + sif_ki_spqp_fini(sdev); +ki_spqp_failed: + sif_pqp_fini(sdev); +pqp_failed: + /* Release indices for qp 0 and 1 */ + for (i = 3; i >= 0; i--) + sif_free_qp_idx(pd, i); + dealloc_pd(pd); + +pd_alloc_failed: + sif_deinit_pd(sdev); +pd_init_failed: + sif_hw_kernel_cb_fini(sdev); +cb_alloc_failed: + sif_base_deinit(sdev); +base_failed: + sif_chip_deinit(sdev); +chip_init_failed: + return ret; +} + +void sif_hw_deinit(struct sif_dev *sdev) +{ + int i; + + if (!sdev->limited_mode) { + sif_log(sdev, SIF_PQP, "enter"); + sif_ki_spqp_fini(sdev); + sif_pqp_fini(sdev); + + /* Release indices for qp 0 and 1 */ + for (i = 3; i >= 0; i--) + sif_free_qp_idx(sdev->pd, i); + } + + dealloc_pd(sdev->pd); + sif_deinit_pd(sdev); + sif_hw_kernel_cb_fini(sdev); + sif_base_deinit(sdev); + sif_chip_deinit(sdev); +} + + +int force_pcie_link_retrain(struct sif_dev *sdev) +{ + int err, parent_pcie_cap; + u16 parent_lnkctl; + + parent_pcie_cap = pci_find_capability(sdev->pdev->bus->self, PCI_CAP_ID_EXP); + err = pci_read_config_word(sdev->pdev, parent_pcie_cap + PCI_EXP_LNKCTL, &parent_lnkctl); + parent_lnkctl |= PCI_EXP_LNKCTL_RL; + err = pci_write_config_word(sdev->pdev->bus->self, parent_pcie_cap + PCI_EXP_LNKCTL, + parent_lnkctl); + return err; +} + + +static int sif_chip_init(struct sif_dev *sdev) +{ + u16 devid; + + /* Chip version specific config */ + devid = sdev->pdev->device; + switch (devid) { + case PCI_DEVICE_ID_PSIF_VF: + sdev->is_vf = true; + sdev->num_vfs = 0; + sdev->mbox_epsc = MBOX_EPSC; + sdev->eps_cnt = MBOX_EPSC + 1; + break; + + case PCI_DEVICE_ID_PSIF_PF: + sdev->is_vf = false; + sdev->mbox_epsc = MBOX_EPSC; + sdev->eps_cnt = MBOX_EPSC + 1; + break; + + case PCI_DEVICE_ID_SN1_VF: + sdev->is_vf = true; + sdev->num_vfs = 0; + sdev->mbox_epsc = SIBS_MBOX_EPSC; + sdev->eps_cnt = SIBS_MBOX_EPSC + 1; + break; + + case PCI_DEVICE_ID_SN1_PF: + sdev->is_vf = false; + sdev->mbox_epsc = SIBS_MBOX_EPSC; + sdev->eps_cnt = SIBS_MBOX_EPSC + 1; + break; + + default: + sif_log(sdev, SIF_INFO, "Unknown device id %x", devid); + return -ENODEV; + } + + if (!sif_vf_en && sdev->is_vf) { + sif_log(sdev, SIF_INFO, "Parameter vf_en=0: VF driver load disabled"); + return -EINVAL; + } + + + sdev->es = kcalloc(sdev->eps_cnt, sizeof(struct sif_eps), GFP_KERNEL); + if (!sdev->es) + return -ENOMEM; + + return 0; +} + + +static void sif_chip_deinit(struct sif_dev *sdev) +{ + kfree(sdev->es); + sdev->es = NULL; +} diff --git a/drivers/infiniband/hw/sif/sif_hwi.h b/drivers/infiniband/hw/sif/sif_hwi.h new file mode 100644 index 0000000000000..a67141229098a --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_hwi.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_hwi.h: Hardware init for SIF + */ + +#ifndef _SIF_HWI_H +#define _SIF_HWI_H +#include +#include "sif_cq.h" +#include "sif_r3.h" + +struct sif_dev; +struct sif_pqp; +struct sif_qp; +struct sif_compl; +struct sif_cqe; +struct psif_wr; +struct psif_cq_entry; +enum psif_wr_type; + +/* Main calls for hardware specific initialization/deinitialization */ + +int force_pcie_link_retrain(struct sif_dev *sdev); +int sif_hw_init(struct sif_dev *sdev); +void sif_hw_deinit(struct sif_dev *sdev); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_ibcq.h b/drivers/infiniband/hw/sif/sif_ibcq.h new file mode 100644 index 0000000000000..1c5476e582438 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_ibcq.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_ibcq.h: External interface to IB completion queue logic for SIF + */ + +#ifndef __SIF_IBCQ_H +#define __SIF_IBCQ_H + +struct ib_cq *sif_create_cq(struct ib_device *ibdev, int cqe, + int comp_vector, struct ib_ucontext *context, + struct ib_udata *udata, + enum sif_proxy_type proxy); + +int sif_modify_cq(struct ib_cq *ibcq, u16 cq_count, u16 cq_period); +int sif_destroy_cq(struct ib_cq *ibcq); +int sif_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); +int sif_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int sif_peek_cq(struct ib_cq *ibcq, int wc_cnt); + +int sif_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); +int sif_req_ncomp_notif(struct ib_cq *ibcq, int wc_cnt); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_ibpd.h b/drivers/infiniband/hw/sif/sif_ibpd.h new file mode 100644 index 0000000000000..41773fe124bf0 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_ibpd.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_ibpd.h: External interface to (IB) protection domains for SIF + */ + +#ifndef __SIF_IBPD_H +#define __SIF_IBPD_H + +struct ib_pd *sif_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *ibucontext, + struct ib_udata *udata); + +int sif_dealloc_pd(struct ib_pd *ibpd); + +struct ib_shpd *sif_alloc_shpd(struct ib_device *ibdev, + struct ib_pd *ibpd, + struct ib_udata *udata); + +struct ib_pd *sif_share_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata, + struct ib_shpd *shpd); + +int sif_remove_shpd(struct ib_device *ibdev, + struct ib_shpd *shpd, + int atinit); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_ibqp.h b/drivers/infiniband/hw/sif/sif_ibqp.h new file mode 100644 index 0000000000000..bde0740570bed --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_ibqp.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_ibqp.h: External interface to IB queue pair logic for sif + */ + +#ifndef __SIF_IBQP_H +#define __SIF_IBQP_H + +struct ib_qp *sif_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *qp_init_attr, + struct ib_udata *udata); +int sif_modify_qp(struct ib_qp *ibqp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_udata *udata); + +int sif_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); +int sif_destroy_qp(struct ib_qp *ibqp); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_idr.c b/drivers/infiniband/hw/sif/sif_idr.c new file mode 100644 index 0000000000000..ff726bd5d8371 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_idr.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_idr.c: Synchronized ID ref allocation + */ + +#include "sif_idr.h" + +int sif_idr_init(struct sif_idr *sidr, int id_min, int id_max) +{ + int ret = 0; + idr_init(&sidr->idr); + mutex_init(&sidr->lock); + sidr->id_min = id_min; + sidr->id_max = id_max; + return ret; +} + + +void sif_idr_deinit(struct sif_idr *sidr) +{ + idr_destroy(&sidr->idr); +} + + +int sif_idr_alloc(struct sif_idr *sidr, void *ref, gfp_t gfp_mask) +{ + int index; + + mutex_lock(&sidr->lock); + index = idr_alloc(&sidr->idr, ref, sidr->id_min, sidr->id_max, gfp_mask); + mutex_unlock(&sidr->lock); + return index; +} + +void sif_idr_remove(struct sif_idr *sidr, int index) +{ + mutex_lock(&sidr->lock); + idr_remove(&sidr->idr, index); + mutex_unlock(&sidr->lock); +} diff --git a/drivers/infiniband/hw/sif/sif_idr.h b/drivers/infiniband/hw/sif/sif_idr.h new file mode 100644 index 0000000000000..4bdfcfd575d51 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_idr.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_idr.h: simple id allocation and deallocation for SIF + */ + +#ifndef _SIF_IDR_H +#define _SIF_IDR_H +#include +#include +#include + +/* Synchronized ID ref allocation */ + +struct sif_idr { + struct idr idr; + struct mutex lock; + int id_min; + int id_max; +}; + +int sif_idr_init(struct sif_idr *sidr, int id_min, int id_max); +void sif_idr_deinit(struct sif_idr *sidr); + +int sif_idr_alloc(struct sif_idr *sidr, void *ref, gfp_t gfp_mask); +void sif_idr_remove(struct sif_idr *sidr, int index); + + +#endif diff --git a/drivers/infiniband/hw/sif/sif_int_user.h b/drivers/infiniband/hw/sif/sif_int_user.h new file mode 100644 index 0000000000000..bed597d1b6946 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_int_user.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_int_user.h: This file defines special internal data structures used + * to communicate between libsif and the sif driver. + * This file is included both from user space and kernel space so + * it must not contain any kernel/user specific header file includes. + * This file is internal to libsif/sif driver since it relies on HW specific + * include files. + */ + +#ifndef _SIF_INT_USER_H +#define _SIF_INT_USER_H + + +#include "psif_hw_data.h" + +/* Do this the brute force way, since structs are used in user-space */ +#if defined(__x86_64__) || defined(__sparc__) +#define SIF_CACHE_BYTES 64 +#else +#define SIF_CACHE_BYTES 64 +#endif + +/* We use the extension here to communicate with the driver + * (for correct debugfs reporting) + */ + +/* sif_sq_sw flags definition + */ +enum sq_sw_state { + FLUSH_SQ_IN_PROGRESS = 0, + FLUSH_SQ_IN_FLIGHT = 1, +}; + +struct sif_sq_sw { + struct psif_sq_sw d; /* Hardware visible descriptor */ + __u8 fill[SIF_CACHE_BYTES - sizeof(struct psif_sq_sw)]; /* separate the cache lines */ + __u16 last_seq; /* Last used sq seq.num (req. sq->lock) */ + __u16 head_seq; /* Last sq seq.number seen in a compl (req. cq->lock) */ + __u16 trusted_seq; /* Last next_seq that was either generate or exist in the cq */ + __u8 tsl; /* Valid after transition to RTR */ + unsigned long flags; /* Flags, using unsigned long due to test_set/test_and_set_bit */ +}; + +/* sif_rq_sw flags definition + */ +enum rq_sw_state { + FLUSH_RQ_IN_PROGRESS = 0, + FLUSH_RQ_IN_FLIGHT = 1, + FLUSH_RQ_FIRST_TIME = 2, + RQ_IS_INVALIDATED = 3, +}; + +struct sif_rq_sw { + struct psif_rq_sw d; /* Hardware visible descriptor */ + __u8 fill[SIF_CACHE_BYTES - sizeof(struct psif_rq_sw)]; /* separate the cache lines */ + atomic_t length; /* current length of queue as #posted - #completed */ + __u32 next_seq; /* First unused sequence number */ + unsigned long flags; /* Flags, using unsigned long due to test_set/test_and_set_bit */ +}; + +enum cq_sw_state { + CQ_POLLING_NOT_ALLOWED = 0, + CQ_POLLING_IGNORED_SEQ = 1, + FLUSH_SQ_FIRST_TIME = 2, +}; + +struct sif_cq_sw { + struct psif_cq_sw d; /* Hardware visible descriptor */ + __u8 fill[SIF_CACHE_BYTES - sizeof(struct psif_cq_sw)]; /* separate the cache lines */ + __u32 next_seq; /* First unused sequence number */ + __u32 cached_head; /* Local copy kept in sync w/hw visible head_indx */ + __u32 last_hw_seq; /* Last next_seq reported in completion for req_notify_cq */ + __u32 armed; /* Set if req_notify_cq has been called but event not processed */ + __u32 miss_cnt; /* Number of in-flight completions observed by poll_cq */ + __u32 miss_occ; /* Number of times 1 or more in-flight completions was seen */ + unsigned long flags; /* Flags, using unsigned long due to test_set/test_and_set_bit */ +}; + +#endif diff --git a/drivers/infiniband/hw/sif/sif_ireg.c b/drivers/infiniband/hw/sif/sif_ireg.c new file mode 100644 index 0000000000000..2f19ce2b3aae4 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_ireg.c @@ -0,0 +1,930 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_ireg.c: Utilities and entry points needed for Infiniband registration + */ + +#include +#include +#include +#include +#include +#include +#include "sif_dev.h" +#include "sif_ireg.h" +#include "sif_user.h" +#include "sif_dma.h" +#include "sif_ibpd.h" +#include "sif_ibcq.h" +#include "sif_ibqp.h" +#include "sif_mr.h" +#include "sif_mw.h" +#include "sif_fmr.h" +#include "sif_ah.h" +#include "sif_srq.h" +#include "sif_xrc.h" +#include "sif_sndrcv.h" +#include "sif_hwi.h" +#include "sif_query.h" +#include "sif_pd.h" +#include "sif_base.h" +#include "version.h" + + +static ssize_t show_rev(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct sif_dev *sdev = dev_get_drvdata(dev); + + return sprintf(buf, "%x\n", PSIF_REVISION(sdev)); +} + +static ssize_t show_fw_ver(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct sif_dev *sdev = dev_get_drvdata(device); + struct sif_eps *es = &sdev->es[sdev->mbox_epsc]; + + return sprintf(buf, "%hu.%hu.0\n", es->ver.fw_major, es->ver.fw_minor); +} + +static ssize_t show_eps_api_ver(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct sif_dev *sdev = dev_get_drvdata(device); + struct sif_eps *es = &sdev->es[sdev->mbox_epsc]; + + return sprintf(buf, "%hu.%hu\n", es->ver.epsc_major, es->ver.epsc_minor); +} + +static ssize_t show_hca(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct sif_dev *sdev = dev_get_drvdata(device); + + return sprintf(buf, "ORCL%d\n", PSIF_DEVICE(sdev)); +} + +static ssize_t show_board(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct sif_dev *sdev = dev_get_drvdata(device); + const char *prod_str = get_product_str(sdev); + /* + * Paranoia level: avoid dumping the whole kernel to + * user-space if the zero termination character in the product + * string has been compromised + */ + const int n = min_t(int, 64, (int)strlen(prod_str)); + + return sprintf(buf, "%.*s\n", n, prod_str); +} + +static ssize_t show_stats(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct sif_dev *sdev = dev_get_drvdata(device); + /* TBD: device specific counters, stats registers */ + sif_log(sdev, SIF_VERBS, "Not implemented"); + return -EOPNOTSUPP; +} + + +/* PSIF specific extensions */ + +/* Version information details (git revision of driver and firmware etc) */ +static ssize_t show_versioninfo(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct sif_dev *sdev = dev_get_drvdata(device); + char **fwv = sdev->es[sdev->mbox_epsc].ver.fw_version; + + return snprintf(buf, PAGE_SIZE, "%s - build user %s at %s\n" + "sifdrv git tag:\n%s\n%s\n" + "EPSC firmware: build user %s at %s\nimage revision string %s\n" + "version tag:\n%s\n%s", + sif_version.git_repo, + sif_version.build_user, sif_version.build_git_time, + sif_version.last_commit, + (sif_version.git_status[0] != '\0' ? sif_version.git_psifapi_status : ""), + fwv[FWV_EPS_BUILD_USER], fwv[FWV_EPS_BUILD_GIT_TIME], + fwv[FWV_EPS_REV_STRING], fwv[FWV_EPS_GIT_LAST_COMMIT], + (fwv[FWV_EPS_GIT_STATUS][0] != '\0' ? fwv[FWV_EPS_GIT_STATUS] : "")); +} + + +static ssize_t show_resp_ms(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct sif_dev *sdev = dev_get_drvdata(device); + + return sprintf(buf, "%d\n", jiffies_to_msecs(sdev->min_resp_ticks)); +} + + +static ssize_t set_resp_ms(struct device *device, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct sif_dev *sdev = dev_get_drvdata(device); + size_t old_val = jiffies_to_msecs(sdev->min_resp_ticks); + size_t new_val; + int ret = kstrtoul(buf, 0, &new_val); + + if (ret || !new_val) + new_val = 1; + sif_log(sdev, SIF_INFO, "%ld ms -> %ld ms", old_val, new_val); + sdev->min_resp_ticks = msecs_to_jiffies(new_val); + return strlen(buf); +} + +static ssize_t show_irq_moderation(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct sif_dev *sdev = dev_get_drvdata(device); + + return sprintf(buf, "%hu\n", sdev->es[sdev->mbox_epsc].eqs.irq_moderation); +} + +static ssize_t set_irq_moderation(struct device *device, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct sif_dev *sdev = dev_get_drvdata(device); + struct sif_eps *es = &sdev->es[sdev->mbox_epsc]; + u16 old_val = es->eqs.irq_moderation; + u16 new_val; + + int ret = kstrtou16(buf, 0, &new_val); + struct psif_epsc_csr_req req; /* local epsc wr copy */ + struct psif_epsc_csr_rsp resp; + + if (ret || !new_val) + new_val = 0; + + if (eps_version_ge(es, 0, 36)) { + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_HOST_INT_COMMON_CTRL; + req.uf = 0; + req.u.int_common.total_usec = (uintptr_t)new_val; + ret = sif_epsc_wr_poll(sdev, &req, &resp); + if (ret) { + sif_log(sdev, SIF_INFO, "Failed to configure device interrupt total moderation\n"); + return ret; + } + es->eqs.irq_moderation = new_val; + sif_log(sdev, SIF_INFO, "Interrupt total moderation: %d usecs -> %d usecs", + old_val, new_val); + return strlen(buf); + } else + return -1; +} + +static ssize_t show_mt_override(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct sif_dev *sdev = dev_get_drvdata(device); + + switch (sdev->mt_override) { + case SIFMT_BYPASS: + sprintf(buf, "bypass\n"); + break; + case SIFMT_UMEM: + sprintf(buf, "umem (no override)\n"); + break; + case SIFMT_UMEM_SPT: + sprintf(buf, "spt\n"); + break; + case SIFMT_ZERO: + sprintf(buf, "zero\n"); + break; + default: + /* Sanity check for debugging the driver only */ + sprintf(buf, "***undefined***\n"); + break; + } + return strlen(buf); +} + + +static ssize_t set_mt_override(struct device *device, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct sif_dev *sdev = dev_get_drvdata(device); + + if (strcmp(buf, "bypass\n") == 0) + sdev->mt_override = SIFMT_BYPASS; + else if (strcmp(buf, "umem\n") == 0 || strcmp(buf, "none\n") == 0) + sdev->mt_override = SIFMT_UMEM; + else if (strcmp(buf, "spt\n") == 0) + sdev->mt_override = SIFMT_UMEM_SPT; + else if (strcmp(buf, "zero\n") == 0) + sdev->mt_override = SIFMT_ZERO; + else + return -EINVAL; + return strlen(buf); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(eps_api_ver, S_IRUGO, show_eps_api_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); +static DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL); +static DEVICE_ATTR(versioninfo, S_IRUGO, show_versioninfo, NULL); +static DEVICE_ATTR(min_resp_ms, S_IWUSR | S_IRUGO, show_resp_ms, set_resp_ms); +static DEVICE_ATTR(mt_override, S_IWUSR | S_IRUGO, show_mt_override, set_mt_override); +static DEVICE_ATTR(irq_moderation, S_IWUSR | S_IRUGO, show_irq_moderation, set_irq_moderation); + +static struct device_attribute *sif_class_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_eps_api_ver, + &dev_attr_hca_type, + &dev_attr_board_id, + &dev_attr_stats, + &dev_attr_versioninfo, + &dev_attr_min_resp_ms, + &dev_attr_mt_override, + &dev_attr_irq_moderation, +}; + +static u64 dev_show(const struct device *device, + struct device_attribute *attr, + char *buf, + int opcode) +{ + struct sif_dev *sdev = dev_get_drvdata(device); + struct psif_epsc_csr_req req; + struct psif_epsc_csr_rsp rsp; + + /* EPSC supports the new requests starting from v.0.43 */ + if (eps_version_ge(&sdev->es[sdev->mbox_epsc], 0, 43)) { + int ret = 0; + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_QUERY; + req.u.query.data.op = opcode; + ret = sif_epsc_wr(sdev, &req, &rsp); + if (ret) + sif_log(sdev, SIF_INFO, "Failed to query tsu error counter\n"); + else + sprintf(buf, "%llu\n", rsp.data); + } + return strlen(buf); +} + +#define DEVICE_SHOW(field) \ +static ssize_t show_##field(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + return dev_show(dev, attr, buf, EPSC_QUERY_##field); \ +} + +DEVICE_SHOW(SQ_NUM_BRE); +DEVICE_SHOW(NUM_CQOVF); +DEVICE_SHOW(SQ_NUM_WRFE); +DEVICE_SHOW(RQ_NUM_WRFE); +DEVICE_SHOW(RQ_NUM_LAE); +DEVICE_SHOW(RQ_NUM_LPE); +DEVICE_SHOW(SQ_NUM_LLE); +DEVICE_SHOW(RQ_NUM_LLE); +DEVICE_SHOW(SQ_NUM_LQPOE); +DEVICE_SHOW(RQ_NUM_LQPOE); +DEVICE_SHOW(SQ_NUM_OOS); +DEVICE_SHOW(RQ_NUM_OOS); +DEVICE_SHOW(SQ_NUM_RREE); +DEVICE_SHOW(SQ_NUM_TREE); +DEVICE_SHOW(SQ_NUM_ROE); +DEVICE_SHOW(RQ_NUM_ROE); +DEVICE_SHOW(SQ_NUM_RAE); +DEVICE_SHOW(RQ_NUM_RAE); +DEVICE_SHOW(RQ_NUM_UDSDPRD); +DEVICE_SHOW(RQ_NUM_UCSDPRD); +DEVICE_SHOW(SQ_NUM_RIRE); +DEVICE_SHOW(RQ_NUM_RIRE); +DEVICE_SHOW(SQ_NUM_RNR); +DEVICE_SHOW(RQ_NUM_RNR); + +static ssize_t clear_diag(struct device *device, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + + struct sif_dev *sdev = dev_get_drvdata(device); + int ret; + struct psif_epsc_csr_req req; + struct psif_epsc_csr_rsp resp; + + if (strcmp(buf, "1\n") == 0) { + + memset(&req, 0, sizeof(req)); + memset(&resp, 0, sizeof(resp)); + + req.opcode = EPSC_SET; + req.u.set.data.op = EPSC_QUERY_RESET_CBLD_DIAG_COUNTERS; + req.u.set.data.value = 0xffffff; + ret = sif_epsc_wr_poll(sdev, &req, &resp); + if (ret) + sif_log(sdev, SIF_INFO, "Failed to clear psif diag counters\n"); + } else + return -EINVAL; + + return strlen(buf); +} + +static DEVICE_ATTR(clear_diag, S_IWUSR, NULL, clear_diag); +static DEVICE_ATTR(sq_num_bre, S_IRUGO, show_SQ_NUM_BRE, NULL); +static DEVICE_ATTR(num_cqovf, S_IRUGO, show_NUM_CQOVF, NULL); +static DEVICE_ATTR(sq_num_wrfe, S_IRUGO, show_SQ_NUM_WRFE, NULL); +static DEVICE_ATTR(rq_num_wrfe, S_IRUGO, show_RQ_NUM_WRFE, NULL); +static DEVICE_ATTR(rq_num_lae, S_IRUGO, show_RQ_NUM_LAE, NULL); +static DEVICE_ATTR(rq_num_lpe, S_IRUGO, show_RQ_NUM_LPE, NULL); +static DEVICE_ATTR(sq_num_lle, S_IRUGO, show_SQ_NUM_LLE, NULL); +static DEVICE_ATTR(rq_num_lle, S_IRUGO, show_RQ_NUM_LLE, NULL); +static DEVICE_ATTR(sq_num_lqpoe, S_IRUGO, show_SQ_NUM_LQPOE, NULL); +static DEVICE_ATTR(rq_num_lqpoe, S_IRUGO, show_RQ_NUM_LQPOE, NULL); +static DEVICE_ATTR(sq_num_oos, S_IRUGO, show_SQ_NUM_OOS, NULL); +static DEVICE_ATTR(rq_num_oos, S_IRUGO, show_RQ_NUM_OOS, NULL); +static DEVICE_ATTR(sq_num_rree, S_IRUGO, show_SQ_NUM_RREE, NULL); +static DEVICE_ATTR(sq_num_tree, S_IRUGO, show_SQ_NUM_TREE, NULL); +static DEVICE_ATTR(sq_num_roe, S_IRUGO, show_SQ_NUM_ROE, NULL); +static DEVICE_ATTR(rq_num_roe, S_IRUGO, show_RQ_NUM_ROE, NULL); +static DEVICE_ATTR(sq_num_rae, S_IRUGO, show_SQ_NUM_RAE, NULL); +static DEVICE_ATTR(rq_num_rae, S_IRUGO, show_RQ_NUM_RAE, NULL); +static DEVICE_ATTR(rq_num_udsdprd, S_IRUGO, show_RQ_NUM_UDSDPRD, NULL); +static DEVICE_ATTR(rq_num_ucsdprd, S_IRUGO, show_RQ_NUM_UCSDPRD, NULL); +static DEVICE_ATTR(sq_num_rire, S_IRUGO, show_SQ_NUM_RIRE, NULL); +static DEVICE_ATTR(rq_num_rire, S_IRUGO, show_RQ_NUM_RIRE, NULL); +static DEVICE_ATTR(sq_num_rnr, S_IRUGO, show_SQ_NUM_RNR, NULL); +static DEVICE_ATTR(rq_num_rnr, S_IRUGO, show_RQ_NUM_RNR, NULL); + +static struct attribute *sif_diag_counters_class_attributes[] = { + &dev_attr_clear_diag.attr, + &dev_attr_sq_num_bre.attr, + &dev_attr_num_cqovf.attr, + &dev_attr_sq_num_wrfe.attr, + &dev_attr_rq_num_wrfe.attr, + &dev_attr_rq_num_lae.attr, + &dev_attr_rq_num_lpe.attr, + &dev_attr_sq_num_lle.attr, + &dev_attr_rq_num_lle.attr, + &dev_attr_sq_num_lqpoe.attr, + &dev_attr_rq_num_lqpoe.attr, + &dev_attr_sq_num_oos.attr, + &dev_attr_rq_num_oos.attr, + &dev_attr_sq_num_rree.attr, + &dev_attr_sq_num_tree.attr, + &dev_attr_sq_num_roe.attr, + &dev_attr_rq_num_roe.attr, + &dev_attr_sq_num_rae.attr, + &dev_attr_rq_num_rae.attr, + &dev_attr_rq_num_udsdprd.attr, + &dev_attr_rq_num_ucsdprd.attr, + &dev_attr_sq_num_rire.attr, + &dev_attr_rq_num_rire.attr, + &dev_attr_sq_num_rnr.attr, + &dev_attr_rq_num_rnr.attr, + NULL, +}; + +static struct attribute_group diag_counters_attr_group = { + .attrs = sif_diag_counters_class_attributes, + .name = "diag_counters", +}; + +static struct ib_ucontext *sif_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + int ret; + struct sif_dev *sdev = to_sdev(ibdev); + struct sif_ucontext *s_uc; + + s_uc = kzalloc(sizeof(*s_uc), GFP_KERNEL); + if (!s_uc) + return NULL; + + s_uc->pd = alloc_pd(sdev); + if (!s_uc->pd) { + ret = -ENOMEM; + goto alloc_pd_failed; + } + s_uc->pd->ibpd.device = ibdev; + + s_uc->cb = alloc_cb(sdev, false); + if (!s_uc->cb) { + ret = -ENOMEM; + goto alloc_cb_failed; + } + + if (udata) { + struct sif_get_context_ext cmd; + struct sif_get_context_resp_ext resp; + u16 major_ver, minor_ver; + + memset(&cmd, 0, sizeof(cmd)); + ib_copy_from_udata(&cmd, udata, sizeof(cmd)); + + s_uc->abi_version = cmd.abi_version; + major_ver = s_uc->abi_version >> 8; + minor_ver = s_uc->abi_version & 0xff; + if (major_ver != SIF_UVERBS_ABI_MAJOR_VERSION) { + if (major_ver < 10 && major_ver > 0) { + sif_log(sdev, SIF_INFO, + "User verbs abi version mismatch - driver has v.%d.%d - libsif has v.%d.%d", + SIF_UVERBS_ABI_MAJOR_VERSION, SIF_UVERBS_ABI_MINOR_VERSION, + major_ver, minor_ver); + ret = -EINVAL; + goto udata_copy_failed; + } else { + static bool printed; + /* TBD: remove - bw comp - in this case probably not set */ + /* Set to final version that does not report to us */ + if (!printed) { + sif_log(sdev, SIF_INFO, + "Invalid version info - upgrade libsif!"); + printed = true; + } + s_uc->abi_version = SIF_UVERBS_VERSION(3, 1); + } + } + memset(&resp, 0, sizeof(resp)); + resp.sq_sw_ext_sz = sdev->ba[sq_sw].ext_sz; + resp.sq_hw_ext_sz = sdev->ba[sq_hw].ext_sz; + resp.rq_ext_sz = sdev->ba[rq_sw].ext_sz; + resp.cq_ext_sz = sdev->ba[cq_sw].ext_sz; + resp.sq_entry_per_block = sdev->ba[sq_sw].entry_per_block; + resp.rq_entry_per_block = sdev->ba[rq_sw].entry_per_block; + resp.cq_entry_per_block = sdev->ba[cq_sw].entry_per_block; + ret = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (ret) + goto udata_copy_failed; + } + + sif_log(sdev, SIF_VERBS_V, " at %p with pd %d used for CQs libsif abi v.%d.%d", + s_uc, s_uc->pd->idx, s_uc->abi_version >> 8, s_uc->abi_version & 0xff); + return &s_uc->ib_uc; + +udata_copy_failed: + release_cb(sdev, s_uc->cb); +alloc_cb_failed: + dealloc_pd(s_uc->pd); +alloc_pd_failed: + kfree(s_uc); + return ERR_PTR(ret); +} + +static int sif_dealloc_ucontext(struct ib_ucontext *ib_uc) +{ + int ret; + u32 pd_idx = 0; + struct sif_dev *sdev = to_sdev(ib_uc->device); + struct sif_ucontext *s_uc = + container_of(ib_uc, struct sif_ucontext, ib_uc); + + sif_logs(SIF_VERBS_V, pd_idx = s_uc->pd->idx); + + ret = dealloc_pd(s_uc->pd); + if (ret) { + sif_log(sdev, SIF_INFO, "Failed (status %d) to deallocate pd %d", ret, s_uc->pd->idx); + return ret; + } + + release_cb(sdev, s_uc->cb); + kfree(s_uc); + sif_log(sdev, SIF_VERBS_V, "at %p done (cq pd index %d)", s_uc, pd_idx); + return 0; +} + + +static int sif_mmap_block(struct sif_ucontext *uc, struct vm_area_struct *vma, + enum sif_tab_type type, u32 index, int vm_flags) +{ + struct sif_dev *sdev = to_sdev(uc->ib_uc.device); + struct sif_table *tp = &sdev->ba[type]; + struct sif_table_block *b; + struct sif_pd *pd; + u64 start, block_sz; + off_t len; + off_t offset; + int ret; + + if (tp->entry_per_block <= 1) { + sif_log(sdev, SIF_INFO, + "Failed to map %s block index %d: direct user access not available with flat_alloc scheme", + sif_table_name(type), index); + return -EPERM; + } + if (tp->block_cnt <= index) { + sif_log(sdev, SIF_INFO, "Failed to map %s block index %d: out of range - block_cnt %d", + sif_table_name(type), index, tp->block_cnt); + return -EINVAL; + } + + b = sif_get_block(tp, index); + pd = b->pd; + if (!pd) { + sif_log(sdev, SIF_INFO, "Failed to map %s block index %d: not allocated", + sif_table_name(type), index); + return -ENODEV; + } + if (pd == uc->pd) + goto pd_ok; /* CQ case */ + + if (!sif_is_user_pd(pd)) { + sif_log(sdev, SIF_INFO, "Failed to map %s block index %d, pd %d - owned by kernel space", + sif_table_name(type), index, pd->idx); + return -EACCES; + } + + /* TBD: Security aspects of XRC domain access + * (in the xrc case, we don't have a user context at the moment) + */ + if (pd->ibpd.uobject && pd->ibpd.uobject->context != &uc->ib_uc) { + sif_log(sdev, SIF_INFO, "Failed to map %s block index %d: belongs to another user context", + sif_table_name(type), index); + return -EACCES; + } +pd_ok: + block_sz = tp->ext_sz * tp->entry_per_block; + len = vma->vm_end - vma->vm_start; + if (block_sz != len) { + sif_log(sdev, SIF_INFO, "Failed to map %s block index %d: Expected map len %lld, got %ld", + sif_table_name(type), index, + block_sz, len); + return -EINVAL; + } + + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vma->vm_flags |= vm_flags; + start = vma->vm_start; + + offset = block_sz * index; + + ret = sif_mem_vma_map_part(tp->mem, vma, offset, len); + if (ret) + return ret; + + /* TBD: ehca uses a vm_operations_struct and vma->private_data to ref.count + * but MLX does not - is it necessary? + * Also remap_pfn_range requires the mm sema to be held, but other drivers dont take it + * - is it already held by the caller here? + */ + return 0; +} + + +static int sif_mmap_cb(struct sif_ucontext *uc, struct vm_area_struct *vma, u32 index) +{ + struct sif_dev *sdev = to_sdev(uc->ib_uc.device); + struct sif_cb *cb = sif_cb_from_uc(uc, index); + off_t len; + dma_addr_t cb_start; + int ret; + + if (!cb) { + sif_log(sdev, SIF_INFO, "Failed to associate cb %d with context", index); + return -EINVAL; + } + + len = vma->vm_end - vma->vm_start; + if (len != PAGE_SIZE) { + sif_log(sdev, SIF_INFO, "Failed to map cb index %d: Expected map len %ld, got %ld", + index, PAGE_SIZE, len); + return -EINVAL; + } + cb_start = pci_resource_start(sdev->pdev, SIF_CBU_BAR) + index * PAGE_SIZE; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + vma->vm_flags |= VM_WRITE; + ret = io_remap_pfn_range(vma, vma->vm_start, cb_start >> PAGE_SHIFT, + PAGE_SIZE, vma->vm_page_prot); + if (ret) + sif_log(sdev, SIF_INFO, "io_remap_pfn_range failed with %d", ret); + return ret; +} + + +#define def_map_queue(type) \ +static int sif_mmap_##type(struct sif_ucontext *uc, struct vm_area_struct *vma, u32 index)\ +{\ + struct sif_dev *sdev = to_sdev(uc->ib_uc.device);\ + struct sif_##type *type;\ + u64 q_sz;\ + off_t len;\ + \ + type = safe_get_sif_##type(sdev, index);\ + if (!type) {\ + sif_log(sdev, SIF_INFO, "Failed to map " #type \ + " index %d out of range", index);\ + sif_log(sdev, SIF_INFO, "%p : %p", sdev->ba[type##_hw].bitmap, sdev->ba[qp].bitmap);\ + return -EINVAL;\ + } \ + \ + q_sz = type->mem->size;\ + len = vma->vm_end - vma->vm_start;\ + if (q_sz < len) {\ + sif_log(sdev, SIF_INFO, "Failed to map " #type " index %d: "\ + "Expected map req for <= %lld bytes, got %ld", index, q_sz, len);\ + return -EINVAL;\ + } \ + \ + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;\ + vma->vm_flags |= VM_READ|VM_WRITE;\ + \ + return sif_mem_vma_map_part(type->mem, vma, 0, len);\ +} + +def_map_queue(sq) +def_map_queue(rq) +def_map_queue(cq) + +static int sif_mmap(struct ib_ucontext *ib_uc, struct vm_area_struct *vma) +{ + enum sif_mmap_cmd cmd; + u32 index; + struct sif_dev *sdev = to_sdev(ib_uc->device); + struct sif_ucontext *s_uc = to_sctx(ib_uc); + + mmap_get_cmd(vma->vm_pgoff << PAGE_SHIFT, &cmd, &index); + + sif_log(sdev, SIF_MMAP, + "pg offset 0x%lx start 0x%lx, end 0x%lx len 0x%lx, flags 0x%lx index %d", + vma->vm_pgoff, vma->vm_start, vma->vm_end, vma->vm_end - vma->vm_start, + vma->vm_flags, index); + + switch (cmd) { + case SIF_MAP_SQ_SW: + return sif_mmap_block(s_uc, vma, sq_sw, index, VM_READ|VM_WRITE); + case SIF_MAP_RQ_SW: + return sif_mmap_block(s_uc, vma, rq_sw, index, VM_READ|VM_WRITE); + case SIF_MAP_CQ_SW: + return sif_mmap_block(s_uc, vma, cq_sw, index, VM_READ|VM_WRITE); + case SIF_MAP_SQ_HW: + return sif_mmap_block(s_uc, vma, sq_hw, index, VM_READ); + case SIF_MAP_RQ_HW: + return sif_mmap_block(s_uc, vma, rq_hw, index, VM_READ); + case SIF_MAP_CQ_HW: + return sif_mmap_block(s_uc, vma, cq_hw, index, VM_READ); + case SIF_MAP_CB: + return sif_mmap_cb(s_uc, vma, index); + case SIF_MAP_SQ: + return sif_mmap_sq(s_uc, vma, index); + case SIF_MAP_RQ: + return sif_mmap_rq(s_uc, vma, index); + case SIF_MAP_CQ: + return sif_mmap_cq(s_uc, vma, index); + default: + break; + } + sif_log(sdev, SIF_MMAP, "cmd %d not implemented", cmd); + return -EOPNOTSUPP; +} + +static int sif_get_protocol_stats(struct ib_device *ibdev, + union rdma_protocol_stats *stats) +{ + struct sif_dev *sdev = to_sdev(ibdev); + + sif_log(sdev, SIF_VERBS, "Not implemented"); + return -EOPNOTSUPP; +} + + +static enum rdma_link_layer sif_get_link_layer(struct ib_device *ibdev, u8 port_num) +{ + struct sif_dev *sdev = to_sdev(ibdev); + + sif_log(sdev, SIF_VERBS, "returns IB_LINK_LAYER_INFINIBAND for port %d", port_num); + return IB_LINK_LAYER_INFINIBAND; +} + +static int sif_port_callback(struct ib_device *ibdev, u8 portno, struct kobject *obj) +{ + struct sif_dev *sdev = to_sdev(ibdev); + + sif_log(sdev, SIF_VERBS, "port %d", portno); + return 0; +} + +static inline struct ib_cq *sif_ib_create_cq(struct ib_device *ibdev, int cqe, + int comp_vector, struct ib_ucontext *context, + struct ib_udata *udata) +{ + return sif_create_cq(ibdev, cqe, comp_vector, context, udata, SIFPX_OFF); +} + +/* putting this function here to avoid sif_epsc.h from being rdma/ib_verbs.h dependent */ +static int sif_eps_wr_ex(struct ib_device *ibdev, enum psif_mbox_type eps_num, + struct psif_epsc_csr_req *req, struct psif_epsc_csr_rsp *cqe) +{ + struct sif_dev *sdev = to_sdev(ibdev); + + return sif_eps_wr(sdev, eps_num, req, cqe); + +} + +int sif_register_ib_device(struct sif_dev *sdev) +{ + int ret = 0; + int i; + struct ib_device *dev = &sdev->ib_dev; + struct psif_epsc_device_attr epsdev; + + /* We need to do a query_device to get the node_guid */ + ret = epsc_query_device(sdev, &epsdev); + if (ret) + return ret; + + strlcpy(dev->name, "sif%d", IB_DEVICE_NAME_MAX); + + dev->owner = THIS_MODULE; + dev->uverbs_abi_ver = SIF_UVERBS_ABI_VERSION; + + /* SIF supported user verbs */ + dev->uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_CREATE_AH) | + (1ull << IB_USER_VERBS_CMD_MODIFY_AH) | + (1ull << IB_USER_VERBS_CMD_QUERY_AH) | + (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_REG_SMR) | + (1ull << IB_USER_VERBS_CMD_REREG_MR) | + (1ull << IB_USER_VERBS_CMD_QUERY_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | + (1ull << IB_USER_VERBS_CMD_BIND_MW) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_MW) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_PEEK_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV) + | (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | + (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD) | + (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | + (1ull << IB_USER_VERBS_CMD_OPEN_QP) + | (1ull << IB_USER_VERBS_CMD_ALLOC_SHPD) | + (1ull << IB_USER_VERBS_CMD_SHARE_PD) + ; + + dev->get_protocol_stats = sif_get_protocol_stats; + + dev->query_device = sif_query_device; + dev->modify_device = sif_modify_device; + + dev->query_port = sif_query_port; + dev->modify_port = sif_modify_port; + + dev->get_link_layer = sif_get_link_layer; + dev->query_gid = sif_query_gid; + dev->query_pkey = sif_query_pkey; + + dev->alloc_ucontext = sif_alloc_ucontext; + dev->dealloc_ucontext = sif_dealloc_ucontext; + dev->mmap = sif_mmap; + + dev->alloc_pd = sif_alloc_pd; + dev->dealloc_pd = sif_dealloc_pd; + dev->create_ah = sif_create_ah; + dev->destroy_ah = sif_destroy_ah; + dev->query_ah = sif_query_ah; + + dev->create_srq = sif_create_srq; + dev->modify_srq = sif_modify_srq; + dev->query_srq = sif_query_srq; + dev->destroy_srq = sif_destroy_srq; + + dev->create_qp = sif_create_qp; + dev->modify_qp = sif_modify_qp; + dev->query_qp = sif_query_qp; + dev->destroy_qp = sif_destroy_qp; + + dev->post_send = sif_post_send; + dev->post_recv = sif_post_recv; + dev->post_srq_recv = sif_post_srq_recv; + + dev->create_cq = sif_ib_create_cq; + dev->destroy_cq = sif_destroy_cq; + dev->resize_cq = sif_resize_cq; + dev->poll_cq = sif_poll_cq; + dev->peek_cq = sif_peek_cq; + dev->req_notify_cq = sif_req_notify_cq; + dev->req_ncomp_notif = sif_req_ncomp_notif; + + dev->get_dma_mr = sif_get_dma_mr; + dev->reg_phys_mr = sif_reg_phys_mr; + dev->rereg_phys_mr = sif_rereg_phys_mr; + dev->reg_user_mr = sif_reg_user_mr; + dev->dereg_mr = sif_dereg_mr; + dev->query_mr = sif_query_mr; + + dev->alloc_fmr = sif_alloc_fmr; + dev->map_phys_fmr = sif_map_phys_fmr; + dev->unmap_fmr = sif_unmap_phys_fmr_list; + dev->dealloc_fmr = sif_dealloc_fmr; + + dev->attach_mcast = sif_multicast_attach; + dev->detach_mcast = sif_multicast_detach; + + /* All our mad handling happens via the normal QP0 paths + * this function is for devices which implements the SMA + * in software: + */ + dev->process_mad = NULL; + + dev->alloc_xrcd = sif_alloc_xrcd; + dev->dealloc_xrcd = sif_dealloc_xrcd; + dev->alloc_shpd = sif_alloc_shpd; + dev->share_pd = sif_share_pd; + dev->remove_shpd = sif_remove_shpd; + + dev->node_guid = cpu_to_be64(epsdev.node_guid); + + snprintf(dev->node_desc, sizeof(dev->node_desc), "sif_%s", + init_utsname()->nodename); + + dev->node_type = RDMA_NODE_IB_CA; + dev->phys_port_cnt = sdev->limited_mode ? 0 : epsdev.phys_port_cnt; + dev->num_comp_vectors = sdev->es[sdev->mbox_epsc].eqs.cnt - 2; + + ret = ib_register_device(dev, sif_port_callback); + if (ret) { + sif_log(sdev, SIF_VERBS, "Fail to register IB device: error %d", + -ret); + goto err_ibreg; + } + + for (i = 0; i < ARRAY_SIZE(sif_class_attributes); ++i) { + ret = device_create_file(&dev->dev, sif_class_attributes[i]); + if (ret) { + sif_log(sdev, SIF_VERBS, + "Fail to register with sysfs: error %d!", -ret); + goto err_sysfsreg; + } + } + + /* Diag_counters */ + ret = sysfs_create_group(&dev->dev.kobj, &diag_counters_attr_group); + if (ret) { + sif_log(sdev, SIF_VERBS, + "Fail to register diag_counters with sysfs: error %d!", -ret); + goto err_sysfsreg; + } + + /* Populate the external kernel API (see sif_verbs.h): */ + sdev->sv.eps_wr = sif_eps_wr_ex; + sdev->sv.create_cq = sif_create_cq; + sdev->ib_dev.local_dma_lkey = sdev->dma_mr->index; + + sdev->registered = true; + sif_log(sdev, SIF_VERBS_V, "%s registered with IB", sdev->ib_dev.name); + return 0; + +err_sysfsreg: + ib_unregister_device(dev); +err_ibreg: + sif_log(sdev, SIF_INFO, "Exit - error %d", -ret); + return ret; +} + +void sif_unregister_ib_device(struct sif_dev *sdev) +{ + struct ib_device *ibdev = &sdev->ib_dev; + + sdev->registered = false; + ib_unregister_device(ibdev); + sif_logi(ibdev, SIF_VERBS, "done unregistering device"); +} diff --git a/drivers/infiniband/hw/sif/sif_ireg.h b/drivers/infiniband/hw/sif/sif_ireg.h new file mode 100644 index 0000000000000..724b6df9c6e19 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_ireg.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_ireg.h: support functions used in setup of sif as an IB HCA + */ + +#ifndef __SIF_IREG_H +#define __SIF_IREG_H + +/* User context of a user level ib call */ +struct sif_ucontext { + struct ib_ucontext ib_uc; + struct sif_pd *pd; /* A protection domain for completion queues */ + struct sif_cb *cb; /* The collect buffer for the user process */ + u32 abi_version; /* User level library's abi version */ +}; + +static inline struct sif_ucontext *to_sctx(struct ib_ucontext *context) +{ + return container_of(context, struct sif_ucontext, ib_uc); +} + +int sif_register_ib_device(struct sif_dev *sdev); +void sif_unregister_ib_device(struct sif_dev *sdev); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_main.c b/drivers/infiniband/hw/sif/sif_main.c new file mode 100644 index 0000000000000..1890a1a6cb651 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_main.c @@ -0,0 +1,635 @@ +/* + * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_main.c: main entry points and initialization + */ + +#include +#include +#ifdef CONFIG_X86 +#include +#endif +#include +#include +#include "sif_dev.h" +#include "sif_fwa.h" +#include "sif_mmu.h" +#include "sif_mr.h" +#include "sif_hwi.h" +#include "sif_r3.h" +#include "sif_vf.h" +#include "sif_pt.h" +#include "sif_ireg.h" +#include "sif_debug.h" +#include "psif_hw_csr.h" +#include "version.h" +#include + + +#define PSIF_VERSION_STR "0.1.0.6+" + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Oracle SIF Infiniband HCA driver"); +MODULE_VERSION(PSIF_VERSION_STR); +MODULE_AUTHOR("Knut Omang"); + +/* The device(s) we support */ + +static const struct pci_device_id pci_table[] = { + {PCI_DEVICE(PCI_VENDOR_ID_SUN, PCI_DEVICE_ID_PSIF_PF)}, + {PCI_DEVICE(PCI_VENDOR_ID_SUN, PCI_DEVICE_ID_PSIF_VF)}, + {PCI_DEVICE(PCI_VENDOR_ID_SUN, PCI_DEVICE_ID_SN1_PF)}, + {PCI_DEVICE(PCI_VENDOR_ID_SUN, PCI_DEVICE_ID_SN1_VF)}, + {0,} +}; + +MODULE_DEVICE_TABLE(pci, pci_table); + +/* module entry points */ +static int __init sif_init(void); +static void __exit sif_exit(void); + +/* device entry points */ +static int sif_probe(struct pci_dev *pdev, + const struct pci_device_id *id); +static void sif_remove(struct pci_dev *dev); + +static int sif_suspend(struct pci_dev *dev, pm_message_t state) +{ + struct sif_dev *sdev = pci_get_drvdata(dev); + + sif_log(sdev, SIF_INFO, " "); + return 0; +} + +static int sif_resume(struct pci_dev *dev) +{ + struct sif_dev *sdev = pci_get_drvdata(dev); + + sif_log(sdev, SIF_INFO, " "); + return 0; +} + +static void sif_shutdown(struct pci_dev *dev) +{ + struct sif_dev *sdev = pci_get_drvdata(dev); + + sif_log(sdev, SIF_INFO, " "); +} + +static struct pci_driver sif_driver = { + .name = "sif", + .id_table = pci_table, + .probe = sif_probe, + .remove = sif_remove, + .suspend = sif_suspend, + .resume = sif_resume, + .shutdown = sif_shutdown, + .sriov_configure = sif_vf_enable, +}; + +/* Driver parameters: */ + +ulong sif_debug_mask = 0x3; +module_param_named(debug_mask, sif_debug_mask, ulong, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(debug_mask, "Selective enabling of debugging output to the system log"); + +#ifdef SIF_TRACE_MASK +ulong sif_trace_mask = 0x0; +module_param_named(trace_mask, sif_trace_mask, ulong, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(trace_mask, "Selective enabling of debugging output to the ftrace buffer"); +#endif + +ulong sif_feature_mask = 0; +module_param_named(feature_mask, sif_feature_mask, ulong, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(feature_mask, "Selective enabling of sif driver features"); + +ulong sif_vendor_flags = 0; +module_param_named(vendor_flags, sif_vendor_flags, ulong, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(vendor_flags, "Selective enabling of sif driver vendor specific mode flags"); + +uint sif_max_pqp_wr = SIF_SW_MAX_SQE; +module_param_named(max_pqp_wr, sif_max_pqp_wr, uint, S_IRUGO); +MODULE_PARM_DESC(max_pqp_wr, "Maximum number of outstanding privileged QP requests supported"); + +uint sif_ki_spqp_size = 1; +module_param_named(ki_spqp_size, sif_ki_spqp_size, uint, S_IRUGO); +MODULE_PARM_DESC(ki_spqp_size, "Number of privileged QPs for key invalidate stencils to set up"); + +/* pqp_size == cq_eq_max */ +uint sif_cq_eq_max = 12; +module_param_named(cq_eq_max, sif_cq_eq_max, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(cq_eq_max, "Upper limit on no. of EQs to distribute completion events among"); + +uint sif_cb_max = 100; +module_param_named(cb_max, sif_cb_max, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(cb_max, "Upper limit on no. of CBs."); + +/* TBD - This is a debug feature to evaluate performance. */ +ushort sif_perf_sampling_threshold = 100; +module_param_named(perf_sampling_threshold, sif_perf_sampling_threshold, ushort, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(perf_sampling_threshold, "The performance measurement based on every N samples"); + +uint sif_fmr_cache_flush_threshold = 512; +module_param_named(fmr_cache_flush_threshold, sif_fmr_cache_flush_threshold, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(fmr_cache_flush_threshold, "PF limit for when to use fast-path full MMU flush for FMR unmap"); + + +/* In principle, SIF can allow any max inline size but at the cost of more memory + * allocated per QP. This variable sets the upper limit for any QP by defining + * the max extent of the sq entries, which means that the real max size is slightly + * less, depending on the max number of sges requested: + */ +uint sif_max_inline = 0x400; +module_param_named(max_inline, sif_max_inline, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(max_inline, "Max configurable inline data per QP"); + +uint sif_vf_en = 1; +module_param_named(vf_en, sif_vf_en, uint, S_IRUGO); +MODULE_PARM_DESC(vf_en, "If set to 0, refuse to load VF drivers"); + +ulong sif_eps_log_size = 0; +module_param_named(eps_log_size, sif_eps_log_size, ulong, S_IRUGO); +MODULE_PARM_DESC(eps_log_size, "Enable log redirection - value is size of log buffer to allocate"); + +ushort sif_eps_log_level = EPS_LOG_INFO; +module_param_named(eps_log_level, sif_eps_log_level, ushort, S_IRUGO); +MODULE_PARM_DESC(eps_log_level, "Level of logging to set for EPS redirect at load"); + +static int sif_bar_init(struct pci_dev *pdev); +static void sif_bar_deinit(struct pci_dev *pdev); + + +static int sif_set_check_max_payload(struct sif_dev *sdev) +{ + struct pci_dev *parent; + u16 devctl, devcap, pdevctl, pdevcap; + int pcie_cap, pcie_parent_cap, min_cap_mps, err; + + u8 payload_sz, payload_sz_cap; + u8 parent_payload_sz, parent_payload_sz_cap; + + pcie_cap = pci_find_capability(sdev->pdev, PCI_CAP_ID_EXP); + + /* read PSIF max payload size capability and setting */ + err = pci_read_config_word(sdev->pdev, pcie_cap + PCI_EXP_DEVCTL, &devctl); + if (err) + return err; + + payload_sz = (devctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5; + + err = pci_read_config_word(sdev->pdev, pcie_cap + PCI_EXP_DEVCAP, &devcap); + if (err) + return err; + + payload_sz_cap = (devcap & PCI_EXP_DEVCAP_PAYLOAD); + + if (sif_feature(max_supported_payload)) { + parent = pci_upstream_bridge(sdev->pdev); + if (!parent) { + sif_log(sdev, SIF_INFO, + "No parent bridge device, cannot determine atomic capabilities!"); + return PSIF_PCIE_ATOMIC_OP_NONE; + } + + pcie_parent_cap = pci_find_capability(parent, PCI_CAP_ID_EXP); + if (!pcie_parent_cap) { + sif_log(sdev, SIF_INFO, + "Unable to find any PCIe capability in parent device - assuming payload size is ok"); + return 0; + } + + /* read root complex (port) max payload size */ + err = pci_read_config_word(parent, pcie_parent_cap + PCI_EXP_DEVCTL, &pdevctl); + if (err) + return err; + + err = pci_read_config_word(parent, pcie_parent_cap + PCI_EXP_DEVCAP, &pdevcap); + if (err) + return err; + + parent_payload_sz = (pdevctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5; + parent_payload_sz_cap = (pdevcap & PCI_EXP_DEVCAP_PAYLOAD); + + min_cap_mps = min(parent_payload_sz_cap, payload_sz_cap); + + /* adjusting the RC max payload size to the supported max payload size */ + if (parent_payload_sz != min_cap_mps) { + sif_log(sdev, SIF_INFO, + "Adjusting RC max payload sz to %d\n", 128 << parent_payload_sz_cap); + err = pci_write_config_word(parent, + pcie_parent_cap + PCI_EXP_DEVCTL, + (pdevctl & ~PCI_EXP_DEVCTL_PAYLOAD) + (min_cap_mps << 5)); + } + + /* Adjusting the max payload size to the supported max payload size */ + if (payload_sz != min_cap_mps) { + sif_log(sdev, SIF_INFO, + "Adjusting max payload sz to %d\n", 128 << parent_payload_sz_cap); + err = pci_write_config_word(sdev->pdev, + pcie_cap + PCI_EXP_DEVCTL, + (devctl & ~PCI_EXP_DEVCTL_PAYLOAD) + (min_cap_mps << 5)); + } + + if (min_cap_mps == 0) { + sif_log(sdev, SIF_INFO, + "PCI express max payload size is set to 128 which triggers a rev1 bug"); + } + } + return err; +} + +/* Entry of new instance */ +static int sif_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + int err = 0; + + /* TBD: Zeroed memory from ib_alloc_device? */ + struct sif_dev *sdev = + (struct sif_dev *)ib_alloc_device(sizeof(struct sif_dev)); + if (!sdev) { + err = -ENOMEM; + goto pfail_ib_alloc; + } + + sdev->pdev = pdev; + sdev->dfs = NULL; + sdev->fw_vfs = -1; /* #of VFS enabled in firmware not known yet */ + sdev->ib_dev.dma_device = &pdev->dev; + sdev->limited_mode = sif_feature(force_limited_mode) ? true : false; + + strlcpy(sdev->ib_dev.name, "sif%d", IB_DEVICE_NAME_MAX); + + pci_set_drvdata(pdev, sdev); + sif_log(sdev, SIF_INFO, + "%s found, device id 0x%x, subsystem id 0x%x, revision %d, at 0x%p", + get_product_str(sdev), PSIF_DEVICE(sdev), + PSIF_SUBSYSTEM(sdev), PSIF_REVISION(sdev), sdev); + + sdev->wq = create_singlethread_workqueue(sdev->ib_dev.name); + if (!sdev->wq) { + sif_log(sdev, SIF_INFO, "Failed to allocate kernel work queue"); + err = -ENOMEM; + goto wq_fail; + } + + err = sif_set_check_max_payload(sdev); + if (err) + goto wq_fail; + + /* Ask PCI drivers to enable the device and set up BARs etc */ + err = pci_enable_device_mem(pdev); + if (err) + goto pfail_enable; + + /* Check if 64 bits DMA is supported */ + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (!err) { + sif_log(sdev, SIF_INIT, "64 bit DMA supported"); + pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + } else { + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (!err) { + sif_log(sdev, SIF_INIT, "32 bit DMA supported"); + pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + } else { + sif_log(sdev, SIF_INIT, "No DMA support!?"); + goto pfail_dma; + } + } + + pci_enable_pcie_error_reporting(pdev); + + /* Set up BAR access */ + err = sif_bar_init(pdev); + if (err) + goto pfail_bar; + + if (xen_pv_domain()) { + /* The Xen PV domain may return huge pages that are misaligned + * in DMA space, see Orabug: 21690736. + * Also we have to turn off the inline sge optimization, as it assumes + * that (guest) physical and DMA addresses are equal, which is not + * the case for the PV domain - see Orabug: 23012335. + */ + sif_log(sdev, SIF_INFO, "xen pv domain: Restricting resource allocation.."); + sif_feature_mask |= SIFF_no_huge_pages | SIFF_disable_inline_first_sge; + sif_qp_size = min(sif_qp_size, 0x1000U); + sif_mr_size = min(sif_mr_size, 0x1000U); + sif_ah_size = min(sif_ah_size, 0x1000U); + sif_cq_size = min(sif_cq_size, 0x1000U); + sif_rq_size = min(sif_rq_size, 0x1000U); + sif_max_pqp_wr = min(sif_max_pqp_wr, 0x1000U); + } + + /* Timeout scaling factor: + * This value is used as a factor to calculate sensible + * timeout values throughout the driver: + */ + sdev->min_resp_ticks = SIF_HW_TIMEOUT; + /* Type UMEM means no override - initialize */ + sdev->mt_override = SIFMT_UMEM; + + err = sif_dfs_register(sdev); + if (err) + goto pfail_dfs; + + /* PSIF initialization */ + err = sif_hw_init(sdev); + if (err) + goto pfail_psif_base; + + err = sif_fwa_register(sdev); + if (err) + goto fwa_reg_failed; + + /* Reserve key 0 as an invalid key for sanity checking + * See #3323 for details + */ + sdev->dma_inv_mr = sif_alloc_invalid_mr(sdev->pd); + if (IS_ERR(sdev->dma_inv_mr)) { + err = PTR_ERR(sdev->dma_inv_mr); + goto pfail_dma_inv_mr; + } + + /* Create a DMA MR (mapping the whole address space) + * for use with the local_dma_lkey + */ + sdev->dma_mr = create_dma_mr(sdev->pd, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE); + + if (IS_ERR(sdev->dma_mr)) { + err = PTR_ERR(sdev->dma_mr); + goto pfail_dma_mr; + } + + if (PSIF_REVISION(sdev) <= 3) { + err = sif_r3_init(sdev); + if (err) + goto pfail_r3_init; + } + + /* Successful device init */ + + err = sif_register_ib_device(sdev); + if (err) + goto pfail_ibreg; + + /* Now that an IB device name exists, create a symlink in debugfs */ + sif_dfs_link_to_ibdev(sdev); + + + sif_log(sdev, SIF_INFO, "Successfully probed and set up device"); + return 0; +pfail_ibreg: + sif_r3_deinit(sdev); +pfail_r3_init: + sif_dealloc_mr(sdev, sdev->dma_mr); +pfail_dma_mr: + sif_dealloc_mr(sdev, sdev->dma_inv_mr); +pfail_dma_inv_mr: + sif_fwa_unregister(sdev); +fwa_reg_failed: + sif_hw_deinit(sdev); +pfail_psif_base: + sif_dfs_unregister(sdev); +pfail_dfs: + sif_bar_deinit(pdev); +pfail_bar: + pci_disable_pcie_error_reporting(pdev); +pfail_dma: + pci_disable_device(pdev); +pfail_enable: + destroy_workqueue(sdev->wq); +wq_fail: + ib_dealloc_device(&sdev->ib_dev); +pfail_ib_alloc: + sif_log0(SIF_INIT, "sif_probe failed with status %d\n", err); + return err; +} + +/* Exit of instance */ +static void sif_remove(struct pci_dev *dev) +{ + struct sif_dev *sdev = pci_get_drvdata(dev); + + sif_log0(SIF_INIT, "Enter: sif_remove"); + + sif_vf_disable(sdev); + + sif_unregister_ib_device(sdev); + sif_r3_deinit(sdev); + sif_dealloc_mr(sdev, sdev->dma_mr); + sif_dealloc_mr(sdev, sdev->dma_inv_mr); + sif_fwa_unregister(sdev); + sif_hw_deinit(sdev); + sif_dfs_unregister(sdev); + sif_bar_deinit(dev); + pci_clear_master(dev); + pci_disable_device(dev); + flush_workqueue(sdev->wq); + destroy_workqueue(sdev->wq); + ib_dealloc_device(&sdev->ib_dev); + sif_log0(SIF_INIT, "exit sif_remove"); +} + +static int sif_bar_init(struct pci_dev *pdev) +{ + struct sif_dev *sdev = pci_get_drvdata(pdev); + int err; + phys_addr_t start; + size_t length; + + /* Request access to the device space in BAR0 for this driver */ + err = pci_request_region(pdev, SIF_CBU_BAR, "sif_cb"); + if (err) { + sif_log(sdev, SIF_INIT, "Failed to request cb region"); + goto pfail_bar0; + } + + /* Then map all of it to allow access */ + start = pci_resource_start(pdev, SIF_CBU_BAR); + + /* This should not happen - kernel or BIOS bug? + * TBD: Check this from the CPU ID? (M bit?) + */ + if (start > (1ULL << 52)) { + sif_log(sdev, SIF_INIT, + "pci_resource_start returned a physical address beyond CPU max phys.addr (%llx)", + start); + err = -ENOMEM; + goto pfail_ioremap0; + } + + length = pci_resource_len(pdev, SIF_CBU_BAR); + + sdev->cbu_mtrr = -1; /* Avoid attempt to free mtrr 0 */ + + /* + * Need iomap_wc() in order to get write-combining to work, + * even when using explicit write-combining instructions. + */ + sdev->cb_base = ioremap_wc(start, length); + if (!sdev->cb_base) { + sif_log(sdev, SIF_INIT, + "ioremap_wc - failed to map cb BAR (start %llx len %lx)", + start, length); + err = -ENOMEM; + goto pfail_ioremap0; + } + sdev->cb_sz = length; + + sif_log(sdev, SIF_INIT, "BAR%d (cb) mapped at kva %p start %llx len %lx", + SIF_CBU_BAR, sdev->cb_base, start, length); + + err = pci_request_region(pdev, SIF_MSIX_BAR, "sif_msix"); + if (err) { + sif_log(sdev, SIF_INIT, "Failed to request msix region"); + goto pfail_bar2; + } + + start = pci_resource_start(pdev, SIF_MSIX_BAR); + length = pci_resource_len(pdev, SIF_MSIX_BAR); + sdev->msi_base = ioremap_nocache(start, length); + if (!sdev->msi_base) { + sif_log(sdev, SIF_INIT, + "ioremap_nocache - failed to map msix BAR%d (start %llx len %lx)", + SIF_MSIX_BAR, start, length); + err = -ENOMEM; + goto pfail_ioremap2; + } + sdev->msi_sz = length; + sif_log(sdev, SIF_INIT, "BAR%d (msix) mapped at kva %p start %llx len %lx", + SIF_MSIX_BAR, sdev->msi_base, start, length); + + err = pci_request_region(pdev, SIF_EPS_BAR, "sif_csr"); + if (err) { + sif_log(sdev, SIF_INIT, "Failed to request eps region"); + goto pfail_bar4; + } + + start = pci_resource_start(pdev, SIF_EPS_BAR); + length = pci_resource_len(pdev, SIF_EPS_BAR); + sdev->eps_base = ioremap_nocache(start, length); + if (!sdev->eps_base) { + sif_log(sdev, SIF_INIT, "Failed to map eps BAR%d (start %llx len %lx)", + SIF_EPS_BAR, start, length); + err = -ENOMEM; + goto pfail_ioremap4; + } + sdev->eps = (struct __iomem psif_pcie_mbox *)sdev->eps_base; + sdev->eps_sz = length; + + sif_log(sdev, SIF_INIT, "BAR%d (eps) mapped at kva %p start %llx len %lx", + SIF_EPS_BAR, sdev->eps, start, length); + return 0; + +pfail_ioremap4: + pci_release_region(pdev, SIF_EPS_BAR); +pfail_bar4: + iounmap(sdev->msi_base); +pfail_ioremap2: + pci_release_region(pdev, SIF_CBU_BAR); +pfail_bar2: + iounmap(sdev->cb_base); +pfail_ioremap0: +#ifdef CONFIG_X86 + if (sdev->cbu_mtrr >= 0) + mtrr_del(sdev->cbu_mtrr, + pci_resource_start(pdev, SIF_CBU_BAR), + pci_resource_len(pdev, SIF_CBU_BAR)); +#endif + pci_release_region(pdev, SIF_MSIX_BAR); +pfail_bar0: + return err; +} + +static void sif_bar_deinit(struct pci_dev *pdev) +{ + struct sif_dev *sdev = pci_get_drvdata(pdev); + + iounmap(sdev->eps); + pci_release_region(pdev, 4); + iounmap(sdev->msi_base); + pci_release_region(pdev, 2); + iounmap(sdev->cb_base); +#ifdef CONFIG_X86 + if (sdev->cbu_mtrr >= 0) + mtrr_del(sdev->cbu_mtrr, + pci_resource_start(pdev, SIF_CBU_BAR), + pci_resource_len(pdev, SIF_CBU_BAR)); +#endif + pci_release_region(pdev, 0); +} + + + +/* Statically register this driver with the kernel */ + +static int __init sif_init(void) +{ + int stat = 0; + + sif_log0(SIF_INFO, "**** Oracle development driver - internal use only! ****"); + sif_log0(SIF_INFO, "%s - build user %s at %s", sif_version.git_repo, + sif_version.build_user, sif_version.build_git_time); + sif_log0(SIF_INFO, "sifdrv git tag:\n%s", sif_version.last_commit); + if (sif_version.git_status[0] != '\0') + sif_log0(SIF_INFO, " *** sifdrv git status at build time: ***\n%s", sif_version.git_status); + sif_log0(SIF_INFO, "psifapi git tag:\n%s", sif_version.last_psifapi_commit); + if (sif_version.git_psifapi_status[0] != '\0') + sif_log0(SIF_INFO, " *** psifapi git status at build time ***\n%s", + sif_version.git_psifapi_status); + + sif_log0(SIF_INIT, "hw header release \"%s\"", PSIF_RELEASE_STR); + sif_log0(SIF_INIT, "built for PSIF version %d.%d, EPSC API version %d.%d", + PSIF_MAJOR_VERSION, PSIF_MINOR_VERSION, EPSC_MAJOR_VERSION, EPSC_MINOR_VERSION); + sif_log0(SIF_INIT, "sif debug mask 0x%lx", sif_debug_mask); + if (sif_feature_mask) { + u64 undef = sif_feature_mask & ~SIFF_all_features; + + if (undef) { + sif_log0(SIF_INFO, + "***** Invalid feature mask - undefined bits %llx - get rid of legacy bits!", + undef); + return -EINVAL; + } + sif_log0(SIF_INFO, "sif feature mask 0x%lx", sif_feature_mask); + } + + stat = sif_pt_init(); + if (stat) + goto pt_init_failed; + + stat = sif_fwa_init(); + if (stat) + goto fwa_init_failed; + + return pci_register_driver(&sif_driver); + +fwa_init_failed: + sif_pt_exit(); +pt_init_failed: + return stat; +} + +static void __exit sif_exit(void) +{ + sif_fwa_exit(); + pci_unregister_driver(&sif_driver); + sif_pt_exit(); + sif_log0(SIF_INIT, "done unregistering"); +} + +module_init(sif_init); +module_exit(sif_exit); diff --git a/drivers/infiniband/hw/sif/sif_mem.c b/drivers/infiniband/hw/sif/sif_mem.c new file mode 100644 index 0000000000000..2f2629b116d38 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_mem.c @@ -0,0 +1,1109 @@ +/* + * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_mem.c: SIF table memory and page table management + */ + +#include +#include +#include +#include +#include "sif_dev.h" +#include "sif_mem.h" +#include "sif_dma.h" +#include "sif_pt.h" + +/* Defined below */ +static int sif_mem_fixup_dma(struct scatterlist *sg); + +/* Initialization of global per device info */ +void sif_mem_init(struct sif_dev *sdev) +{ + struct sif_mem_info *mi = &sdev->mi; + + if (sif_feature(toggle_page_size)) { + mi->page_shift = PAGE_SHIFT == 12 ? 13 : 12; + mi->page_size = PAGE_SIZE == 0x1000 ? 0x2000 : 0x1000; + } else { + mi->page_shift = PAGE_SHIFT; + mi->page_size = PAGE_SIZE; + } + mi->level_shift = 9; + mi->max_shift = mi->page_shift + mi->level_shift * PT_LEVELS; + mi->ptes_per_page = 1 << mi->level_shift; + mi->page_mask = ~(mi->page_size - 1); +} + +/* Some utilities */ + +inline size_t mem_type_to_page_shift(struct sif_dev *sdev, enum sif_mem_type mem_type) +{ + switch (mem_type) { + case SIFMT_2M: + return sdev->mi.page_shift + sdev->mi.level_shift; + default: + return sdev->mi.page_shift; + } +} + + +static u32 sif_mem_fmr_max_page_shift(struct sif_mem *mem) +{ + struct sif_dev *sdev = mem->sdev; + u32 max_shift = sdev->mi.max_shift; + u64 end = 0; + u32 bits = sizeof(dma_addr_t) << 3; + int i; + u64 incr = 1 << mem->m.fmr.page_shift; + + BUG_ON(mem->mem_type != SIFMT_FMR); + + for (i = 0; i < mem->m.fmr.page_list_len; i++) { + u64 next_addr = mem->m.fmr.page_list[i]; + + if (end && end != next_addr) { + unsigned long border = end | next_addr; + u32 shift = find_first_bit(&border, bits); + + if (shift < max_shift) { + sif_log(sdev, SIF_MEM_V, + "%4d: start 0x%llx, sz 0x%llx, prev.end 0x%llx shift %d -> %d", + i, next_addr, incr, end, max_shift, shift); + max_shift = shift; + if (max_shift == mem->m.fmr.page_shift) /* No point in continuing */ + break; + } + } + end = next_addr + incr; + } + sif_log(sdev, SIF_MEM_SG, "found max shift %d from inspecting %d sges", max_shift, i); + return max_shift; +} + + +/* Calculate the max.possible page_shift for this memory + * based on alignment of the DMA + */ +static u32 sif_mem_max_page_shift(struct sif_mem *mem) +{ + struct sif_dev *sdev = mem->sdev; + u32 max_shift = sdev->mi.max_shift; + u64 end = 0; + u32 bits = sizeof(dma_addr_t) << 3; + u32 sg_cnt = 0; + + struct scatterlist *sg = sif_mem_get_sgl(mem); + + if (!sg) + return sdev->mi.page_shift; + for (; sg; sg = sg_next(sg)) { + u64 dma_start = sg_dma_address(sg); + + sg_cnt++; +#ifdef __sparc__ + /* TBD: Fix bug in umem: + * SG lists are not always properly terminated + */ + if (!sg_dma_len(sg)) + break; +#endif + if (end && end != dma_start) { + unsigned long border = end | dma_start; + u32 shift = find_first_bit(&border, bits); + + if (shift < max_shift) { + sif_log(sdev, SIF_MEM_V, + "%4d: start 0x%llx, sz %x, prev.end 0x%llx shift %d -> %d", + sg_cnt, dma_start, sg_dma_len(sg), end, max_shift, shift); + max_shift = shift; + if (max_shift == sdev->mi.page_shift) /* No point in continuing */ + break; + /* BUG_ON(max_shift < sdev->mi.page_shift); */ + if (max_shift < sdev->mi.page_shift) { + sif_log(sdev, SIF_INFO, + "Failed to find a valid page shift: max_shift %d sdev->mi.page_shift %d", + max_shift, sdev->mi.page_shift); + return max_shift; + } + } + } + end = sg_dma_address(sg) + sg_dma_len(sg); + } + sif_log(sdev, SIF_MEM_SG, "found max shift %d from inspecting %d sges", max_shift, sg_cnt); + return max_shift; +} + +/* External observer: + * Return the largest page size (represented by page shift bits) usable for this memory + */ +u32 sif_mem_page_shift(struct sif_mem *mem) +{ + /* If a maximum has been calculated, use it: */ + if (mem->max_page_shift) + return mem->max_page_shift; + return mem_type_to_page_shift(mem->sdev, mem->mem_type); +} + +static struct scatterlist *sg_alloc_list(struct sif_dev *sdev, unsigned int nelems, gfp_t flag) +{ + struct scatterlist *sg = sif_kmalloc(sdev, sizeof(struct scatterlist) * nelems, flag); + + if (sg) { + sif_log0(SIF_MMU, "start at %p, %d elems allocated", sg, nelems); + sg_init_table(sg, nelems); + } + return sg; +} + + +/* API for managing a sif_kmem object */ + +/** Allocate a set of pages of size (1 << page_shift). + * Prepare for scatterlist(s) of fixed length @sg_size (in number of elements) + * and allocate an initial @sz bytes (must be multiple of 1 << page_shift) + * @sz must be less than what fits within the initial scatterlist. + * If sg_size is 0, figure out the optimal sg_size. + */ +int sif_kmem_init(struct sif_dev *sdev, struct sif_kmem *kmem, size_t sg_size, size_t sz, + u32 page_shift, gfp_t flag, enum dma_data_direction dir) +{ + int ret; + + memset(kmem, 0, sizeof(*kmem)); + kmem->page_shift = page_shift; + + if (!sg_size) + sg_size = sz >> page_shift; + kmem->sg_size = sg_size; + kmem->dir = dir; + kmem->sg_max = 0; /* Indicates an empty list with no end mark set yet */ + + if (sz == 0) + return 0; + + ret = sif_kmem_extend(sdev, kmem, sz, flag); + if (ret < 0) + return ret; + + return 0; +} + + +static void sif_kmem_free_pages(struct sif_kmem *kmem, struct scatterlist *sg, u32 nelems) +{ + int i; + int order = kmem->page_shift - PAGE_SHIFT; + + for (i = 0; i < nelems; i++) { + __free_pages(sg_page(sg), order); + sg = sg_next(sg); + } +} + + +static void sif_kmem_free_sgls(struct sif_kmem *kmem, struct scatterlist *sgl, u32 nlists) +{ + for (; nlists > 0; nlists--) { + struct scatterlist *nsgl = sg_chain_ptr(&sgl[kmem->sg_size]); + + kfree(sgl); + sgl = nsgl; + } +} + +/* Find the @n'th scatterlist array within kmem */ +static struct scatterlist *sif_kmem_find_sg_head_idx(struct sif_kmem *kmem, u32 n) +{ + int i = 0; + struct scatterlist *sgl = kmem->sg; + + for (; n > i; i++) + sgl = sg_chain_ptr(&sgl[kmem->sg_size]); + return sgl; +} + + +/* Find the scatterlist element with index idx within kmem */ +struct scatterlist *sif_kmem_find_sg_idx(struct sif_kmem *kmem, u32 idx) +{ + struct scatterlist *sgl; + int n = idx / kmem->sg_size; + + sgl = sif_kmem_find_sg_head_idx(kmem, n); + return &sgl[idx % kmem->sg_size]; +} + + +void sif_kmem_free(struct sif_dev *sdev, struct sif_kmem *kmem) +{ + int npages = kmem->sg_max - kmem->sg_start; + struct scatterlist *sg = sif_kmem_find_sg_idx(kmem, kmem->sg_start); + + ib_dma_unmap_sg(&sdev->ib_dev, sg, npages, kmem->dir); + + sif_kmem_free_pages(kmem, sg, npages); + sif_kmem_free_sgls(kmem, sg, kmem->nlists); + kmem->sg = NULL; +} + + +/* Extend a kmem object by allocating more sg entries if necessary, then + * allocate pages and dma map them. The invariant upon exit is that + * all allocated pages are dma mapped, which means that we must + * clean up pages that did not get mapped, if mapping fails midway: + */ + +int sif_kmem_extend(struct sif_dev *sdev, struct sif_kmem *kmem, size_t sz, gfp_t flag) +{ + u32 i; + int ret; + int order; + struct page *page; + struct scatterlist *sg; + struct scatterlist *sg_prev = NULL; + struct scatterlist *sg_start = NULL; + size_t page_size = 1UL << kmem->page_shift; + u64 page_mask = page_size - 1; + u32 sg_size = (sz + page_mask) >> kmem->page_shift; + + u32 nl = kmem->nlists; + long free_sg = nl * kmem->sg_size - kmem->sg_max; + + sif_log(sdev, SIF_MEM, "enter, kmem at %p, sz 0x%lx", kmem, sz); + + /* Make room in sg list */ + for (; free_sg < sg_size; free_sg += kmem->sg_size) { + sg = sg_alloc_list(sdev, kmem->sg_size + 1, flag); + if (!sg) { + ret = -ENOMEM; + goto failed; + } + if (kmem->last_sg) + sg_chain(kmem->last_sg, kmem->sg_size + 1, sg); + else + kmem->sg = sg; + kmem->last_sg = sg; + kmem->nlists++; + } + + /* The end mark is always in the last used element, not the first available one + * which sg_max points to: + */ + if (kmem->sg_max) { + sg_prev = sif_kmem_find_sg_idx(kmem, kmem->sg_max - 1); + sg_unmark_end(sg_prev); + sg = sg_next(sg_prev); + } else + sg = sif_kmem_find_sg_idx(kmem, 0); + + sg_start = sg; + order = kmem->page_shift - PAGE_SHIFT; + + /* Allocate the new memory */ + for (i = 0; i < sg_size; i++) { + sif_log(sdev, SIF_MEM_V, "i = %d, sg %p", i, sg); + page = sif_alloc_pages(sdev, flag | __GFP_ZERO, order); + if (!page) { + ret = -ENOMEM; + sg_size = i; + sg_mark_end(sg); + goto map_failed; + } + BUG_ON(!sg); + sg_set_page(sg, page, page_size, 0); + sg_prev = sg; + sg = sg_next(sg); + } + sg_mark_end(sg_prev); + + ret = ib_dma_map_sg(&sdev->ib_dev, sg_start, sg_size, kmem->dir); + if (ret < 0) { + sif_log(sdev, SIF_INFO, "ib_dma_map_sg failed with %d", ret); + ret = -EFAULT; + goto map_failed; + } + + sif_logs(SIF_PT_VV, sif_dump_sg(sg_start)); + + /* TBD: Remove this when issues with wrong alignments of DMA addresses + * has been resolved (both Sparc and OVM, see Orabug: 21690736 + * For 2M seg_size, check that all DMA addresses are 2M aligned: + */ + if (page_size >= PMD_SIZE) { + for (sg = sg_start, i = 0; sg != NULL; sg = sg_next(sg), i++) { + if (sg_dma_address(sg) & ~PMD_MASK) { + sif_log(sdev, SIF_INFO, + "**** Orabug: 21690736 - aligned PA maps to unaligned IOVA: i = %d, pa %llx dma %pad", + i, + (u64)sg_phys(sg), &sg_dma_address(sg)); + ret = -EIO; + goto map_failed; + } + sif_log(sdev, SIF_MEM_V, "i = %d, pa %llx dma %pad", i, + (u64)sg_phys(sg), &sg_dma_address(sg)); + } + } + + /* To enable direct lookup, we rely on the s/g list not being + * collapsed by dma mapping. This holds on x86 but eg. on sparc we see + * collapsed lists where the IOMMU delivers the whole DMA range in a single entry + * at the start. Handle this case too by rewriting the DMA list + * to comply with our needs, otherwise fail (and dump the sg list to the trace buffer + * for analysis): + */ + if (sg_size != ret) { + if (ret == 1) { + sif_log(sdev, SIF_MEM, "Fixing up collapsed sg list (%d/%d)", + ret, sg_size); + ret = sif_mem_fixup_dma(sg_start); + if (ret) + goto map_failed; + sif_logs(SIF_PT_VV, sif_dump_sg(sg_start)); + } else { + /* This should not happen, but sanity check it anyway */ + sif_log(sdev, SIF_INFO, + "** Detected unhandled layout of s/g list (%d/%d) **", + ret, sg_size); + ret = -EPROTOTYPE; + goto map_failed; + } + } + i = kmem->sg_max; + kmem->sg_max += ret; + kmem->size += sz; + return i; +map_failed: + sif_dump_sg(sg_start); + if (sg_size) + sif_kmem_free_pages(kmem, sg_start, sg_size); +failed: + return ret; +} + + +/* Map a part of the @kmem object given by @offset, @size to the user space + * vm context given in @vma. The part must be page aligned and page sized: + */ + +static int sif_kmem_vma_map_part(struct sif_dev *sdev, struct sif_kmem *kmem, struct vm_area_struct *vma, + off_t start_off, size_t size) +{ + off_t sg_index = start_off >> kmem->page_shift; + u64 page_size = 1 << kmem->page_shift; + u64 page_mask = (page_size - 1); + off_t off = start_off & page_mask; /* start offset within mem page */ + off_t sz = min_t(off_t, size, page_size - off); + struct scatterlist *sg; + dma_addr_t pfn, sg_phy; + u64 start = vma->vm_start; + u64 rem = size; + int ret; + + BUG_ON(off & ~PAGE_MASK); + + sg = sif_kmem_find_sg_idx(kmem, sg_index); + + sif_log(sdev, SIF_MMAP, "size %lx, off %lx start sg idx: %ld", + size, off, sg_index); + + for (; rem > 0; sg = sg_next(sg)) { + sg_phy = sg_phys(sg); + pfn = (sg_phy + off) >> PAGE_SHIFT; + sif_log(sdev, SIF_MMAP, "pfn %pad, sz %lx sg_phys %pad off %lx", + &pfn, sz, &sg_phy, off); + ret = remap_pfn_range(vma, start, pfn, sz, vma->vm_page_prot); + if (ret) + return ret; + rem -= sz; + start += sz; + sz = min(rem, page_size); + off = 0; + } + return 0; +} + + +static int sif_vma_map_sg_part(struct sif_dev *sdev, struct scatterlist *sg, + struct vm_area_struct *vma, off_t start_off, size_t size) +{ + u64 start = vma->vm_start; + off_t off = start_off; + dma_addr_t pfn, sg_phy; + off_t rem = size; + off_t sz; + int ret; + + BUG_ON(off & ~PAGE_MASK); + + sif_log(sdev, SIF_MMAP, "size %lx, off %lx", + size, start_off); + + while (off > sg->length) { + off -= sg->length; + sg = sg_next(sg); + } + sz = min_t(off_t, rem, sg->length - off); + + for (;;) { + sg_phy = sg_phys(sg); + pfn = (sg_phy + off) >> PAGE_SHIFT; + sif_log(sdev, SIF_MMAP, "pfn %pad, sz %lx sg_phys %pad off %lx", + &pfn, sz, &sg_phy, off); + ret = remap_pfn_range(vma, start, pfn, sz, vma->vm_page_prot); + if (ret) + return ret; + rem -= sz; + start += sz; + off = 0; + if (rem <= 0) + break; + sg = sg_next(sg); + sz = min_t(off_t, rem, sg->length); + } + return 0; +} + + +/* Remove a set of sg entries from the list starting at page index sg_idx + * and unlink from the linked list. + * + * We have to make sure we maintain consistency for index lookups, + * so no scatterlist vectors can be deleted from the middle of the list, + * only head and tail removal is allowed, + * and if we remove scatterlists from the head of the list, we must update the offset. + */ + +int sif_kmem_shrink(struct sif_dev *sdev, struct sif_kmem *kmem, int sg_idx, size_t size) +{ + /* TBD: Implement this! */ + return -EOPNOTSUPP; +} + + +/************************************ + * API for managing different higher level (scatter) memory segment abstractions + * used by SIF: + */ + +/* Set up a sif_mem structure for handling a memory + * segment of initial size @size. + */ +struct sif_mem *sif_mem_create(struct sif_dev *sdev, size_t sg_size, + size_t size, enum sif_mem_type mem_type, + gfp_t flag, enum dma_data_direction dir) +{ + int ret; + u32 page_shift = mem_type_to_page_shift(sdev, mem_type); + struct sif_mem *mem = kzalloc(sizeof(*mem), flag); + + if (!mem) + return NULL; + + BUG_ON(mem_type != SIFMT_2M && mem_type != SIFMT_4K); + + + ret = sif_kmem_init(sdev, &mem->m.km, sg_size, + size, page_shift, flag, dir); + if (ret) + goto failed; + + mem->sdev = sdev; + mem->size = size; + mem->mem_type = mem_type; + mem->max_page_shift = 0; + return mem; +failed: + kfree(mem); + return NULL; +} + +/* Create a sif_mem object from an umem object (User level memory) + * The sif_mem object resumes ownership of the umem: + */ +struct sif_mem *sif_mem_create_umem(struct sif_dev *sdev, + struct ib_umem *umem, + enum sif_mem_type mem_type, + gfp_t flag, enum dma_data_direction dir) +{ + struct sif_mem *mem; + u64 dma_addr; + + if (mem_type != SIFMT_BYPASS && !umem) { + sif_log(sdev, SIF_INFO, "Invalid umem setup"); + return NULL; + } + mem = kzalloc(sizeof(*mem), flag); + if (!mem) + return NULL; + + BUG_ON(!umem); + BUG_ON(mem_type != SIFMT_UMEM && + mem_type != SIFMT_UMEM_RO && + mem_type != SIFMT_BYPASS); + + mem->sdev = sdev; + mem->m.u.umem = umem; + mem->size = umem->length; + mem->mem_type = mem_type; + + /* See commit eeb8461e - sg chain safe impl of umem in 3.15 */ + mem->m.u.sg = umem->sg_head.sgl; + mem->m.u.start_offset = umem->address & ~PAGE_MASK; + mem->vmap_base = (void *)umem->address; + mem->max_page_shift = sif_mem_max_page_shift(mem); + dma_addr = sg_dma_address(mem->m.u.sg); + sif_log(sdev, SIF_MEM, "vaddr %p, sg dma start 0x%llx, umem start_offset %llx", + mem->vmap_base, dma_addr, mem->m.u.start_offset); + if (umem->nmap < umem->npages) { + int ret; + + sif_log(sdev, SIF_MEM, "Fixing up collapsed sg list (%d/%d)", + umem->nmap, umem->npages); + sif_logs(SIF_MEM, sif_dump_sg(mem->m.u.sg)); + ret = sif_mem_fixup_dma(mem->m.u.sg); + if (ret) { + sif_log(sdev, SIF_INFO, "sg list fixup failed"); + sif_dump_sg(mem->m.u.sg); + kfree(mem); + return NULL; + } + } + sif_logs(SIF_PT_VV, sif_dump_sg(mem->m.u.sg)); + return mem; +} + +/* Create a sif_mem object from a phys array of length @num_phys + * The phys array is owned by caller: + */ +struct sif_mem *sif_mem_create_phys(struct sif_dev *sdev, void *kvaddr, + struct ib_phys_buf *phys_buf, int num_phys, + gfp_t flag) +{ + int i; + u64 size = 0; + struct sif_mem *mem = kzalloc(sizeof(*mem), flag); + + if (!mem) + return NULL; + + mem->sdev = sdev; + mem->m.phys.phys_buf = phys_buf; + mem->m.phys.phys_buf_len = num_phys; + for (i = 0; i < num_phys; i++) { + sif_log(sdev, SIF_MMU_V, "phys_buf addr 0x%llx size 0x%llx", + phys_buf[i].addr, phys_buf[i].size); + size += phys_buf[i].size; + } + /* TBD: We could calculate this above but phys_mr is scheduled to be removed */ + mem->max_page_shift = 0; + mem->vmap_base = kvaddr; + mem->size = size; + mem->mem_type = SIFMT_PHYS; + return mem; +} + +struct sif_mem *sif_mem_create_fmr(struct sif_dev *sdev, size_t max_pages, u32 page_shift, + gfp_t flag) +{ + size_t size = max_pages << page_shift; + struct sif_mem *mem = sif_mem_create_ref(sdev, SIFMT_PTONLY, 0, size, flag); + + if (mem) + mem->m.fmr.page_shift = page_shift; + sif_log(sdev, SIF_FMR, "page_shift %d, size 0x%lx", page_shift, size); + return mem; +} + +/* Create a sif_mem object from a memory pointer array of length @num_pages + * The memory pointer array is owned by caller: + */ +int sif_mem_map_fmr(struct sif_mem *mem, u64 iova, + u64 *page_list, int num_pages) +{ + u64 actual_size = num_pages << mem->m.fmr.page_shift; + + if (iova & (mem->m.fmr.page_shift - 1)) { + sif_log(mem->sdev, SIF_INFO, "Misaligned FMR start - iova 0x%llx", iova); + return -EINVAL; + } + if (actual_size > mem->size) { + /* This is really now an artificial limit for us, except for performance */ + sif_log(mem->sdev, SIF_INFO, "Attempt to map 0x%llx bytes, max for this FMR is 0x%llx", + actual_size, mem->size); + return -ENOMEM; + } + mem->vmap_base = (void *)iova; + mem->m.fmr.page_list = page_list; + mem->m.fmr.page_list_len = num_pages; + mem->mem_type = SIFMT_FMR; + + /* We save the max mem size to be able to restore it later */ + mem->m.fmr.max_size = mem->size; + mem->size = actual_size; + mem->max_page_shift = sif_mem_fmr_max_page_shift(mem); + return 0; +} + +void sif_mem_unmap_fmr(struct sif_mem *mem) +{ + mem->vmap_base = NULL; + mem->size = mem->m.fmr.max_size; + mem->m.fmr.page_list = NULL; + mem->m.fmr.page_list_len = 0; + mem->mem_type = SIFMT_PTONLY; +} + +/* Create a sif_mem object mapped dma contiguous, suitable for + * BYPASS mapping (size constraints..) + */ +struct sif_mem *sif_mem_create_dmacont(struct sif_dev *sdev, size_t size, + gfp_t flag, enum dma_data_direction dir) +{ + struct sif_mem *mem = kzalloc(sizeof(*mem), flag); + dma_addr_t dma_handle; + struct scatterlist *sg; + + if (!mem) + return NULL; + + /* The __GFP_DMA32 bit is not supported by page_alloc in all kernels */ + if (unlikely(flag & __GFP_DMA32)) { + u64 dma_addr; + + mem->vmap_base = ib_dma_alloc_coherent(&sdev->ib_dev, size, + &dma_addr, flag); + dma_handle = dma_addr; + mem->m.u.flags = SMF_DMA32; + } else + mem->vmap_base = sif_dma_alloc_aligned(&sdev->ib_dev, size, &dma_handle, + flag, dir); + if (!mem->vmap_base) + goto dma_alloc_failed; + mem->sdev = sdev; + mem->mem_type = SIFMT_BYPASS; + mem->max_page_shift = sdev->mi.max_shift; + mem->size = size; + mem->m.u.dir = dir; + mem->m.u.umem = NULL; + sg = mem->m.u.sg = &mem->m.u.sg0; + sg_init_one(sg, mem->vmap_base, mem->size); + sg->dma_address = dma_handle; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + sg->dma_length = mem->size; +#endif + return mem; +dma_alloc_failed: + kfree(mem); + return NULL; +} + + +/* Create a sif_mem object with no own memory backing - to use for CB, SQ_CMPL and + * kernel full passthrough cases to have a "shallow" mem object: + */ +struct sif_mem *sif_mem_create_ref(struct sif_dev *sdev, enum sif_mem_type mem_type, + u64 sif_vaddr, size_t size, gfp_t flag) +{ + struct sif_mem *mem = kzalloc(sizeof(*mem), flag); + + if (!mem) + return NULL; + + BUG_ON(mem_type != SIFMT_PTONLY && mem_type != SIFMT_NOMEM && mem_type != SIFMT_CS); + + mem->sdev = sdev; + mem->mem_type = mem_type; + mem->vmap_base = (void *)sif_vaddr; + mem->size = size; + mem->max_page_shift = 0; + return mem; +} + + +/* Free a sif_mem previously created with sif_mem_create */ +int sif_mem_free(struct sif_mem *mem) +{ + switch (mem->mem_type) { + case SIFMT_2M: + case SIFMT_4K: + sif_kmem_free(mem->sdev, &mem->m.km); + break; + case SIFMT_BYPASS: + /* BYPASS mode can be used from kernel or user space + * If umem is set, it is a user space mapping: + */ + if (!mem->m.u.umem) { + if (mem->m.u.flags & SMF_DMA32) + ib_dma_free_coherent(&mem->sdev->ib_dev, mem->size, + mem->vmap_base, sif_mem_dma(mem, 0)); + else + sif_dma_free_aligned(&mem->sdev->ib_dev, mem->size, + mem->vmap_base, sif_mem_dma(mem, 0), mem->m.u.dir); + } + /* Deliberate fall-through */ + case SIFMT_UMEM: + case SIFMT_UMEM_RO: + if (mem->m.u.umem) + ib_umem_release(mem->m.u.umem); + break; + default: + break; /* Nothing extra to do */ + } + kfree(mem); + return 0; +} + + +/* Allocate some (more) memory for this sif_mem + * Return a pointer to the start of that memory and increase ref.cnt for the sif_mem + */ +int sif_mem_extend(struct sif_mem *mem, size_t size, gfp_t flag) +{ + int sg_idx; + + if (mem->mem_type != SIFMT_2M && mem->mem_type != SIFMT_4K) + return -EINVAL; + + sg_idx = sif_kmem_extend(mem->sdev, &mem->m.km, size, flag); + mem->size = mem->m.km.size; + return sg_idx; +} + +/* Free a subrange of this memory object starting at @sg and dereference the + * sif_mem object. Assumes there is no other references to this subrange: + */ +int sif_mem_shrink(struct sif_mem *mem, int sg_idx, size_t size) +{ + int ret; + + if (mem->mem_type != SIFMT_2M && mem->mem_type != SIFMT_4K) + return -EINVAL; + + ret = sif_kmem_shrink(mem->sdev, &mem->m.km, sg_idx, size); + mem->size = mem->m.km.size; + return ret; +} + + +bool sif_mem_has_umem(struct sif_mem *mem) +{ + switch (mem->mem_type) { + case SIFMT_UMEM: + case SIFMT_UMEM_RO: + case SIFMT_BYPASS: + return mem->m.u.umem != NULL; + default: + break; + } + return false; +} + + +/* Find kernel virtual address at @offset within map */ +void *sif_mem_kaddr(struct sif_mem *mem, off_t offset) +{ + switch (mem->mem_type) { + case SIFMT_2M: + case SIFMT_4K: + { + off_t off = offset & ((1 << mem->m.km.page_shift) - 1); + u32 i = offset >> mem->m.km.page_shift; + struct scatterlist *sg = sif_kmem_find_sg_idx(&mem->m.km, i); + + return sg_virt(sg) + off; + } + case SIFMT_BYPASS: + case SIFMT_UMEM: + case SIFMT_UMEM_RO: + case SIFMT_NOMEM: + case SIFMT_PHYS: + case SIFMT_FMR: + return mem->vmap_base + offset; + default: + break; + } + + sif_log(mem->sdev, SIF_INFO, "Not implemented for type %d", + mem->mem_type); + return NULL; +} + +/* Find DMA address at @offset within map */ +dma_addr_t sif_mem_dma(struct sif_mem *mem, off_t offset) +{ + switch (mem->mem_type) { + case SIFMT_PTONLY: + return offset; + case SIFMT_2M: + case SIFMT_4K: + { + off_t off = offset & ((1 << mem->m.km.page_shift) - 1); + u32 i = offset >> mem->m.km.page_shift; + struct scatterlist *sg = sif_kmem_find_sg_idx(&mem->m.km, i); + + return sg_dma_address(sg) + off; + } + case SIFMT_BYPASS: + return sg_dma_address(mem->m.u.sg) + offset; + case SIFMT_UMEM: + case SIFMT_UMEM_RO: + { + struct scatterlist *sg = mem->m.u.sg; + /* umem objects have page aligned sg lists but may start at an offset */ + offset += mem->m.u.start_offset; + while (sg && offset >= sg->length) { + offset -= sg->length; + sg = sg_next(sg); + } + return sg_dma_address(sg) + offset; + } + case SIFMT_PHYS: + { + struct ib_phys_buf *pb = mem->m.phys.phys_buf; + + while (offset >= pb->size) { + offset -= pb->size; + pb++; + } + return pb->addr + offset; + } + case SIFMT_FMR: + { + u32 pageno = offset >> mem->m.fmr.page_shift; + off_t off = offset & ((1 << mem->m.fmr.page_shift) - 1); + + return mem->m.fmr.page_list[pageno] + off; + } + default: + break; + } + + sif_log(mem->sdev, SIF_INFO, "Not implemented for type %d", + mem->mem_type); + BUG(); + return 0ull; +} + + +struct scatterlist *sif_mem_get_sgl(struct sif_mem *mem) +{ + switch (mem->mem_type) { + case SIFMT_UMEM: + case SIFMT_UMEM_RO: + case SIFMT_BYPASS: + return mem->m.u.sg; + case SIFMT_2M: + case SIFMT_4K: + return mem->m.km.sg; + default: + sif_log(mem->sdev, SIF_INFO, "unsupported memory type %d", mem->mem_type); + break; + } + return NULL; +} + + +/* If map is continuous, get start of dma mapping + * otherwise return an error pointer: + */ +dma_addr_t sif_mem_dma_if_cont(struct sif_mem *mem) +{ + struct scatterlist *sg; + size_t sz = 1 << sif_mem_max_page_shift(mem); + + if (sz < mem->size) { + sif_log(mem->sdev, SIF_INFO, + "size: %lld - max possible page sz %ld: mmu bypass not possible", + mem->size, sz); + return (u64)ERR_PTR(-EPERM); + } + sg = sif_mem_get_sgl(mem); + if (unlikely(!sg)) + return (u64)ERR_PTR(-EINVAL); + return sg_dma_address(sg); +} + + +int sif_mem_vma_map_part(struct sif_mem *mem, struct vm_area_struct *vma, + off_t start_off, size_t size) +{ + switch (mem->mem_type) { + case SIFMT_2M: + case SIFMT_4K: + return sif_kmem_vma_map_part(mem->sdev, &mem->m.km, vma, start_off, size); + case SIFMT_BYPASS: + case SIFMT_BYPASS_RO: + return sif_vma_map_sg_part(mem->sdev, mem->m.u.sg, vma, start_off, size); + default: + sif_log(mem->sdev, SIF_INFO, "not implemented for mem.type %d", mem->mem_type); + return -EOPNOTSUPP; + } +} + + +/* Map the memory referenced by @mem to the user space vma */ +int sif_mem_vma_map(struct sif_mem *mem, struct vm_area_struct *vma) +{ + return sif_mem_vma_map_part(mem, vma, 0, mem->size); +} + +/* sif_mem iterator support (mainly for the types that do not expose a scatterlist) */ + +int sif_mem_iter_init(struct sif_mem *mem, struct sif_mem_iter *it) +{ + it->mem = mem; + switch (mem->mem_type) { + case SIFMT_PHYS: + case SIFMT_FMR: + case SIFMT_PTONLY: + it->phys.i = 0; + break; + default: + it->sg = sif_mem_get_sgl(mem); + if (!it->sg) + return -EINVAL; + } + it->offset = 0; + return 0; +} + + +int sif_mem_iter_advance(struct sif_mem_iter *it, u64 incr) +{ + switch (it->mem->mem_type) { + case SIFMT_PHYS: + { + long left = it->mem->m.phys.phys_buf[it->phys.i].size - it->offset; + + if (left > incr) + it->offset += incr; + else { + it->offset = incr - left; + it->phys.i++; + } + if (it->phys.i >= it->mem->m.phys.phys_buf_len) + return -ENOMEM; + return 0; + } + case SIFMT_FMR: + { + long page_size = 1 << it->mem->m.fmr.page_shift; + long left = page_size - it->offset; + + if (left > incr) + it->offset += incr; + else { + it->offset = incr - left; + it->phys.i++; + } + if (it->phys.i >= it->mem->m.fmr.page_list_len) + return -ENOMEM; + return 0; + } + case SIFMT_PTONLY: + it->offset += incr; + if (it->offset >= it->mem->size) + return -ENOMEM; + return 0; + default: + it->offset += incr; + while (it->offset >= it->sg->length) { + it->offset = it->offset - it->sg->length; + it->sg = sg_next(it->sg); + } + if (it->sg) + return 0; + else + return -ENOMEM; + } +} + +dma_addr_t sif_mem_iter_dma(struct sif_mem_iter *it) +{ + switch (it->mem->mem_type) { + case SIFMT_PHYS: + return it->mem->m.phys.phys_buf[it->phys.i].addr + it->offset; + case SIFMT_FMR: + return it->mem->m.fmr.page_list[it->phys.i] + it->offset; + case SIFMT_PTONLY: + return 0; /* For future fmr use: populate with empty ptes to be filled later */ + default: + return sg_dma_address(it->sg) + it->offset; + } +} + + +/* DMA is mapped continuously and the map is reflected in a "collapsed" sg list for DMA, + * The rest of the list is still valid for the pa/va part - we need to loop through and + * make it consistent for our usage: + */ +static int sif_mem_fixup_dma(struct scatterlist *sg) +{ + struct scatterlist *from_sg = sg; + struct scatterlist *last_sg = sg; + dma_addr_t dma_addr = sg_dma_address(from_sg); + size_t dma_size = sg_dma_len(sg); + size_t sg_len = sg->length; /* Save the "homogeneous" length */ + + while (sg) { + if (dma_size < sg->length) + return -EINVAL; /* should not happen */ + + if (sg->dma_address && sg->dma_address != (dma_addr_t)-1) { + /* This entry is part of the collapsed list + * must keep address and dma_length until we have "consumed" it, + * Since all lengths are homogeneous in the resulting list we + * can temporarily "misuse" the length field in this entry to + * store the new dma_address, and just leave the dma_length + * for later consumption: + */ + sg->length = sg->dma_address; + } else + sg->dma_length = sg_len; + + sg->dma_address = dma_addr; + dma_addr += sg_len; + dma_size -= sg_len; + last_sg = sg; + sg = sg_next(sg); + + if (!dma_size) { + /* Clean up our "temporary store" (see below comment) */ + from_sg->length = from_sg->dma_length = sg_len; + from_sg = sg_next(from_sg); + dma_addr = from_sg->length; /* from temp store */ + dma_size = sg_dma_len(from_sg); + } + } + return 0; +} + +/* A utility for dumping an sg list to the trace buffer */ +void sif_dump_sg(struct scatterlist *sgl) +{ + struct scatterlist *sg = sgl; + int cnt = 0; + + trace_printk(" **** sg dump - start at %p ****\n", sg); + trace_printk("%16s: %16s %8s %16s %16s %8s %8s %4s\n", + "sg", "dma", "dmalen", "pa", "kva", "length", "offset", "end mark"); + while (sg) { + u64 dma_addr = sg_dma_address(sg); + u64 pa = sg_phys(sg); + + trace_printk("%p: %#16llx %#8x %#16llx %p %#8x %#8x %4s\n", + sg, dma_addr, sg_dma_len(sg), pa, + sg_virt(sg), sg->length, sg->offset, + (sg_is_last(sg) ? "[last]" : "")); + sg = sg_next(sg); + cnt++; + } + trace_printk(" **** tot.%d elements ****\n", cnt); +} diff --git a/drivers/infiniband/hw/sif/sif_mem.h b/drivers/infiniband/hw/sif/sif_mem.h new file mode 100644 index 0000000000000..1b91a8fd72854 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_mem.h @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_mem.h: A common interface for all memory used by + * SIF for queue, table and page table management + */ + +#ifndef _SIF_MEM_H +#define _SIF_MEM_H +#include +#include "sif_user.h" + +/* We need to support 4 interfaces to memory; abbreviated umem, fmr, + * phys and kmem below, to be compatible with the different ways we are called. + * This is due to be cleaned up in the core IB stack, + * by allowing the use of scatterlists for all types of s/g memory + * provided to rdma devices. + */ + +/* Allocation of table and queue memory: + * The Linux buddy allocator should guarantee us lots of up to 4M I/O contiguous + * memory segments through alloc_pages provided the system has enough memory. + * Assume that we get at least 4M standalone and any number of (aligned) 2M entries after that + * + * This way we allocate contiguous memory and use bypass/passthrough mapping if + * alloc_sz <= 4M, and revert to GVA2GPA if needs are larger, but allocate in 2M blocks + * and use PSIF 2M pages for this. + */ + +struct ib_umem; +struct sif_dev; + +/* Per device memory configuration info + * embedded in sif_dev: + */ +struct sif_mem_info { + u8 page_shift; /* number of bits within the smallest SIF level 0 page (depends on config) */ + u8 level_shift; /* number of bits to shift to the next level in the page table */ + u8 max_shift; /* Highest number of bits within the highest level page */ + u32 ptes_per_page; /* Page table entries per page table page */ + u64 page_size; /* size of a SIF level 0 page (as configured) */ + u64 page_mask; /* All bits beyond page_shift set */ +}; + +/* Valid for SIFMT_2M, SIFMT_4K and SIFMT_BYPASS_RO: + * Represented as a pool of equally sized pages. + * Allows direct page offset lookup from the kernel side. + * All pages are the same size. + * To maintain offset indexes, interior pages cannot be removed. + * sg_start will be > 0 if there are empty entries at the start, allowing + * indexes to remain valid if entries are deleted from the head + */ +struct sif_kmem { + u64 size; /* Size of the mapped memory of this kmem */ + u32 page_shift; /* Represents page size of each scatter element */ + u32 sg_size; /* Allocated number of (usable!) elements in (each) scatter list */ + u32 sg_start; /* Current start offset into the sg list */ + u32 sg_max; /* Last entry in use + 1 (<= sg_size * nlists) */ + u32 nlists; /* Number of (sg_size+1'd) sg lists linked through sg */ + enum dma_data_direction dir; /* DMA direction used for dma mapping */ + struct scatterlist *sg; /* Pointer to start of scatterlist array */ + struct scatterlist *last_sg; /* The start of the last list array in the sg list linkage */ +}; + +/* Valid for SIFMT_FMR (when called from ib_map_phys_fmr) */ +struct sif_mem_fmr { + u64 *page_list; /* Array of dma addresses of buffers */ + u32 page_list_len; /* length of page_list array */ + u32 page_shift; /* Represents page size of each scatter element */ + u64 max_size; /* Saved maximal size of the FMR as supplied during creation */ +}; + +/* Valid for SIFMT_PHYS (when called from ib_reg_phys_mr) + * It is called "phys" but should have been called "dma" as it is used + * with dma addresses in at least 1 of the 2 use cases in the kernel... + * not important to support this API, but keep for completeness: + */ +struct sif_mem_phys { + struct ib_phys_buf *phys_buf; /* Array of dma address/size pairs of buffers */ + u64 phys_buf_len; /* length of phys_buf array */ +}; + +/* Flag values so far only used by 'flags' in sif_mem_umem: */ +enum sif_mem_flags { + SMF_DMA32 = 0x1 /* Set if this memory is allocated from the DMA32 space */ +}; + +/* Memory types mapped from user space: + * Valid for SIFMT_UMEM, SIFMT_UMEM_RO, SIFMT_BYPASS: + */ +struct sif_mem_umem { + struct ib_umem *umem; /* User memory, NULL if this is a kernel bypass mapping */ + struct scatterlist *sg; /* A pointer to a valid scatterlist (user and kernel) */ + u64 start_offset; /* Stored misalignment according to the scatter element size */ + enum dma_data_direction dir; /* DMA direction used for dma mapping */ + u32 flags; + struct scatterlist sg0; /* Inline storage for bypass mode */ +}; + + +/* The generic sif s/g memory representation + * + */ +struct sif_mem { + struct sif_dev *sdev; + enum sif_mem_type mem_type; /* Logical type of mapping */ + u16 max_page_shift; /* 0: unknown, >= 0: Largest page size that can be mapped cont. */ + u64 size; /* Size of mapping */ + void *vmap_base; /* Kernel address of the start of a vmap cont.mapping, if any */ + union { + struct sif_mem_umem u; /* SIFMT_{UMEM*,BYPASS} */ + struct sif_kmem km; /* SIFMT_{2M,CS,4K} */ + struct sif_mem_fmr fmr; /* SIFMT_FMR */ + struct sif_mem_phys phys; /* SIFMT_PHYS */ + } m; +}; + + +/* Initialization of global per device info - called from sif_hwi.c */ +void sif_mem_init(struct sif_dev *sdev); + +/* API for managing a sif_kmem object */ + +/* Allocate a memory object of size @size and populate an sg list + * with it: + */ +int sif_kmem_init(struct sif_dev *sdev, struct sif_kmem *kmem, size_t sg_size, size_t size, + u32 page_shift, gfp_t flag, enum dma_data_direction dir); + +/* sg unmap and free the memory referenced by mem */ +void sif_kmem_free(struct sif_dev *sdev, struct sif_kmem *mem); + +/* Extend the kmem object with a total size of @size - return sg_index of the first + * allocated element: + */ +int sif_kmem_extend(struct sif_dev *sdev, struct sif_kmem *kmem, + size_t size, gfp_t flag); +int sif_kmem_shrink(struct sif_dev *sdev, struct sif_kmem *mem, int sg_idx, size_t size); + +/* Find the scatterlist element with index idx within kmem */ +struct scatterlist *sif_kmem_find_sg_idx(struct sif_kmem *kmem, u32 idx); + +/************************************ + * API for managing different higher level (scatter) memory segment abstractions + * used by SIF: + */ + +/* Set up a sif_mem structure for handling a memory + * segment of initial size @size. + */ +struct sif_mem *sif_mem_create(struct sif_dev *sdev, size_t sg_size, size_t size, + enum sif_mem_type mem_type, + gfp_t flag, + enum dma_data_direction dir); + +/* Create a sif_mem object from an umem object (User level memory) + * The sif_mem object resumes ownership of the umem: + */ +struct sif_mem *sif_mem_create_umem(struct sif_dev *sdev, + struct ib_umem *umem, + enum sif_mem_type mem_type, + gfp_t flag, enum dma_data_direction dir); + +/* Create a sif_mem object from a phys array of length @num_phys + * The phys array is owned by caller: + */ +struct sif_mem *sif_mem_create_phys(struct sif_dev *sdev, void *iova_start, + struct ib_phys_buf *phys, int num_phys, + gfp_t flag); + +/* Create a sif_mem object from a memory pointer array of length @num_pages + * The memory pointer array is owned by caller: + */ +struct sif_mem *sif_mem_create_fmr(struct sif_dev *sdev, size_t size, u32 page_shift, + gfp_t flag); + +/* Create a sif_mem object with no own memory backing - to use for CB, SQ_CMPL and + * kernel full passthrough cases to have a "shallow" mem object: + */ +struct sif_mem *sif_mem_create_ref(struct sif_dev *sdev, enum sif_mem_type mem_type, + u64 sif_vaddr, size_t size, gfp_t flag); + +/* Create an aligned sif_mem object mapped coherent dma contiguous, suitable for + * BYPASS mapping (size constraints..) + */ +struct sif_mem *sif_mem_create_dmacont(struct sif_dev *sdev, size_t size, gfp_t flag, + enum dma_data_direction dir); + +/* Free a sif_mem previously created with sif_mem_create */ +int sif_mem_free(struct sif_mem *mem); + +/* Map a previously created sif_mem ref object from a memory pointer array of length @num_pages + * The memory pointer array is owned by caller: + * Returns -ENOMEM if the sif_mem ref object does not have a sufficiently large size. + */ +int sif_mem_map_fmr(struct sif_mem *mem, u64 iova, + u64 *page_list, int num_pages); + +/* Unmap and reset a mem object previously set up with sif_mem_map_fmr */ +void sif_mem_unmap_fmr(struct sif_mem *mem); + +/* Allocate some (more) memory for this sif_mem + * Return an s/g index (page offset to the start of that memory + * or -errval if an error. + */ +int sif_mem_extend(struct sif_mem *mem, size_t size, gfp_t flag); + +/* Free a subrange of this memory object starting at @sg_idx and dereference the + * sif_mem object. Assumes there is no other references to this subrange, and that + * this subrange corresponds exactly to a prior allocation with either create or extend above + * returns 0 upon success or a negative errno if failure: + */ +int sif_mem_shrink(struct sif_mem *mem, int sg_idx, size_t size); + +/* Returns true if this memory is represented internally by an umem object */ +bool sif_mem_has_umem(struct sif_mem *mem); + +/* Return the largest page size (represented by page shift bits) usable for this memory */ +u32 sif_mem_page_shift(struct sif_mem *mem); + +/* Find kernel virtual address at @offset within map */ +void *sif_mem_kaddr(struct sif_mem *mem, off_t offset); + +/* Find dma address at @offset within map */ +dma_addr_t sif_mem_dma(struct sif_mem *mem, off_t offset); + +/* If map is continuous, get start of dma mapping + * otherwise return an error pointer: + */ +dma_addr_t sif_mem_dma_if_cont(struct sif_mem *mem); + +/* Return the start of the s/g list for this mem object */ +struct scatterlist *sif_mem_get_sgl(struct sif_mem *mem); + +/* Map a part of the @mem object given by @offset, @size to the user space + * vm context given in @vma. The part must be page aligned and page sized: + */ + +int sif_mem_vma_map_part(struct sif_mem *mem, struct vm_area_struct *vma, + off_t start_off, size_t size); + +/* Map the memory referenced by @mem to the user space vma */ +int sif_mem_vma_map(struct sif_mem *mem, struct vm_area_struct *vma); + + +/* sif_mem iterator (mainly for the types that do not expose a scatterlist) */ +struct sif_mem_iter { + struct sif_mem *mem; + union { + struct { + int i; /* Index used by SIFMT_PHYS and SIFMT_FMR */ + } phys; + struct scatterlist *sg; /* Used by scatterlist based types */ + }; + size_t offset; /* Current offset within element */ +}; + +int sif_mem_iter_init(struct sif_mem *mem, struct sif_mem_iter *it); +int sif_mem_iter_advance(struct sif_mem_iter *it, u64 incr); +dma_addr_t sif_mem_iter_dma(struct sif_mem_iter *mi); + +/* A utility for dumping an sg list to the trace buffer */ +void sif_dump_sg(struct scatterlist *sgl); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_mmu.c b/drivers/infiniband/hw/sif/sif_mmu.c new file mode 100644 index 0000000000000..ba4f1a0ba88f1 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_mmu.c @@ -0,0 +1,751 @@ +/* + * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_mmu.c: main entry points and initialization + */ + +#include "sif_mmu.h" +#include "sif_dev.h" +#include "sif_base.h" +#include "sif_dma.h" +#include "sif_hwi.h" +#include "sif_mem.h" +#include "sif_spt.h" +#include "sif_xmmu.h" +#include "sif_pt.h" +#include "sif_mr.h" +#include "sif_query.h" + +#include +#include +#include +#include +#include +#include +#include "psif_hw_setget.h" +#include "sif_defs.h" + +static int sif_map_gva_ctx(struct sif_dev *sdev, + struct sif_mmu_ctx *ctx, + struct sif_mem *mem, + bool write); + +static int sif_map_bypass_ctx(struct sif_dev *sdev, + struct sif_mmu_ctx *ctx, + struct sif_mem *mem, + bool write); + +static int sif_map_cs_ctx(struct sif_dev *sdev, + struct sif_mmu_ctx *ctx, + bool write); + +#ifndef __sparc__ +/* Special handling for PHYS memory types which don't have any sg list: */ +static int sif_map_special_ctx(struct sif_dev *sdev, + struct sif_mmu_ctx *ctx, + struct sif_mem *mem, + bool write); +#endif + +static int sif_mmu_invalidate_tlb(struct sif_dev *sdev, struct sif_mmu_ctx *ctx, enum wr_mode mode); + +void set_ctx(struct sif_dev *sdev, + struct sif_mmu_ctx *ctx, + enum psif_table_level level, + u64 val) +{ + struct psif_mmu_cntx *hw_ctx = &ctx->mctx; + + val &= ~PSIF_TABLE_PTR_MASK; + hw_ctx->table_ptr = ((val) >> PT_PAGE_SHIFT); + hw_ctx->table_level = level; + sif_log(sdev, SIF_MMU, "%p ptr 0x%08llx level %d", hw_ctx, val, level); +} + + + +int sif_map_ctx(struct sif_dev *sdev, + struct sif_mmu_ctx *ctx, + struct sif_mem *mem, + u64 virt_base, u64 size, bool write) +{ + /* hw_ctx entry assumed to be set up in pass through + * prior to the call (all null bytes) + */ + ctx->type = MMU_GVA2GPA_MODE; + ctx->base = virt_base; + ctx->size = size; + ctx->mt = mem->mem_type; + + switch (mem->mem_type) { + case SIFMT_BYPASS: + case SIFMT_BYPASS_RO: + case SIFMT_NOMEM: + return sif_map_bypass_ctx(sdev, ctx, mem, write); + case SIFMT_UMEM: + case SIFMT_UMEM_RO: + case SIFMT_2M: + case SIFMT_4K: + return sif_map_gva_ctx(sdev, ctx, mem, write); + case SIFMT_CS: + return sif_map_cs_ctx(sdev, ctx, write); + case SIFMT_ZERO: + return sif_zero_map_gva_ctx(sdev, ctx, mem, write); + case SIFMT_PTONLY: + return 0; /* Nothing to map yet */ +#ifndef __sparc__ + case SIFMT_PHYS: + return sif_map_special_ctx(sdev, ctx, mem, write); + case SIFMT_UMEM_SPT: + return sif_spt_map_gva_ctx(sdev, ctx, mem, write); +#endif + default: + sif_log(sdev, SIF_INFO, "Unimplemented mem_type %d %s", + mem->mem_type, sif_mem_type_str(mem->mem_type)); + return -EOPNOTSUPP; + } + return -EINVAL; +} + +void sif_unmap_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx) +{ + switch (ctx->mt) { + case SIFMT_BYPASS: + case SIFMT_BYPASS_RO: + case SIFMT_NOMEM: + break; + case SIFMT_UMEM: + case SIFMT_UMEM_RO: + case SIFMT_PHYS: + case SIFMT_FMR: + case SIFMT_2M: + case SIFMT_4K: + case SIFMT_CS: + case SIFMT_PTONLY: + sif_unmap_gva_ctx(sdev, ctx); + break; +#ifndef __sparc__ + case SIFMT_ZERO: + sif_zero_unmap_gva_ctx(sdev, ctx); + break; + case SIFMT_UMEM_SPT: + sif_spt_unmap_gva_ctx(sdev, ctx); + break; +#endif + default: + sif_log(sdev, SIF_INFO, "Unimplemented mem type %d, ctx at %p", ctx->mt, ctx); + BUG(); /* Should not happen - throwing the cards */ + } +} + +static size_t num_pages(u64 base, u64 size, u32 page_shift) +{ + size_t pg_sz = 1 << page_shift; + + return aligned_size(base, size, pg_sz) >> page_shift; +} + +/* May return -1 or a valid enum value for psif_page_size */ +static int hw_leaf_page_sz(struct sif_dev *sdev, u32 page_shift) +{ + /* Page size not supported by device configuration */ + if (sdev->mi.page_shift > page_shift) { + sif_log(sdev, SIF_INFO, + "Cannot support page shift %d - min.page shift supported in this configuration is %d", + page_shift, sdev->mi.page_shift); + return -1; + } + + switch (sdev->mi.page_shift) { + case 12: /* Device configured for Intel page sizes */ + if (page_shift < 21) + return PAGE_SIZE_IA32E_4KB; + if (page_shift < 30) + return PAGE_SIZE_IA32E_2MB; + return PAGE_SIZE_IA32E_1GB; + case 13: /* Device configured for Sparc page sizes */ + if (page_shift < 16) + return PAGE_SIZE_S64_8KB; + if (page_shift < 19) + return PAGE_SIZE_S64_64KB; + if (page_shift < 22) + return PAGE_SIZE_S64_512KB; + if (page_shift < 25) + return PAGE_SIZE_S64_4MB; + if (page_shift < 28) + return PAGE_SIZE_S64_32MB; + if (page_shift < 34) + return PAGE_SIZE_S64_2GB; + return PAGE_SIZE_S64_16GB; + } + sif_log(sdev, SIF_INFO, "Cannot support page shift %d", page_shift); + return -1; +} + + +static inline enum psif_table_level hw_leaf_level(enum psif_page_size pg_sz) +{ + switch (pg_sz) { + case PAGE_SIZE_IA32E_2MB: + case PAGE_SIZE_S64_4MB: + return PAGE_LEVEL1; + case PAGE_SIZE_IA32E_1GB: + case PAGE_SIZE_S64_2GB: + return PAGE_LEVEL2; + default: + return PAGE_LEVEL0; + } +} + + +static int sif_map_bypass_ctx(struct sif_dev *sdev, + struct sif_mmu_ctx *ctx, + struct sif_mem *mem, + bool write) +{ + u64 addr = 0; + int ret = 0; + + ctx->type = MMU_PASS_THROUGH0; + + if (mem->mem_type == SIFMT_NOMEM) + ctx->mt = SIFMT_BYPASS; + if (write) + ctx->mctx.wr_access = 1; + + if (mem->m.u.umem) { + addr = sif_mem_dma_if_cont(mem); + if (IS_ERR((void *)addr)) + return PTR_ERR((void *)addr); + } else if (mem->mem_type != SIFMT_NOMEM) + addr = sif_mem_dma(mem, 0); + + if (mem->mem_type == SIFMT_BYPASS || mem->mem_type == SIFMT_BYPASS_RO) + ctx->uv2dma = addr - ctx->base; + ctx->base = addr; + return ret; +} + + +static int sif_map_gva_ctx(struct sif_dev *sdev, + struct sif_mmu_ctx *ctx, + struct sif_mem *mem, + bool write) +{ + struct psif_mmu_cntx *hw_ctx = &ctx->mctx; + bool multipage; + u64 page_size; + u64 page_mask; + enum psif_table_level leaf_level; + u64 aligned_base; + u64 aligned_sz; + u32 page_shift = sif_mem_page_shift(mem); + u8 pt_leaf_level = 0; + u8 pt_pte_extent = 1; + u64 dma_addr; + + /* Adjust to a supported page shift */ + int ret = find_optimal_leaf_level(sdev, page_shift, + ctx->base, sif_mem_dma(mem, 0), ctx->size, + &pt_leaf_level, &pt_pte_extent); + if (ret) + return ret; + + page_shift = sdev->mi.page_shift + pt_leaf_level * sdev->mi.level_shift; + page_size = 1ULL << page_shift; + page_mask = ~(page_size - 1); + + hw_ctx->wr_access = write; + hw_ctx->translation_type = MMU_GVA2GPA_MODE; + hw_ctx->page_size = hw_leaf_page_sz(sdev, page_shift); + + aligned_base = ctx->base & page_mask; + aligned_sz = aligned_size(ctx->base, ctx->size, page_size); + multipage = sdev->single_pte_pt || aligned_sz > page_size; + leaf_level = hw_leaf_level(hw_ctx->page_size); + dma_addr = sif_mem_dma(mem, 0); + + sif_log(sdev, SIF_MMU_V, "base 0x%llx dma base 0x%llx size 0x%llx page shift %d size %s", + ctx->base, dma_addr, ctx->size, page_shift, + string_enum_psif_page_size(hw_ctx->page_size)); + + if (multipage) { + ctx->pt = sif_pt_create(sdev, sif_mem_get_sgl(mem), + ctx->base, ctx->size, page_shift, false, false); + if (!ctx->pt) + return -ENOMEM; + set_ctx(sdev, ctx, sif_pt_root_table_level(ctx->pt), sif_pt_dma_root(ctx->pt)); + } else { + dma_addr_t aligned_dma_addr = dma_addr & ~((1 << page_shift) - 1); + + set_ctx(sdev, ctx, leaf_level, aligned_dma_addr); + } + return 0; +} + + +static int sif_map_cs_ctx(struct sif_dev *sdev, + struct sif_mmu_ctx *ctx, + bool write) +{ + struct psif_mmu_cntx *hw_ctx = &ctx->mctx; + + hw_ctx->wr_access = write; + hw_ctx->translation_type = MMU_GVA2GPA_MODE; + hw_ctx->page_size = PAGE_SIZE_IA32E_4KB; + + /* Just create a page table with an empty top level page */ + ctx->pt = sif_pt_create_empty(sdev, ctx->base, SIFMT_CS); + if (!ctx->pt) + return -ENOMEM; + set_ctx(sdev, ctx, sif_pt_root_table_level(ctx->pt), sif_pt_dma_root(ctx->pt)); + return 0; +} + +#ifndef __sparc__ +static int sif_map_special_ctx(struct sif_dev *sdev, + struct sif_mmu_ctx *ctx, + struct sif_mem *mem, + bool write) +{ + struct psif_mmu_cntx *hw_ctx = &ctx->mctx; + bool multipage = aligned_size(ctx->base, ctx->size, PAGE_SIZE) > PAGE_SIZE; + + sif_log(sdev, SIF_MMU_V, "base 0x%llx size 0x%llx", ctx->base, ctx->size); + + hw_ctx->page_size = PAGE_SIZE_IA32E_4KB; + hw_ctx->wr_access = write; + hw_ctx->translation_type = MMU_GVA2GPA_MODE; + + if (multipage) { + ctx->pt = sif_pt_create_for_mem(mem, ctx->base, 12, true, true); + if (!ctx->pt) + return -ENOMEM; + set_ctx(sdev, ctx, sif_pt_root_table_level(ctx->pt), sif_pt_dma_root(ctx->pt)); + } else + set_ctx(sdev, ctx, PAGE_LEVEL0, sif_mem_dma(mem, 0)); + return 0; +} +#endif + +/* map an existing context to a new memory object + * Reuse key, page table and mmu context if possible + */ +int sif_map_fmr_ctx(struct sif_dev *sdev, + struct sif_mmu_ctx *ctx, + struct sif_mem *mem) +{ + struct psif_mmu_cntx *hw_ctx = &ctx->mctx; + struct psif_key *key = get_key(sdev, ctx->lkey); + bool multipage; + u64 vstart = (u64)mem->vmap_base; + u64 page_size; + u64 page_mask; + enum psif_table_level leaf_level; + u64 aligned_base; + u64 aligned_sz; + u32 page_shift = sif_mem_page_shift(mem); + u8 pt_leaf_level = 0; + u8 pt_pte_extent = 1; + int ret; + + /* Adjust to a supported page shift */ + ret = find_optimal_leaf_level(sdev, page_shift, + vstart, sif_mem_dma(mem, 0), mem->size, + &pt_leaf_level, &pt_pte_extent); + if (ret) + return ret; + + page_shift = sdev->mi.page_shift + pt_leaf_level * sdev->mi.level_shift; + page_size = 1ULL << page_shift; + page_mask = ~(page_size - 1); + + hw_ctx->wr_access = true; + hw_ctx->translation_type = MMU_GVA2GPA_MODE; + hw_ctx->page_size = hw_leaf_page_sz(sdev, page_shift); + + aligned_base = ctx->base & page_mask; + aligned_sz = aligned_size(vstart, mem->size, page_size); + multipage = sdev->single_pte_pt || aligned_sz > page_size; + leaf_level = hw_leaf_level(hw_ctx->page_size); + + /* Now page sizes may have changed too, if so we cannot reuse the page table, delete it: */ + if (ctx->pt && page_shift > ctx->pt->page_shift) { + sif_pt_free(ctx->pt); + ctx->pt = NULL; + } + + /* For FMRs we reuse the mmu context and modify the existing key */ + ctx->base = (u64)mem->vmap_base; + ctx->size = mem->size; + + set_psif_key__base_addr(key, ctx->base); + set_psif_key__lkey_state(key, PSIF_DMA_KEY_VALID); + set_psif_key__rkey_state(key, PSIF_DMA_KEY_VALID); + set_psif_key__length(key, mem->size); + + sif_log(sdev, SIF_FMR, "key %d: base now at %llx (sz %llx - mem sz %llx)", + ctx->lkey, ctx->base, ctx->size, mem->size); + + /* We have two cases: + * 1) a single page pointer: Pointer must be set to new address - keep page size and everything + * 2) a page table of any depth: + * appropriate ptes must be set to refer to new pages + */ + if (!multipage) { + dma_addr_t dma_addr = sif_mem_dma(mem, 0); + dma_addr_t aligned_dma_addr = dma_addr & ~((1 << page_shift) - 1); + + set_ctx(sdev, ctx, leaf_level, aligned_dma_addr); + } else if (!ctx->pt) { + ctx->pt = sif_pt_create_for_mem(mem, ctx->base, page_shift, true, false); + if (!ctx->pt) + return -ENOMEM; + set_ctx(sdev, ctx, sif_pt_root_table_level(ctx->pt), sif_pt_dma_root(ctx->pt)); + } else { + sif_pt_remap_for_mem(ctx->pt, mem, page_shift, ctx->base); + /* Only the level of the top node may have changed, the page is + * guaranteed to be the same, but the previous use could + * have been a single page - just set it every time for now: + */ + set_ctx(sdev, ctx, sif_pt_root_table_level(ctx->pt), sif_pt_dma_root(ctx->pt)); + } + /* Update the used network endian context */ + set_psif_key__mmu_context(key, *((u64 *)&ctx->mctx)); + return 0; +} + +void sif_unmap_gva_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx) +{ + /* TLB invalidate is not available at teardown, instead we + * invalidate the whole MMU as a final operation before taking down the + * communication with the EPSC. + */ + if (likely(sdev->registered) && ctx->pt && !sif_feature(disable_invalidate_tlb)) + sif_mmu_invalidate_tlb(sdev, ctx, PCM_WAIT); + if (ctx->pt) + sif_pt_free(ctx->pt); +} + + +void sif_unmap_fmr_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx, enum wr_mode mode) +{ + sif_log(sdev, SIF_FMR, "key %d", ctx->lkey); + if (!sif_feature(disable_invalidate_tlb)) + sif_mmu_invalidate_tlb(sdev, ctx, mode); +} + + +static int sif_mmu_invalidate_tlb_partial(struct sif_dev *sdev, struct sif_mmu_ctx *ctx, + u64 start, u64 len, enum wr_mode mode) +{ + struct psif_wr wr; + int ncompleted; + int ret = 0; + u32 lkey = ctx->lkey; + u32 npages; + u32 shift; + u32 sq_entry_idx; + int pqp_sq_idx; + struct sif_sq *sq; + struct sif_pqp *pqp; + struct psif_cq_entry *cqe; + DECLARE_SIF_CQE_POLL(sdev, lcqe); + + pqp = lcqe.pqp; + + if (!lkey) { + lkey = allocate_invalidate_key(ctx); + if (!lkey) { + sif_log(sdev, SIF_INFO, + "Failed to allocate a TLB invalidation key!"); + return -ENOMEM; + } + } + + /* Do no invalidate TLB if page table is NULL. + * However, if mode == PCM_WAIT, need to generate + * a completion to itself to ensure that all the + * previous posted invalidate TLB pqp operations + * have completed. + * + * This is mainly to cater for invalidating the TLB of a + * list of fmr ctx. This is done here within the function as + * the generated completion needs to know the selected + * pqp. The caller sif_unmap_phys_fmr_list doesn't + * know the pqp until DECLARE_SIF_CQE_POLL. + * In a scenario for invalidating TLB for a ctx, + * the ctx->pt is checked before calling this function + * so that no additional completion will be generated. + * e.g in sif_unmap_gva_ctx. + */ + if (unlikely(!ctx->pt)) { + if (mode == PCM_WAIT) { + ret = gen_pqp_cqe(&lcqe); + if (ret < 0) { + sif_log(sdev, SIF_INFO, + "cqe %p gen_pqp_cqe returned %d", + &lcqe, ret); + return ret; + } + ret = poll_cq_waitfor(&lcqe); + if (ret < 0) { + sif_log(sdev, SIF_INFO, + "cqe %p poll_cq_waitfor returned %d", + &lcqe, ret); + } + } + return ret; + } + + memset(&wr, 0, sizeof(struct psif_wr)); + wr.op = PSIF_WR_INVALIDATE_TLB; + wr.details.su.key = lkey; + + shift = sif_pt_page_shift(ctx->pt); + npages = num_pages(ctx->base, len, shift); + + while (npages) { + /* TLB invalidate only uses the lower 16 bits of the length field */ + u32 n = min_t(u32, npages, 0xffff); + + wr.details.su.addr = start; + wr.details.su.length = n; + npages -= n; + if (npages > 0) { + int sts = sif_pqp_post_send(sdev, &wr, NULL); + + if (sts) { + sif_log(sdev, SIF_INFO, + "Partial invalidate TLB for key %d, base %llx, length %x failed, sts %d", + lkey, start, n << shift, sts); + return sts; + } + } else + break; + /* reset checksum for the next calculation */ + wr.checksum = 0; + start += n << shift; + } + + /* We can allow async post only if we do not depend on deleting the key after + * the request has completed: + */ + if (mode != PCM_WAIT && ctx->lkey) { + wr.completion = (mode == PCM_POST) ? 0 : 1; + return sif_pqp_post_send(sdev, &wr, NULL); + } + + wr.completion = 1; + + sif_log(sdev, SIF_PQP, "Invalidate TLB for key %d, base %llx, length %x", + lkey, start, wr.details.su.length << shift); + + ncompleted = sif_pqp_poll_wr(sdev, &wr, &lcqe); + + if (ncompleted < 0) { + sif_log(sdev, SIF_INFO, "%s completion for pqp request", + (ncompleted ? "Error" : "No")); + ret = ncompleted; + goto out; + } + + /* Note that we operate on 3 different indices here! */ + cqe = &lcqe.cqe; + pqp_sq_idx = pqp->qp->qp_idx; + sq = get_sif_sq(sdev, pqp_sq_idx); + + /* sq_id.sq_seq_num contains the send queue sequence number for this completion + * and by this driver's definition the index into the send queue will + * be this number modulo the length of the send queue: + */ + sq_entry_idx = cqe->wc_id.sq_id.sq_seq_num & sq->mask; + + if (cqe->status != PSIF_WC_STATUS_SUCCESS) { + sif_log(sdev, SIF_INFO, + "base %llx, length %x: failed with status %s(%d) for cq_seq %d", + start, wr.details.su.length << shift, + string_enum_psif_wc_status(cqe->status), cqe->status, cqe->seq_num); + sif_logs(SIF_INFO, write_struct_psif_cq_entry(NULL, 0, cqe)); + ret = -EIO; + atomic_inc(&pqp->cq->error_cnt); + goto out; + } + + sif_log(sdev, SIF_PQP, "cq_seq %d sq_seq %d, sq_entry_idx %d", + cqe->seq_num, cqe->wc_id.sq_id.sq_seq_num, sq_entry_idx); +out: + if (!ctx->lkey) + release_invalidate_key(sdev, lkey); + return ret; +} + + +static int sif_mmu_invalidate_tlb(struct sif_dev *sdev, struct sif_mmu_ctx *ctx, enum wr_mode mode) +{ + return sif_mmu_invalidate_tlb_partial(sdev, ctx, ctx->base, ctx->size, mode); +} + + +/* extend an mmu context with DMA addresses from @mem. + * Only GVA2GPA memory types supports this: + */ +int sif_map_ctx_part(struct sif_dev *sdev, + struct sif_mmu_ctx *ctx, + struct sif_mem *mem, + u64 virt_base, u64 size) +{ + int ret; + + if (ctx->type != MMU_GVA2GPA_MODE) + return -EINVAL; + + ret = sif_pt_extend(ctx->pt, sif_mem_get_sgl(mem), virt_base, size); + if (ret >= 0 && ctx->mt == SIFMT_CS && ctx->pt->vsize == size) + set_ctx(sdev, ctx, sif_pt_root_table_level(ctx->pt), sif_pt_dma_root(ctx->pt)); + return ret; +} + + +/* invalidate a pte range in an already existing context's page table + * Only GVA2GPA memory types supports this: + */ + +int sif_unmap_gva_ctx_part(struct sif_dev *sdev, struct sif_mmu_ctx *ctx, + u64 virt_base, u64 size) +{ + int ret = sif_pt_free_part(ctx->pt, virt_base, size); + + if (ret < 0) + return ret; + + if (unlikely(!sdev->registered)) { + /* TLB invalidate is not available at teardown */ + return 0; + } + + /* Invalidate this range of the page table with PSIF - assume async call is ok */ + return sif_mmu_invalidate_tlb_partial(sdev, ctx, virt_base, size, PCM_POST); +} + + + +const char *sif_mem_type_str(enum sif_mem_type mem_type) +{ + switch (mem_type) { + case SIFMT_BYPASS: + return "SIFMT_BYPASS"; + case SIFMT_UMEM: + return "SIFMT_UMEM"; + case SIFMT_UMEM_RO: + return "SIFMT_UMEM_RO"; + case SIFMT_BYPASS_RO: + return "SIFMT_BYPASS_RO"; + case SIFMT_UMEM_SPT: + return "SIFMT_UMEM_SPT"; + case SIFMT_2M: + return "SIFMT_2M"; + case SIFMT_4K: + return "SIFMT_4K"; + case SIFMT_CS: + return "SIFMT_CS"; + case SIFMT_ZERO: + return "SIFMT_ZERO"; + case SIFMT_PHYS: + return "SIFMT_PHYS"; + case SIFMT_FMR: + return "SIFMT_FMR"; + case SIFMT_NOMEM: + return "SIFMT_NOMEM"; + case SIFMT_PTONLY: + return "SIFMT_PTONLY"; + case SIFMT_MAX: + return "SIFMT_MAX"; + default: + break; + } + return "(undefined sif_mem_type)"; +} + + +struct psif_mmu_cntx sif_mmu_ctx_passthrough(bool write) +{ + struct psif_mmu_cntx ctx = { .wr_access = 1 }; + return ctx; +} + + +#define TSU_MMU_FLUSH_CACHES_ADDR 0x00200003L + +/* Post a command to flush the TLBs PTE cache. + * If @ptw_cache is set, also flush the PTW cache. + */ +int sif_post_flush_tlb(struct sif_dev *sdev, bool ptw_cache) +{ + int ret; + + if (eps_version_ge(&sdev->es[sdev->mbox_epsc], 0, 100)) { + struct psif_epsc_csr_rsp resp; + struct psif_epsc_csr_req req; + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_FLUSH_CACHES; + req.u.flush_caches.flush_mmu_caches.flush_mmu_cache = 1; + if (ptw_cache) + req.u.flush_caches.flush_mmu_caches.flush_ptw_cache = 1; + ret = sif_epsc_wr_poll(sdev, &req, &resp); + } else { + int bits = (ptw_cache ? 0x3 : 0x1); + + ret = sif_write_global_csr(sdev, TSU_MMU_FLUSH_CACHES_ADDR, bits); + } + if (ret) { + sif_log(sdev, SIF_INFO, + "clearing MMU cache failed with error %d ", ret); + } + return ret; +} + + +/* Wait for a previously posted flush_tlb to complete */ +int sif_complete_flush_tlb(struct sif_dev *sdev) +{ + ulong start_time = jiffies; + ulong timeout = sdev->min_resp_ticks * 4; + ulong timeout_time = start_time + timeout; + u64 val; + int cnt = 0; + int ret; + int ms; + + if (eps_version_ge(&sdev->es[sdev->mbox_epsc], 0, 100)) { + /* For API ver. >= 100, we already wait for completion in mailbox operation */ + return 0; + } + do { + val = sif_read_global_csr(sdev, TSU_MMU_FLUSH_CACHES_ADDR); + cnt++; + } while (val != -1LL && (val & 0x4) != 0x4 && time_is_after_jiffies(timeout_time)); + if (val == -1LL) + sif_log(sdev, SIF_INFO, "CSR error waiting for mmu cache flush to finish"); + if (time_is_before_jiffies(timeout_time)) { + sif_log(sdev, SIF_INFO, "timeout waiting for mmu cache flush to finish, val = %lld", + val); + return -ETIMEDOUT; + } + ret = sif_write_global_csr(sdev, TSU_MMU_FLUSH_CACHES_ADDR, 0x0); + ms = jiffies_to_msecs(jiffies - start_time); + if (ret) + sif_log(sdev, SIF_INFO, "failed to turn off mmu cache flush mode in %d ms", ms); + else + sif_log(sdev, SIF_INFO_V, "flushing completed in %d ms, cnt %d", + ms, cnt); + return ret; +} diff --git a/drivers/infiniband/hw/sif/sif_mmu.h b/drivers/infiniband/hw/sif/sif_mmu.h new file mode 100644 index 0000000000000..6624f9455856c --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_mmu.h @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_mmu.h: API for management of sif's on-chip mmu. + */ + +#ifndef _SIF_MMU_H +#define _SIF_MMU_H + +#include +#include "psif_hw_data.h" +#include "sif_user.h" + +struct sif_mem; +struct psif_mmu_cntx; +struct sif_dev; + +enum wr_mode { + PCM_POST, /* Post WR without requesting send completion */ + PCM_POST_COMPL, /* Post WR requesting send completion but do not wait(poll) for it */ + PCM_WAIT /* Post WR requesting send completion and wait(poll) for it to arrive */ +}; + +enum post_mode { + PM_WRITE, /* Write the WR into the SQ but don't trigger any posting */ + PM_DOORBELL, /* Post request and trigger doorbell (send queue mode) */ + PM_CB, /* "Normal" collect buffer mode */ +}; + +/* The driver's representation of an MMU context: + * The key is the only means for referring the MMU context wrt invalidation + * (TLB_INVALIDATE) but this is only necessary to do for GVA2GPA contexts + * [TBD: with level > 0 (?)] + */ + +struct sif_mmu_ctx { + u64 base; /* Start of mapping (byte resolution) */ + u64 size; /* Size of mapping (byte resolution) */ + u32 lkey; /* Key to use for invalidation - only valid if nonzero */ + enum sif_mem_type mt; /* Logical type of mapping */ + enum psif_mmu_translation type; /* Defined in psif_hw_data */ + struct psif_mmu_cntx mctx; /* host order version of MMU context populated by sif_map_ctx */ + struct sif_pt *pt; /* sif page table this mmu context points into (only GVA2GPA types) */ + off_t uv2dma; /* For bypass: user_va + uv2dma = actual dma_addr */ + u64 phys_sz; /* Only used by SIFMT_ZERO mappings */ +}; + + +/* Prepare a new mmu context + * ctx points to storage for this mmu context + * mem points to a DMA mapped memory object to map + * + * - prepare any page tables needed for dma + * and/or allocate private structures + * - fill in information for hw in ctx->hw_ctx + * + * NB! hw_ctx is assumed to be set to values for + * MMU_PASS_THROUGH (all null bytes) by default + * + * Return 0 upon success or -errno + */ +int sif_map_ctx(struct sif_dev *sdev, + struct sif_mmu_ctx *ctx, + struct sif_mem *mem, + u64 virt_base, u64 size, + bool write); + +/* Release any resources associated with + * the mmu context c. This will typically be + * any driver managed page tables and any I/O mappings + * (pinning) of page table memory + */ +void sif_unmap_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *c); + +/* Populate/invalidate a pte range in an already existing context's page table + * Only GVA2GPA memory types supports this: + * page_list should contain the corresponding list of dma_addresses to map: + */ +int sif_map_ctx_part(struct sif_dev *sdev, + struct sif_mmu_ctx *c, + struct sif_mem *mem, + u64 virt_base, u64 size); + +int sif_unmap_gva_ctx_part(struct sif_dev *sdev, struct sif_mmu_ctx *c, + u64 virt_base, u64 size); + +/* Remap an existing context to a new memory object + * (of the same size) + */ +int sif_map_fmr_ctx(struct sif_dev *sdev, + struct sif_mmu_ctx *c, + struct sif_mem *mem); + +void sif_unmap_fmr_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx, enum wr_mode mode); + +/*** internal mmu code - used by sif_xmmu.h ***/ + +void sif_unmap_gva_ctx(struct sif_dev *sdev, struct sif_mmu_ctx *ctx); + +const char *sif_mem_type_str(enum sif_mem_type mem_type); + +void set_ctx(struct sif_dev *sdev, + struct sif_mmu_ctx *ctx, + enum psif_table_level level, + u64 val); + +/* Return an mmu context in passthrough mode */ +struct psif_mmu_cntx sif_mmu_ctx_passthrough(bool write); + +/* The I/O side virtual address as seen from sif */ +static inline u64 sif_mmu_vaddr(struct sif_mmu_ctx *ctx, off_t offset) +{ + return ctx->base + offset; +} + +/* Post a command to flush the TLBs PTE cache. + * If @ptw_cache is set, also flush the PTW cache. + */ +int sif_post_flush_tlb(struct sif_dev *sdev, bool ptw_cache); + +/* Wait for a previously posted flush_tlb to complete */ +int sif_complete_flush_tlb(struct sif_dev *sdev); + +/* Flush the TLB and wait for the flush to complete */ +static inline int sif_flush_tlb(struct sif_dev *sdev) +{ + int ret = sif_post_flush_tlb(sdev, true); + + if (ret) + return ret; + return sif_complete_flush_tlb(sdev); +} + +#endif diff --git a/drivers/infiniband/hw/sif/sif_mr.c b/drivers/infiniband/hw/sif/sif_mr.c new file mode 100644 index 0000000000000..9632f1e759ac8 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_mr.c @@ -0,0 +1,511 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_mr.c: Implementation of memory regions support for SIF + */ + +#include +#include + +#include "sif_dev.h" +#include "psif_hw_data.h" +#include "psif_hw_setget.h" +#include "sif_defs.h" +#include "sif_base.h" +#include "sif_mr.h" +#include "sif_pd.h" +#include "sif_mmu.h" +#include "sif_pt.h" +#include "sif_user.h" +#include +#include "sif_user.h" + +struct sif_mr *sif_alloc_invalid_mr(struct sif_pd *pd) +{ + struct sif_dev *sdev = to_sdev(pd->ibpd.device); + u64 bad_addr = (~0ull) ^ (PAGE_SIZE-1); + struct sif_mem *mem = + sif_mem_create_ref(sdev, SIFMT_NOMEM, bad_addr, 0, GFP_KERNEL); + if (!mem) + return ERR_PTR(-ENOMEM); + + return alloc_mr(sdev, pd, mem, 0, 0); +} + +struct sif_mr *create_dma_mr(struct sif_pd *pd, int acc_fl) +{ + /* Use a common MR (in bypass mode) + * covering the whole memory space (for each pd which needs it) + */ + struct sif_dev *sdev = to_sdev(pd->ibpd.device); + struct sif_mr *mr; + struct sif_mem *mem = + sif_mem_create_ref(sdev, SIFMT_NOMEM, 0ull, (~0ull) ^ (PAGE_SIZE-1), GFP_KERNEL); + if (!mem) + return ERR_PTR(-ENOMEM); + + mr = alloc_mr(sdev, pd, mem, 0, acc_fl); + if (IS_ERR(mr)) + goto alloc_mr_failed; + return mr; + +alloc_mr_failed: + sif_mem_free(mem); + return mr; +} + + +struct ib_mr *sif_get_dma_mr(struct ib_pd *ibpd, int acc_fl) +{ + struct sif_mr *mr = create_dma_mr(to_spd(ibpd), acc_fl); + + return mr ? &mr->ibmr : NULL; +} + + +struct ib_mr *sif_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, + u64 virt_addr, int acc_fl, + struct ib_udata *udata) +{ + enum sif_mem_type mem_type = SIFMT_UMEM; + struct sif_dev *sdev = to_sdev(ibpd->device); + struct sif_mr *mr; + void *ret; + struct ib_umem *umem; + struct sif_mem *mem; + ulong user_flags = 0; + u64 map_length = 0; + u64 phys_length = 0; + u64 umem_length = length; + enum dma_data_direction dma_dir = DMA_BIDIRECTIONAL; + DEFINE_DMA_ATTRS(attrs); + + if (udata) { + struct sif_reg_mr_ext cmd; + int rv; + + rv = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); + if (rv) + return ERR_PTR(-EFAULT); + user_flags = cmd.flags; + if (sif_vendor_enable(MMU_special, user_flags)) { + mem_type = + sdev->mt_override == SIFMT_UMEM ? cmd.mem_type : sdev->mt_override; + map_length = cmd.map_length; + phys_length = cmd.phys_length; + if (mem_type == SIFMT_BYPASS_RO || mem_type == SIFMT_UMEM_RO) + dma_dir = DMA_TO_DEVICE; + if (mem_type == SIFMT_CS) + umem_length = phys_length; + } + } + + sif_log(sdev, SIF_MR, "start 0x%llx len 0x%llx virt_addr 0x%llx flags 0x%lx", + start, length, virt_addr, user_flags); + + /* Pin user memory */ + umem = ib_umem_get_attrs(ibpd->uobject->context, start, umem_length, acc_fl, + dma_dir, &attrs); + + if (IS_ERR(umem)) { + int ev = PTR_ERR(umem); + + ret = (void *)umem; + sif_log(sdev, SIF_MR, + "#### Failed to get umem [err %d] (start %llx length %llx vaddr %llx, udata at %p)", + ev, start, length, virt_addr, udata); + return ret; + } + + if (map_length) { + if (map_length < length) { + sif_log(sdev, SIF_INFO, "illegal map_length 0x%llx - must be > length 0x%llx", + map_length, length); + return ERR_PTR(-EINVAL); + } + length = map_length; + } + + mem = sif_mem_create_umem(sdev, umem, mem_type, GFP_KERNEL, dma_dir); + if (!mem) { + mr = (void *)ERR_PTR(-ENOMEM); + goto err_create_mem; + } + + mr = alloc_mr(sdev, to_spd(ibpd), mem, start, acc_fl); + if (IS_ERR(mr)) + goto err_mmu_ctx; + + if (udata) { + struct sif_reg_mr_resp_ext resp; + int rv; + + memset(&resp, 0, sizeof(resp)); + resp.uv2dma = mr->mmu_ctx.uv2dma; + rv = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (rv) { + /* Exit here as ib_umem_release is implicit via dealloc_mr */ + dealloc_mr(sdev, mr); + return ERR_PTR(-EFAULT); + } + } + + sif_log(sdev, SIF_MR, "Exit: ibmr 0x%p - uv2dma %lx", &mr->ibmr, mr->mmu_ctx.uv2dma); + return &mr->ibmr; + +err_mmu_ctx: + sif_mem_free(mem); /* owns and frees the umem as well */ + return (void *)mr; +err_create_mem: + ib_umem_release(umem); + return (void *)mr; +} + + +struct ib_mr *sif_reg_phys_mr(struct ib_pd *ibpd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int acc_fl, u64 *iova_start) +{ + struct sif_dev *sdev = to_sdev(ibpd->device); + struct sif_mr *mr; + struct sif_mem *mem; + + if ((num_phys_buf <= 0) || !phys_buf_array) { + sif_log(sdev, SIF_INFO, "input error: num_phys_buf 0%x phys_buf_array %p", + num_phys_buf, phys_buf_array); + mr = ERR_PTR(-EINVAL); + goto param_err; + } + + sif_log(sdev, SIF_MR, " num_phys_buf %d, flags 0x%x, iova_start %p", + num_phys_buf, acc_fl, iova_start); + + mem = sif_mem_create_phys(sdev, iova_start, phys_buf_array, num_phys_buf, + GFP_KERNEL); + if (!mem) { + sif_log(sdev, SIF_INFO, "Failed to create mem object (ENOMEM)"); + mr = ERR_PTR(-ENOMEM); + goto param_err; + } + + mr = alloc_mr(sdev, to_spd(ibpd), mem, (u64)iova_start, acc_fl); + if (IS_ERR(mr)) + goto alloc_mr_failed; + + return &mr->ibmr; +alloc_mr_failed: + sif_mem_free(mem); +param_err: + return (void *)mr; +} + + +int sif_rereg_phys_mr(struct ib_mr *ibmr, int mr_rereg_mask, + struct ib_pd *ibpd, + struct ib_phys_buf *phys_buf_array, int num_phys_buf, + int mr_access_flags, u64 *iova_start) +{ + struct sif_dev *sdev = to_sdev(ibpd->device); + + sif_log(sdev, SIF_INFO, "Not implemented"); + return -EOPNOTSUPP; +} + + + +struct sif_mr *alloc_mr(struct sif_dev *sdev, struct sif_pd *pd, + struct sif_mem *mem, u64 map_start, int acc_fl) +{ + struct sif_mr *mr; + volatile struct psif_key *key; + struct psif_key lkey; + bool write; + int index; + int ret = 0; + u64 length = mem ? mem->size : ((~0ull) ^ (PAGE_SIZE-1)); + + index = sif_alloc_key_idx(sdev); + if (index < 0) { + sif_log(sdev, SIF_MR, "Failed to allocate key idx"); + ret = -ENOMEM; + goto err_reg_mr; + } + + mr = kzalloc(sizeof(struct sif_mr), GFP_KERNEL); + if (!mr) { + sif_log(sdev, SIF_MR, "Failed to allocate memory for sif_mr"); + ret = -ENOMEM; + goto err_mr_alloc; + } + + memset(mr, 0, sizeof(struct sif_mr)); + memset(&lkey, 0, sizeof(struct psif_key)); + mr->index = index; + mr->mem = mem; + set_sif_mr(sdev, index, mr); + key = get_key(sdev, index); + + if (length) { + /* MR will always have L/R keys associated with them.*/ + lkey.lkey_state = PSIF_DMA_KEY_VALID; + lkey.rkey_state = PSIF_DMA_KEY_VALID; + } else { + /* Allocation is for a special invalid key */ + lkey.lkey_state = PSIF_DMA_KEY_INVALID; + lkey.rkey_state = PSIF_DMA_KEY_INVALID; + } + + /* Access flags */ + lkey.local_access_rd = 1; + if (acc_fl & IB_ACCESS_LOCAL_WRITE) + lkey.local_access_wr = 1; + if (acc_fl & IB_ACCESS_REMOTE_READ) + lkey.remote_access_rd = 1; + if (acc_fl & IB_ACCESS_REMOTE_WRITE) + lkey.remote_access_wr = 1; + if (acc_fl & IB_ACCESS_REMOTE_ATOMIC) + lkey.remote_access_atomic = 1; + /* TBD: IB_ACCESS_MW_BIND (what to do with that?) + * and also conditonal_wr + */ + + write = (lkey.local_access_wr ? 1:0) || (lkey.remote_access_wr ? 1:0); + + lkey.pd = pd->idx; + + ret = sif_map_ctx(sdev, &mr->mmu_ctx, mem, map_start, length, write); + if (ret) + goto err_map_ctx; + + mr->mmu_ctx.lkey = index; + if (length) + lkey.base_addr = mr->mmu_ctx.base; + else + lkey.base_addr = (u64)-1LL; + lkey.length = mr->mmu_ctx.size; + lkey.mmu_context = mr->mmu_ctx.mctx; + + sif_logs(SIF_DUMP, write_struct_psif_key(NULL, 0, &lkey)); + + /* Write to HW descriptor */ + copy_conv_to_hw(key, &lkey, sizeof(lkey)); + + mr->ibmr.lkey = mr->ibmr.rkey = mr->index; + + sif_log(sdev, SIF_MR, "type %s - key %d (pd %d) - success", + sif_mem_type_str(mem->mem_type), + mr->index, pd->idx); + return mr; +err_map_ctx: + kfree(mr); + set_sif_mr(sdev, index, NULL); +err_mr_alloc: + sif_clear_key(sdev, index); + sif_free_key_idx(sdev, index); +err_reg_mr: + sif_log(sdev, SIF_MR, "Exit: failed with status %d", ret); + return ERR_PTR(ret); +} + +int sif_query_mr(struct ib_mr *ibmr, struct ib_mr_attr *mr_attr) +{ + sif_logi(ibmr->device, SIF_MR, "Not implemented"); + return -EOPNOTSUPP; +} + + +/* If the MMU is involved (not pass-through mode) + * PSIF MR deregistration is asyncronous and five-step (see #2002): + * 1) Invalidate associated dma validation entry but first + * make sure it is in the special MMU_VALID state which does not + * allow uses of it from IB but allows it to be used for invalidation + * operations. The invalidate req causes a flush of the entry in + * VAL's cache. + * 2) Invalidate MMU context (TLB_INVALIDATE) + * This will lead to a fetch of the key again, this time with + * state == MMU_VALID. + * 3) Issue another key invalidate + * 4) NIL validation entry - make valid = 0 + * 5) Unpin/release memory associated with it + */ + +void dealloc_mr(struct sif_dev *sdev, struct sif_mr *mr) +{ + int index = mr->index; + int sts; + struct psif_key *key = get_key(sdev, index); + bool need_5_step = mr->mmu_ctx.type == MMU_GVA2GPA_MODE; + + /* We do not invalidate the invalid key at index 0 */ + bool do_invalidate_key = index != 0 && !sif_feature(disable_invalidate_key); + + if (do_invalidate_key) { + if (need_5_step) { + set_psif_key__lkey_state(key, PSIF_DMA_KEY_MMU_VALID); + set_psif_key__rkey_state(key, PSIF_DMA_KEY_MMU_VALID); + } else { + set_psif_key__lkey_state(key, PSIF_DMA_KEY_INVALID); + set_psif_key__rkey_state(key, PSIF_DMA_KEY_INVALID); + } + + /* Flush this DMA validation entry */ + sts = sif_invalidate_key(sdev, index, PCM_WAIT); + if (sts) { + sif_log(sdev, SIF_INFO, + "Invalidate key failed"); + } + } + + /* Invalidate and unmap MMU context */ + sif_unmap_ctx(sdev, &mr->mmu_ctx); + + if (need_5_step && do_invalidate_key) { + set_psif_key__lkey_state(key, PSIF_DMA_KEY_INVALID); + set_psif_key__rkey_state(key, PSIF_DMA_KEY_INVALID); + + /* Flush this DMA validation entry - the final operation, must be synchronous: */ + sts = sif_invalidate_key(sdev, index, PCM_WAIT); + if (sts) { + sif_log(sdev, SIF_INFO, + "Invalidate key failed"); + } + } + + kfree(mr); + set_sif_mr(sdev, index, NULL); + + if (!sif_feature(disable_invalidate_key)) { + /* Release memory associated with this key */ + sif_clear_key(sdev, index); + sif_free_key_idx(sdev, index); + } +} + + +void sif_dealloc_mr(struct sif_dev *sdev, struct sif_mr *mr) +{ + struct sif_mem *mem = mr->mem; + + dealloc_mr(sdev, mr); + sif_mem_free(mem); +} + + +int sif_dereg_mr(struct ib_mr *ibmr) +{ + struct sif_mr *mr = to_smr(ibmr); + struct sif_dev *sdev = to_sdev(ibmr->device); + int index = mr->ibmr.lkey; + + sif_logi(ibmr->device, SIF_MR, "Enter: mr 0x%p key 0x%x", mr, + index); + + sif_dealloc_mr(sdev, mr); + sif_log(sdev, SIF_MR, "Exit: success"); + return 0; +} + +struct ib_mr *sif_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list_len) +{ + sif_logi(ibpd->device, SIF_FMR, "Not implemented"); + return ERR_PTR(-EOPNOTSUPP); +} + +struct ib_fast_reg_page_list *sif_alloc_fast_reg_page_list(struct ib_device + *ibdev, + int page_list_len) +{ + sif_logi(ibdev, SIF_FMR, "Not implemented"); + return ERR_PTR(-EOPNOTSUPP); +} + +void sif_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl) +{ + sif_logi(pl->device, SIF_FMR, "Not implemented"); +} + + +/* Line printer for debugfs file */ +void sif_dfs_print_key(struct seq_file *s, struct sif_dev *sdev, loff_t pos) +{ + struct psif_key *key; + struct psif_key lkey; + const char *typestr; + char l_state, r_state; + + if (unlikely(pos < 0)) { + seq_printf(s, "# %61s State %s\n", "", "Page table info"); + seq_printf(s, "# Index %18s %18s %16s LR %s\n", + "Base address(hex)", "Length(hex)", "MMU ctx type", " top leaf pages"); + return; + } + + key = get_key(sdev, pos); + copy_conv_to_sw(&lkey, key, sizeof(struct psif_key)); + typestr = string_enum_psif_mmu_translation(lkey.mmu_context.translation_type) + 4; + l_state = string_enum_psif_dma_vt_key_states(lkey.lkey_state)[13]; + r_state = string_enum_psif_dma_vt_key_states(lkey.rkey_state)[13]; + + seq_printf(s, "%7lld %18llx %18llx %16s %c%c ", pos, lkey.base_addr, lkey.length, + typestr, l_state, r_state); + sif_pt_dfs_print(s, sdev, pos); +} + + +/* API to allocate/release a key for TLB invalidation only + * Note that 0 is considered an invalid key! + */ +u32 allocate_invalidate_key(struct sif_mmu_ctx *ctx) +{ + /* This call is only meaningful for contexts with a valid page table: */ + struct sif_dev *sdev = ctx->pt->sdev; + int index; + struct psif_key lkey; + volatile struct psif_key *key; + + index = sif_alloc_key_idx(sdev); + if (index < 0) + return 0; + + key = get_key(sdev, index); + memset(&lkey, 0, sizeof(struct psif_key)); + lkey.lkey_state = PSIF_DMA_KEY_MMU_VALID; + lkey.rkey_state = PSIF_DMA_KEY_MMU_VALID; + lkey.base_addr = ctx->base; + lkey.length = ctx->size; + lkey.mmu_context = ctx->mctx; + + /* Write to HW descriptor */ + copy_conv_to_hw(key, &lkey, sizeof(lkey)); + return (u32)index; +} + +/* Release and invalidate a previously allocated TLB invalidation key */ +void release_invalidate_key(struct sif_dev *sdev, u32 index) +{ + int sts; + struct psif_key *key = get_key(sdev, index); + + set_psif_key__lkey_state(key, PSIF_DMA_KEY_INVALID); + set_psif_key__rkey_state(key, PSIF_DMA_KEY_INVALID); + + /* Flush this DMA validation entry - we do not really depend on the result + * so safe to make it asynchronous: + */ + sts = sif_invalidate_key(sdev, index, PCM_POST); + if (sts) + sif_log(sdev, SIF_INFO, + "Invalidate key failed"); + + /* Release memory associated with this key */ + sif_clear_key(sdev, index); + sif_free_key_idx(sdev, index); +} diff --git a/drivers/infiniband/hw/sif/sif_mr.h b/drivers/infiniband/hw/sif/sif_mr.h new file mode 100644 index 0000000000000..959f8b407887f --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_mr.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_mr.h: Interface to internal IB memory registration logic for SIF + */ + +#ifndef __SIF_MR_H +#define __SIF_MR_H +#include "sif_mmu.h" + +struct ib_umem; +struct sif_mem; + +struct sif_mr { + struct ib_mr ibmr; + int index; + struct sif_mem *mem; + struct sif_mmu_ctx mmu_ctx; +}; + +static inline struct sif_mr *to_smr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct sif_mr, ibmr); +} + +struct ib_mr *sif_get_dma_mr(struct ib_pd *ibpd, int mr_access_flags); +struct sif_mr *sif_alloc_invalid_mr(struct sif_pd *pd); +struct ib_mr *sif_reg_phys_mr(struct ib_pd *ibpd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, int mr_access_flags, + u64 *iova_start); + +struct ib_mr *sif_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags, + struct ib_udata *udata); + +int sif_query_mr(struct ib_mr *ibmr, struct ib_mr_attr *mr_attr); +int sif_dereg_mr(struct ib_mr *ibmr); + +struct ib_mr *sif_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list_len); +struct ib_fast_reg_page_list *sif_alloc_fast_reg_page_list(struct ib_device + *ibdev, + int page_list_len); + +void sif_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); +int sif_rereg_phys_mr(struct ib_mr *ibmr, + int mr_rereg_mask, + struct ib_pd *ibpd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, int mr_access_flags, u64 *iova_start); + +/* Deallocate MR - assumes ownership of mr->mem and deletes that as well. + * To be used with high level mr allocation operations that create their own + * sif_mem object: + */ +void sif_dealloc_mr(struct sif_dev *sdev, struct sif_mr *mr); + +struct sif_dev; +struct seq_file; +struct sif_pd; +enum psif_mmu_translation; + +/* Line printer for debugfs file */ +void sif_dfs_print_key(struct seq_file *s, struct sif_dev *sdev, loff_t pos); + +/* Internal mr allocation/deallocation functions: + * Allocate an IB MR for the memory object @mem + * If length == 0, allocate an invalid map. + * The mr does not own the @mem object + */ +struct sif_mr *alloc_mr(struct sif_dev *sdev, struct sif_pd *pd, + struct sif_mem *mem, u64 map_start, int acc_fl); +struct sif_mr *create_dma_mr(struct sif_pd *pd, int acc_fl); + +void dealloc_mr(struct sif_dev *sdev, struct sif_mr *mr); + + +/* API to allocate/release a key for TLB invalidation only + * Note that 0 is considered an invalid key! + */ +u32 allocate_invalidate_key(struct sif_mmu_ctx *ctx); + +/* Release and invalidate a previously allocated TLB invalidation key */ +void release_invalidate_key(struct sif_dev *sdev, u32 lkey); + + +#endif diff --git a/drivers/infiniband/hw/sif/sif_mw.c b/drivers/infiniband/hw/sif/sif_mw.c new file mode 100644 index 0000000000000..a9099c1f16dfc --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_mw.c @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_mw.c: Implementation of memory windows for SIF + */ + +#include +#include "sif_mw.h" +#include "sif_dev.h" + +struct ib_mw *sif_alloc_mw(struct ib_pd *ibpd) +{ + sif_logi(ibpd->device, SIF_INFO, "Not implemented"); + return ERR_PTR(-EOPNOTSUPP); +} + +int sif_bind_mw(struct ib_qp *ibqp, + struct ib_mw *ibmw, struct ib_mw_bind *mw_bind) +{ + sif_logi(ibqp->device, SIF_INFO, "Not implemented"); + return -EOPNOTSUPP; +} + +int sif_dealloc_mw(struct ib_mw *ibmw) +{ + sif_logi(ibmw->device, SIF_INFO, "Not implemented"); + return -EOPNOTSUPP; +} diff --git a/drivers/infiniband/hw/sif/sif_mw.h b/drivers/infiniband/hw/sif/sif_mw.h new file mode 100644 index 0000000000000..4067f36ec0de6 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_mw.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_mw.h: Interface to internal IB memory window logic for SIF + */ + +#ifndef __SIF_MW_H +#define __SIF_MW_H + +struct ib_mw *sif_alloc_mw(struct ib_pd *ibpd); +int sif_bind_mw(struct ib_qp *ibqp, struct ib_mw *ibmw, + struct ib_mw_bind *mw_bind); +int sif_dealloc_mw(struct ib_mw *ibmw); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_pd.c b/drivers/infiniband/hw/sif/sif_pd.c new file mode 100644 index 0000000000000..e1fc92f5fa33f --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_pd.c @@ -0,0 +1,544 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_pd.c: Implementation of IB protection domains for SIF + */ + +#include +#include "sif_dev.h" +#include "sif_ibpd.h" +#include "sif_pd.h" +#include "sif_defs.h" +#include "sif_base.h" +#include "sif_mmu.h" +#include "sif_mr.h" +#include "sif_xrc.h" +#include "sif_query.h" + + +int sif_init_pd(struct sif_dev *sdev) +{ + /* Avoid using pd == 0 to have HW trap use of blank AHs: */ + return sif_idr_init(&sdev->pd_refs, 1, SIF_MAX_PD_INDEX); +} + + +void sif_deinit_pd(struct sif_dev *sdev) +{ + sif_idr_deinit(&sdev->pd_refs); +} + + +inline void cancel_cb(struct psif_cb __iomem *cb) +{ + u64 __iomem *c_adr = (u64 __iomem *)((u8 __iomem *)cb + 0xff8); + u64 c_val = PSIF_WR_CANCEL_CMD_BE; + + __raw_writeq(cpu_to_be64(c_val), c_adr); +} + + +struct sif_pd *alloc_pd(struct sif_dev *sdev) +{ + struct sif_pd *pd = kzalloc(sizeof(struct sif_pd), GFP_KERNEL); + + if (!pd) + return NULL; + + pd->idx = sif_idr_alloc(&sdev->pd_refs, pd, GFP_KERNEL); + spin_lock_init(&pd->lock); + INIT_LIST_HEAD(&pd->qp_list); + INIT_LIST_HEAD(&pd->cq_list); + INIT_LIST_HEAD(&pd->rq_list); + + sif_log(sdev, SIF_PD, "pd idx %d", pd->idx); + return pd; +} + + +int dealloc_pd(struct sif_pd *pd) +{ + struct sif_dev *sdev = to_sdev(pd->ibpd.device); + + sif_log(sdev, SIF_PD, "pd idx %d", pd->idx); + + if (!list_empty(&pd->qp_list)) { + sif_log(sdev, SIF_INFO, "pd idx %d: failed - still active qp blocks", pd->idx); + return -EBUSY; + } + if (!list_empty(&pd->cq_list)) { + sif_log(sdev, SIF_INFO, "pd idx %d: failed - still active cq blocks", pd->idx); + return -EBUSY; + } + if (!list_empty(&pd->rq_list)) { + sif_log(sdev, SIF_INFO, "pd idx %d: failed - still active rq blocks", pd->idx); + return -EBUSY; + } + + sif_idr_remove(&sdev->pd_refs, pd->idx); + kfree(pd); + return 0; +} + + +/* IB Verbs level interfaces (sif_ibpd.h) */ + + +struct ib_pd *sif_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, struct ib_udata *udata) +{ + struct sif_dev *sdev = to_sdev(ibdev); + struct sif_pd *pd; + int ret; + + pd = alloc_pd(sdev); + if (!pd) + return ERR_PTR(-ENOMEM); + + /* For bw comp with libsif */ + if (udata) { + struct sif_ucontext *uc = to_sctx(context); + struct sif_alloc_pd_resp_ext resp; + + memset(&resp, 0, sizeof(resp)); + resp.cb_idx = uc->cb->idx; + ret = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (ret) { + dealloc_pd(pd); + return ERR_PTR(-EFAULT); + } + } + return &pd->ibpd; +} + +int sif_dealloc_pd(struct ib_pd *ibpd) +{ + return ibpd->shpd ? 0 : dealloc_pd(to_spd(ibpd)); +} + +struct ib_shpd *sif_alloc_shpd(struct ib_device *ibdev, + struct ib_pd *ibpd, + struct ib_udata *udata) +{ + struct sif_dev *sdev = to_sdev(ibdev); + struct sif_pd *pd = to_spd(ibpd); + struct sif_shpd *shpd; + + shpd = kzalloc(sizeof(struct sif_shpd), GFP_KERNEL); + if (!shpd) + return ERR_PTR(-ENOMEM); + + shpd->ibshpd.device = &sdev->ib_dev; + shpd->pd = pd; + + return &shpd->ibshpd; +} + +struct ib_pd *sif_share_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata, + struct ib_shpd *ibshpd) +{ + struct sif_shpd *shpd = to_sshpd(ibshpd); + struct sif_pd *pd = shpd->pd; + int ret; + + if (udata) { + struct sif_ucontext *uc = to_sctx(context); + struct sif_share_pd_resp_ext resp; + + memset(&resp, 0, sizeof(resp)); + resp.cb_idx = uc->cb->idx; + ret = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (ret) + return ERR_PTR(-EFAULT); + } + + return &pd->ibpd; +} + +int sif_remove_shpd(struct ib_device *ibdev, + struct ib_shpd *ibshpd, + int atinit) +{ + struct sif_shpd *shpd = to_sshpd(ibshpd); + + if (!atinit && shpd->pd) + dealloc_pd(shpd->pd); + + kfree(ibshpd); + + return 0; +} + +/* Collect buffer management */ + + +/* Obtain information about lat_cb and bw_cb resources + * We cannot use the ba structs yet as they are not initialized at this point: + */ +static void sif_cb_init(struct sif_dev *sdev) +{ + struct psif_epsc_csr_req req; + struct psif_epsc_csr_rsp rsp; + struct sif_eps *es = &sdev->es[sdev->mbox_epsc]; + + /* EPSC supports the new requests starting from v.0.36 */ + if (eps_version_ge(es, 0, 37)) { + int ret = 0; + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_QUERY; + req.u.query.data.op = EPSC_QUERY_CAP_VCB_LO; + req.u.query.info.op = EPSC_QUERY_CAP_VCB_HI; + ret = sif_epsc_wr(sdev, &req, &rsp); + if (ret) + sif_log(sdev, SIF_INFO, "Request for VCB info failed with %d", ret); + else { + sdev->bw_cb_cnt = rsp.data; + sdev->lat_cb_cnt = rsp.info; + sif_log(sdev, SIF_INFO, "Got %ld bw_cbs and %ld lat_cbs", + sdev->bw_cb_cnt, sdev->lat_cb_cnt); + } + } +} + + +/* Called from sif_base.c to initialize each of the cb tables */ +void sif_cb_table_init(struct sif_dev *sdev, enum sif_tab_type type) +{ + struct sif_table *tp; + + BUG_ON(!is_cb_table(type)); + tp = &sdev->ba[type]; + + /* Update table values with EPSC data: */ + if (type == bw_cb) { + sif_cb_init(sdev); + if (sdev->bw_cb_cnt) { + tp->entry_cnt = sdev->bw_cb_cnt; + tp->table_sz = tp->ext_sz * tp->entry_cnt; + } + tp->sif_off = sdev->cb_base; + } else { + /* lat_cb */ + if (sdev->lat_cb_cnt) { + tp->entry_cnt = sdev->lat_cb_cnt; + tp->table_sz = tp->ext_sz * tp->entry_cnt; + tp->sif_off = sdev->cb_base + sdev->ba[bw_cb].table_sz; + } else + tp->entry_cnt = 0; + } + + tp->mem = sif_mem_create_ref(sdev, SIFMT_NOMEM, tp->sif_base, + tp->table_sz, GFP_KERNEL); +} + + +struct sif_cb *alloc_cb(struct sif_dev *sdev, bool lat_cb) +{ + int idx; + struct sif_cb *cb = kzalloc(sizeof(struct sif_cb), GFP_KERNEL); + + if (!cb) + return NULL; + + if (unlikely(lat_cb)) { + idx = sif_alloc_lat_cb_idx(sdev); + if (idx < 0) { + sif_log(sdev, SIF_INFO, "Unable to allocate lat_cb - trying bw_cb instead"); + lat_cb = false; + } else + cb->cb = get_lat_cb(sdev, idx); + } + + if (likely(!lat_cb)) { + idx = sif_alloc_bw_cb_idx(sdev); + if (idx < 0) + goto err_index; + cb->cb = get_bw_cb(sdev, idx); + } + + /* Reset Collect buffer */ + cb->idx = idx; + cb->is_lat_cb = lat_cb; + + cancel_cb(cb->cb); + + spin_lock_init(&cb->lock); + return cb; +err_index: + kfree(cb); + return NULL; +} + + +void release_cb(struct sif_dev *sdev, struct sif_cb *cb) +{ + cancel_cb(cb->cb); + if (unlikely(cb->is_lat_cb)) + sif_free_lat_cb_idx(sdev, cb->idx); + else + sif_free_bw_cb_idx(sdev, cb->idx); + kfree(cb); +} + + +/* Find the driver struct for a collect buffer index, if associated with @uc + */ +struct sif_cb *sif_cb_from_uc(struct sif_ucontext *uc, u32 index) +{ + if (uc->cb->idx == index) + return uc->cb; + return NULL; +} + + +/* + * Write a prepared work request (in wqe) to the associated collect buffer: + * Return 0 on success otherwise -EBUSY if lock is held + */ +int sif_cb_write(struct sif_qp *qp, struct psif_wr *wqe, int cp_len) +{ + unsigned long flags; + struct sif_cb *cb = get_cb(qp); + + if (!spin_trylock_irqsave(&cb->lock, flags)) + return -EBUSY; + + wmb(); /* Previous memory writes must be ordered wrt the I/O writes */ + copy_conv_to_mmio(cb->cb, wqe, cp_len); + wc_wmb(); /* I/O writes must be completed before we let go of the lock! */ + spin_unlock_irqrestore(&cb->lock, flags); + + return 0; +} + + +#define SQS_START_DOORBELL 0xfc0 +#define SQS_STOP_DOORBELL 0xf80 + +/* + * Notify about a work request to the cb doorbell - triggering SQ mode: + */ +void sif_doorbell_write(struct sif_qp *qp, struct psif_wr *wqe, bool start) +{ + unsigned long flags; + u16 doorbell_offset = start ? SQS_START_DOORBELL : SQS_STOP_DOORBELL; + struct sif_cb *cb = get_cb(qp); + struct sif_dev *sdev = to_sdev(qp->ibqp.pd->device); + + sif_log(sdev, SIF_QP, "%s sqs for qp %d sq_seq %d", (start ? "start" : "stop"), + qp->qp_idx, wqe->sq_seq); + spin_lock_irqsave(&cb->lock, flags); + wmb(); + copy_conv_to_mmio((u8 __iomem *)cb->cb + doorbell_offset, wqe, 8); + + /* Flush write combining */ + wc_wmb(); + spin_unlock_irqrestore(&cb->lock, flags); +} + + +/* + * Force the SQS to process an already posted WR: + */ + +void sif_doorbell_from_sqe(struct sif_qp *qp, u16 seq, bool start) +{ + u16 doorbell_offset = start ? SQS_START_DOORBELL : SQS_STOP_DOORBELL; + struct sif_cb *cb = get_cb(qp); + struct sif_dev *sdev = to_sdev(qp->ibqp.pd->device); + struct sif_sq *sq = get_sif_sq(sdev, qp->qp_idx); + u64 *wqe = (u64 *)get_sq_entry(sq, seq); + + /* Pick the 1st 8 bytes directly from the sq entry: */ + wmb(); + __raw_writeq(*wqe, ((u8 __iomem *)cb->cb + doorbell_offset)); + + /* Flush write combining */ + wc_wmb(); + sif_log(sdev, SIF_QP, "%s sqs for qp %d sq_seq %d", (start ? "start" : "stop"), + qp->qp_idx, seq); +} + + +static struct list_head *type_to_list(struct sif_pd *pd, enum sif_tab_type type) +{ + switch (type) { + case cq_hw: + return &pd->cq_list; + case rq_hw: + return &pd->rq_list; + case qp: + return &pd->qp_list; + default: + BUG(); + } + return NULL; +} + + +/* Allocate a free index from a block: + * The index is a global index + */ +static int alloc_from_block(struct sif_table_block *b, enum sif_tab_type type) +{ + int next = 0; + int index; + int loc_idx; + + struct sif_table *table = b->table; + + if (table->alloc_rr) + next = (b->last_used + 1) & (table->entry_per_block - 1); + loc_idx = find_next_zero_bit(b->bitmap, table->entry_per_block, next); + if (table->alloc_rr && loc_idx >= table->entry_per_block) + loc_idx = find_next_zero_bit(b->bitmap, table->entry_per_block, 0); + if (loc_idx < table->entry_per_block) { + set_bit(loc_idx, b->bitmap); + if (table->alloc_rr) + b->last_used = loc_idx; + index = loc_idx + b->offset; + sif_log(table->sdev, SIF_IDX2, + "%s[%d:%d] -> %d ", sif_table_name(type), + b->offset / table->entry_per_block, loc_idx, index); + return index; + } + return -1; +} + + +/* Free a used index back to a block: + * The index is a global index + */ +static void free_to_block(struct sif_table_block *b, enum sif_tab_type type, int index) +{ + struct sif_table *table = b->table; + size_t ext_sz = table->ext_sz; + char *desc = sif_mem_kaddr(table->mem, index * ext_sz); + + /* Get from global index to block index */ + index -= b->offset; + + /* Clean descriptor entry for reuse: + * note that we clean the whole extent here which + * includes all of sif_##type for inlined types: + */ + if (type == rq_hw) /* only zero out driver data structure */ + memset(desc + sizeof(struct psif_rq_hw), 0, ext_sz - sizeof(struct psif_rq_hw)); + else if (!is_cb_table(type) && type != qp && type != cq_hw) + memset(desc, 0, ext_sz); + + sif_log(table->sdev, SIF_IDX2, + "%s[%d:%d] ", sif_table_name(type), + b->offset / table->entry_per_block, index); + clear_bit(index, b->bitmap); +} + + +/* Support for per protection domain table index allocations (2nd level allocation): + * Invariants: + * - sif_table_block entries are 0-initialized, and initialized to real values on demand. + * - We keep a list of blocks and try to allocate starting from the first in the list + * assuming that the last added block has the most free entries. + */ + +int sif_pd_alloc_index(struct sif_pd *pd, enum sif_tab_type type) +{ + struct sif_dev *sdev = to_sdev(pd->ibpd.device); + struct sif_table *tp = &sdev->ba[type]; + struct list_head *list = type_to_list(pd, type); + struct sif_table_block *b; + int idx = -1; + + if (tp->entry_per_block == 1) /* Handle 1-level alloc case */ + return sif_alloc_index(sdev, type); + + spin_lock(&pd->lock); + list_for_each_entry(b, list, pd_list) { + idx = alloc_from_block(b, type); + if (idx >= 0) + break; + } + if (idx < 0) { + /* Allocate a new block */ + int blk_idx = sif_alloc_index(sdev, type); + + if (blk_idx >= 0) { + b = sif_get_block(tp, blk_idx); + sif_log(sdev, SIF_IDX2, "%s blk_idx %d: %p [%ld/%d]", + sif_table_name(type), blk_idx, b, + sizeof(struct sif_table_block), tp->block_ext); + b->table = tp; + b->pd = pd; + b->offset = blk_idx * tp->entry_per_block; + /* Don't modify last_used as we want it to survive (de)allocations */ + list_add(&b->pd_list, list); + idx = alloc_from_block(b, type); + } + } + spin_unlock(&pd->lock); + return idx; +} + + +void sif_pd_free_index(struct sif_pd *pd, enum sif_tab_type type, int index) +{ + struct sif_dev *sdev = to_sdev(pd->ibpd.device); + struct sif_table *tp = &sdev->ba[type]; + struct sif_table_block *b; + int bits_used; + int blk_idx = index / tp->entry_per_block; + + if (tp->entry_per_block == 1) /* Handle 1-level alloc case */ + return sif_free_index(sdev, type, index); + + b = sif_get_block(tp, blk_idx); + if (!b->table) { + /* BUG */ + sif_log(sdev, SIF_INFO, "index %d: block table ptr NULL - blk_idx %d table %s", + index, blk_idx, sif_table_name(type)); + return; + } + spin_lock(&pd->lock); + free_to_block(b, type, index); + bits_used = bitmap_weight(b->bitmap, tp->entry_per_block); + if (!bits_used) { + list_del(&b->pd_list); + sif_free_index(sdev, type, blk_idx); + } + spin_unlock(&pd->lock); +} + + +bool sif_pd_index_used(struct sif_table *tp, int idx) +{ + struct sif_table_block *b; + int blk_idx = idx / tp->entry_per_block; + + if (!test_bit(blk_idx, tp->bitmap)) + return false; + b = sif_get_block(tp, blk_idx); + return test_bit(idx % tp->entry_per_block, b->bitmap); +} + + +bool sif_is_user_pd(struct sif_pd *pd) +{ + if (pd->ibpd.uobject) + return true; + /* TBD: We don't know if an XRC domain originates from user space, + * as it does not get any uobject + */ + if (pd->xrcd) /* TBD: && pd->xrcd->ib_xrcd.uobject) */ + return true; + return false; +} diff --git a/drivers/infiniband/hw/sif/sif_pd.h b/drivers/infiniband/hw/sif/sif_pd.h new file mode 100644 index 0000000000000..aa0277a80b12f --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_pd.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_pd.h: Internal interface to protection domains + * and collect buffer management for SIF + */ + +#ifndef __SIF_PD_H +#define __SIF_PD_H + +struct sif_dev; +struct sif_pd; +struct sif_cb; +struct sif_qp; +struct sif_ucontext; + +/**** Protection domains ****/ + +/* SIF supports a 24 bit PD index: */ +#define SIF_MAX_PD_INDEX ((1 << 24) - 1) + +struct sif_pd { + struct ib_pd ibpd; + int idx; /* index of this pd */ + struct sif_xrcd *xrcd; /* If set, this pd is owned by an xrcd */ + spinlock_t lock; /* Protects lists and their bitmaps while owned by us */ + /* List of blocks of descriptor entries owned by this pd */ + struct list_head qp_list; + struct list_head cq_list; + struct list_head rq_list; +}; + +struct sif_shpd { + struct ib_shpd ibshpd; + struct sif_pd *pd; +}; + +/* Initialize/deinitialize the pd subsystem */ +int sif_init_pd(struct sif_dev *sdev); +void sif_deinit_pd(struct sif_dev *sdev); + +struct sif_pd *alloc_pd(struct sif_dev *sdev); +int dealloc_pd(struct sif_pd *pd); + + +/* Per protection domain table index allocations (2nd level allocation) */ +int sif_pd_alloc_index(struct sif_pd *pd, enum sif_tab_type type); +void sif_pd_free_index(struct sif_pd *pd, enum sif_tab_type type, int index); + +/* 2-level and 1-level safe index usage check: + * idx is the entry index (not block index) + * and is assumed to be within bounds: + * + */ +bool sif_pd_index_used(struct sif_table *tp, int idx); + +bool sif_is_user_pd(struct sif_pd *pd); + + +/**** Collect buffers ****/ + +static inline bool is_cb_table(enum sif_tab_type type) +{ + return type == bw_cb || type == lat_cb; +} + + +/* Called from sif_base.c to initialize the cb tables */ +void sif_cb_table_init(struct sif_dev *sdev, enum sif_tab_type type); + + +/* per collect buffer struct */ +struct sif_cb { + int idx; /* index of this cb */ + bool is_lat_cb; /* High bandwidth or low latency cb */ + spinlock_t lock; /* Serializes access to this cb */ + u64 reqs; /* Number of requests on this cb */ + struct psif_cb __iomem *cb; /* Pointer to the actual collect buffer space */ +}; + +/* Allocation and deallocation of collect buffers + * If @lat_cb is set, allocate low latency CB instead of high bandwidth one: + */ +struct sif_cb *alloc_cb(struct sif_dev *sdev, bool lat_cb); +void release_cb(struct sif_dev *sdev, struct sif_cb *cb); + +/* Find the driver struct for a collect buffer index, if associated with @uc + */ +struct sif_cb *sif_cb_from_uc(struct sif_ucontext *uc, u32 index); + + +/* + * Write a prepared work request (in wqe) to the associated collect buffer: + * Return 0 on success otherwise -EBUSY if lock is held + */ +int sif_cb_write(struct sif_qp *qp, struct psif_wr *wqe, int cp_len); + + +/* + * Notify about a work request to the cb doorbell - triggering SQ mode: + */ +void sif_doorbell_write(struct sif_qp *qp, struct psif_wr *wqe, bool start); + + +/* + * Force the SQS to process an already posted WR: + */ +void sif_doorbell_from_sqe(struct sif_qp *qp, u16 seq, bool start); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_pqp.c b/drivers/infiniband/hw/sif/sif_pqp.c new file mode 100644 index 0000000000000..fd22824e29af6 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_pqp.c @@ -0,0 +1,1048 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_pqp.c: Privileged QP handling + * The privileged QPs are SIFs internal send only QPs for management operations + */ + +#include "sif_dev.h" +#include "sif_cq.h" +#include "sif_sq.h" +#include "sif_base.h" +#include "psif_hw_data.h" +#include "psif_hw_setget.h" +#include "sif_pqp.h" +#include "sif_qp.h" +#include "sif_hwi.h" +#include "sif_ibqp.h" +#include "sif_checksum.h" +#include "sif_defs.h" + +static inline struct sif_qp *__create_init_qp(struct sif_dev *sdev, struct sif_cq *cq) +{ + struct sif_qp *qp; + struct ib_qp_init_attr init_attr = { + .event_handler = NULL, + .send_cq = &cq->ibcq, + .recv_cq = NULL, /* receive side not used */ + .srq = NULL, + .cap = { + .max_send_wr = sif_max_pqp_wr, + .max_recv_wr = 0, + .max_send_sge = 0, + .max_recv_sge = 0, + .max_inline_data = 0 + }, + .qp_type = IB_QPT_UD, + }; + struct sif_qp_init_attr sif_attr = { + .pd = sdev->pd, + .qp_type = PSIF_QP_TRANSPORT_MANSP1, + .qosl = QOSL_LOW_LATENCY, + .sq_hdl_sz = sizeof(struct sif_sq_hdl), + }; + + qp = create_qp(sdev, &init_attr, &sif_attr); + if (!IS_ERR(qp)) + qp->ibqp.pd = &sdev->pd->ibpd; + return qp; +} + + + +static struct sif_pqp *_sif_create_pqp(struct sif_dev *sdev, size_t alloc_sz, int comp_vector) +{ + struct sif_pqp *pqp; + struct sif_cq *cq; + struct sif_qp *qp; + struct sif_sq *sq = NULL; + int ret = 0; + + /* The privileged QP only supports state in modify_qp */ + struct ib_qp_attr mod_attr = { + .qp_state = IB_QPS_INIT + }; + + pqp = kzalloc(alloc_sz, GFP_KERNEL); + if (!pqp) { + sif_log(sdev, SIF_INFO, "Failed to allocate memory for priv.qp"); + return NULL; + } + + cq = create_cq(sdev->pd, sif_max_pqp_wr, comp_vector, SIFPX_OFF, false); + if (IS_ERR(cq)) { + ret = PTR_ERR(cq); + goto cq_alloc_failed; + } + cq->ibcq.device = &sdev->ib_dev; + pqp->cq = cq; + cq->pqp = pqp; + init_completion(&pqp->nonfull); + + /* Now create a queue pair. + * TBD: Use a separate pqp for req_notify_cq and use low latency.. + */ + qp = __create_init_qp(sdev, cq); + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto qp_alloc_failed; + } + + pqp->qp = qp; + sq = get_sif_sq(sdev, qp->qp_idx); + /* Reserve 1/2 or at least 1 entry for pqp requests with completion on the PQP */ + pqp->lowpri_lim = sq->entries - min_t(int, sq->entries/2, 2); + + /* Run the required qp modify sequence */ + ret = sif_modify_qp(&qp->ibqp, &mod_attr, + IB_QP_STATE, NULL); + if (ret) + goto qp_alloc_failed; + + mod_attr.qp_state = IB_QPS_RTR; + ret = sif_modify_qp(&qp->ibqp, &mod_attr, + IB_QP_STATE, NULL); + if (ret) + goto qp_alloc_failed; + + mod_attr.qp_state = IB_QPS_RTS; + mod_attr.sq_psn = 0; + ret = sif_modify_qp(&qp->ibqp, &mod_attr, + IB_QP_STATE, NULL); + if (ret) + goto qp_alloc_failed; + + atomic64_set(&pqp->qp->arm_srq_holdoff_time, 0); + + sif_log(sdev, SIF_QP, "success"); + return pqp; + +qp_alloc_failed: + /* Special destruction order, see below: */ + destroy_cq(cq); + if (sq) + sq->cq_idx = -1; + + if (pqp->qp) + destroy_qp(sdev, qp); +cq_alloc_failed: + kfree(pqp); + sif_log(sdev, SIF_QP, "failed with %d", ret); + return ERR_PTR(ret); +} + + +int sif_destroy_pqp(struct sif_dev *sdev, struct sif_pqp *pqp) +{ + struct sif_sq *sq = get_sif_sq(sdev, pqp->qp->qp_idx); + bool self_destruct = get_pqp(sdev) == pqp; + /* For the last pqp we make an exception from the IB std reqs + * in that we keep the PQP itself up to invalidate the CQ using the + * PQP to send the invalidate, **before** we take down the QP itself. + * The hardware will make sure that for this special case + * the completion is sent before the CQ entry is invalidated. + */ + int ret; + + if (self_destruct) { + sif_log(sdev, SIF_PQP, "self destruct CQ %d", pqp->cq->index); + ret = destroy_cq(pqp->cq); + if (ret < 0) + return ret; + + if (sq) + sq->cq_idx = -1; + } + + ret = destroy_qp(sdev, pqp->qp); + if (ret < 0) + return ret; + + /* Support the normal destruction order as long as we have + * other PQPs in the system: + */ + if (!self_destruct) { + ret = destroy_cq(pqp->cq); + if (ret < 0) + return ret; + + if (sq) + sq->cq_idx = -1; + } + kfree(pqp); + return 0; +} + + +struct sif_pqp *sif_create_pqp(struct sif_dev *sdev, int comp_vector) +{ + return _sif_create_pqp(sdev, sizeof(struct sif_pqp), comp_vector); +} + + +static void pqp_complete_nonfull(struct sif_pqp *pqp) +{ + int ql; + unsigned long flags; + struct sif_dev *sdev = to_sdev(pqp->cq->ibcq.device); + struct sif_sq *sq = get_sif_sq(sdev, pqp->qp->qp_idx); + struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, pqp->qp->qp_idx); +return; + spin_lock_irqsave(&sq->lock, flags); + ql = sq_length(sq, sq_sw->head_seq, sq_sw->last_seq); + if (ql <= sq->mask && atomic_read(&pqp->waiters)) + complete(&pqp->nonfull); + spin_unlock_irqrestore(&sq->lock, flags); +} + + +static inline void __pqp_complete_sq(struct sif_sq *sq, u32 sq_seq) +{ + /* TBD: Allow pqp posters to wait for completions */ +} + + + +static void pqp_reset_cmpl(struct sif_cqe *lcqe) +{ + struct sif_pqp *pqp = lcqe->pqp; + struct sif_cq *cq = pqp->cq; + struct sif_dev *sdev = to_sdev(cq->ibcq.device); + struct sif_sq *sq = get_sif_sq(sdev, pqp->qp->qp_idx); + struct sif_sq_hdl *wh = get_sq_hdl(sq, lcqe->sq_seq); + unsigned long flags; + + spin_lock_irqsave(&cq->lock, flags); + wh->wr_id = 0; + wh->used = false; + spin_unlock_irqrestore(&cq->lock, flags); +} + + + +/* Process all received completions on @cq - must be only PQP completions! + * Return the number processed, or -errno upon errors: + * Assumes the cq lock is held. + * If first_err is set, check for completion errors and return the first one with errors: + */ + +/* TBD: Clean up memory barriers in this function */ +static int __pqp_process_cqe(struct sif_pqp *pqp, struct sif_cqe *first_err) +{ + struct sif_cq *cq = pqp->cq; + struct sif_dev *sdev = to_sdev(cq->ibcq.device); + struct sif_sq_sw *sq_sw; + volatile struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index); + struct sif_sq *sq; + u32 seqno = cq_sw->next_seq; + volatile struct psif_cq_entry *cqe_be = get_cq_entry(cq, seqno); + int npolled = 0; + int cqe_cnt = 0; + u64 wci; + struct psif_send_completion_id *wc_id = (struct psif_send_completion_id *)&wci; + int sq_seq; + struct sif_cqe *lcqe; + struct sif_sq_hdl *wh; + int ql = 0; + u64 dbg_mask; + bool err_seen = false; + + for (; seqno == get_psif_cq_entry__seq_num(cqe_be); npolled++) { + enum psif_wc_status status = get_psif_cq_entry__status(cqe_be); + int sq_idx = get_psif_cq_entry__qp(cqe_be); + bool dump_it = false; + + sq = get_sif_sq(sdev, sq_idx); + sq_sw = get_sif_sq_sw(sdev, sq_idx); + wci = get_psif_cq_entry__wc_id(cqe_be); + sq_seq = wc_id->sq_seq_num; + wh = get_sq_hdl(sq, sq_seq); + + if (unlikely(status != PSIF_WC_STATUS_SUCCESS)) { + sif_log(sdev, SIF_INFO, "error completion polled"); + dump_it = true; + } + + if (pqp->qp->flags & SIF_QPF_KI_STENCIL) + goto cont_check_first_err; + + if (unlikely(!wh)) { + sif_log(sdev, SIF_INFO, + "cqe %d for cq %d refers sq(qp) %d which has not been initialized", + seqno, cq->index, sq_idx); + dump_it = true; + goto cont_no_wh; + } + if (unlikely(!wh->used)) { + sif_log(sdev, SIF_INFO, + "ignoring unused cqe %d for cq %d, sq %d, sq_seq %d", + seqno, cq->index, sq_idx, sq_seq); + dump_it = true; + goto cont; + } + if (unlikely(wh->sq_seq != sq_seq)) { + sif_log(sdev, SIF_INFO, + "wrong cqe %d for cq %d: got sq_seq %d, expected %d", + seqno, cq->index, sq_seq, wh->sq_seq); + dump_it = true; + goto cont; + } + + lcqe = (struct sif_cqe *)wh->wr_id; + if (lcqe) { + wh->wr_id = 0; + cqe_cnt++; + mb(); + sif_log(sdev, SIF_PQP, "copying to caller cqe at %p", &lcqe->cqe); + copy_conv_to_sw(&lcqe->cqe, cqe_be, sizeof(struct psif_cq_entry)); + wmb(); + lcqe->written = true; + if (lcqe->need_complete) + complete(&lcqe->cmpl); + } +cont_check_first_err: + if (unlikely(first_err && (status != PSIF_WC_STATUS_SUCCESS))) { + sif_log(sdev, SIF_PQP, "error completion received - aborting"); + copy_conv_to_sw(&first_err->cqe, cqe_be, sizeof(struct psif_cq_entry)); + err_seen = true; + first_err->written = true; + npolled++; + } +cont: + wh->used = 0; +cont_no_wh: + if (dump_it) { + sif_logs(SIF_INFO, + write_struct_psif_cq_entry(NULL, 1, + (const struct psif_cq_entry *)cqe_be); + printk("\n")); + } + + mb(); + sq_sw->head_seq = sq_seq; + seqno = ++cq_sw->next_seq; + + if (cq_length(cq, cq_sw->cached_head, seqno) >= cq->high_watermark) { + /* Update CQ hardware pointer */ + set_psif_cq_sw__head_indx(&cq_sw->d, seqno); + cq_sw->cached_head = seqno; + } + + ql = sq_length(sq, sq_seq, sq_sw->last_seq); + if (ql <= sq->mask) + pqp_complete_nonfull(pqp); + mb(); + if (unlikely(err_seen)) + break; + cqe_be = get_cq_entry(cq, seqno); + } + + dbg_mask = npolled ? SIF_PQP : SIF_IPOLL; + sif_log(sdev, dbg_mask, "processed %d (%d with waiters) requests - seqno 0x%x, ql %d", + npolled, atomic_read(&pqp->waiters), + seqno, ql); + + if (npolled > 0) { + /* reset timeout each time we see a new completion: */ + pqp->timeout = jiffies + sdev->min_resp_ticks * 4; + } + return npolled; +} + + +static int pqp_process_cqe(struct sif_pqp *pqp, struct sif_cqe *first_err) +{ + unsigned long flags; + int npolled; + struct sif_cq *cq = pqp->cq; + + /* If someone else holds the lock, the CQEs are handled */ + if (!spin_trylock_irqsave(&cq->lock, flags)) + return -EBUSY; + npolled = __pqp_process_cqe(pqp, first_err); + spin_unlock_irqrestore(&cq->lock, flags); + return npolled; +} + + +static struct sif_pqp *find_any_pqp(struct sif_dev *sdev) +{ + int cpu; + + for (cpu = 0; cpu < sdev->pqp_cnt; cpu++) + if (sdev->pqp[cpu]) + return sdev->pqp[cpu]; + return NULL; +} + +/* Get the right PQP for the same EQ*/ +struct sif_pqp *get_pqp_same_eq(struct sif_dev *sdev, int comp_vector) +{ + unsigned int pqp_index = comp_vector - 2; + struct sif_pqp *pqp = sdev->pqp_cnt ? sdev->pqp[pqp_index % sdev->pqp_cnt] : NULL; + + if (unlikely(!pqp)) { + /* Typically during take down */ + return find_any_pqp(sdev); + } + return pqp; +} + + +/* Get the right PQP for the current CPU */ +struct sif_pqp *get_pqp(struct sif_dev *sdev) +{ + unsigned int cpu = smp_processor_id(); + struct sif_pqp *pqp = sdev->pqp_cnt ? sdev->pqp[cpu % sdev->pqp_cnt] : NULL; + + if (unlikely(!pqp)) { + /* Typically during take down */ + return find_any_pqp(sdev); + } + return pqp; +} + +/* Get the next PQP in a round robin fashion */ +struct sif_pqp *get_next_pqp(struct sif_dev *sdev) +{ + struct sif_pqp *pqp; + int next = atomic_inc_return(&sdev->next_pqp) % sdev->pqp_cnt; + + pqp = sdev->pqp[next]; + if (unlikely(!pqp)) { + /* Typically during take down */ + return find_any_pqp(sdev); + } + return pqp; +} + +struct sif_cb *get_cb(struct sif_qp *qp) +{ + struct sif_dev *sdev = to_sdev(qp->ibqp.pd->device); + unsigned int cpu = smp_processor_id(); + return sdev->kernel_cb[qp->qosl][cpu % sdev->kernel_cb_cnt]; +} + + +inline bool pqp_req_gets_completion(struct sif_pqp *pqp, struct psif_wr *wr, enum post_mode mode) +{ + return mode == PM_WRITE || (wr->op != PSIF_WR_GENERATE_COMPLETION && wr->completion) || + wr->cq_desc_vlan_pri_union.cqd_id == pqp->cq->index; +} + +/* Fill in common parts and post a work request to the management QP for the current CPU + * If @cqe is non-null, a completion will be requested and the result put there in + * host order when it is found (by __pqp_process_cqe()) + */ +int sif_pqp_write_send(struct sif_pqp *pqp, struct psif_wr *wr, struct sif_cqe *cqe, + enum post_mode mode) +{ + struct sif_qp *qp = pqp->qp; + u32 qp_idx = qp->qp_idx; + struct sif_dev *sdev = to_sdev(pqp->qp->ibqp.device); + struct sif_pd *pd = sdev->pd; + struct sif_sq *sq = get_sif_sq(sdev, qp_idx); + struct psif_sq_entry *sqe; + struct sif_sq_hdl *wh; + unsigned long flags; + bool ring_doorbell; + int q_sz; + int ret = 0; + u16 head, sq_seq; + struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, qp_idx); + unsigned long timeout = sdev->min_resp_ticks * 4; + u16 limit = pqp_req_gets_completion(pqp, wr, mode) ? sq->entries : pqp->lowpri_lim; + /* Per IBTA 11.4.1.1, error is only returned + * when the QP is in the RESET, INIT or RTR states. + */ + if (qp->last_set_state < IB_QPS_RTS) + return -EINVAL; /* The pqp is not ready */ + + pqp->timeout = jiffies + timeout; + + wr->local_qp = qp_idx; + wr->tsu_qosl = qp->qosl; + wr->tsu_sl = qp->tsl; + +restart: + /* Make sure emptying the queue takes preference over filling it up: */ + if (mode != PM_WRITE) + ret = pqp_process_cqe(pqp, NULL); + if (ret > 0 || ret == -EBUSY) + ret = 0; /* Got some reqs */ + else if (ret < 0) + return ret; + + spin_lock_irqsave(&sq->lock, flags); + sq_seq = sq_sw->last_seq; + head = sq_sw->head_seq; + q_sz = sq_length(sq, head, sq_seq); + + if (q_sz >= limit) { + if (sq_seq != pqp->last_full_seq) { + sif_log(sdev, SIF_PQP, + "Privileged qp full - head %d sq_seq %d q_sz %d/%d", + head, sq_seq, q_sz, sq->entries); + pqp->last_full_seq = sq_seq; + } + spin_unlock_irqrestore(&sq->lock, flags); + + if (limit < sq->entries && sq_seq != pqp->last_nc_full) { + /* Avoid spinning creating more sync completions + * - block on next try unless sequence number has changed: + */ + pqp->last_nc_full = sq_seq; + return -EAGAIN; + } + + /* PQP requests to a full queue should not be generated at interrupt level */ + BUG_ON(in_interrupt()); + if (time_is_after_jiffies(pqp->timeout)) { + goto restart; + if (sq_seq != pqp->last_full_seq) + sif_log(sdev, SIF_PQP, "priv.qp %d: spin waiting for slot in queue", + pqp->qp->qp_idx); + } else { + sif_log(sdev, SIF_INFO, + "Timeout waiting for previous response (seq %d) to complete", + sq_sw->head_seq); + return -ETIMEDOUT; + } + } + sq_seq = ++sq_sw->last_seq; + + /* Store longest send queue observed */ + if (unlikely(q_sz > sq->max_outstanding && mode != PM_WRITE)) + sq->max_outstanding = q_sz; + + /* For GENERATE_COMPLETION the CQ id to generate in is put here + * and no completion is expected on the PQP. + */ + if (wr->op == PSIF_WR_GENERATE_COMPLETION) { + /* Are we generating a completion on our own QP? */ + if (wr->details.su.u2.target_qp == pqp->qp->qp_idx) + wr->details.su.wc_id.sq_id.sq_seq_num = sq_seq; + } else + wr->cq_desc_vlan_pri_union.cqd_id = sq->cq_idx; + + wh = get_sq_hdl(sq, sq_seq); + wh->wr_id = (u64)cqe; + wh->sq_seq = sq_seq; + wh->used = true; + + if (cqe) { + if ((wr->op != PSIF_WR_GENERATE_COMPLETION) || (wr->se)) { + cqe->sq_seq = sq_seq; + wr->completion = 1; + } + BUG_ON(cqe->written); + } + + sqe = get_sq_entry(sq, sq_seq); + + sif_log(sdev, SIF_PQP, "pd %d cq_idx %d sq_idx %d sq.seqn %d op %s", + pd->idx, wr->cq_desc_vlan_pri_union.cqd_id, sq->index, sq_seq, + string_enum_psif_wr_type(wr->op)); + + if (likely(mode != PM_WRITE)) { + u64 csum; + + wr->sq_seq = sq_seq; + + /* Collect_length is always 0 for privileged wr's - they have no data */ + csum = csum32_partial(wr, sizeof(*wr), qp->magic); + csum = csum32_fold(csum); + wr->checksum = csum; + + sif_log(sdev, SIF_PQP, "PQP checksum %x", wr->checksum); + } + + sif_logs(SIF_DUMP, write_struct_psif_wr(NULL, 0, wr)); + + /* update send queue */ + copy_conv_to_hw(sqe, wr, sizeof(struct psif_wr)); + + if (likely(mode != PM_WRITE)) { + /* Flush writes before updating the sw pointer, + * This is necessary to ensure that the sqs do not see + * an incomplete entry: + */ + wmb(); + + /* Update sw pointer visible to hw */ + set_psif_sq_sw__tail_indx(&sq_sw->d, sq_seq); + + /* Finally write to collect buffer - implicit barriers before/after I/O writes + * + * Workaround #3595: ring doorbell if SQS in SQ-mode + */ + ring_doorbell = qp->flags & SIF_QPF_FORCE_SQ_MODE || + !(get_psif_sq_hw__sq_next(&sq->d) & 0x1) || + mode == PM_DOORBELL; + + if (ring_doorbell) + sif_doorbell_from_sqe(qp, sq_seq, true); + else if (sif_cb_write(qp, wr, sizeof(struct psif_wr))) { + /* vcb lock busy, use db mode instead */ + sif_doorbell_from_sqe(qp, sq_seq, true); + } + } + + spin_unlock_irqrestore(&sq->lock, flags); + return ret; +} + + +int sif_pqp_post_send(struct sif_dev *sdev, struct psif_wr *wr, struct sif_cqe *cqe) +{ + struct sif_pqp *pqp = cqe ? cqe->pqp : get_pqp(sdev); + enum post_mode mode = pqp->qp->flags & SIF_QPF_FORCE_SQ_MODE ? PM_DOORBELL : PM_CB; + + return sif_pqp_write_send(pqp, wr, cqe, mode); +} + +int sif_pqp_poll_wr(struct sif_dev *sdev, struct psif_wr *wr, struct sif_cqe *cqe) +{ + int ret = sif_pqp_post_send(sdev, wr, cqe); + + if (ret) { + sif_log(sdev, SIF_INFO, "PQP wr %d post failed on QP %d, CQ %d", + cqe->pqp->qp->qp_idx, cqe->pqp->cq->index, wr->sq_seq); + return ret; + } + + ret = poll_cq_waitfor(cqe); + if (ret < 0) + sif_log(sdev, SIF_INFO, "poll_cq_waitfor, pqp QP %d, CQ %d failed with %d", + cqe->pqp->qp->qp_idx, cqe->pqp->cq->index, ret); + return ret; +} + + +/* Poll and process incoming (internal) completions + * while waiting for this particular completion + */ +int poll_cq_waitfor(struct sif_cqe *lcqe) +{ + struct sif_pqp *pqp = lcqe->pqp; + struct sif_cq *cq = pqp->cq; + struct sif_dev *sdev = to_sdev(cq->ibcq.device); + int ret = 0; + volatile bool *written = &lcqe->written; + u64 min_resp_ticks = sdev->min_resp_ticks; + + /* TBD: This timeout is unsafe - we just keep it now to allow runs be aborted + * without having to reboot. Keep value for it a factor larger than other timeouts: + */ + pqp->timeout = jiffies + min_resp_ticks * 4; + + while (!(*written)) { + ret = pqp_process_cqe(pqp, NULL); + if (ret == -EBUSY) { + ret = 0; + continue; + } else if (ret < 0) + break; + else if (ret == 0) { + if (time_is_before_jiffies(pqp->timeout)) { + if (sif_feature(pcie_trigger)) + force_pcie_link_retrain(sdev); + sif_log(sdev, SIF_INFO, + "cq %d: poll for cqe %p timed out", cq->index, lcqe); + atomic_inc(&cq->timeout_cnt); + + sif_logs(SIF_PQPT, + struct sif_sq *sq = get_sif_sq(sdev, pqp->qp->qp_idx); + struct psif_sq_entry *sqe = + get_sq_entry(sq, lcqe->sq_seq); + write_struct_psif_sq_entry(NULL, 1, sqe)); + ret = -ETIMEDOUT; + break; + } + if (!in_interrupt()) /* TBD: Fix this as well */ + cond_resched(); + else + cpu_relax(); + + if (sdev->min_resp_ticks != min_resp_ticks) { + /* Give us a quick way out by changing min_resp_ticks */ + pqp->timeout -= (min_resp_ticks - sdev->min_resp_ticks) * 4; + min_resp_ticks = sdev->min_resp_ticks; + } + continue; + } + } + + if (ret < 0) + pqp_reset_cmpl(lcqe); + return ret; +} + + +/* Poll for any pqp completion, return the number of completions polled */ +static int poll_cq_waitfor_any(struct sif_pqp *pqp, struct sif_cqe *first_err) +{ + struct sif_cq *cq = pqp->cq; + struct sif_dev *sdev = to_sdev(cq->ibcq.device); + int ret = 0; + u64 min_resp_ticks = sdev->min_resp_ticks; + + pqp->timeout = jiffies + min_resp_ticks * 4; + + while (!ret) { + ret = pqp_process_cqe(pqp, first_err); + if (ret == -EBUSY) { + ret = 0; + continue; + } else if (ret < 0) + break; + else if (ret == 0) { + if (time_is_before_jiffies(pqp->timeout)) { + if (sif_feature(pcie_trigger)) + force_pcie_link_retrain(sdev); + sif_log(sdev, SIF_INFO, + "cq %d: poll timed out", cq->index); + atomic_inc(&cq->timeout_cnt); + ret = -ETIMEDOUT; + break; + } + if (!in_interrupt()) /* TBD: Fix this as well */ + cond_resched(); + else + cpu_relax(); + + if (sdev->min_resp_ticks != min_resp_ticks) { + /* Give us a quick way out by changing min_resp_ticks */ + pqp->timeout -= (min_resp_ticks - sdev->min_resp_ticks) * 4; + min_resp_ticks = sdev->min_resp_ticks; + } + } + } + sif_log(sdev, SIF_PQP, "ret = %d", ret); + return ret; +} + + +/***** Generic completion generation *****/ + +static int __gen_cqe(struct sif_dev *sdev, u32 target_cq, u64 wc_id, u32 target_qp, + enum psif_wc_opcode opcode, enum psif_wc_status status, struct sif_cqe *cqe, + bool event) +{ + struct psif_wr wr; + + memset(&wr, 0, sizeof(struct psif_wr)); + wr.op = PSIF_WR_GENERATE_COMPLETION; + wr.cq_desc_vlan_pri_union.cqd_id = target_cq; + wr.details.su.completion_status = status; + wr.details.su.completion_opcode = opcode; + + if (opcode >= PSIF_WC_OPCODE_RECEIVE_SEND) + wr.details.su.wc_id.rq_id = wc_id; + else + wr.details.su.wc_id.sq_id.sq_seq_num = wc_id; + + wr.details.su.u2.target_qp = target_qp; + /* set the IB_CQ_SOLICITED flag because the CQ might be armed + * and the consumer might be interested in getting these events. + * Setting IB_CQ_SOLICITED is generally safe because it is a + * subset of IB_CQ_NEXT_COMP. + */ + if (event) + wr.se = 1; + + return sif_pqp_post_send(sdev, &wr, cqe); +} + + +/* Generate a SUCCESS completion on the PQP itself + * We use this to be able to wait for a set of generated completions to other + * CQs to have been completed: + */ +int gen_pqp_cqe(struct sif_cqe *cqe) +{ + struct sif_pqp *pqp = cqe->pqp; + struct sif_dev *sdev = to_sdev(pqp->cq->ibcq.device); + struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, pqp->qp->qp_idx); + + if (cqe) + cqe->written = false; + + sif_log(sdev, SIF_PQP, " for sq %d, last_nc_full %d, head_seq %d last_seq %d", + pqp->qp->qp_idx, pqp->last_nc_full, sq_sw->head_seq, sq_sw->last_seq); + return __gen_cqe(sdev, pqp->cq->index, 0, pqp->qp->qp_idx, + PSIF_WC_OPCODE_GENERATE_COMPLETION, PSIF_WC_STATUS_SUCCESS, + cqe, true); +} + + +/* Post a request to generate a completion with the given values + * on the cq identified by @target_cq. + * This request generates no completion on the PQP itself: + */ +static int sif_gen_cqe(struct sif_dev *sdev, u32 target_cq, u64 wc_id, u32 target_qp, + enum psif_wc_opcode opcode, enum psif_wc_status status, bool event) +{ + return __gen_cqe(sdev, target_cq, wc_id, target_qp, opcode, status, NULL, event); +} + +/* Post a request to generate a completion for an outstanding rq entry + * on the given qp. This request generates no completion on the PQP itself: + */ + +static int sif_gen_rq_cqe(struct sif_dev *sdev, struct sif_rq *rq, u32 rq_seq, + struct sif_qp *target_qp, enum psif_wc_opcode opcode, + enum psif_wc_status status) +{ + struct psif_rq_entry *rqe = get_rq_entry(rq, rq_seq); + u64 wc_id = get_psif_rq_entry__rqe_id(rqe); + u32 cq_idx = get_psif_qp_core__rcv_cq_indx(&target_qp->d.state); + + sif_log(sdev, SIF_PQP, "on rq %d, rq_seq %d, wc_id %llx, cq %d (target_qp %d)", + rq->index, rq_seq, wc_id, cq_idx, target_qp->qp_idx); + + return sif_gen_cqe(sdev, cq_idx, wc_id, target_qp->qp_idx, opcode, status, true); +} + + +int sif_gen_rq_flush_cqe(struct sif_dev *sdev, struct sif_rq *rq, + u32 rq_seq, struct sif_qp *target_qp) +{ + return sif_gen_rq_cqe(sdev, rq, rq_seq, target_qp, + PSIF_WC_OPCODE_RECEIVE_SEND, PSIF_WC_STATUS_WR_FLUSH_ERR); +} + +/* Post a request to generate a completion for an outstanding sq entry + * on the given qp. This request generates no completion on the PQP itself: + */ + +static int sif_gen_sq_cqe(struct sif_dev *sdev, struct sif_sq *sq, u32 sq_seq, u32 target_qp, + enum psif_wc_opcode opcode, enum psif_wc_status status, bool event) +{ + struct psif_sq_entry *sqe = get_sq_entry(sq, sq_seq); + u64 wc_id = get_psif_wr__sq_seq(&sqe->wr); + + sif_log(sdev, SIF_PQP, "on sq %d, sq_seq %d, wc_id %llx, cq %d (target_qp %d)", + sq->index, sq_seq, wc_id, sq->cq_idx, target_qp); + + return sif_gen_cqe(sdev, sq->cq_idx, wc_id, target_qp, opcode, status, event); +} + + +int sif_gen_sq_flush_cqe(struct sif_dev *sdev, struct sif_sq *sq, + u32 sq_seq, u32 target_qp, bool event) +{ + return sif_gen_sq_cqe(sdev, sq, sq_seq, target_qp, + PSIF_WC_OPCODE_SEND, PSIF_WC_STATUS_WR_FLUSH_ERR, event); +} + + +/***** Stencil PQP support **** + * + * A stencil PQP is a PQP set up fully populated with WRs ready + * for parallel batch processing (using SQSes) of particularly performance + * critical PQP operations. + * + * The idea is to lay this out to allow the WRs to be reused with minimal + * updates: + */ + +struct sif_st_pqp *sif_create_inv_key_st_pqp(struct sif_dev *sdev) +{ + int i; + struct sif_st_pqp *spqp = (struct sif_st_pqp *)_sif_create_pqp(sdev, sizeof(*spqp), 0); + struct sif_pqp *pqp; + int qp_idx; + struct sif_sq *sq; + struct sif_sq_sw *sq_sw; + struct psif_sq_entry *sqe; + struct psif_wr lwr; + u16 max_db_int; + + if (IS_ERR(spqp)) + return spqp; + + pqp = &spqp->pqp; + qp_idx = pqp->qp->qp_idx; + sq = get_sif_sq(sdev, qp_idx); + sq_sw = get_sif_sq_sw(sdev, qp_idx); + max_db_int = (sq->entries >> 3); + + /* Pre-populate the SQ */ + for (i = 0; i < sq->entries; i++) + sif_write_invalidate(pqp, key, 0, NULL, PCM_POST, PM_WRITE); + + /* Now, to start using the stencil at seq.1 (as normal SQs) + * we must reset the sw tail pointer which + * was updated by sif_write_invalidate: + */ + sq_sw->last_seq = 0; + spqp->doorbell_seq = 1; + + spqp->doorbell_interval = min_t(u16, SPQP_DOORBELL_INTERVAL, max_db_int); + spqp->next_doorbell_seq = spqp->doorbell_interval + 1; + spqp->req_compl = 0; + spqp->next_poll_seq = (sq->entries >> 1); + spqp->sq = sq; + spqp->sq_sw = sq_sw; + spqp->pqp.qp->flags |= SIF_QPF_KI_STENCIL; + + /* Calculate a partial checksum + * - they are all the same since the fields we change + * are calculated with 0-values to ease checksum mod. later: + */ + sqe = get_sq_entry(sq, 0); + copy_conv_to_sw(&lwr, &sqe->wr, sizeof(lwr)); + spqp->checksum = csum32_partial(&lwr, sizeof(lwr), pqp->qp->magic); + + sif_log(sdev, SIF_PQPT, "done qp %d, sq sz %d, next_poll_seq %d", qp_idx, + sq->entries, spqp->next_poll_seq); + return spqp; +} + + +int sif_destroy_st_pqp(struct sif_dev *sdev, struct sif_st_pqp *spqp) +{ + return sif_destroy_pqp(sdev, &spqp->pqp); +} + + +/* Update a new invalidate key request into a preconfigured stencil pqp + * Assumes exclusive access to the PQP SQ. + */ +int sif_inv_key_update_st(struct sif_st_pqp *spqp, int index, enum wr_mode mode) +{ + struct sif_sq *sq = spqp->sq; + struct sif_sq_sw *sq_sw = spqp->sq_sw; + u16 sq_seq = ++sq_sw->last_seq; + struct psif_sq_entry *sqe = get_sq_entry(sq, sq_seq); + struct sif_dev *sdev = to_sdev(spqp->pqp.cq->ibcq.device); + bool poll_prev = false; + int ret = 1; + u64 csum_inc = (u64)index + (u64)sq_seq; + u64 csum; + int q_sz; + u16 head; + DECLARE_SIF_CQE_POLL(sdev, first_err); + + /* Modify the request to our need */ + set_psif_wr_su__key(&sqe->wr.details.su, index); + set_psif_wr__sq_seq(&sqe->wr, sq_seq); + + head = sq_sw->head_seq; + q_sz = sq_length(sq, head, sq_seq); + + if (unlikely(q_sz > (int)sq->entries)) { + sif_log(sdev, SIF_INFO, "Error: Stencil pqp (qp %d) is full at seq %d, head %d", + sq->index, sq_seq, sq_sw->head_seq); + sq_sw->last_seq--; + return -ENOMEM; + } + + /* Store longest send queue observed */ + if (unlikely(q_sz > sq->max_outstanding)) + sq->max_outstanding = q_sz; + + if (unlikely(mode == PCM_WAIT || sq_seq == spqp->next_poll_seq)) { + set_psif_wr__completion(&sqe->wr, 1); + spqp->req_compl++; + sif_log(sdev, SIF_PQPT, "sq %d: requesting completion for seq %d (%d)", + sq->index, sq_seq, spqp->req_compl); + poll_prev = spqp->req_compl > 1; + if (sq_seq == spqp->next_poll_seq) + spqp->next_poll_seq += (sq->entries >> 1); + csum_inc += 0x80000000; + } else { + /* Reset the completion bit in case it was set in the previous generation! */ + set_psif_wr__completion(&sqe->wr, 0); + } + + /* Add the changes to the checksum */ + csum = csum32_partial(&csum_inc, 8, spqp->checksum); + csum = csum32_fold(csum); + set_psif_wr__checksum(&sqe->wr, csum); + + sif_log(sdev, SIF_PQP, "cq %d, sq %d, sq seq %d%s", spqp->pqp.cq->index, + sq->index, sq_seq, (poll_prev ? " (poll prev)" : "")); + + if (unlikely(mode == PCM_WAIT || sq_seq == spqp->next_doorbell_seq)) { + sif_log(sdev, SIF_PQPT, "sq %d: writing doorbell at seq %d - tail at %d%s", + sq->index, spqp->doorbell_seq, sq_seq, (mode == PCM_WAIT ? " [wait]" : "")); + wmb(); + set_psif_sq_sw__tail_indx(&sq_sw->d, sq_seq); + sif_doorbell_from_sqe(spqp->pqp.qp, spqp->doorbell_seq, true); + spqp->doorbell_seq = sq_seq + 1; + spqp->next_doorbell_seq = sq_seq + spqp->doorbell_interval + 1; + } + + if (poll_prev) { + sif_log(sdev, SIF_PQPT, "enter wait (poll_prev) (%d)", spqp->req_compl); + ret = poll_cq_waitfor_any(&spqp->pqp, &first_err); + if (ret < 0) + goto out; + if (unlikely(first_err.written)) { + sif_log(sdev, SIF_INFO, "error completion with status %s", + string_enum_psif_wc_status(first_err.cqe.status)); + goto out; + } + sif_log(sdev, SIF_PQPT, "polled %d completions", ret); + spqp->req_compl -= ret; + } + + if (unlikely(mode == PCM_WAIT)) { + while (sq_sw->head_seq != sq_seq) { + sif_log(sdev, SIF_PQPT, "enter wait (%d) seq %d/%d", + spqp->req_compl, sq_sw->head_seq, sq_seq); + ret = poll_cq_waitfor_any(&spqp->pqp, &first_err); + if (ret < 0) + break; + spqp->req_compl -= ret; + sif_log(sdev, SIF_PQPT, "done wait - head now %d - rem.cmpl %d", + sq_sw->head_seq, spqp->req_compl); + } + } + + if (ret == 0) + ret = -ENOMEM; + else if (ret > 0) + ret = 0; + +out: + sif_log(sdev, SIF_PQP, "done ret = %d", ret); + return ret; +} + + +/* get exclusive access to a stencil pqp */ +struct sif_st_pqp *sif_alloc_ki_spqp(struct sif_dev *sdev) +{ + int index; + struct sif_st_pqp *spqp = NULL; + + mutex_lock(&sdev->ki_spqp.lock); + index = find_next_zero_bit(sdev->ki_spqp.bitmap, sdev->ki_spqp.pool_sz, 0); + if (index < sdev->ki_spqp.pool_sz) { + set_bit(index, sdev->ki_spqp.bitmap); + spqp = sdev->ki_spqp.spqp[index]; + } + mutex_unlock(&sdev->ki_spqp.lock); + sif_log(sdev, SIF_PQPT, "bit index %d", index); + return spqp; +} + +void sif_release_ki_spqp(struct sif_st_pqp *spqp) +{ + struct sif_dev *sdev = to_sdev(spqp->pqp.cq->ibcq.device); + + mutex_lock(&sdev->ki_spqp.lock); + clear_bit(spqp->index, sdev->ki_spqp.bitmap); + mutex_unlock(&sdev->ki_spqp.lock); + sif_log(sdev, SIF_PQPT, "bit index %d", spqp->index); +} diff --git a/drivers/infiniband/hw/sif/sif_pqp.h b/drivers/infiniband/hw/sif/sif_pqp.h new file mode 100644 index 0000000000000..55bcd7ce38809 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_pqp.h @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_pqp.h: Privileged QP handling + */ + +#ifndef __SIF_PQP_H +#define __SIF_PQP_H + +struct sif_qp; +struct sif_cq; +struct sif_rq; +struct sif_sq; +struct completion; +enum post_mode; + +/* Data structure used by PQP requesters to get the completion information, + * and optionally block waiting for it to arrive: + */ +struct sif_cqe { + struct psif_cq_entry cqe; /* host order copy of hw cqe */ + struct completion cmpl; /* a completion to wait on for response */ + struct sif_pqp *pqp; /* Priv.qp to wait on */ + bool need_complete; /* cmpl is initialized and a waiter is present */ + bool written; /* Set to true when a completion has been copied here */ + u16 sq_seq; /* set by post_send to allow us to reset ourselves */ +}; + +/* + * Declare and initialize data structure to receive a poll completion + * cqe.status initialized tosomething != SUCCESS + */ +#define DECLARE_SIF_CQE_POLL(d_, c_)\ + struct sif_cqe c_ = { \ + .cqe.status = PSIF_WC_STATUS_FIELD_MAX,\ + .pqp = get_pqp(d_),\ + .need_complete = false,\ + .written = false,\ + } + +#define DECLARE_SIF_CQE_WITH_SAME_EQ(d_, c_, e_) \ + struct sif_cqe c_ = { \ + .cqe.status = PSIF_WC_STATUS_FIELD_MAX,\ + .pqp = get_pqp_same_eq(d_, e_), \ + .need_complete = false,\ + .written = false,\ + } + + +#define DECLARE_SIF_CQE_WAIT(d_, c_)\ + struct sif_cqe c_ = { \ + .cqe.status = PSIF_WC_STATUS_FIELD_MAX,\ + .pqp = get_pqp(d_),\ + .need_complete = true,\ + .written = false,\ + };\ + init_completion(&c_.cmpl) + +#define DECLARE_SIF_CQE_POLL_WITH_RR_PQP(d_, c_)\ + struct sif_cqe c_ = { \ + .cqe.status = PSIF_WC_STATUS_FIELD_MAX,\ + .pqp = get_next_pqp(d_),\ + .need_complete = false,\ + .written = false,\ + } + + +struct sif_pqp { + struct sif_qp *qp; /* The qp used */ + struct sif_cq *cq; /* Associated completion queue for this priv.QP */ + unsigned long timeout; /* rescheduled when new completions observed */ + struct completion nonfull; /* allow a poster to wait for a cred */ + atomic_t waiters; /* number of waiters on nonfull */ + u16 last_full_seq; /* For logging purposes, record when last observed full */ + u16 last_nc_full; /* Track when to return EAGAIN to flush non-compl.entries */ + u16 lowpri_lim; /* Max number of outstanding low priority reqs */ +}; + +struct sif_pqp *sif_create_pqp(struct sif_dev *sdev, int comp_vector); +int sif_destroy_pqp(struct sif_dev *sdev, struct sif_pqp *pqp); + +/* Get the right PQP for the current CPU */ +struct sif_pqp *get_pqp(struct sif_dev *sdev); + +/* Get the right PQP with the same EQ */ +struct sif_pqp *get_pqp_same_eq(struct sif_dev *sdev, int comp_vector); + +/* Get the next PQP in round robin fashion */ +struct sif_pqp *get_next_pqp(struct sif_dev *sdev); + +/* Get the right CB for the current CPU for the given QP */ +struct sif_cb *get_cb(struct sif_qp *qp); + +static inline struct sif_cq *pqp_cq(struct sif_dev *sdev) +{ + return (get_pqp(sdev))->cq; +} + +static inline struct sif_qp *pqp_qp(struct sif_dev *sdev) +{ + return (get_pqp(sdev))->qp; +} + +/* Fill in common parts and post a work request to the management QP for the current CPU + * If @cqe is non-null, a completion will be requested and eventually reflected to @cqe + * in host order. + */ +int sif_pqp_post_send(struct sif_dev *sdev, struct psif_wr *wr, struct sif_cqe *cqe); + +/* Same as post send but allow post_mode - sif_pqp_post_send uses PM_CB */ +int sif_pqp_write_send(struct sif_pqp *pqp, struct psif_wr *wr, struct sif_cqe *cqe, + enum post_mode mode); + + +/* Poll and process incoming (internal) completions + * while waiting for this particular completion + */ +int poll_cq_waitfor(struct sif_cqe *lcqe); + +int sif_pqp_poll_wr(struct sif_dev *sdev, struct psif_wr *wr, struct sif_cqe *cqe); + + + +/* Generate a SUCCESS completion on the PQP itself + * We use this to be able to wait for a set of generated completions to other + * CQs to have been completed: + */ +int gen_pqp_cqe(struct sif_cqe *cqe); + +/* Post a request to generate a flushed-in-error completion for an outstanding rq entry + * on the given qp. This request generates no completion on the PQP itself: + */ +int sif_gen_rq_flush_cqe(struct sif_dev *sdev, struct sif_rq *rq, + u32 rq_seq, struct sif_qp *target_qp); + +/* Post a request to generate a flushed-in-error completion for an outstanding sq entry + * on the given qp. This request generates no completion on the PQP itself: + */ +int sif_gen_sq_flush_cqe(struct sif_dev *sdev, struct sif_sq *sq, + u32 sq_seq, u32 target_qp, bool notify_ev); + +/* Stencil PQP support - pre-populated PQPs for special performance sensitive use cases */ + +#define SPQP_DOORBELL_INTERVAL 8192 + +struct sif_st_pqp { + struct sif_pqp pqp; /* The PQP to use - must be first */ + struct sif_sq *sq; /* Short path to sq */ + struct sif_sq_sw *sq_sw;/* Short path to sq_sw */ + int index; /* The index of this st_pqp within it's pool */ + u16 doorbell_interval; /* Interval between each doorbell write */ + u16 doorbell_seq; /* Seq.no to use in next doorbell */ + u16 next_doorbell_seq; /* Next seqno to ring doorbell */ + u16 req_compl; /* Number of completions requested */ + u16 next_poll_seq; /* Next seqno to set completion and wait/poll for one */ + u64 checksum; /* Host endian partial checksum of stencil WR entries */ +}; + + +/* Stencil PQP management */ +struct sif_spqp_pool { + struct mutex lock; /* Protects access to this pool */ + struct sif_st_pqp **spqp; /* Key invalidate stencil PQPs */ + u32 pool_sz; /* Number of stencil PQPs set up */ + ulong *bitmap; /* Bitmap for allocation from spqp */ +}; + + +struct sif_st_pqp *sif_create_inv_key_st_pqp(struct sif_dev *sdev); + +/* get exclusive access to a stencil pqp */ +struct sif_st_pqp *sif_alloc_ki_spqp(struct sif_dev *sdev); +void sif_release_ki_spqp(struct sif_st_pqp *spqp); + +/* Update a new invalidate key request into a preconfigured stencil pqp + * Assumes exclusive access to the PQP SQ. + */ +int sif_inv_key_update_st(struct sif_st_pqp *spqp, int index, enum wr_mode mode); + + +int sif_destroy_st_pqp(struct sif_dev *sdev, struct sif_st_pqp *spqp); + +#endif