From: Knut Omang Date: Wed, 25 May 2016 09:01:10 +0000 (+0200) Subject: sif driver initial commit part 1 X-Git-Tag: v4.1.12-92~148^2~11 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=ed2b400dc4f85d922f31c25b66dff750b9620fd2;p=users%2Fjedix%2Flinux-maple.git sif driver initial commit part 1 sif_ah.c: Implementation of IB address handles for SIF sif_ah.h: Interface to internal IB address handle logic for SIF sif_base.c: Basic hardware setup of SIF sif_base.h: Basic hardware setup of SIF sif_checksum.c: Utilities for SIF specific 32 bit checksums sif_checksum.h: Utilities for SIF specific 32 bit checksums sif_cq.c: Implementation of completion queue logic for SIF sif_cq.h: Internal interface to psif completion queue logic sif_debug.c: Use of debugfs for dumping internal data structure info sif_debug.h: Use of debugfs for dumping internal data structure info sif_defs.c: IB-to-SIF Mapper. sif_defs.h: Div. utility definitions and auxiliary data structures sif_dev.h: Driver specific data structure definitions sif_dma.c: DMA memory mapping sif_dma.h: DMA memory mapping sif_drvapi.h: Device specific operations available via the FWA access path sif_elog.c: Log over PCIe support for firmware sif_elog.h: Misc device for capturing log from the EPSC sif_enl.h: Protocol definitions for the netlink protocol for EPSC access from sif_epsc.c: Implementation of API for communication with the EPSC sif_epsc.h: API for communication with the EPSC (and EPS-A's) sif_eq.c: Setup of event queues and interrupt handling sif_eq.h: Event queues and interrupt handling sif_fmr.c: Implementation of fast memory registration for SIF sif_fmr.h: Interface to internal IB Fast Memory Registration (FMR) Credits: The sif driver supports Oracle’s new Dual Port EDR and QDR IB Adapters and the integrated IB devices on the new SPARC SoC. The driver is placed under drivers/infiniband/hw/sif This patch set is the result of direct or indirect contribution by several people: Code contributors: Knut Omang, Vinay Shaw, Haakon Bugge, Wei Lin Guay, Lars Paul Huse, Francisco Trivino-Garcia. Minor patch/bug fix contributors: Hans Westgaard Ry, Jesus Escudero, Robert Schmidt, Dag Moxnes, Andre Wuttke, Predrag Hodoba, Roy Arntsen Initial architecture adaptations: Khalid Aziz (sparc64), Gerd Rausch (arm64) Testing, Test development, Continuous integration, Bug haunting, Code review: Knut Omang, Hakon Bugge, Åsmund Østvold, Francisco Trivino-Garcia, Wei Lin Guay, Vinay Shaw, Hans Westgaard Ry, + numerous other people within Oracle. Simulator development: Andrew Manison, Hans Westgaard Ry, Knut Omang, Vinay Shaw Orabug: 22529577 Reviewed-by: Hakon Bugge Signed-off-by: Knut Omang --- diff --git a/drivers/infiniband/hw/sif/Kconfig b/drivers/infiniband/hw/sif/Kconfig new file mode 100644 index 0000000000000..6cea193a78085 --- /dev/null +++ b/drivers/infiniband/hw/sif/Kconfig @@ -0,0 +1,5 @@ +config INFINIBAND_SIF + tristate "Oracle Infiniband HCA support" + depends on PCI && 64BIT && HAS_DMA + ---help--- + Low level driver for Oracle's family of Infiniband HCAs diff --git a/drivers/infiniband/hw/sif/sif_ah.c b/drivers/infiniband/hw/sif/sif_ah.c new file mode 100644 index 0000000000000..6d76057b43c9e --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_ah.c @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_ah.c: Implementation of IB address handles for SIF + */ + +#include +#include +#include "sif_dev.h" +#include "psif_hw_data.h" +#include "sif_defs.h" +#include "sif_base.h" +#include "sif_ah.h" + + +struct ib_ah *sif_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *ah_attr, + struct ib_udata *udata) +{ + struct sif_ah *ah; + struct sif_dev *sdev = to_sdev(ibpd->device); + struct sif_pd *pd = to_spd(ibpd); + struct ib_ah *ret; + + volatile struct psif_ah *ah_p; + struct psif_ah lah; + int index; + + sif_log(sdev, SIF_AH, "for pd %d", pd->idx); + + index = sif_alloc_ah_idx(sdev); + if (index < 0) { + ret = ERR_PTR(-ENOMEM); + goto err_create_ah; + } + ah = get_sif_ah(sdev, index); + memset(ah, 0, sizeof(struct sif_ah)); + ah->index = index; + ah_p = &ah->d; + + /* TBD: Many attrs should come from device cap-limits and + * as provided by user + */ + + /* Update hw */ + memset(&lah, 0, sizeof(lah)); + lah.sl = ah_attr->sl; + lah.port = ah_attr->port_num - 1; + lah.pd = pd->idx; + lah.remote_lid = ah_attr->dlid; + lah.local_lid_path = ah_attr->src_path_bits; + lah.ipd = ah_attr->static_rate; /* TBD: Encoding + is this right? */ + lah.loopback = + (sdev->port[lah.port].lid | lah.local_lid_path) == ah_attr->dlid ? + LOOPBACK : NO_LOOPBACK; + + + if (ah_attr->ah_flags & IB_AH_GRH) { + lah.use_grh = USE_GRH; + /* We need to byte swap these an extra time as we are receiving + * them in big endian format, and they are subject to copy/convert as well: + */ + lah.grh_remote_gid_0 = cpu_to_be64(ah_attr->grh.dgid.global.subnet_prefix); + lah.grh_remote_gid_1 = cpu_to_be64(ah_attr->grh.dgid.global.interface_id); + lah.grh_flowlabel = ah_attr->grh.flow_label; + lah.grh_hoplmt = ah_attr->grh.hop_limit; + /* TBD: ah_attr->grh.sgid_index? */ + + sif_log(sdev, SIF_AH, " - with grh dgid %llx.%llx", + lah.grh_remote_gid_0, + lah.grh_remote_gid_1); + } + + copy_conv_to_hw(ah_p, &lah, sizeof(lah)); + + sif_log(sdev, SIF_AH, "ah %d - remote_lid 0x%x src_path_bits 0x%x sl %d, %s", + ah->index, lah.remote_lid, lah.local_lid_path, lah.sl, + (lah.loopback ? "(loopback)" : "")); + sif_logs(SIF_DUMP, write_struct_psif_ah(NULL, 0, &lah)); + + + if (udata) { + struct sif_create_ah_resp_ext resp; + int ret; + + memset(&resp, 0, sizeof(resp)); + resp.index = ah->index; + ret = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (ret) { + sif_destroy_ah(&ah->ibah); + return ERR_PTR(ret); + } + } + return &ah->ibah; +err_create_ah: + return ret; +} + +int sif_destroy_ah(struct ib_ah *ibah) +{ + struct sif_ah *ah = to_sah(ibah); + struct sif_dev *sdev = to_sdev(ibah->device); + int index = ah->index; + + sif_logi(ibah->device, SIF_AH, "index 0x%x", index); + + sif_clear_ah(sdev, index); + sif_free_ah_idx(sdev, index); + + return 0; +} + +int sif_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + sif_logi(ibah->device, SIF_AH, "Not implemented"); + return -EOPNOTSUPP; +} + +int sif_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + + struct sif_ah *ah = to_sah(ibah); + struct psif_ah lah; + + ah_attr->ah_flags = 0; + copy_conv_to_sw(&lah, &ah->d, sizeof(lah)); + ah_attr->sl = lah.sl; + ah_attr->port_num = lah.port + 1; + ah_attr->static_rate = lah.ipd; + ah_attr->dlid = lah.remote_lid; + + if (lah.use_grh == USE_GRH) { + ah_attr->ah_flags |= IB_AH_GRH; + ah_attr->grh.dgid.global.subnet_prefix = lah.grh_remote_gid_0; + ah_attr->grh.dgid.global.interface_id = lah.grh_remote_gid_1; + ah_attr->grh.flow_label = lah.grh_flowlabel; + ah_attr->grh.hop_limit = lah.grh_hoplmt; + } + + sif_logi(ibah->device, SIF_AH, "ah %d - remote_lid 0x%x src_path_bits 0x%x %s", + ah->index, lah.remote_lid, lah.local_lid_path, + (lah.loopback ? "(loopback)" : "")); + return 0; +} + + +void sif_dfs_print_ah(struct seq_file *s, struct sif_dev *sdev, + loff_t pos) +{ + if (unlikely(pos < 0)) + seq_puts(s, "# Index Port PD Rem.lid\n"); + else { + struct psif_ah *ah_p = get_ah(sdev, pos); + struct psif_ah lah; + + copy_conv_to_sw(&lah, ah_p, sizeof(struct psif_ah)); + seq_printf(s, "%7lld %5d %5d %7d\n", + pos, lah.port + 1, lah.pd, lah.remote_lid); + } +} diff --git a/drivers/infiniband/hw/sif/sif_ah.h b/drivers/infiniband/hw/sif/sif_ah.h new file mode 100644 index 0000000000000..8ccbdf80ce543 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_ah.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_ah.h: Interface to internal IB address handle logic for SIF + */ + +#ifndef __SIF_AH_H +#define __SIF_AH_H + +struct sif_ah { + volatile struct psif_ah d; + struct ib_ah ibah; + int index; +}; + +static inline struct sif_ah *to_sah(struct ib_ah *ibah) +{ + return container_of(ibah, struct sif_ah, ibah); +} + +struct ib_ah *sif_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *ah_attr, + struct ib_udata *udata); +int sif_destroy_ah(struct ib_ah *ibah); +int sif_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); +int sif_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); + +struct seq_file; +struct sif_dev; + +/* Line printer for debugfs file */ +void sif_dfs_print_ah(struct seq_file *s, struct sif_dev *sdev, loff_t pos); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_base.c b/drivers/infiniband/hw/sif/sif_base.c new file mode 100644 index 0000000000000..a911d2991c5ee --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_base.c @@ -0,0 +1,1163 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_base.c: Basic hardware setup of SIF + */ +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#endif +#include "sif_base.h" +#include "sif_hwi.h" +#include "sif_mmu.h" +#include "sif_dma.h" +#include "psif_hw_csr.h" +#include "sif_epsc.h" +#include "sif_query.h" +#include "sif_defs.h" + +/* Pretty printers for debugfs defined here: */ +#include "sif_qp.h" +#include "sif_sq.h" +#include "sif_ah.h" +#include "sif_mr.h" +#include "sif_eq.h" +#include "sif_cq.h" + +static int sif_init_bitmap(struct sif_table *table); +static void sif_free_bitmap(struct sif_table *table); + +#define psif_xrq_sw psif_rq_sw + +/* fallback cases for special entries below */ +static uint dummy_bw_cb_size = 16383; +static uint dummy_lat_cb_size = 1; + +/* Macro for generating parameter values for queues + * They are all read only after driver load + */ + +#define add_qsz_parameter(type, hwtype, initsize) \ +uint sif_##type##_size = initsize;\ +module_param_named(type##_size, sif_##type##_size, uint, S_IRUGO);\ +MODULE_PARM_DESC(type##_size, "Size of the " #type " descriptor table") + + +/* These are the queue size parameters we support + * e.g. for instance qp_size=2048 or ah_size=100 + * (all sizes will be rounded up to a power of two value) + */ +add_qsz_parameter(mr, key, 524288); +add_qsz_parameter(epsc, epsc_csr_req, 2048); +add_qsz_parameter(qp, qp, 131072); +add_qsz_parameter(rq, rq_hw, 131072); +add_qsz_parameter(cq, cq_hw, 131072); +add_qsz_parameter(ah, ah, 524288); +add_qsz_parameter(sq_ring, sq_ring, 262144); +add_qsz_parameter(sq_tvl, sq_tvl, 128); + +/* These sizes must be equal to QP size */ +#define sif_sq_rspq_size sif_qp_size +#define sif_rqsp_size sif_qp_size +#define sif_atsp_size sif_qp_size + +/* These can be set from the command line - no parameter needed */ +static uint sif_epsa0_size = 64; +static uint sif_epsa1_size = 64; +static uint sif_epsa2_size = 64; +static uint sif_epsa3_size = 64; + +/* This defines how small the smallest (sw) pointers can get. + * If set to <= 8, 512 sw descriptors will fit in one page. + * This gives the smallest amount of internal overhead in each software descriptor + * but will yield a much larger block size which will require a larger amount of + * entries from both software and hardware descriptors to be reserved for each + * protection domain: + */ +uint sif_min_extent = 128; +module_param_named(min_extent, sif_min_extent, uint, S_IRUGO); +MODULE_PARM_DESC(min_extent, "The smallest entry size to use for descriptors"); + +/* These vars defines a minimal value for the number of extra eq entries + * to allocate. The driver will only update the EQ_SW_INDEX pointer + * when necessary. Necessary is defined by the absolute requirement that + * there must at any time be enough space in the event queue to store all possible + * sets of events occuring simultaenously. During setup, the driver will allocate + * enough entries to have at least @epsc_eq_headroom extra entries such that EQ_SW_INDEX + * need not be updated more often than for every @epsc_eq_headroom event: + */ +uint sif_epsc_eq_headroom = 64; +module_param_named(epsc_eq_headroom, sif_epsc_eq_headroom, uint, S_IRUGO); +MODULE_PARM_DESC(epsc_eq_headroom, "Minimal amount of extra headroom in the EPSC event queue"); + +uint sif_tsu_eq_headroom = 64; +module_param_named(tsu_eq_headroom, sif_tsu_eq_headroom, uint, S_IRUGO); +MODULE_PARM_DESC(tsu_eq_headroom, "Minimal amount of extra headroom in TSU event queue 0"); + + +/* sif_table_layout is a static struct used to organize + * base pointer size/layout data in a way that allows + * them to be configured by iteration: + */ + +struct sif_table_layout { + off_t off; /* Off. to corr. psif_base_addr within psif_csr */ + const char *name; /* Corresponding to enum name */ + const char *desc; /* Textual table desc (for logging) */ + uint *e_cnt_ref; /* Driver parameter ref for no.of entries to allocate */ + u32 entry_sz; /* Real size of entries in this table */ + u32 ext; /* Actual extent of (stride between) entries in this table */ + sif_dfs_printer dfs_printer; /* entry printing in debugfs */ + enum sif_tab_type xref; /* -1: No xref, else xref bitmap (read only) */ + bool wr_access; /* Whether or not PSIF should have write access */ + bool drv_ref; /* Keep track of driver structs via separate pointer array */ +}; + +/* Composition of static entries into the base_layout table below: + * + * This setup defines the memory layout of descriptors and inlined + * driver data structures. + * + * add_layout : base layout of descriptors with no inlined struct and no debugfs print + * - a version: Include separate array of pointers to driver struct + * add_x_layout: layout with alternative type to define extent (inlined driver struct) + * - p version: provide a printer function for debugfs + * - d version: default naming of printer function + * - r version: "cross reference" the bitmap of another map - no separate allocation + */ + +#define add_xpr_layout(type, ec, _desc, _e_type, _dfs_printer, _xref, _wr_acc, _drv_ref) { \ + .off = offsetof(struct psif_csr_be, base_addr_##type),\ + .name = #type,\ + .desc = _desc,\ + .e_cnt_ref = &sif_##ec##_size,\ + .entry_sz = sizeof(struct _e_type),\ + .ext = roundup_pow_of_two(sizeof(struct _e_type)),\ + .dfs_printer = _dfs_printer,\ + .xref = _xref, \ + .wr_access = _wr_acc, \ + .drv_ref = _drv_ref, \ +} + +#define add_xp_layout(type, ec, _desc, _e_type, _dfs_printer, _wr_acc) \ + add_xpr_layout(type, ec, _desc, _e_type, _dfs_printer, -1, _wr_acc, false) + +#define add_x_layout(type, ec, _desc, _e_type, _wr_acc) \ + add_xp_layout(type, ec, _desc, _e_type, NULL, _wr_acc) + +#define add_xd_layout(type, ec, _desc, _e_type, _wr_acc) \ + add_xp_layout(type, ec, _desc, _e_type, sif_dfs_print_##type, _wr_acc) + +#define add_xdr_layout(type, ec, _desc, _e_type, _xref, _wr_acc) \ + add_xpr_layout(type, ec, _desc, _e_type, sif_dfs_print_##type, _xref, _wr_acc, false) + +#define add_layout(type, ec, _desc, _wr_acc) \ + add_x_layout(type, ec, _desc, psif_##type, _wr_acc) + +#define add_a_layout(type, ec, _desc, _wr_acc) \ + add_xpr_layout(type, ec, _desc, psif_##type, sif_dfs_print_##type, -1, _wr_acc, true) + +#define add_r_layout(type, ec, _desc, _xref, _wr_acc) \ + add_xpr_layout(type, ec, _desc, sif_##type, NULL, _xref, _wr_acc, false) + +#define add_d_layout(type, ec, _desc, _wr_acc) \ + add_xp_layout(type, ec, _desc, psif_##type, sif_dfs_print_##type, _wr_acc) + +/* For use with eps req */ +#define add_e_req_layout(type, _suff) { \ + .off = 0, \ + .name = #type "_csr_req", \ + .desc = "EPS" #_suff " Request queue", \ + .e_cnt_ref = &sif_##type##_size, \ + .entry_sz = sizeof(struct psif_epsc_csr_req),\ + .ext = roundup_pow_of_two(sizeof(struct psif_epsc_csr_req)), \ + .dfs_printer = sif_dfs_print_##type, \ + .xref = -1, \ + .wr_access = false, \ + .drv_ref = false, \ +} + +/* For use with eps rsp */ +#define add_e_rsp_layout(type, _suff) { \ + .off = 0, \ + .name = #type "_csr_rsp", \ + .desc = "EPS" #_suff " Response queue", \ + .e_cnt_ref = &sif_##type##_size, \ + .entry_sz = sizeof(struct psif_epsc_csr_rsp),\ + .ext = roundup_pow_of_two(sizeof(struct psif_epsc_csr_rsp)), \ + .dfs_printer = NULL, \ + .xref = type##_csr_rsp, \ + .wr_access = true,\ + .drv_ref = false,\ +} + + +/* This array is indexed by the sif_tab_type enum + * NB! If you change anything here (including order) + * remember to update + * - enum sif_tab_type in sif_dev.h + * - define_funcs call list in sif_base.h + */ + +static struct sif_table_layout base_layout[] = { + add_e_req_layout(epsc, C), + add_e_rsp_layout(epsc, C), + add_a_layout(key, mr, "Key validation", false), + add_xd_layout(qp, qp, "QP descriptor", sif_qp, true), + add_layout(rqsp, rqsp, "RQ scratch pad", true), + add_layout(atsp, atsp, "Atomic replay data", true), + add_xd_layout(ah, ah, "Address handle", sif_ah, false), + add_xd_layout(cq_hw, cq, "Compl.desc (hw)", sif_cq, true), + add_r_layout(cq_sw, cq, "Compl.desc (sw)", cq_hw, false), + add_xd_layout(rq_hw, rq, "Recv.queue (hw)", sif_rq, true), + add_r_layout(rq_sw, rq, "Recv.queue (sw)", rq_hw, false), + add_xdr_layout(sq_hw, qp, "Send queue (hw)", sif_sq, qp, true), + add_r_layout(sq_sw, qp, "Send queue (sw)", qp, false), + { + /* Special handling of the completion block's + * special send queue address map - see #944 + */ + .off = offsetof(struct psif_csr_be, base_addr_sq_cmpl), + .name = "sq_cmpl", + .desc = "cq: SQ addr.map", + .e_cnt_ref = &sif_qp_size, + .entry_sz = 0, /* Calculated later */ + .ext = 0, /* Calculated later */ + .dfs_printer = sif_dfs_print_sq_cmpl, + .xref = qp, /* Reference QP to have flat setup (used by dfs only) */ + .wr_access = false, + .drv_ref = false, + }, + add_layout(sq_ring, sq_ring, "SQS Ring buffer", true), + add_layout(sq_tvl, sq_tvl, "SQS Resp.queue TVL", true), + add_layout(sq_rspq, sq_rspq, "SQS Resp.queue", true), + { + /* Special handling of collect buffer entries */ + .off = 0, + .name = "bw_cb", + .desc = "High bandwith collect buffer", + .e_cnt_ref = &dummy_bw_cb_size, + .entry_sz = sizeof(struct psif_cb), + .ext = 4096, + .dfs_printer = NULL, + .xref = -1, + .wr_access = false, + .drv_ref = false, + }, + { + /* Special handling of collect buffer entries */ + .off = 0, + .name = "lat_cb", + .desc = "Low latency collect buffer", + .e_cnt_ref = &dummy_lat_cb_size, + .entry_sz = sizeof(struct psif_cb), + .ext = 4096, + .dfs_printer = NULL, + .xref = -1, + .wr_access = false, + .drv_ref = false, + }, + add_e_req_layout(epsa0, A-0), + add_e_rsp_layout(epsa0, A-0), + add_e_req_layout(epsa1, A-1), + add_e_rsp_layout(epsa1, A-1), + add_e_req_layout(epsa2, A-2), + add_e_rsp_layout(epsa2, A-2), + add_e_req_layout(epsa3, A-3), + add_e_rsp_layout(epsa3, A-3) +}; + + +const char *sif_table_name(enum sif_tab_type type) +{ + return base_layout[type].name; +} + + +static bool is_eps_req(enum sif_tab_type type) +{ + switch (type) { + case epsc_csr_req: + case epsa0_csr_req: + case epsa1_csr_req: + case epsa2_csr_req: + case epsa3_csr_req: + return true; + default: + break; + } + return false; +} + + +static bool is_eps_rsp(enum sif_tab_type type) +{ + switch (type) { + case epsc_csr_rsp: + case epsa0_csr_rsp: + case epsa1_csr_rsp: + case epsa2_csr_rsp: + case epsa3_csr_rsp: + return true; + default: + break; + } + return false; +} + + +sif_dfs_printer sif_table_dfs_printer(enum sif_tab_type type) +{ + /* At this point we have one common implementation: */ + return base_layout[type].dfs_printer; +} + + +static enum sif_tab_type get_sw_type(enum sif_tab_type type) +{ + switch (type) { + case cq_hw: + return cq_sw; + case rq_hw: + return rq_sw; + case qp: + case sq_hw: + return sq_sw; + default: + break; + } + return (enum sif_tab_type)0; +} + +static enum sif_tab_type get_hw_type(enum sif_tab_type type) +{ + switch (type) { + case cq_sw: + return cq_hw; + case rq_sw: + return rq_hw; + case sq_sw: + return sq_hw; + default: + break; + } + return (enum sif_tab_type)0; +} + +static bool is_sw_type(enum sif_tab_type type) +{ + switch (type) { + case cq_sw: + case rq_sw: + case sq_sw: + return true; + default: + break; + } + return false; +} + + +/* The user mapped types we need to adjust extent for + * based on min_extent + * qp is exempt from this list as it is not mapped to + * user space although part of two-level alloc: + */ +static bool is_user_mapped_type(enum sif_tab_type type) +{ + switch (type) { + case cq_sw: + case rq_sw: + case sq_sw: + case cq_hw: + case rq_hw: + case sq_hw: + return true; + default: + break; + } + return false; +} + + +static int init_blocks(struct sif_dev *sdev, enum sif_tab_type type) +{ + struct sif_table *tp = &sdev->ba[type]; + enum sif_tab_type sw_type; + size_t sw_eb; /* sw type's required minimal entries per block */ + + if (is_sw_type(type)) { + /* Pick up block settings from the hw type which has already been initialized */ + enum sif_tab_type hw_type = get_hw_type(type); + struct sif_table *tph = &sdev->ba[hw_type]; + + tp->entry_per_block = tph->entry_per_block; + tp->block_ext = tph->block_ext; + tp->block_cnt = tph->block_cnt; + tp->block = tph->block; + return 0; + } + + sw_type = get_sw_type(type); + /* Only the tables with a software type requires 2-level alloc */ + if (sw_type) + sw_eb = PAGE_SIZE / base_layout[sw_type].ext; + else + return 0; + + if (type == qp) { + /* Only relate to sq_hw and sq_sw + * (which hasn't been setup yet) for block size calc + */ + tp->entry_per_block = max(sw_eb, PAGE_SIZE / base_layout[sq_hw].ext); + } else { + /* blocks must fill a page of the smallest of the sw and hw pointer */ + tp->entry_per_block = max(sw_eb, PAGE_SIZE / tp->ext_sz); + } + tp->block_cnt = tp->entry_cnt / tp->entry_per_block; + + if (tp->entry_per_block > 1) { + /* Allocate an 8 byte aligned/end aligned room for the local bitmap + * right after the block struct: + */ + int bitmap_bytes = (((tp->entry_per_block + 7) >> 3) + 7) & ~7; + + sif_log(sdev, SIF_INFO, + "%s uses two-level alloc: entry_per_block %d, block_cnt %d bitmap_bytes %d", + sif_table_name(type), tp->entry_per_block, tp->block_cnt, + bitmap_bytes); + + tp->block_ext = sizeof(struct sif_table_block) + bitmap_bytes; + + if (unlikely(type == sq_hw)) /* Uses QP bitmap */ + tp->block = sdev->ba[qp].block; + else { + /* Zero-initialize the block struct - real initialize + * upon first allocation + */ + tp->block = kzalloc(tp->block_ext * tp->block_cnt, GFP_KERNEL); + } + if (!tp->block) + return -ENOMEM; + } + + if (tp->alloc_rr) { + size_t i; + /* Make sure we start at index 0 for readability + reserve QP 0 */ + for (i = 0; i < tp->block_cnt; i++) { + struct sif_table_block *b = sif_get_block(tp, i); + + b->last_used = tp->entry_per_block - 1; + } + } + return 0; +} + + +static void deinit_blocks(struct sif_dev *sdev, enum sif_tab_type type) +{ + struct sif_table *tp = &sdev->ba[type]; + + if (tp->block) { + /* SQ uses QP bitmap and sw types refs the corresponding hw type */ + if (likely(type != sq_hw && !is_sw_type(type))) + kfree(tp->block); + tp->block = NULL; + } +} + + +/* Set up the memory mapped table type given by @type + * with SIF based on information in the base_layout table. + */ +int sif_table_init(struct sif_dev *sdev, enum sif_tab_type type) +{ + struct sif_table *tp = &sdev->ba[type]; + int extent; /* As log2 */ + int ret = 0; + struct psif_epsc_csr_req req; /* local epsc wr copy */ + struct psif_epsc_csr_rsp resp; + u64 alloc_sz; + u32 cfg_sz; + + memset(tp, 0, sizeof(*tp)); + tp->type = type; + tp->sdev = sdev; + cfg_sz = (u32)(*base_layout[type].e_cnt_ref); + if (cfg_sz & 0x80000000 || cfg_sz == 0) { + sif_log(sdev, SIF_INFO, "%s(%u): table size %#x out of bounds", + base_layout[type].desc, type, cfg_sz); + return -EINVAL; + } + + /* Only 2^n sized number of entries allowed: */ + tp->entry_cnt = roundup_pow_of_two(cfg_sz); + tp->ext_sz = base_layout[type].ext; + tp->table_sz = (size_t)tp->ext_sz * tp->entry_cnt; + + /* Set aside room for a sif_epsc_data struct at the end of + * the eps completion vectors so they can use the same mmu context in psif: + */ + alloc_sz = (is_eps_rsp_tab(type) ? + tp->table_sz + sizeof(struct sif_epsc_data) + sif_eps_log_size : + tp->table_sz); + + if (unlikely(type == sq_cmpl)) + sif_sq_cmpl_setup(tp); + else if (unlikely(is_cb_table(type))) + sif_cb_table_init(sdev, type); + else + sif_alloc_table(tp, alloc_sz); + + if (!tp->mem) { + sif_log(sdev, SIF_INFO, + "Failed to allocate 0x%lx bytes of memory for the %s table", + tp->table_sz, base_layout[type].desc); + return -ENOMEM; + } + + extent = order_base_2(tp->ext_sz); + + if (type == ah) /* Address handles can be allocated from intr.context */ + tp->from_interrupt = true; + + /* Allocate descriptors in a round robin fashion */ + tp->alloc_rr = is_cb_table(type) ? + sif_feature(alloc_cb_round_robin) : !sif_feature(disable_alloc_round_robin); + + /* single level defaults - then check for 2-level setup.. */ + tp->block_cnt = tp->entry_cnt; + tp->entry_per_block = 1; + + /* Enable one or two-level allocation */ + if (!sif_feature(flat_alloc)) + ret = init_blocks(sdev, type); + + if (ret) + goto err_init_blocks; + + if (tp->alloc_rr) + tp->last_used = tp->block_cnt - 1; /* Next will be the first entry */ + + sif_log(sdev, SIF_INFO, "%s(%d): entry cnt %d, entry sz %d, ext sz %d, extent %d, [%s]", + base_layout[type].desc, type, tp->entry_cnt, base_layout[type].entry_sz, tp->ext_sz, + extent, (base_layout[type].wr_access ? "writable" : "readonly")); + sif_log(sdev, SIF_INIT, " - table sz 0x%lx %s sif_base 0x%llx csr off 0x%lx", + tp->table_sz, sif_mem_type_str(tp->mem->mem_type), + tp->sif_base, base_layout[type].off); + + /* If xref is set to something other than -1 it means + * this table is not being allocated from individually, and thus + * need no bitmap, but rather is implicitly allocated from the referenced + * table entry (which must be lower in enum value to ensure that it is + * already allocated!) + * Also a table that references another this way is not allowed to allocate + * any indices.. + */ + if (base_layout[type].xref != -1) + tp->bitmap = sdev->ba[base_layout[type].xref].bitmap; + else if (sif_init_bitmap(tp) != 0) { + ret = -ENOMEM; + goto err_init_bitmap; + } + + spin_lock_init(&tp->lock); + + if (is_cb_table(type)) + return 0; /* No base addr setup for CBs */ + + /* Base address setup - inform the EPS */ + memset(&req, 0, sizeof(req)); + + if (is_eps_req(type)) { + /* Both req and rsp gets posted when rsp is set up */ + ret = 0; + } else if (is_eps_rsp(type)) { + /* req,rsp and eq setup taken care of here: */ + ret = sif_eps_init(sdev, type); + if (ret) + goto err_map_ctx; /* No context mapped in this case */ + } else { + req.opcode = EPSC_SET_BASEADDR; + req.u.base_addr.address = tp->sif_base; + req.u.base_addr.num_entries = tp->entry_cnt; + req.u.base_addr.extent_log2 = extent; + ret = sif_map_ctx(sdev, &tp->mmu_ctx, tp->mem, + tp->sif_base, tp->table_sz, + base_layout[type].wr_access); + if (ret) { + sif_log(sdev, SIF_INFO, "Failed to set up mmu context for %s", + base_layout[type].desc); + goto err_map_ctx; + } + req.addr = base_layout[type].off; + + /* Fill in the mmu context from sif_map_ctx before submitting to the EPSC */ + req.u.base_addr.mmu_context = tp->mmu_ctx.mctx; + + ret = sif_epsc_wr_poll(sdev, &req, &resp); + if (ret) + goto err_epsc_comm; + } + return 0; + + +err_epsc_comm: + sif_unmap_ctx(sdev, &tp->mmu_ctx); +err_map_ctx: + if (base_layout[type].xref == -1) + sif_free_bitmap(tp); +err_init_bitmap: + deinit_blocks(sdev, type); +err_init_blocks: + sif_free_table(tp); + tp->mem = NULL; + return ret; +} + +static void sif_table_deinit(struct sif_dev *sdev, enum sif_tab_type type) +{ + struct sif_table *tp = &sdev->ba[type]; + + if (tp->mem) { + if (is_eps_rsp(type)) + sif_eps_deinit(sdev, type); + sif_unmap_ctx(sdev, &tp->mmu_ctx); + if (base_layout[type].xref == -1) + sif_free_bitmap(tp); + deinit_blocks(sdev, type); + sif_free_table(tp); + tp->mem = NULL; + } +} + + +static void sif_base_deinit_partly(struct sif_dev *sdev, int level) +{ + int i; + + for (i = level - 1; i >= 0; i--) + sif_table_deinit(sdev, i); +} + + +int sif_base_init(struct sif_dev *sdev) +{ + /* Setting up base registers */ + int ret = 0; + int i; + + /* extent less than 8 bytes not supported by hw */ + if (sif_min_extent < 8) + sif_min_extent = 8; + else + sif_min_extent = roundup_pow_of_two(sif_min_extent); + + if (!sif_feature(flat_alloc) && sif_min_extent > 2048) { + sif_log(sdev, SIF_INFO, + "cap'ing min_extent to 2048 - largest supported with two -level alloc"); + sif_min_extent = 2048; + } + + /* Update sw table extents with min_extent: */ + for (i = 0; i < sif_tab_init_max; i++) + if (is_user_mapped_type(i) && base_layout[i].ext < sif_min_extent) + base_layout[i].ext = sif_min_extent; + + for (i = 0; i < sif_tab_init_max; i++) { + ret = sif_table_init(sdev, i); + /* Allow some base address setup calls to fail. + * This should allow us to work around some cases very old firmware + * just to perform firmware flash upgrade: + */ + if (ret) { + sif_log(sdev, SIF_INFO, "table init failed for the \"%s\" table", + sif_table_name(i)); + if (i <= epsc_csr_rsp || i == qp || i == key) + goto bi_failed; + } + } + + /* We rely upon 0-initialized table structs for the EPS-A entries as well */ + for (i = sif_tab_init_max; i < sif_tab_max; i++) { + struct sif_table *tp = &sdev->ba[i]; + + memset(tp, 0, sizeof(*tp)); + } + + /* Init complete */ + return 0; + +bi_failed: + sif_base_deinit_partly(sdev, i); + return ret; +} + + +void sif_base_deinit(struct sif_dev *sdev) +{ + sif_base_deinit_partly(sdev, sif_tab_max); +} + + +/* Send a base addr request to a given EPSA with address information for @type */ +int sif_table_update(struct sif_dev *sdev, enum psif_mbox_type eps_num, + enum sif_tab_type type) +{ + int ret; + struct sif_table *tp = &sdev->ba[type]; + struct psif_epsc_csr_req req; /* local epsc wr copy */ + struct psif_epsc_csr_rsp resp; + int extent = order_base_2(tp->ext_sz); + + /* GVA2GPA not supported by EPSes in rev2: */ + if (PSIF_REVISION(sdev) <= 2 && tp->mem->mem_type != SIFMT_BYPASS) + return -EOPNOTSUPP; + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_SET_BASEADDR; + req.u.base_addr.address = tp->sif_base; + req.u.base_addr.num_entries = tp->entry_cnt; + req.u.base_addr.extent_log2 = extent; + req.u.base_addr.mmu_context = tp->mmu_ctx.mctx; + req.addr = base_layout[type].off; /* This is the type of request */ + + ret = sif_eps_wr(sdev, eps_num, &req, &resp); + return ret; +} + + +/* Write an invalidate request to the pqp. + * using the given modes. Note that if @lcqe is set, wr_mode must be + * set to PCM_WAIT, to avoid the cqe from living beyond it's caller's scope! + */ +int sif_write_invalidate(struct sif_pqp *pqp, enum sif_tab_type type, int index, + struct sif_cqe *lcqe, enum wr_mode wr_mode, enum post_mode p_mode) +{ + struct psif_wr wr; + enum psif_wr_type inv_op; + int ncompleted; + u32 sq_entry_idx; + int pqp_sq_idx; + struct sif_sq *sq; + struct psif_cq_entry *cqe; + bool self_destruct; + struct sif_dev *sdev = to_sdev(pqp->qp->ibqp.device); + + self_destruct = (type == cq_hw) && (index == pqp->cq->index); + + /* Figure out if an invalidate request is necessary */ + inv_op = sif_invalidate_opcode(type); + BUG_ON(inv_op == -1); + BUG_ON(lcqe && wr_mode != PCM_WAIT); + if (inv_op == -1) + return -ENODEV; + + sif_log(sdev, SIF_PQP, "sending inv.req. type %s (0x%x) target queue index %d", + sif_table_name(type), inv_op, index); + + memset(&wr, 0, sizeof(struct psif_wr)); + /* For this table type we need to send an explicit + * invalidate work request + */ + wr.op = inv_op; + switch (inv_op) { + case PSIF_WR_INVALIDATE_RKEY: + case PSIF_WR_INVALIDATE_LKEY: + case PSIF_WR_INVALIDATE_BOTH_KEYS: + wr.details.su.key = index; + break; + case PSIF_WR_INVALIDATE_RQ: + wr.details.su.u2.rq_id = index; + break; + case PSIF_WR_INVALIDATE_XRCSRQ: + wr.details.su.u2.xrq_id = index; + break; + case PSIF_WR_INVALIDATE_CQ: + wr.details.su.u2.cq_id = index; + break; + case PSIF_WR_INVALIDATE_SGL_CACHE: + wr.details.su.u2.target_qp = index; + break; + default: + /* Should never get here */ + return -ENODEV; + } + + if (self_destruct) { + /* A self destruct does not receive any completion + * instead we must poll for descriptor write-back + */ + int ret = 0; + int sts = sif_pqp_post_send(sdev, &wr, NULL); + + if (sts) { + sif_log(sdev, SIF_INFO, + "Posted self-destruct request on cq %d failed, sts %d", + index, sts); + } + + sif_log(sdev, SIF_INFO_V, "Posted self-destruct request on cq %d", index); + ret = poll_wait_for_cq_writeback(sdev, pqp->cq); + return ret; + } + + if (wr_mode != PCM_WAIT) { + int sts; + + wr.completion = (wr_mode == PCM_POST) ? 0 : 1; + sts = sif_pqp_write_send(pqp, &wr, NULL, p_mode); + if (sts != -EAGAIN) + return sts; + /* In the EAGAIN case, fall through to post a new request with completion + * to be able to use the quota beyond lowpri_lim + */ + } + + wr.completion = 1; + ncompleted = sif_pqp_poll_wr(sdev, &wr, lcqe); + + if (ncompleted < 0) { + sif_log(sdev, SIF_INFO, "pqp request failed with errno %d", ncompleted); + return ncompleted; + } + + /* Note that we operate on 3 different indices here! */ + cqe = &lcqe->cqe; + pqp_sq_idx = pqp->qp->qp_idx; + sq = get_sif_sq(sdev, pqp_sq_idx); + + /* sq_id.sq_seq_num contains the send queue sequence number for this completion + * and by this driver's definition the index into the send queue will + * be this number modulo the length of the send queue: + */ + sq_entry_idx = cqe->wc_id.sq_id.sq_seq_num & sq->mask; + + if (cqe->status != PSIF_WC_STATUS_SUCCESS) { + sif_log(sdev, SIF_INFO, "failed with status %s(%d) for cq_seq %d", + string_enum_psif_wc_status(cqe->status), cqe->status, cqe->seq_num); + sif_logs(SIF_INFO, write_struct_psif_cq_entry(NULL, 0, cqe)); + atomic_inc(&pqp->cq->error_cnt); + return -EIO; + } + + sif_log(sdev, SIF_PQP, "cq_seq %d sq_seq %d, sq_entry_idx %d", + cqe->seq_num, cqe->wc_id.sq_id.sq_seq_num, sq_entry_idx); + + return ncompleted < 0 ? ncompleted : 0; +} + +int sif_invalidate(struct sif_dev *sdev, enum sif_tab_type type, int index, + enum wr_mode wr_mode) +{ + struct sif_cqe *cqe = NULL; + DECLARE_SIF_CQE_POLL(sdev, lcqe); + struct sif_pqp *pqp = lcqe.pqp; + + if (unlikely(!pqp)) + return 0; /* Failed before any PQPs were set up */ + + if (wr_mode == PCM_WAIT) + cqe = &lcqe; + return sif_write_invalidate(pqp, type, index, cqe, wr_mode, PM_CB); +} + +#define table_lock(table, flags) \ + do {\ + if (unlikely(table->from_interrupt)) \ + spin_lock_irqsave(&table->lock, flags); \ + else \ + spin_lock(&table->lock); \ + } while (0) + + +#define table_unlock(table, flags) \ + do { \ + if (unlikely(table->from_interrupt)) \ + spin_unlock_irqrestore(&table->lock, flags); \ + else \ + spin_unlock(&table->lock); \ + } while (0) + + +/* 1st level bitmap index allocation scheme */ +static int sif_init_bitmap(struct sif_table *table) +{ + /* Allocate 1 bit for each block of entries */ + size_t bsz = max(sizeof(ulong), table->block_cnt / sizeof(ulong)); + + if (bsz > SIF_MAX_CONT) + table->bitmap = vzalloc(bsz); + else + table->bitmap = kzalloc(bsz, GFP_KERNEL); + if (!table->bitmap) { + sif_log0(SIF_INIT, + "Failed to allocate 0x%lx bytes of alloc.bitmap", bsz); + return -ENOMEM; + } + return 0; +} + +int sif_alloc_index(struct sif_dev *sdev, enum sif_tab_type type) +{ + int index; + int next = 0; + struct sif_table *table = &sdev->ba[type]; + unsigned long flags = 0; + + table_lock(table, flags); + if (table->alloc_rr) + next = (table->last_used + 1) & (table->block_cnt - 1); + + index = find_next_zero_bit(table->bitmap, table->block_cnt, next); + if (table->alloc_rr && index >= table->block_cnt) + index = find_next_zero_bit(table->bitmap, table->block_cnt, 0); + if (index < table->block_cnt) { + set_bit(index, table->bitmap); + if (table->alloc_rr) + table->last_used = index; + } else + index = -1; + table_unlock(table, flags); + sif_log(sdev, SIF_IDX, "%s[%d] (entries per block %d)", sif_table_name(type), index, + table->entry_per_block); + return index; +} + +void sif_free_index(struct sif_dev *sdev, enum sif_tab_type type, int index) +{ + struct sif_table *table = &sdev->ba[type]; + size_t ext_sz = table->ext_sz; + char *desc = sif_mem_kaddr(table->mem, index * ext_sz); + unsigned long flags = 0; + + if (!test_bit(index, table->bitmap)) { + /* This should not happen - inconsistency somewhere */ + sif_log(sdev, SIF_INFO, "XZW: index %d, table type %d/%d was not marked as used!", + index, type, sif_tab_init_max); + BUG(); + return; + } + + + if (table->entry_per_block == 1) { + /* Clean descriptor entry for reuse: + * note that we clean the whole extent here which + * includes all of sif_##type for inlined types: + */ + if (table->type == rq_hw) /* only zero out driver data structure */ + memset(desc + sizeof(struct psif_rq_hw), 0, ext_sz - sizeof(struct psif_rq_hw)); + else if (!is_cb_table(table->type) && table->type != qp && table->type != cq_hw) + memset(desc, 0, ext_sz); + } + + table_lock(table, flags); + clear_bit(index, table->bitmap); + table_unlock(table, flags); + sif_log(sdev, SIF_IDX, "%s[%d]", sif_table_name(type), index); +} + + +bool sif_index_used(struct sif_table *table, int index) +{ + if (unlikely(index < 0 || index >= table->entry_cnt)) + return NULL; + return test_bit(index, table->bitmap); +} + + +u32 sif_entries_used(struct sif_table *table) +{ + int bits_used = 0; + int i = 0; + unsigned long flags = 0; + + table_lock(table, flags); + if (table->entry_per_block == 1) + bits_used = bitmap_weight(table->bitmap, table->block_cnt); + else + for (;;) { + i = sif_next_used(table, i); + if (i < 0) + break; + bits_used++; + i++; + } + + table_unlock(table, flags); + return bits_used; +} + +static void sif_free_bitmap(struct sif_table *table) +{ + if (table->bitmap) { + size_t bsz = table->block_cnt / sizeof(ulong); + + if (bsz > SIF_MAX_CONT) + vfree(table->bitmap); + else + kfree(table->bitmap); + table->bitmap = NULL; + } +} + + +/* This function is used to traverse tables for the debugfs file system. + * @index is the descriptor index (not block index) so in case of + * two-level allocation (table->entry_per_block > 1) + * a two-level traversal is needed here: + */ +int sif_next_used(struct sif_table *table, int index) +{ + ulong *map = NULL; + int blk_idx, new_blk_idx, epb, old_idx; + struct sif_table_block *b; + + /* This is a queue - no bitmap */ + if (unlikely(table->type == epsc_csr_req)) + return sif_eps_next_used(table, index); + + /* TBD: Quick hack for now - the bitmap reference stuff does not work + * properly with two-level alloc: + */ + if (unlikely(table->type == sq_cmpl)) + table = &table->sdev->ba[qp]; + + map = table->bitmap; + if (!map) + return -1; + + if (table->entry_per_block == 1) { + index = find_next_bit(map, table->block_cnt, index); + if (index < table->block_cnt) + return index; + else + return -1; + } + old_idx = index; + + /* Two level allocation */ + epb = table->entry_per_block; + blk_idx = index / epb; +next_block: + index = index % epb; + new_blk_idx = find_next_bit(map, table->block_cnt, blk_idx); + if (new_blk_idx >= table->block_cnt) + return -1; + if (new_blk_idx != blk_idx) + index = 0; + + b = sif_get_block(table, new_blk_idx); + index = find_next_bit(b->bitmap, epb, index); + if (index >= epb) { + blk_idx++; + goto next_block; + } + index += b->offset; + return index; +} + +static int sif_alloc_sg_table(struct sif_table *tp, size_t size) +{ + struct sif_dev *sdev = tp->sdev; + size_t sg_size = size >> PMD_SHIFT; + enum sif_mem_type memtype = sif_feature(no_huge_pages) ? SIFMT_4K : SIFMT_2M; + + tp->mem = sif_mem_create(sdev, sg_size, size, memtype, + GFP_KERNEL, DMA_BIDIRECTIONAL); + if (!tp->mem) + return -ENOMEM; + return 0; +} + +int sif_alloc_table(struct sif_table *tp, size_t size) +{ + struct sif_dev *sdev = tp->sdev; + int ret; + + /* TBD: handle eqs in a better way */ + if (!tp->is_eq && base_layout[tp->type].drv_ref) { + size_t ref_tbl_sz = sizeof(void *) * tp->entry_cnt; + + tp->drv_ref = vzalloc(ref_tbl_sz); + if (!tp->drv_ref) { + sif_log(sdev, SIF_INFO, "unable to allocate %ld bytes of ref.table for table %s", + ref_tbl_sz, sif_table_name(tp->type)); + return -ENOMEM; + } + } + + /* The sqs ring buffer must be phys.cont to avoid PCIe deadlocks (#3477) + * and do not need to be zero initialized, its written by HW and read by HW + */ + if (size <= SIF_MAX_CONT || (tp->type == sq_ring && !tp->is_eq)) { + gfp_t flags = GFP_KERNEL; + + if (tp->type != sq_ring) + flags |= __GFP_ZERO; + + tp->mem = sif_mem_create_dmacont(sdev, size, flags, DMA_BIDIRECTIONAL); + if (!tp->mem) { + ret = -ENOMEM; + goto t_alloc_failed; + } + tp->sif_base = sif_mem_dma(tp->mem, 0); + if (tp->type == sq_ring) { + /* Avoid deadlocks on PCIe (#3484) */ + tp->mmu_ctx.mctx.ro = 1; + tp->mmu_ctx.mctx.ns = 1; + + /* + * BZ #3618: Make sure no dirty cache lines + * exists, which might be flushed out and + * overwrite the ring-buffer, after it has + * been written by PSIF + */ +#ifdef CONFIG_X86 + clflush_cache_range(tp->mem->vmap_base, size); +#else + sif_log(sdev, SIF_INFO, "Warning: implement flush cache for this architecture"); +#endif + } + return 0; + } + + ret = sif_alloc_sg_table(tp, size); + if (ret) + goto t_alloc_failed; + + /* Use some easily identifiable (nonzero) high virtual address range on the sif side */ + tp->sif_base = tp->is_eq ? + SIF_BASE_ADDR_EQ_START(tp->index) : + SIF_BASE_ADDR_START(tp->type); + return 0; + +t_alloc_failed: + if (tp->drv_ref) { + vfree(tp->drv_ref); + tp->drv_ref = NULL; + } + return ret; +} + + +void sif_free_table(struct sif_table *tp) +{ + sif_mem_free(tp->mem); + tp->mem = NULL; + + if (tp->drv_ref) { + vfree(tp->drv_ref); + tp->drv_ref = NULL; + } +} diff --git a/drivers/infiniband/hw/sif/sif_base.h b/drivers/infiniband/hw/sif/sif_base.h new file mode 100644 index 0000000000000..df91a7e392c42 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_base.h @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_base.h: Basic hardware setup of SIF + */ + +#ifndef __SIF_BASE_H +#define __SIF_BASE_H +#include "sif_dev.h" +#include "sif_debug.h" +#include "sif_pd.h" +#include "sif_qp.h" +#include "sif_cq.h" +#include "sif_ah.h" +#include "sif_int_user.h" + +/* Establish contact with the EPS and initialize the base descriptor setup */ +int sif_base_init(struct sif_dev *sdev); + +void sif_base_deinit(struct sif_dev *sdev); + +int sif_alloc_index(struct sif_dev *sdev, enum sif_tab_type type); +void sif_free_index(struct sif_dev *sdev, enum sif_tab_type type, int index); +u32 sif_entries_used(struct sif_table *table); + +bool sif_index_used(struct sif_table *table, int index); + +/* Find next used entry, starting at (and including) index + */ +int sif_next_used(struct sif_table *table, int index); + +int sif_invalidate(struct sif_dev *sdev, enum sif_tab_type type, int index, enum wr_mode mode); + +int sif_write_invalidate(struct sif_pqp *pqp, enum sif_tab_type type, int index, + struct sif_cqe *lcqe, enum wr_mode wr_mode, enum post_mode p_mode); + +#define sif_define_funcs(type) \ +static inline int sif_invalidate_##type(struct sif_dev *sdev, int index, \ + enum wr_mode mode)\ +{ \ + return sif_invalidate(sdev, type, index, mode); \ +} \ +static inline u32 sif_##type##_usage(struct sif_dev *sdev)\ +{\ + return sif_entries_used(&sdev->ba[type]); \ +} \ +static inline struct psif_##type *get_##type(struct sif_dev *sdev, int index)\ +{ \ + return (struct psif_##type *)(sif_mem_kaddr(sdev->ba[type].mem, \ + index * sdev->ba[type].ext_sz)); \ +} \ +static inline void sif_clear_##type(struct sif_dev *sdev, int index)\ +{ \ + struct psif_##type *p = get_##type(sdev, index);\ + memset(p, 0, sizeof(*p));\ +} + + +#define sif_def_pd_index_alloc(type)\ +static inline int sif_alloc_##type##_idx(struct sif_pd *pd)\ +{ \ + return sif_pd_alloc_index(pd, type); \ +} \ +static inline void sif_free_##type##_idx(struct sif_pd *pd, int index)\ +{ \ + sif_pd_free_index(pd, type, index); \ +} + +#define sif_def_global_index_alloc(type)\ +static inline int sif_alloc_##type##_idx(struct sif_dev *sdev)\ +{ \ + return sif_alloc_index(sdev, type); \ +} \ +static inline void sif_free_##type##_idx(struct sif_dev *sdev, int index)\ +{ \ + sif_free_index(sdev, type, index); \ +} + +const char *sif_table_name(enum sif_tab_type type); + +/* Exposed to sif_epsc only! */ + +/* Set up the table type @type and send a base addr request to the EPSC */ +int sif_table_init(struct sif_dev *sdev, enum sif_tab_type type); + +/* Send a base addr request to a given EPSA with address information for @type */ +int sif_table_update(struct sif_dev *sdev, enum psif_mbox_type eps_num, + enum sif_tab_type type); + +sif_dfs_printer sif_table_dfs_printer(enum sif_tab_type type); + +#define psif_bw_cb psif_cb __iomem +#define psif_lat_cb psif_cb __iomem + +sif_define_funcs(key) +sif_define_funcs(qp) +sif_define_funcs(cq_hw) +sif_define_funcs(cq_sw) +sif_define_funcs(ah) +sif_define_funcs(rq_sw) +sif_define_funcs(rq_hw) +sif_define_funcs(sq_sw) +sif_define_funcs(sq_hw) +sif_define_funcs(sq_rspq) +sif_define_funcs(bw_cb) +sif_define_funcs(lat_cb) + +/* These descriptors use 2-level alloc, + * 2nd level resource management is done by the protection domain. + * The purpose of this is that elements that fits within the same page will always be + * owned by the same protection domain, to avoid that an ill-behaved application + * may accidentially modify the descriptors of an unrelated application. + * Changes in allocation levels here must be accompanied by changes in init_blocks + * in sif_base.c and type changes sdev <-> pd in the index allocation functions. + */ +sif_def_pd_index_alloc(qp) +sif_def_pd_index_alloc(rq_hw) +sif_def_pd_index_alloc(sq_hw) +sif_def_pd_index_alloc(cq_hw) + +/* These use global, single level alloc. + * CBs are unproblematic since they each occupy a full page. + * The rest is only used from kernel space + */ + +sif_def_global_index_alloc(key) +sif_def_global_index_alloc(ah) +sif_def_global_index_alloc(bw_cb) +sif_def_global_index_alloc(lat_cb) + +/* Lookup functions for sif structs inlined with hw descs */ +#define sif_define_lookup_funcs(type, hwtype)\ +static inline struct sif_##type *get_sif_##type(struct sif_dev *sdev, int idx)\ +{ \ + return container_of(get_##hwtype(sdev, idx),\ + struct sif_##type, d);\ +} \ +static inline struct sif_##type *safe_get_sif_##type(struct sif_dev *sdev, int idx)\ +{ \ + struct sif_table *tp = &sdev->ba[hwtype];\ + if (unlikely(idx < 0 || idx >= tp->entry_cnt)) \ + return NULL;\ + if (!sif_pd_index_used(tp, idx))\ + return NULL;\ + return get_sif_##type(sdev, idx);\ +} \ +extern uint sif_##type##_size + +sif_define_lookup_funcs(rq, rq_hw); +sif_define_lookup_funcs(rq_sw, rq_sw); +sif_define_lookup_funcs(sq, sq_hw); +sif_define_lookup_funcs(sq_sw, sq_sw); +sif_define_lookup_funcs(cq, cq_hw); +sif_define_lookup_funcs(cq_sw, cq_sw); +sif_define_lookup_funcs(qp, qp); +sif_define_lookup_funcs(ah, ah); + +/* Lookup functions for sif structs accessed via the + * "side-array" table->drv_ref + */ +#define sif_def_ref_lookup_funcs(type, hwtype) \ +static inline struct sif_##type *get_sif_##type(struct sif_dev *sdev, int idx) \ +{ \ + return ((struct sif_##type **)sdev->ba[hwtype].drv_ref)[idx]; \ +} \ +static inline void set_sif_##type(struct sif_dev *sdev, int idx, struct sif_##type *v) \ +{ \ + ((struct sif_##type **)sdev->ba[hwtype].drv_ref)[idx] = v; \ +} \ +static inline struct psif_##hwtype *safe_get_##hwtype(struct sif_dev *sdev, int idx)\ +{ \ + struct sif_table *tp = &sdev->ba[hwtype]; \ + if (!sif_index_used(tp, idx)) \ + return NULL;\ + return get_##hwtype(sdev, idx);\ +} \ +static inline struct sif_##type *safe_get_sif_##type(struct sif_dev *sdev, int idx)\ +{ \ + struct sif_table *tp = &sdev->ba[hwtype]; \ + if (!sif_index_used(tp, idx)) \ + return NULL;\ + return get_sif_##type(sdev, idx);\ +} \ +extern uint sif_##type##_size + +sif_def_ref_lookup_funcs(mr, key); + +static inline struct sif_table_block *sif_get_block(struct sif_table *tp, int index) +{ + return (struct sif_table_block *)(tp->block + tp->block_ext * index); +} + +extern uint sif_xrq_size; +extern uint sif_epsc_size; +extern uint sif_epsc_eq_headroom; +extern uint sif_tsu_eq_headroom; +extern uint sif_sq_ring_size; +extern uint sif_sq_tvl_size; +extern uint sif_min_extent; + +/* Multi-strategy allocation of table memory */ +int sif_alloc_table(struct sif_table *tp, size_t size); + +void sif_free_table(struct sif_table *tp); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_checksum.c b/drivers/infiniband/hw/sif/sif_checksum.c new file mode 100644 index 0000000000000..b64ef70797b99 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_checksum.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_checksum.c: Utilities for SIF specific 32 bit checksums + * + */ +#include +#include +#include "sif_checksum.h" +#include + +/* + * 32 bit "IP/TCP"-like checksumming - modified from 16 to 32 bit + * from kernel/lib/checksum.c: + */ + +static inline u32 from64to32(u64 x) +{ + /* add up 32-bit and 32-bit for 32+c bit */ + x = (x & 0xffffffff) + (x >> 32); + /* add up carry.. */ + x = (x & 0xffffffff) + (x >> 32); + return x; +} + + +static u64 do_csum32(const unsigned char *buff, int len) +{ + int unaligned; + u64 result = 0; + + if (len <= 0) + goto out; + unaligned = 3 & (unsigned long) buff; + if (1 & (unsigned long) buff) { +#ifdef __LITTLE_ENDIAN + result += (*buff << 24); +#else + result = *buff; +#endif + len--; + buff++; + } + if (len >= 2) { + if (2 & (unsigned long) buff) { +#ifdef __LITTLE_ENDIAN + result += (*(u32 *) buff) << 16; +#else + result += *(u32 *) buff; +#endif + len -= 2; + buff += 2; + } + if (len >= 4) { + if (4 & (unsigned long) buff) { + result += *(u32 *) buff; + len -= 4; + buff += 4; + } + if (len >= 8) { + const unsigned char *end = buff + ((unsigned int)len & ~7); + unsigned int carry = 0; + + do { + u64 w = *(u64 *) buff; + + buff += 8; + result += carry; + result += w; + carry = (w > result); + } while (buff < end); + result += carry; + result = (result & 0xffffffff) + (result >> 32); + } + if (len & 4) { + result += *(u32 *) buff; + len -= 4; + buff += 4; + } + } + if (len & 2) { +#ifdef __LITTLE_ENDIAN + result += (*(unsigned short *) buff) << 16; +#else + result += *(unsigned short *) buff; +#endif + buff += 2; + } + } + if (len & 1) +#ifdef __LITTLE_ENDIAN + result += *buff; +#else + result += (*buff << 24); +#endif + result = from64to32(result); + switch (unaligned) { + case 1: + result = ((result >> 8) & 0xffffff) | ((result & 0xff) << 24); + break; + case 2: + result = ((result >> 16) & 0xffff) | ((result & 0xffff) << 16); + break; + case 3: + result = ((result >> 24) & 0xff) | ((result & 0xffffff) << 8); + break; + default: + break; + } +out: + return result; +} + + +u64 csum32_partial(const void *buff, int len, u64 wsum) +{ + u64 sum = (__force u64)wsum; + u64 result = do_csum32(buff, len); + + /* add in old sum, and carry.. */ + result += sum; + if (sum > result) + result += 1; + return (__force u64)result; +} +EXPORT_SYMBOL(csum32_partial); diff --git a/drivers/infiniband/hw/sif/sif_checksum.h b/drivers/infiniband/hw/sif/sif_checksum.h new file mode 100644 index 0000000000000..aaa17a1a5536b --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_checksum.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_checksum.h: Utilities for SIF specific 32 bit checksums + */ +#ifndef _SIF_CHECKSUM_H +#define _SIF_CHECKSUM_H + +/* + * 32 bit "IP/TCP"-like checksumming - modified from 16 to 32 bit + * from kernel/lib/checksum.c: + */ + +u64 csum32_partial(const void *buff, int len, u64 wsum); + +/* + * Fold a partial checksum + */ +static inline u32 csum32_fold(u64 csum) +{ + u64 sum = (__force u64)csum; + + sum = (sum & 0xffffffff) + (sum >> 32); + sum = (sum & 0xffffffff) + (sum >> 32); + return (__force u32)~sum; +} + +#endif diff --git a/drivers/infiniband/hw/sif/sif_cq.c b/drivers/infiniband/hw/sif/sif_cq.c new file mode 100644 index 0000000000000..26502cef38236 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_cq.c @@ -0,0 +1,1010 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_cq.c: Implementation of completion queue logic for SIF + */ +#include +#include +#include +#include + +#include "sif_dev.h" +#include "psif_hw_data.h" +#include "psif_hw_setget.h" +#include "sif_defs.h" +#include "sif_base.h" +#include "sif_mmu.h" +#include "sif_ibcq.h" +#include "sif_cq.h" +#include "sif_hwi.h" +#include "sif_dma.h" +#include "sif_user.h" +#include "sif_qp.h" +#include "sif_pqp.h" +#include "sif_hwi.h" +#include "sif_ibqp.h" +#include +#include + +static inline int translate_wr_id( + uint64_t *wr_id, + struct sif_dev *sdev, + struct sif_cq *cq, + struct sif_sq *sq, + struct psif_cq_entry *cqe, + u32 sq_seq_num, int qpn) +{ + struct sif_sq_hdl *wh = get_sq_hdl(sq, sq_seq_num); + + if (unlikely(!wh)) { + sif_log(sdev, SIF_INFO, + "cqe 0x%x for cq %d refers sq(qp) %d (not initialized), sts %d opc 0x%x", + cqe->seq_num, cq->index, qpn, cqe->status, cqe->opcode); + return -EFAULT; + } + if (!unlikely(wh->used)) { + if (sq_seq_num == wh->sq_seq) + sif_log(sdev, SIF_INFO, + "dupl cqe 0x%x for cq %d: got sq_seq 0x%x, last exp.0x%x, sts %d opc 0x%x", + cqe->seq_num, cq->index, sq_seq_num, wh->sq_seq, + cqe->status, cqe->opcode); + else + sif_log(sdev, SIF_INFO, + "unexp. cqe 0x%x for cq %d: got sq_seq 0x%x, last exp.0x%x, sts %d opc 0x%x", + cqe->seq_num, cq->index, sq_seq_num, wh->sq_seq, + cqe->status, cqe->opcode); + return -EFAULT; + } + if (unlikely(wh->sq_seq != sq_seq_num)) { + sif_log(sdev, SIF_INFO, + "wrong cqe 0x%x for cq %d: got sq_seq 0x%x, expected 0x%x, sts %d opc 0x%x", + cqe->seq_num, cq->index, sq_seq_num, wh->sq_seq, cqe->status, cqe->opcode); + return -EFAULT; + } + *wr_id = wh->wr_id; + wh->used = false; + + return 0; +} + + +struct ib_cq *sif_create_cq(struct ib_device *ibdev, int entries, + int comp_vector, + struct ib_ucontext *context, + struct ib_udata *udata, + enum sif_proxy_type proxy) +{ + struct sif_cq *cq = NULL; + struct sif_dev *sdev = to_sdev(ibdev); + struct sif_ucontext *uc = to_sctx(context); + struct sif_pd *pd = context ? uc->pd : sdev->pd; + ulong user_flags = 0; + bool user_mode = udata != NULL; + + if (entries < 1) + return ERR_PTR(-EINVAL); + + if (udata) { + struct sif_create_cq_ext cmd; + int rv = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); + + if (rv) + return ERR_PTR(rv); + user_flags = cmd.flags; + if (sif_vendor_enable(proxy_mode, user_flags)) + proxy = cmd.proxy; + if (sif_vendor_enable(SVF_kernel_mode, user_flags)) + user_mode = false; + if (uc->abi_version < 0x0302) /* TBD: Remove - bw comp */ + user_mode = !user_mode; + } + + cq = create_cq(pd, entries, comp_vector, proxy, user_mode); + if (IS_ERR(cq)) + return (struct ib_cq *)cq; + + if (udata) { + struct sif_create_cq_resp_ext resp; + int ret; + + memset(&resp, 0, sizeof(resp)); + resp.cq_idx = cq->index; + ret = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (ret) { + destroy_cq(cq); + return ERR_PTR(-EFAULT); + } + } + atomic_inc(&sdev->cq_count); + sif_log(sdev, SIF_CQ, "new cq at %p entries %d (used %d)%s", + cq, entries, atomic_read(&sdev->cq_count), + (user_mode ? " (user mode)" : "")); + return &cq->ibcq; +} + + +struct sif_cq *create_cq(struct sif_pd *pd, int entries, + int comp_vector, + enum sif_proxy_type proxy, + bool user_mode) +{ + struct sif_dev *sdev = to_sdev(pd->ibpd.device); + struct sif_cq_sw *cq_sw; + struct psif_cq_sw lcq_sw; + struct psif_cq_entry *cqe; + struct sif_cq *cq; + struct sif_cq *ecq; + u32 entries_log2; + u64 alloc_sz; + int ret; + int index = sif_alloc_cq_hw_idx(pd); + + if (index < 0) { + ecq = ERR_PTR(-ENOMEM); + goto err_alloc_index; + } + + cq = get_sif_cq(sdev, index); + /* Use entries field to determine if entry has been used before */ + if (cq->entries) { + ret = poll_wait_for_cq_writeback(sdev, cq); + if (ret) + return ERR_PTR(ret); + } + + memset(cq, 0, sizeof(*cq)); + cq->pd = pd; + cq->index = index; + + cq_sw = get_sif_cq_sw(sdev, index); + cq_sw->next_seq = 0; + cq_sw->last_hw_seq = 0; + + /* Make sure we never fill the CQ completely on rev 1-3 - Bug #3657 */ + if (PSIF_REVISION(sdev) <= 3) + entries++; + + cq->entries = roundup_pow_of_two(entries); + cq->ibcq.cqe = cq->entries; + entries_log2 = order_base_2(cq->entries); + + /* Adjust available cqes on rev 1-3 - Bug #3657 */ + if (PSIF_REVISION(sdev) <= 3) + cq->ibcq.cqe--; + + /* See #2965: 5 bit size_log2 field in cq desc + * but counter is 32 bit. For simplicity to distinguish full from empty + * SIF can allow allocation of up to 2^30 (size_log2 = 0x1e) entries. + * Use the largest value tested, which should be enough + * + * TBD: Should perhaps limit to some fraction of physical memory available? + */ + if (entries_log2 > SIF_SW_MAX_CQE_LOG2) { + sif_log(sdev, SIF_INFO, + "requested %d entries -> %d but sif only supports %d", + entries, cq->entries, 1 << SIF_SW_MAX_CQE_LOG2); + return ERR_PTR(-ENFILE); + } + + cq->mask = cq->entries - 1; + cq->extent = sizeof(struct psif_cq_entry); + + alloc_sz = cq->entries * cq->extent; + + /* Only whole pages must be exposed to user space */ + if (user_mode && (alloc_sz & ~PAGE_MASK)) + alloc_sz = (alloc_sz + PAGE_SIZE) & PAGE_MASK; + cq->user_mode = user_mode; + + if (alloc_sz <= SIF_MAX_CONT) + cq->mem = sif_mem_create_dmacont(sdev, alloc_sz, GFP_KERNEL | __GFP_ZERO, DMA_BIDIRECTIONAL); + else + cq->mem = sif_mem_create(sdev, alloc_sz >> PMD_SHIFT, + alloc_sz, SIFMT_2M, GFP_KERNEL | __GFP_ZERO, DMA_BIDIRECTIONAL); + if (!cq->mem) { + sif_log(sdev, SIF_INFO, "Failed to allocate %d CQ entries", entries); + ecq = ERR_PTR(-ENOMEM); + goto err_cdt_invalid; + } + + sif_log(sdev, SIF_CQ, "CQ: hw %p sw %p, base_adr %p, alloc_sz 0x%llx", + cq, cq_sw, sif_mem_kaddr(cq->mem, 0), alloc_sz); + + /* Since we assume seq.0 as the first valid sequence number, + * we must assume that the first entry we poll against is invalid to + * start with: + */ + cqe = get_cq_entry(cq, 0); + set_psif_cq_entry__seq_num(cqe, (u32)-1); + cq->cq_hw.size_log2 = entries_log2; + + /* Prefetch cq_sw when queue is half full: */ + cq->cq_hw.prefetch_threshold_log2 = entries_log2 - 1; + + cq->cq_hw.valid = 1; + cq->cq_hw.base_addr = sif_mem_dma(cq->mem, 0); + cq->cq_hw.sequence_number = cq_sw->next_seq; + + if (proxy != SIFPX_OFF) { + /* This is a proxy CQ */ + cq->cq_hw.proxy_en = 1; + cq->cq_hw.eps_core = (enum psif_eps_a_core)(proxy - 1); + } + + /* Allocate mmu context */ + ret = sif_map_ctx(sdev, &cq->mmu_ctx, cq->mem, cq->cq_hw.base_addr, + alloc_sz, true); + if (ret) { + ecq = ERR_PTR(-ENOMEM); + goto err_map_ctx; + } + + /* Designate an EQ to this CQ: + * Note that the two first queues as seen by the driver in rev2 + * - index 0 and 1, is reserved for EPSC and async events respectively. + * The index here refers to the first "normal" eq, e.g. eq[2] in + * driver sense: + */ + cq->cq_hw.int_channel = (sif_check_valid_eq_channel(sdev, comp_vector)) ? + comp_vector : sif_get_eq_channel(sdev, cq); + cq->eq_idx = cq->cq_hw.int_channel + 2; + + cq->next_logtime = jiffies; + init_completion(&cq->cleanup_ok); + cq->cq_hw.mmu_cntx = cq->mmu_ctx.mctx; + + copy_conv_to_hw(&cq->d, &cq->cq_hw, sizeof(cq->cq_hw)); + + /* Initialize sw part of descriptor */ + memset(&lcq_sw, 0, sizeof(lcq_sw)); + lcq_sw.head_indx = cq_sw->next_seq; + copy_conv_to_hw(&cq_sw->d, &lcq_sw, sizeof(lcq_sw)); + + spin_lock_init(&cq->lock); + + wmb(); + + /* to sync with event handling. + * NB! Must be the final operation here as there may events + * pending that only handles either a fully valid CQ or refcnt == 0 + */ + atomic_set(&cq->refcnt, 1); + + sif_log(sdev, SIF_CQ, "Exit: success cq %p index %d", cq, + cq->index); + return cq; + +err_map_ctx: + sif_mem_free(cq->mem); +err_cdt_invalid: + sif_free_cq_hw_idx(pd, cq->index); +err_alloc_index: + return ecq; +} + +int sif_modify_cq(struct ib_cq *ibcq, u16 cq_count, u16 cq_period) +{ + struct sif_dev *sdev = to_sdev(ibcq->device); + + sif_log(sdev, SIF_CQ, "Not implemented"); + return -EOPNOTSUPP; +} + + +int sif_destroy_cq(struct ib_cq *ibcq) +{ + struct sif_cq *cq = to_scq(ibcq); + struct sif_dev *sdev = to_sdev(ibcq->device); + int ret = destroy_cq(cq); + + if (!ret) + atomic_dec(&sdev->cq_count); + return ret; +} + + +int destroy_cq(struct sif_cq *cq) +{ + struct sif_dev *sdev = to_sdev(cq->ibcq.device); + u32 index = cq->index; + struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, index); + int ret = 0; + u32 miss_cnt = cq_sw->miss_cnt; + u32 miss_occ = cq_sw->miss_occ; + + BUG_ON(atomic_read(&cq->ibcq.usecnt)); + + if (cq_sw->miss_cnt) { + atomic_add(miss_cnt, &sdev->cq_miss_cnt); + atomic_add(miss_occ, &sdev->cq_miss_occ); + } + ret = sif_invalidate_cq_hw(sdev, index, PCM_WAIT); + if (ret) { + sif_log(sdev, SIF_INFO, + "Releasing index %d in dirty state - ret %d", index, ret); + return 0; + } + + ret = sif_release_cq(sdev, index); + + sif_log(sdev, SIF_CQ, "Exit index %d ret %d miss cnt/occ %d/%d", + index, ret, miss_cnt, miss_occ); + return ret; +} + + + +int sif_release_cq(struct sif_dev *sdev, int index) +{ + struct sif_cq *cq = get_sif_cq(sdev, index); + struct sif_pd *pd = cq->pd; + struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, index); + + /* Wait for any in-progress event queue entry for this CQ to be finished */ + if (atomic_dec_and_test(&cq->refcnt)) + complete(&cq->cleanup_ok); + wait_for_completion(&cq->cleanup_ok); + + /* Make sure any completions on the cq TLB invalidate + * for priv.qp does arrive before the cq is destroyed.. + */ + sif_unmap_ctx(sdev, &cq->mmu_ctx); + sif_mem_free(cq->mem); + + /* Clear sw descriptor - hw descriptor is cleared by hw write-back + * We verify that the write-back has been received before making + * use of the cq again. + */ + memset(cq_sw, 0, sizeof(*cq_sw)); + + if (!sif_feature(disable_invalidate_cq)) + sif_free_cq_hw_idx(pd, index); + return 0; +} + + +int sif_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) +{ + sif_logi(ibcq->device, SIF_CQ, "Not implemented"); + return -EOPNOTSUPP; +} + + +/* @cqe contains little endian local copy of the associated + * completion queue entry + */ +static int handle_send_wc(struct sif_dev *sdev, struct sif_cq *cq, + struct ib_wc *wc, struct psif_cq_entry *cqe, bool qp_is_destroyed) +{ + /* send queue descriptor aligned with qp */ + int sq_idx = cqe->qp; + int ret; + struct sif_sq *sq = get_sif_sq(sdev, sq_idx); + struct sif_sq_sw *sq_sw = get_sif_sq_sw(sdev, sq_idx); + + /* This is a full 32 bit seq.num */ + u32 sq_seq_num = cqe->wc_id.sq_id.sq_seq_num; + + if (qp_is_destroyed) { + wc->wr_id = cqe->wc_id.rq_id; + + /* No more work, when QP is gone */ + return 0; + } + + ret = translate_wr_id(&wc->wr_id, sdev, cq, sq, cqe, sq_seq_num, cqe->qp); + if (ret) + return ret; + + wmb(); + /* Update head_seq after we have marked entry as unused since + * head_seq is used by post_send in the queue full check: + */ + sq_sw->head_seq = sq_seq_num; + + sif_log(sdev, SIF_CQ, + "wr_id 0x%llx on qp/sq %d sq_seq_num %d", + wc->wr_id, cqe->qp, sq_seq_num); + return 0; +} + +/* @cqe contains a host endian local copy of the associated + * completion queue entry. + */ +static struct sif_rq *find_rq(struct sif_dev *sdev, struct sif_cq *cq, + struct psif_cq_entry *cqe) +{ + struct sif_qp *qp = get_sif_qp(sdev, cqe->qp); + + if (qp->type == PSIF_QP_TRANSPORT_XRC) + return cq->xsrq; + else + return get_sif_rq(sdev, qp->rq_idx); +} + +/* @cqe contains a host endian local copy of the associated + * completion queue entry + */ +static int handle_recv_wc(struct sif_dev *sdev, struct sif_cq *cq, struct ib_wc *wc, + struct psif_cq_entry *cqe, bool qp_is_destroyed) +{ + struct sif_rq *rq = find_rq(sdev, cq, cqe); + struct sif_rq_sw *rq_sw = get_sif_rq_sw(sdev, rq->index); + u32 rq_len; + + wc->wr_id = cqe->wc_id.rq_id; + + /* If no QP, no further work */ + if (qp_is_destroyed) + return 0; + + rq_len = atomic_dec_return(&rq_sw->length); + + /* WA #622: For Responder Class A & C error, QP should have been + * marked in ERROR, flush RQ for remaining posted entries. + * + * Note: PSIF doesn't generate FLUSH_ERR completions, we see + * them due to s/w WA #622, do not flush again. + */ + if ((wc->status != IB_WC_WR_FLUSH_ERR) && + (wc->status != IB_WC_SUCCESS)) { + struct sif_qp *qp = to_sqp(wc->qp); + + if (is_regular_qp(qp) && !rq->is_srq + && IB_QPS_ERR == get_qp_state(qp)) { + if (sif_flush_rq(sdev, rq, qp, rq_len)) + sif_log(sdev, SIF_INFO, + "failed to flush RQ %d", rq->index); + } + } + + sif_log(sdev, SIF_CQ, "wr_id 0x%llx queue len %d", wc->wr_id, rq_len); + return 0; +} + +static bool fatal_err(enum ib_qp_type type, struct ib_wc *wc) +{ + if (wc->opcode == IB_WC_SEND || + wc->opcode == IB_WC_RDMA_WRITE || + wc->opcode == IB_WC_RDMA_READ || + wc->opcode == IB_WC_COMP_SWAP || + wc->opcode == IB_WC_FETCH_ADD || + wc->opcode == IB_WC_RECV || + wc->opcode == IB_WC_RECV_RDMA_WITH_IMM) { + switch (type) { + case IB_QPT_UD: + return wc->status == IB_WC_LOC_QP_OP_ERR || + wc->status == IB_WC_LOC_PROT_ERR; + case IB_QPT_RC: + return wc->status == IB_WC_LOC_LEN_ERR || + wc->status == IB_WC_LOC_QP_OP_ERR || + wc->status == IB_WC_LOC_PROT_ERR || + wc->status == IB_WC_BAD_RESP_ERR || + wc->status == IB_WC_REM_INV_REQ_ERR || + wc->status == IB_WC_REM_ACCESS_ERR || + wc->status == IB_WC_REM_OP_ERR || + wc->status == IB_WC_RETRY_EXC_ERR || + wc->status == IB_WC_RNR_RETRY_EXC_ERR; + case IB_QPT_UC: + return wc->status == IB_WC_LOC_QP_OP_ERR; + default: + /* Any other supported QP transport? */ + return false; + } + } else if (wc->status == IB_WC_FATAL_ERR || + wc->status == IB_WC_REM_ABORT_ERR) { + return true; + } + return false; +} + +/* Handle a single completion queue entry at pos @head + */ +static int handle_wc(struct sif_dev *sdev, struct sif_cq *cq, + volatile struct psif_cq_entry *cqe_p, struct ib_wc *wc) +{ + int ret = 0; + struct psif_cq_entry lcqe; + struct sif_qp *qp; + int qpn; + bool qp_is_destroyed; + + mb(); + + /* Read into local copy in host memory and order */ + copy_conv_to_sw(&lcqe, cqe_p, sizeof(lcqe)); + + /* Completion status ok - store generic info + * in ib_wc + */ + qpn = lcqe.qp; + + /* For qp 0/1 decode actual qp index: */ + if (qpn < 2) { + /* pkey_index only valid for qp 1 */ + if (qpn == IB_QPT_GSI) + wc->pkey_index = lcqe.pkey_indx; + qpn |= (lcqe.port << 1); + lcqe.qp = qpn; + } + + qp = get_sif_qp(sdev, qpn); + + sif_log(sdev, SIF_CQ, "CQ %d: Received cq seqn %d for QP %d opcode %s status %s", + cq->index, lcqe.seq_num, qpn, + string_enum_psif_wc_opcode(lcqe.opcode), + string_enum_psif_wc_status(lcqe.status)); + + wc->qp = &qp->ibqp; + wc->status = sif2ib_wc_status(lcqe.status); + qp_is_destroyed = lcqe.opcode & SIF_WC_QP_DESTROYED; + lcqe.opcode &= ~SIF_WC_QP_DESTROYED; + wc->opcode = sif2ib_wc_opcode(lcqe.opcode); + wc->wc_flags = 0; + + if (unlikely(is_epsa_tunneling_qp(qp->ibqp.qp_type))) { + /* if this is EPSA tunneling QP, always return 0. */ + wc->vendor_err = lcqe.vendor_err; + wc->wr_id = lcqe.wc_id.rq_id; + return 0; + } + + if (wc->status != IB_WC_SUCCESS) { + /* + * IBTA: only wr_id, status, qp_num, and vendor_err are valid + * when status != SUCCESS. + * + * Magne 2015-08-25: opcode is also always valid (this + * is required in order to deliver wr_id correct for + * sends when status != SUCCESS) + */ + + /* WA #3850: generate LAST_WQE event on SRQ*/ + struct sif_rq *rq = get_sif_rq(sdev, qp->rq_idx); + + int log_level = + (wc->status == IB_WC_WR_FLUSH_ERR) ? SIF_WCE_V : SIF_WCE; + + + if (!qp_is_destroyed && is_regular_qp(qp) && rq->is_srq) { + if (fatal_err(qp->ibqp.qp_type, wc)) { + struct ib_event ibe = { + .device = &sdev->ib_dev, + .event = IB_EVENT_QP_LAST_WQE_REACHED, + .element.qp = &qp->ibqp + }; + + if (qp->ibqp.event_handler) + qp->ibqp.event_handler(&ibe, qp->ibqp.qp_context); + } + } + + sif_log(sdev, log_level, + "Err.compl on cq %d seq %d raw wr_id %lld raw stat %s(%d) sif op %s(0x%x) qp# %d vendor_err 0x%x %s", + cq->index, lcqe.seq_num, lcqe.wc_id.rq_id, + string_enum_psif_wc_status(lcqe.status)+15, lcqe.status, + string_enum_psif_wc_opcode(lcqe.opcode)+15, lcqe.opcode, + qpn, lcqe.vendor_err, string_enum_psif_tsu_error_types(lcqe.vendor_err)); + + sif_logs(SIF_DUMP, write_struct_psif_cq_entry(NULL, 0, &lcqe)); + atomic_inc(&cq->error_cnt); + } + + /* then handle different types */ + switch (lcqe.opcode) { + case PSIF_WC_OPCODE_LSO: + case PSIF_WC_OPCODE_SEND: + case PSIF_WC_OPCODE_RDMA_WR: + /* Do send completions pass immd data ? */ + /* Answer: Send completions do not report back immediate data */ + if (lcqe.with_imm) + wc->wc_flags |= IB_WC_WITH_IMM; + case PSIF_WC_OPCODE_RDMA_READ: + case PSIF_WC_OPCODE_CMP_SWAP: + case PSIF_WC_OPCODE_FETCH_ADD: + ret = handle_send_wc(sdev, cq, wc, &lcqe, qp_is_destroyed); + break; + case PSIF_WC_OPCODE_RECEIVE_SEND: + case PSIF_WC_OPCODE_RECEIVE_RDMA_WR_IMM: + /* A heuristic mechanism to determine the traffic pattern. */ + qp->traffic_patterns.mask = (qp->traffic_patterns.mask << 1) & + HEUR_RX_DIRECTION; + ret = handle_recv_wc(sdev, cq, wc, &lcqe, qp_is_destroyed); + if (lcqe.with_imm) { + wc->ex.imm_data = be32_to_cpu(lcqe.seq_num_imm.imm); + wc->wc_flags |= IB_WC_WITH_IMM; + } + break; + + case PSIF_WC_OPCODE_MASKED_CMP_SWAP: + case PSIF_WC_OPCODE_MASKED_FETCH_ADD: + case PSIF_WC_OPCODE_INVALIDATE_RKEY: + case PSIF_WC_OPCODE_INVALIDATE_LKEY: + case PSIF_WC_OPCODE_INVALIDATE_BOTH_KEYS: + case PSIF_WC_OPCODE_INVALIDATE_TLB: + case PSIF_WC_OPCODE_RESIZE_CQ: + case PSIF_WC_OPCODE_SET_SRQ_LIM: + case PSIF_WC_OPCODE_SET_XRCSRQ_LIM: + case PSIF_WC_OPCODE_CMPL_NOTIFY_RCVD: + case PSIF_WC_OPCODE_REARM_CMPL_EVENT: + case PSIF_WC_OPCODE_INVALIDATE_RQ: + case PSIF_WC_OPCODE_INVALIDATE_CQ: + case PSIF_WC_OPCODE_INVALIDATE_RB: + case PSIF_WC_OPCODE_INVALIDATE_XRCSRQ: + case PSIF_WC_OPCODE_INVALIDATE_SGL_CACHE: + default: + sif_log(sdev, SIF_INFO, + "Unhandled wc opcode %s", string_enum_psif_wc_opcode(lcqe.opcode)); + ret = -EINVAL; + break; + } + + /* Need sif2ib_flags() */ + if (lcqe.grh == 1) { + wc->wc_flags |= IB_WC_GRH; + sif_log(sdev, SIF_CQ, "GRH present in payload"); + } + + wc->vendor_err = lcqe.vendor_err; + wc->byte_len = lcqe.byte_len; + + /* + * Brian Manula 2-august-2015: src_qp is zero on connected QP transports. + * + * IBTA: Remote node address and QP. Returned only for Datagram services. + */ + wc->src_qp = lcqe.src_qp; + wc->slid = lcqe.slid; + wc->sl = lcqe.sl; + wc->dlid_path_bits = lcqe.dlid_path_bits; + wc->port_num = lcqe.port + 1; /* Sif port numbers start at 0 */ + + if (qp->flags & (SIF_QPF_IPOIB | SIF_QPF_EOIB)) { + bool do_l3_csum; + bool do_l4_csum; + bool csum_l3_ok; + bool csum_l4_ok; + bool csum_ok; + struct psif_offload_info *oinfo; + + oinfo = &lcqe.offload_wc_id.offload; + do_l3_csum = + oinfo->packet_classification_ipv4 || + oinfo->packet_classification_ipv6; + do_l4_csum = + oinfo->packet_classification_tcp || + oinfo->packet_classification_udp; + + csum_l3_ok = do_l3_csum ? oinfo->l3_checksum_ok : true; + csum_l4_ok = do_l4_csum ? oinfo->l4_checksum_ok : true; + csum_ok = csum_l3_ok & csum_l4_ok; + + qp->ipoib_rx_csum_l3_ok += !!(do_l3_csum && csum_l3_ok); + qp->ipoib_rx_csum_l3_err += !!(do_l3_csum && !csum_l3_ok); + + qp->ipoib_rx_csum_l4_ok += !!(do_l4_csum && csum_l4_ok); + qp->ipoib_rx_csum_l4_err += !!(do_l4_csum && !csum_l4_ok); + /* set flag; could be ignored by next level if disabled */ + wc->wc_flags |= (csum_ok) ? IB_WC_IP_CSUM_OK : 0; + if (!csum_ok) { + sif_log(sdev, + SIF_WCE, + "checksum not ok for ipv4/ipv6 eth2 %d ip4 %d ip6 %d frag %d options %d arp %d arp_reply %d exthdr %d tcp %d udp %d l3_ok %d l4_ok %d", + oinfo->packet_classification_eth2, + oinfo->packet_classification_ipv4, + oinfo->packet_classification_ipv6, + oinfo->packet_classification_ip_frag, + oinfo->packet_classification_ip_options, + oinfo->packet_classification_arp, + oinfo->packet_classification_arp_reply, + oinfo->packet_classification_ip6_unsupported_exthdr, + oinfo->packet_classification_tcp, + oinfo->packet_classification_udp, + oinfo->l3_checksum_ok, + oinfo->l4_checksum_ok + ); + } + } + return ret; +} + + +/* + * When a QP is taken down and it has send completions that are not + * polled, we need to walk through the send CQ and update the wr_id, + * before the QP's SQ handle are de-allocated. To signal that the + * wr_id is correct, we set the SIF_WC_QP_DESTROYED bit in the wc + * opcode. + * + * Further, for a receive completion, we normally need the QP in order + * to retrieve the RQ number. Again, the QP might not exist. Hence, we + * mark receive CQEs the same way. + * + * Negative return implies an error, errno is set. Zero or greater + * return indicates numbers of CQEs that were marked with + * SIF_WC_QP_DESTROYED. + */ + +int sif_fixup_cqes(struct sif_cq *cq, struct sif_sq *sq, struct sif_qp *qp) +{ + volatile struct psif_cq_entry *cqe; + struct sif_dev *sdev = to_sdev(cq->ibcq.device); + struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index); + u32 seqno; + u32 polled_value; + int n = 0; + int ret = 0; + unsigned long flags = 0; + + + spin_lock_irqsave(&cq->lock, flags); + + for (seqno = cq_sw->next_seq;; ++seqno) { + struct psif_cq_entry lcqe; + uint64_t wr_id_host_order = 0; + + cqe = get_cq_entry(cq, seqno); + polled_value = get_psif_cq_entry__seq_num(cqe); + + /* More CQEs to check? */ + if (seqno != polled_value) + break; + + /* Fixup only for this QP */ + if (get_psif_cq_entry__qp(cqe) != qp->qp_idx) + continue; + + /* Read into local copy in host memory order */ + copy_conv_to_sw(&lcqe, cqe, sizeof(lcqe)); + + /* Receive completion? */ + if (lcqe.opcode & 0x80) { + struct sif_post_mortem_qp_info_in_cqe *post_mortem_info = + (struct sif_post_mortem_qp_info_in_cqe *) cqe->reserved + 0; + + /* if a receive completion, record some info to be used when cqe is polled */ + post_mortem_info->was_srq = has_srq(sdev, qp); + post_mortem_info->srq_idx = qp->rq_idx; + post_mortem_info->qpn = qp->qp_idx; + } else { + /* If a send completion, handle the wr_id */ + ret = translate_wr_id(&wr_id_host_order, sdev, cq, sq, &lcqe, + lcqe.wc_id.sq_id.sq_seq_num, lcqe.qp); + if (ret) + goto err; + + set_psif_cq_entry__wc_id(cqe, wr_id_host_order); + } + + /* Tell sub-sequent poll_cq() that the wr_id is OK */ + set_psif_cq_entry__opcode(cqe, get_psif_cq_entry__opcode(cqe) | SIF_WC_QP_DESTROYED); + ++n; + } + + ret = n; + +err: + spin_unlock_irqrestore(&cq->lock, flags); + + + return ret; +} + + +/* standard poll function called from ib_poll_cq + * driver internal completion handling uses special logic in sif_pqp.c + * + * All types of QP ownership can use this function for peek operations + * [ via sif_peek_cq (with @wc = NULL) ] + */ +int sif_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct sif_cq *cq = to_scq(ibcq); + struct sif_dev *sdev = to_sdev(ibcq->device); + volatile struct psif_cq_entry *cqe; + struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index); + + u32 seqno; + u32 polled_value = 0; + int npolled = 0; + unsigned long flags = 0; + int ret = 0; + /* TBD: Replace lock with atomic ops */ + spin_lock_irqsave(&cq->lock, flags); + + seqno = cq_sw->next_seq; + cqe = get_cq_entry(cq, seqno); + + sif_log_cq(cq, SIF_POLL, "cq %d (requested %d entries), next_seq %d %s", + cq->index, num_entries, cq_sw->next_seq, (wc ? "" : "(peek)")); + + while (npolled < num_entries) { + /* TBD - maybe should hide this as a function in sif_r3.c */ + if ((test_bit(CQ_POLLING_NOT_ALLOWED, &cq_sw->flags))) + break; + + polled_value = get_psif_cq_entry__seq_num(cqe); + + if ((test_bit(CQ_POLLING_IGNORED_SEQ, &cq_sw->flags)) && ~seqno == polled_value) { + seqno = ++cq_sw->next_seq; + clear_bit(CQ_POLLING_IGNORED_SEQ, &cq_sw->flags); + continue; + } + + if (seqno == polled_value) + npolled++; + else + break; + + if (likely(wc)) { + ret = handle_wc(sdev, cq, cqe, wc); + if (ret < 0) + goto handle_failed; + wc++; + seqno = ++cq_sw->next_seq; + } else /* peek_cq semantics */ + ++seqno; + + cqe = get_cq_entry(cq, seqno); + } + + if (likely(wc)) { + if (cq_length(cq, cq_sw->cached_head, seqno) >= cq->high_watermark) { + /* Update CQ software pointer */ + set_psif_cq_sw__head_indx(&cq_sw->d, seqno); + cq_sw->cached_head = seqno; + } + } + +handle_failed: + spin_unlock_irqrestore(&cq->lock, flags); + + if (npolled) + sif_log(sdev, SIF_CQ, "done - %d completions - seq_no of next entry: %d", + npolled, polled_value); + else + sif_log_cq(cq, SIF_POLL, "no completions polled - seq_no of next entry: %d", + polled_value); + return !ret ? npolled : ret; +} + + +int sif_peek_cq(struct ib_cq *ibcq, int wc_cnt) +{ + return sif_poll_cq(ibcq, wc_cnt, NULL); +} + + +int sif_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct sif_cq *cq = to_scq(ibcq); + struct sif_dev *sdev = to_sdev(ibcq->device); + struct sif_cq_sw *cq_sw = get_sif_cq_sw(sdev, cq->index); + struct psif_wr wr; + int ret; + DECLARE_SIF_CQE_WITH_SAME_EQ(sdev, lcqe, cq->eq_idx); + + sif_log(sdev, SIF_NCQ, "cq_idx %d, flags 0x%x", cq->index, flags); + + memset(&wr, 0, sizeof(struct psif_wr)); + + if (flags & IB_CQ_SOLICITED) + wr.se = 1; + + /* We should never miss events in psif so we have no need for a separate + * handling of IB_CQ_REPORT_MISSED_EVENTS - ignore it. + */ + + wr.op = cq->rcn_sent ? PSIF_WR_REARM_CMPL_EVENT : PSIF_WR_REQ_CMPL_NOTIFY; + wr.completion = 1; + wr.details.su.u2.cq_id = cq->index; + + ret = sif_pqp_poll_wr(sdev, &wr, &lcqe); + + cq->rcn_sent = ret >= 0; + + if (lcqe.cqe.status != PSIF_WC_STATUS_SUCCESS) { + if (ret >= 0) + ret = -EINVAL; + sif_log(sdev, SIF_INFO, + " cq %d: last_hw_seq %u next_seq %u failed with status %s", + cq->index, cq_sw->last_hw_seq, cq_sw->next_seq, + string_enum_psif_wc_status(lcqe.cqe.status)); + } else + sif_log(sdev, SIF_NCQ, "cq %d: last_hw_seq %u next_seq %u status %s", + cq->index, cq_sw->last_hw_seq, cq_sw->next_seq, + string_enum_psif_wc_status(lcqe.cqe.status)); + + if ((ret > 0) && (flags & IB_CQ_REPORT_MISSED_EVENTS)) { + /* peek to see if there is any outstanding completion. + * By checking for this flag, the application + * does not required to call poll_cq again to + * avoid race condition. + */ + return sif_peek_cq(ibcq, 1); + } + + return ret > 0 ? 0 : ret; +} + + +int sif_req_ncomp_notif(struct ib_cq *ibcq, int wc_cnt) +{ + struct sif_dev *sdev = to_sdev(ibcq->device); + + sif_log(sdev, SIF_VERBS, "Not implemented"); + return -EOPNOTSUPP; +} + + +void sif_dfs_print_cq_hw(struct seq_file *s, struct sif_dev *sdev, + loff_t pos) +{ + struct sif_cq *cq; + volatile struct psif_cq_hw *cq_hw_p; + volatile struct sif_cq_sw *cq_sw; + int qlen; + + if (unlikely(pos < 0)) { + seq_printf(s, "# Destroyed cq miss_cnt/occ %u/%u\n", + atomic_read(&sdev->cq_miss_cnt), + atomic_read(&sdev->cq_miss_occ)); + + seq_puts(s, "# Index actual_head cached_head hw_tail entries "); + seq_puts(s, "queue_len next_seq eq #events timeouts errors miss_cnt/occ\n"); + return; + } + + cq = get_sif_cq(sdev, pos); + cq_hw_p = &cq->d; + cq_sw = get_sif_cq_sw(sdev, cq->index); + + /* TBD: Must peek for new entries to report accurately, but it is unsafe + * unless we ref.cnt the cq + */ + qlen = 0; + + seq_printf(s, "%7llu %12u %12d %8u %8u %9u %8u %2u %8u %8u %8u %8u %4u", pos, + get_psif_cq_sw__head_indx(&cq_sw->d), cq_sw->cached_head, + get_psif_cq_hw__tail_indx(cq_hw_p), + cq->entries, qlen, cq_sw->next_seq, cq->eq_idx, atomic_read(&cq->event_cnt), + atomic_read(&cq->timeout_cnt), + atomic_read(&cq->error_cnt), + cq_sw->miss_cnt, cq_sw->miss_occ); + + if (get_psif_cq_hw__proxy_en(cq_hw_p)) + seq_printf(s, " [proxy to %s]", + string_enum_psif_eps_a_core(get_psif_cq_hw__eps_core(cq_hw_p))); + if (cq_sw->armed) + seq_puts(s, " [armed]\n"); + else + seq_puts(s, "\n"); +} + + +/* Poll wait for a cq descriptor to be written back in invalid state */ +int poll_wait_for_cq_writeback(struct sif_dev *sdev, struct sif_cq *cq) +{ + int ret = 0; + ulong timeout = jiffies + sdev->min_resp_ticks * 2; + u8 valid; + + while ((valid = get_psif_cq_hw__valid(&cq->d))) { + if (time_after(jiffies, timeout)) { + sif_log(sdev, SIF_INFO, + "timeout waiting for cq_hw write-back cq %d", cq->index); + atomic_inc(&cq->timeout_cnt); + return -ETIMEDOUT; + } + cpu_relax(); + } + sif_log(sdev, SIF_CQ, "exit - write-back observed on cq %d", cq->index); + return ret; +} diff --git a/drivers/infiniband/hw/sif/sif_cq.h b/drivers/infiniband/hw/sif/sif_cq.h new file mode 100644 index 0000000000000..402db2bd5b7f9 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_cq.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_cq.h: Internal interface to psif completion queue logic + */ + +#ifndef __SIF_CQ_H +#define __SIF_CQ_H +#include "psif_hw_data.h" +#include "sif_user.h" +#include "sif_mmu.h" + +struct sif_dev; +struct sif_cqe; +struct sif_compl; +struct sif_pd; +struct sif_qp; +struct sif_sq; + +struct sif_cq { + volatile struct psif_cq_hw d; /* Hardware descriptor */ + struct ib_cq ibcq ____cacheline_internodealigned_in_smp; + struct sif_pd *pd; /* Unlike the rest of ofed we tie a CQ to a PD */ + struct sif_mem *mem; /* Allocated queue memory */ + int index; + u32 entries; + u32 mask; /* entries - 1 for modulo using & */ + u32 extent; + atomic_t refcnt; /* refc.count on this object */ + struct completion cleanup_ok; /* Used to synchronize cleanup with event handling */ + u32 high_watermark; /* if < used entries (as seen by hw), update hw: head */ + struct psif_cq_hw cq_hw; /* Local copy of cq_hw, as initialized, in host endianness */ + struct sif_mmu_ctx mmu_ctx; + /* lock protects the below data structure and access/freeing of sq elems */ + spinlock_t lock ____cacheline_internodealigned_in_smp; + bool user_mode; /* Set if this is a CQ to be mapped to user space */ + bool pd_is_set; /* Whether or not this cq has a pd set in it's descriptor */ + bool rcn_sent; /* Set if ib_req_notify_cq() has been called on this cq */ + u8 eq_idx; /* Index of the event queue that gets completion events for this cq */ + atomic_t error_cnt; /* No. of error completions observed on this cq */ + atomic_t timeout_cnt; /* No. of completion timeouts observed on this cq */ + atomic_t event_cnt; /* No. of completion events observed for this cq (will wrap..) */ + u32 log_cnt; /* Number of suppressed log messages since last print */ + unsigned long next_logtime; /* timeout for when to print next message */ + struct sif_rq *xsrq; /* The XRC SRQ using this completion queue (see #3521) */ + struct sif_pqp *pqp; /* The PQP using this completion queue (for dfs reporting..) */ +}; + +static inline struct sif_cq *to_scq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct sif_cq, ibcq); +} + +/* Poll wait for a cq descriptor to be written back in invalid state */ +int poll_wait_for_cq_writeback(struct sif_dev *sdev, struct sif_cq *cq); + + +struct sif_cq *create_cq(struct sif_pd *pd, int cqe, + int comp_vector, + enum sif_proxy_type proxy, + bool user_mode); + + +/* internal poll/peek of completion queue: + * - Return value: 0 - @num_entries representing + * the number of ready completions on the queue. + * + * If @wc is set, @poll_cq processes entries and updates the local cq state. + * If @wc is NULL @poll_cq behaves as a peek, not modifying + * the local completion queue state. + * + * Note that @poll_cq does not modify any state shared with + * hardware except the head pointer + */ +int poll_cq(struct sif_dev *sdev, struct sif_cq *cq, int num_entries, + struct sif_cqe *cqe); + +int destroy_cq(struct sif_cq *cq); + + +/* Clean up resource usage associated with this cq + * If return value is -EIDRM it means that this cq was used with a privileged + * QP. In that case no more polls can be made at this point since the completion queue + * polled just self destructed.. + */ +int sif_release_cq(struct sif_dev *sdev, int index); + + +/* Printer for debugfs cq_hw file */ +void sif_dfs_print_cq_hw(struct seq_file *s, struct sif_dev *sdev, + loff_t pos); + +extern int sif_fixup_cqes(struct sif_cq *cq, struct sif_sq *sq, struct sif_qp *qp); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_debug.c b/drivers/infiniband/hw/sif/sif_debug.c new file mode 100644 index 0000000000000..763f6b4172635 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_debug.c @@ -0,0 +1,636 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_debug.c: Use of debugfs for dumping internal data structure info + */ + +#include +#include +#include +#include "sif_dev.h" +#include "sif_debug.h" +#include "sif_base.h" +#include "sif_query.h" +#include "sif_qp.h" +#include "sif_defs.h" + +/* A 'reference' element to identify each table type + */ +struct sif_dfs_ref { + struct sif_dev *sdev; + bool is_eq; + enum sif_tab_type type; + sif_dfs_printer dfs_print; +}; + + +/* Our private data within driver struct + */ +struct sif_dfs { + struct dentry *root; /* The root of the debugfs tree, if set up (pci id name) */ + struct dentry *root_link; /* A symlink from ib device name to pci id name */ + struct dentry *raw_qp; /* Ref to directory with raw qp info, if set up */ + struct sif_dfs_ref sd[sif_tab_init_max]; + struct sif_dfs_ref sd_eq; + struct sif_dfs_ref sd_irq_ch; + struct sif_dfs_ref sd_ipoffload; +}; + +/* A simple iterator */ + +struct sif_dfs_iter { + loff_t pos; /* Current "virtual" offset */ + bool started; /* If header has been printed */ +}; + + +static void *sif_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct sif_dfs_iter *it = (struct sif_dfs_iter *) v; + struct sif_dfs_ref *sd = (struct sif_dfs_ref *) s->private; + struct sif_table *tp = &sd->sdev->ba[sd->type]; + + ++(*pos); + *pos = sif_next_used(tp, *pos); + sif_log(sd->sdev, SIF_DFS, "%lld -> %lld", it->pos, *pos); + if (*pos < 0) { + kfree(it); + return NULL; + } + it->pos = *pos; + return it; +} + +static void *sif_seq_start(struct seq_file *s, loff_t *pos) +{ + struct sif_dfs_iter *it; + struct sif_dfs_ref *sd = (struct sif_dfs_ref *) s->private; + struct sif_table *tp = &sd->sdev->ba[sd->type]; + + sif_log(sd->sdev, SIF_DFS, " at %lld", *pos); + *pos = sif_next_used(tp, *pos); + if (*pos < 0) + return NULL; + it = kmalloc(sizeof(struct sif_dfs_iter), GFP_KERNEL); + if (!it) + return NULL; + it->pos = *pos; + it->started = false; + return it; +} + +static void sif_seq_stop(struct seq_file *s, void *v) +{ + struct sif_dfs_ref *sd = (struct sif_dfs_ref *) s->private; + + if (v) { + sif_log(sd->sdev, SIF_DFS, "sif_seq_stop at %p", v); + kfree(v); + } + sif_log(sd->sdev, SIF_DFS, " [at end]"); +} + +static int sif_seq_show(struct seq_file *s, void *v) +{ + struct sif_dfs_iter *it = (struct sif_dfs_iter *) v; + struct sif_dfs_ref *sd = (struct sif_dfs_ref *) s->private; + + sif_log(sd->sdev, SIF_DFS, "%lld", it->pos); + if (!it->pos || !it->started) { + seq_printf(s, "# %s state:\n", sif_table_name(sd->type)); + if (sd->dfs_print) + sd->dfs_print(s, sd->sdev, -1); + else + seq_puts(s, "# Index\tValues\n"); + it->started = true; + } + if (sd->dfs_print) + sd->dfs_print(s, sd->sdev, it->pos); + else + seq_printf(s, "%lld\n", it->pos); + return 0; +} + + +static const struct seq_operations seq_ops = { + .start = sif_seq_start, + .next = sif_seq_next, + .stop = sif_seq_stop, + .show = sif_seq_show +}; + + +/* Specific support for eq reporting which has slightly different logic: */ +static void *sif_eq_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct sif_dfs_iter *it = (struct sif_dfs_iter *) v; + struct sif_dfs_ref *sd = (struct sif_dfs_ref *) s->private; + struct sif_dev *sdev = sd->sdev; + u32 cnt = sdev->es[sdev->mbox_epsc].eqs.cnt; + + if (*pos > cnt - 2) + *pos = -1; + else + ++(*pos); + + sif_log(sdev, SIF_DFS, "%lld -> %lld", it->pos, *pos); + if (*pos < 0) { + kfree(it); + return NULL; + } + it->pos = *pos; + return it; +} + +static void *sif_eq_seq_start(struct seq_file *s, loff_t *pos) +{ + struct sif_dfs_iter *it; + struct sif_dfs_ref *sd = (struct sif_dfs_ref *) s->private; + struct sif_dev *sdev = sd->sdev; + u32 cnt = sdev->es[sdev->mbox_epsc].eqs.cnt; + + sif_log(sdev, SIF_DFS, " at %lld", *pos); + if (*pos > cnt - 2) { + *pos = -1; + return NULL; + } + it = kmalloc(sizeof(struct sif_dfs_iter), GFP_KERNEL); + if (!it) + return NULL; + it->pos = *pos; + it->started = false; + return it; +} + +static const struct seq_operations eq_seq_ops = { + .start = sif_eq_seq_start, + .next = sif_eq_seq_next, + .stop = sif_seq_stop, + .show = sif_seq_show +}; + +static int sif_seq_open(struct inode *inode, struct file *file) +{ + int ret; + struct sif_dfs_ref *sd = (struct sif_dfs_ref *)inode->i_private; + struct seq_file *seq; + + if (!try_module_get(THIS_MODULE)) + return -EIO; + + if (unlikely(sd->is_eq)) + ret = seq_open(file, &eq_seq_ops); + else + ret = seq_open(file, &seq_ops); + if (!ret) { + seq = file->private_data; + seq->private = inode->i_private; + } + return ret; +}; + +static int sif_seq_release(struct inode *inode, struct file *file) +{ + int stat = seq_release(inode, file); + + module_put(THIS_MODULE); + return stat; +} + + +static const struct file_operations table_fops = { + .owner = THIS_MODULE, + .open = sif_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = sif_seq_release +}; + +static ssize_t irq_ch_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + struct sif_dfs_ref *sd = (struct sif_dfs_ref *) seq->private; + struct sif_dev *sdev = sd->sdev; + struct sif_eps *es = &sdev->es[sdev->mbox_epsc]; + u32 channels = es->eqs.cnt; + + struct sif_eq *eq = &es->eqs.eq[1]; + struct psif_epsc_csr_interrupt_channel *settings; + struct psif_epsc_csr_req req; /* local epsc wr copy */ + struct psif_epsc_csr_rsp resp; + + char buffer[256] = ""; /* make a writable copy of const buf*/ + char *str, *token, *param[2]; + int ret; + + if (!eps_version_ge(es, 0, 36)) + goto opcode_not_available; + + if (count >= sizeof(buffer)) + return -ENOSPC; + + ret = simple_write_to_buffer(buffer, sizeof(buffer), ppos, buf, count); + if (ret < 0) { + sif_log(sd->sdev, SIF_INFO, "Not able to read input parameters from userspace"); + return ret; + } + buffer[ret] = '\0'; + str = buffer; + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_HOST_INT_CHANNEL_CTRL; + req.uf = 0; + settings = &req.u.int_channel; + + while ((token = strsep(&str, ";")) != NULL) { + param[0] = strsep(&token, "="); + if (param[0]) { + param[1] = strsep(&token, "="); + if (!param[1]) + continue; + } else { + continue; + } + + if (strcmp(param[0], "channel") == 0) { + u16 value; + + ret = kstrtou16(param[1], 10, &value); + if (ret == 0 && value > 0 && value < channels) { + settings->int_channel = value; + eq = &es->eqs.eq[value]; + } else { + sif_log(sd->sdev, SIF_INTR, "Invalid irq channel: %hu", + value); + goto sif_invalid_channel; + } + } else if (strcmp(param[0], "adaptive") == 0) { + u8 value; + + ret = kstrtou8(param[1], 10, &value); + if (ret == 0 && value == 0) { + settings->attributes.enable_adaptive = 1; + settings->enable_adaptive = 0; + } else if (ret == 0 && value > 0) { + settings->attributes.enable_adaptive = 1; + settings->enable_adaptive = 1; + } else { + sif_log(sd->sdev, SIF_INTR, "Invalid channel_adaptive value: %hu", + value); + } + } else if (strcmp(param[0], "rx_scale") == 0) { + u16 value; + + ret = kstrtou16(param[1], 10, &value); + if (!ret) { + settings->attributes.channel_rx_scale = 1; + settings->channel_rx_scale = value; + } else { + sif_log(sd->sdev, SIF_INTR, "Invalid channel_rx_scale value: %hu", + value); + } + } else if (strcmp(param[0], "rate_low") == 0) { + u32 value; + + ret = kstrtou32(param[1], 10, &value); + if (!ret) { + settings->attributes.channel_rate_low = 1; + settings->channel_rate_low = value; + } else { + sif_log(sd->sdev, SIF_INTR, "Invalid channel_rate_low value: %u", + value); + } + } else if (strcmp(param[0], "rate_high") == 0) { + u32 value; + + ret = kstrtou32(param[1], 10, &value); + if (!ret) { + settings->attributes.channel_rate_high = 1; + settings->channel_rate_high = value; + } else { + sif_log(sd->sdev, SIF_INTR, "Invalid channel_rate_high value: %u", + value); + } + } else if (strcmp(param[0], "ausec") == 0) { + u16 value; + + ret = kstrtou16(param[1], 10, &value); + if (!ret) { + settings->attributes.channel_ausec = 1; + settings->channel_ausec = value; + } else { + sif_log(sd->sdev, SIF_INTR, "Invalid channel_ausec value: %hu", + value); + } + } else if (strcmp(param[0], "ausec_low") == 0) { + u16 value; + + ret = kstrtou16(param[1], 10, &value); + if (!ret) { + settings->attributes.channel_ausec_low = 1; + settings->channel_ausec_low = value; + } else { + sif_log(sd->sdev, SIF_INTR, "Invalid channel_ausec_low value: %hu", + value); + } + } else if (strcmp(param[0], "ausec_high") == 0) { + u16 value; + + ret = kstrtou16(param[1], 10, &value); + if (!ret) { + settings->attributes.channel_ausec_high = 1; + settings->channel_ausec_high = value; + } else { + sif_log(sd->sdev, SIF_INTR, "Invalid channel_ausec_high value: %hu", + value); + } + } else if (strcmp(param[0], "pusec") == 0) { + u16 value; + + ret = kstrtou16(param[1], 10, &value); + if (!ret) { + settings->attributes.channel_pusec = 1; + settings->channel_pusec = value; + } else { + sif_log(sd->sdev, SIF_INTR, "Invalid channel_pusec value: %hu", + value); + } + } else if (strcmp(param[0], "pusec_low") == 0) { + u16 value; + + ret = kstrtou16(param[1], 10, &value); + if (!ret) { + settings->attributes.channel_pusec_low = 1; + settings->channel_pusec_low = value; + } else { + sif_log(sd->sdev, SIF_INTR, "Invalid channel_pusec_low value: %hu", + value); + } + } else if (strcmp(param[0], "pusec_high") == 0) { + u16 value; + + ret = kstrtou16(param[1], 10, &value); + if (!ret) { + settings->attributes.channel_pusec_high = 1; + settings->channel_pusec_high = value; + } else { + sif_log(sd->sdev, SIF_INTR, "Invalid channel_pusec_high value: %hu", + value); + } + } else { + sif_log(sd->sdev, SIF_INTR, "Omitting invalid irq coalesce parameter %s", + param[0]); + } + } + + if (!settings->int_channel) { + sif_log(sd->sdev, SIF_INTR, "Missing irq channel"); + goto sif_invalid_channel; + } + + ret = sif_epsc_wr_poll(sd->sdev, &req, &resp); + if (ret) { + sif_log(sd->sdev, SIF_INFO, "Failed to configure the coalescing settings for irq channel %d", + settings->int_channel); + goto err_epsc_comm; + } + /* Update the driver device settings */ +#define UPDATE_DRIVER_INT_CTRL_SETTING(attr) { \ + if (settings->attributes.attr) \ + eq->irq_ch.attr = settings->attr; \ + } + UPDATE_DRIVER_INT_CTRL_SETTING(enable_adaptive); + UPDATE_DRIVER_INT_CTRL_SETTING(channel_rx_scale); + UPDATE_DRIVER_INT_CTRL_SETTING(channel_rate_low); + UPDATE_DRIVER_INT_CTRL_SETTING(channel_rate_high); + UPDATE_DRIVER_INT_CTRL_SETTING(channel_ausec); + UPDATE_DRIVER_INT_CTRL_SETTING(channel_ausec_low); + UPDATE_DRIVER_INT_CTRL_SETTING(channel_ausec_high); + UPDATE_DRIVER_INT_CTRL_SETTING(channel_pusec); + UPDATE_DRIVER_INT_CTRL_SETTING(channel_pusec_low); + UPDATE_DRIVER_INT_CTRL_SETTING(channel_pusec_high); + /* Update the irq_ch debug file*/ + sd->dfs_print(seq, sd->sdev, *ppos); + + return count; + +opcode_not_available: +sif_invalid_channel: + return -EINVAL; +err_epsc_comm: + return ret; +} + +static const struct file_operations table_fops_rw = { + .owner = THIS_MODULE, + .open = sif_seq_open, + .read = seq_read, + .write = irq_ch_write, + .llseek = seq_lseek, + .release = sif_seq_release +}; + + +/* Setup/teardown */ + +/* Called before sif_hw_init in main since needed by pqp setup */ +int sif_dfs_register(struct sif_dev *sdev) +{ + struct dentry *df; + struct sif_dfs_ref *sdr; + int i; + char name[100]; + + sprintf(name, "%s", dev_name(&sdev->pdev->dev)); + sdev->dfs = kzalloc(sizeof(struct sif_dfs), GFP_KERNEL); + if (sdev->dfs) + sdev->dfs->root = debugfs_create_dir(name, NULL); + if (!sdev->dfs || !sdev->dfs->root) { + sif_log(sdev, SIF_INFO, + "Unable to set up debugfs file system for %s", name); + goto sif_dfs_reg_failed; + } + + for (i = 0; i < sif_tab_init_max; i++) { + sdr = &sdev->dfs->sd[i]; + sdr->sdev = sdev; + sdr->is_eq = false; + sdr->type = i; + sdr->dfs_print = sif_table_dfs_printer(i); + df = debugfs_create_file(sif_table_name(i), S_IRUGO, sdev->dfs->root, + (void *)sdr, &table_fops); + if (!df) { + sif_log(sdev, SIF_INFO, "Unable to set up debugfs file %s", + sif_table_name(i)); + goto sif_dfs_reg_failed; + } + } + + /* Single file for the event queues */ + sdr = &sdev->dfs->sd_eq; + sdr->sdev = sdev; + sdr->is_eq = true; + sdr->dfs_print = sif_dfs_print_eq; + df = debugfs_create_file("eq", S_IRUGO, sdev->dfs->root, + (void *)sdr, &table_fops); + if (!df) { + sif_log(sdev, SIF_INFO, "Unable to set up debugfs file for event queues"); + return -ENOMEM; + } + /* Single file for the ipoffload qp-statistics */ + sdr = &sdev->dfs->sd_ipoffload; + sdr->sdev = sdev; + sdr->dfs_print = sif_dfs_print_ipoffload; + sdr->type = qp; + df = debugfs_create_file("ipoffload", S_IRUGO, sdev->dfs->root, + (void *)sdr, &table_fops); + if (!df) { + sif_log(sdev, SIF_INFO, "Unable to set up debugfs file for ipoffload qp stat"); + return -ENOMEM; + } + /* Single file for the int channel coalescing settings */ + sdr = &sdev->dfs->sd_irq_ch; + sdr->sdev = sdev; + sdr->is_eq = true; + sdr->dfs_print = sif_dfs_print_irq_ch; + df = debugfs_create_file("irq_ch", S_IWUSR | S_IRUGO, sdev->dfs->root, + (void *)sdr, &table_fops_rw); + if (!df) { + sif_log(sdev, SIF_INFO, + "Unable to set up debugfs file for interrupt channels coalescing settings"); + return -ENOMEM; + } + + /* Create a directory for raw qp dump info */ + sdev->dfs->raw_qp = debugfs_create_dir("raw_qp", sdev->dfs->root); + if (!sdev->dfs->raw_qp) { + sif_log(sdev, SIF_INFO, "Unable to set up debugfs directory for raw QP information"); + goto sif_dfs_reg_failed; + } + return 0; + +sif_dfs_reg_failed: + sif_dfs_unregister(sdev); + return -ENOMEM; +} + + +/* Symlink ib device name to debugfs root node - named by PCI id */ +void sif_dfs_link_to_ibdev(struct sif_dev *sdev) +{ + sdev->dfs->root_link = + debugfs_create_symlink(sdev->ib_dev.name, NULL, sdev->dfs->root->d_iname); + if (!sdev->dfs->root_link) + sif_log(sdev, SIF_INFO, "Failed to create link %s -> %s", + sdev->dfs->root->d_iname, sdev->ib_dev.name); +} + + +void sif_dfs_unregister(struct sif_dev *sdev) +{ + if (!sdev->dfs) + return; + debugfs_remove(sdev->dfs->root_link); + debugfs_remove_recursive(sdev->dfs->root); + kfree(sdev->dfs); + sdev->dfs = NULL; +} + + +/**** support for raw QP state dump */ + + +static int rqp_open(struct inode *inode, struct file *file) +{ + if (!try_module_get(THIS_MODULE)) + return -EIO; + + file->private_data = inode->i_private; + return 0; +}; + + +static ssize_t rqp_read(struct file *file, char __user *buf, size_t sz, loff_t *off) +{ + struct sif_qp *qp = (struct sif_qp *)file->private_data; + struct psif_query_qp lqqp; + int ret; + size_t len = 0; + struct xchar xc; + size_t dump_size = 12000; /* enough space for allocating the qp dump*/ + char *dump; + + sif_log0(SIF_QP, "rqp_read idx %d, sz %ld offset 0x%llx", qp->qp_idx, sz, *off); + if (*off > 0) + return 0; + + dump = kmalloc(dump_size, GFP_KERNEL); + if (!dump) { + sif_log0(SIF_INFO, "Error allocating temp.storage for raw qp read"); + return -ENOMEM; + } + + memset(dump, 0, dump_size*sizeof(char)); + xc.buf = dump; + + ret = epsc_query_qp(qp, &lqqp); + if (ret) { + len = snprintf(xc.buf, sz, + "[query_qp failed with status %d - returning last cached state]\n", + ret); + xc.buf += len; + sz -= len; + } + /* TBD: Could cause buffer overflow in theory: see #2738 */ + write_struct_psif_query_qp(&xc, 0, &lqqp); + sprintf(xc.buf, "\n"); + len = simple_read_from_buffer(buf, sz, off, dump, strlen(dump)); + kfree(dump); + + return len; +} + + +static int rqp_release(struct inode *inode, struct file *file) +{ + module_put(THIS_MODULE); + return 0; +} + + +static const struct file_operations qp_fops = { + .owner = THIS_MODULE, + .open = rqp_open, + .read = rqp_read, + .release = rqp_release, +}; + + +/* TBD: Ref.cnt or other protection probably needed to protect agains "take down" while + * a query is in progress + */ +int sif_dfs_add_qp(struct sif_dev *sdev, struct sif_qp *qp) +{ + char tmp[20]; + + sprintf(tmp, "%d", qp->qp_idx); + qp->dfs_qp = debugfs_create_file(tmp, S_IRUGO, sdev->dfs->raw_qp, + (void *)qp, &qp_fops); + if (!qp->dfs_qp) + return -ENOMEM; + return 0; +} + + +void sif_dfs_remove_qp(struct sif_qp *qp) +{ + debugfs_remove(qp->dfs_qp); + qp->dfs_qp = NULL; +} diff --git a/drivers/infiniband/hw/sif/sif_debug.h b/drivers/infiniband/hw/sif/sif_debug.h new file mode 100644 index 0000000000000..b95ed7893c357 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_debug.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_debug.h: Use of debugfs for dumping internal data structure info + */ + +#ifndef __SIF_DEBUG_H +#define __SIF_DEBUG_H + +struct sif_dev; + +/* Set up/tear down the debugfs structures */ +int sif_dfs_register(struct sif_dev *sdev); +void sif_dfs_unregister(struct sif_dev *sdev); + +/* Symlink to ib device name (to be called after ib_register_device */ +void sif_dfs_link_to_ibdev(struct sif_dev *sdev); + +int sif_dfs_add_qp(struct sif_dev *sdev, struct sif_qp *qp); +void sif_dfs_remove_qp(struct sif_qp *qp); + +/* A generic callback function for printing a table entry + * in a debug fs file: + */ +typedef void (*sif_dfs_printer)(struct seq_file *s, + struct sif_dev *, + loff_t pos); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_defs.c b/drivers/infiniband/hw/sif/sif_defs.c new file mode 100644 index 0000000000000..63a6ecd3d3ee0 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_defs.c @@ -0,0 +1,562 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_defs.c: IB-to-SIF Mapper. + */ +#include +#include +#include "sif_dev.h" +#include "sif_defs.h" +#include "psif_hw_setget.h" +#include "sif_qp.h" + +/* This is where we build and define kernel utilities for logging psif structures: */ +#define copy_convert copy_conv_to_sw +#define copy_convert_to_sw copy_conv_to_sw +#define copy_convert_to_hw copy_conv_to_hw +#define assert(x) BUG_ON(!(x)) +#include "psif_hw_print.c" + +enum psif_wr_type sif_invalidate_opcode(enum sif_tab_type type) +{ + switch (type) { + case rq_sw: + case rq_hw: + return PSIF_WR_INVALIDATE_RQ; + case cq_sw: + case cq_hw: + return PSIF_WR_INVALIDATE_CQ; + case key: + return PSIF_WR_INVALIDATE_BOTH_KEYS; + case qp: + return PSIF_WR_INVALIDATE_SGL_CACHE; + default: + /* This function is used to figure out if an invalidate + * request is needed so ending here is a normal case + */ + break; + } + return (enum psif_wr_type)-1; +} + + +enum psif_wr_type ib2sif_wr_op(enum ib_wr_opcode op, bool is_dr) +{ + switch (op) { + case IB_WR_RDMA_WRITE: + return PSIF_WR_RDMA_WR; + case IB_WR_RDMA_WRITE_WITH_IMM: + return PSIF_WR_RDMA_WR_IMM; + case IB_WR_SEND: + return !is_dr ? PSIF_WR_SEND : PSIF_WR_QP0_SEND_DR_LOOPBACK; + case IB_WR_SEND_WITH_IMM: + return PSIF_WR_SEND_IMM; + case IB_WR_RDMA_READ: + return PSIF_WR_RDMA_RD; + case IB_WR_ATOMIC_CMP_AND_SWP: + return PSIF_WR_CMP_SWAP; + case IB_WR_ATOMIC_FETCH_AND_ADD: + return PSIF_WR_FETCH_ADD; + case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: + return PSIF_WR_MASK_CMP_SWAP; + case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD: + return PSIF_WR_MASK_FETCH_ADD; + case IB_WR_LSO: + return PSIF_WR_LSO; + case IB_WR_SEND_WITH_INV: + case IB_WR_RDMA_READ_WITH_INV: + case IB_WR_LOCAL_INV: + case IB_WR_FAST_REG_MR: + default: + break; + } + sif_log0(SIF_INFO, "Unsupported opcode %d", op); + return (enum psif_wr_type)-1; +} + +enum ib_wr_opcode sif2ib_wr_op(enum psif_wr_type op) +{ + switch (op) { + case PSIF_WR_SEND: + return IB_WR_SEND; + case PSIF_WR_SEND_IMM: + return IB_WR_SEND_WITH_IMM; + case PSIF_WR_RDMA_WR: + return IB_WR_RDMA_WRITE; + case PSIF_WR_RDMA_WR_IMM: + return IB_WR_RDMA_WRITE_WITH_IMM; + case PSIF_WR_RDMA_RD: + return IB_WR_RDMA_READ; + case PSIF_WR_CMP_SWAP: + return IB_WR_ATOMIC_CMP_AND_SWP; + case PSIF_WR_FETCH_ADD: + return IB_WR_ATOMIC_FETCH_AND_ADD; + case PSIF_WR_MASK_CMP_SWAP: + return IB_WR_MASKED_ATOMIC_CMP_AND_SWP; + case PSIF_WR_MASK_FETCH_ADD: + return IB_WR_MASKED_ATOMIC_FETCH_AND_ADD; + case PSIF_WR_LSO: + return IB_WR_LSO; + case PSIF_WR_INVALIDATE_RKEY: + case PSIF_WR_INVALIDATE_LKEY: + case PSIF_WR_INVALIDATE_BOTH_KEYS: + case PSIF_WR_INVALIDATE_TLB: + case PSIF_WR_RESIZE_CQ: + case PSIF_WR_SET_SRQ_LIM: + case PSIF_WR_SET_XRCSRQ_LIM: + case PSIF_WR_INVALIDATE_RQ: + case PSIF_WR_INVALIDATE_CQ: + case PSIF_WR_INVALIDATE_XRCSRQ: + default: + break; + } + sif_log0(SIF_INFO, "Unable to convert opcode %d", op); + return (enum ib_wr_opcode)-1; +} + +/* TBD: These should map directly - must add test first */ +enum ib_wc_opcode sif2ib_wc_opcode(enum psif_wc_opcode opcode) +{ + switch (opcode) { + case PSIF_WC_OPCODE_SEND: + return IB_WC_SEND; + case PSIF_WC_OPCODE_RDMA_WR: + return IB_WC_RDMA_WRITE; + case PSIF_WC_OPCODE_RDMA_READ: + return IB_WC_RDMA_READ; + case PSIF_WC_OPCODE_CMP_SWAP: + return IB_WC_COMP_SWAP; + case PSIF_WC_OPCODE_FETCH_ADD: + return IB_WC_FETCH_ADD; + case PSIF_WC_OPCODE_LSO: + return IB_WC_LSO; + case PSIF_WC_OPCODE_MASKED_CMP_SWAP: + return IB_WC_MASKED_COMP_SWAP; + case PSIF_WC_OPCODE_MASKED_FETCH_ADD: + return IB_WC_MASKED_FETCH_ADD; + case PSIF_WC_OPCODE_RECEIVE_SEND: + return IB_WC_RECV; + case PSIF_WC_OPCODE_RECEIVE_RDMA_WR_IMM: + return IB_WC_RECV_RDMA_WITH_IMM; + case PSIF_WC_OPCODE_INVALIDATE_SGL_CACHE: + return PSIF_WR_INVALIDATE_SGL_CACHE; + case PSIF_WC_OPCODE_INVALIDATE_RKEY: + case PSIF_WC_OPCODE_INVALIDATE_LKEY: + case PSIF_WC_OPCODE_INVALIDATE_BOTH_KEYS: + case PSIF_WC_OPCODE_INVALIDATE_TLB: + case PSIF_WC_OPCODE_RESIZE_CQ: + case PSIF_WC_OPCODE_SET_SRQ_LIM: + case PSIF_WC_OPCODE_REQ_CMPL_NOTIFY: + case PSIF_WC_OPCODE_CMPL_NOTIFY_RCVD: + case PSIF_WC_OPCODE_REARM_CMPL_EVENT: + case PSIF_WC_OPCODE_SET_XRCSRQ_LIM: + case PSIF_WC_OPCODE_INVALIDATE_RQ: + case PSIF_WC_OPCODE_INVALIDATE_CQ: + case PSIF_WC_OPCODE_INVALIDATE_RB: + case PSIF_WC_OPCODE_INVALIDATE_XRCSRQ: + case PSIF_WC_OPCODE_GENERATE_COMPLETION: + case PSIF_WC_OPCODE_RECEIVE_CONDITIONAL_WR_IMM: + break; + } + return -1; +} + +enum psif_wc_opcode ib2sif_wc_opcode(enum ib_wc_opcode opcode) +{ + switch (opcode) { + case IB_WC_SEND: + return PSIF_WC_OPCODE_SEND; + case IB_WC_RDMA_WRITE: + return PSIF_WC_OPCODE_RDMA_WR; + case IB_WC_RDMA_READ: + return PSIF_WC_OPCODE_RDMA_READ; + case IB_WC_COMP_SWAP: + return PSIF_WC_OPCODE_CMP_SWAP; + case IB_WC_FETCH_ADD: + return PSIF_WC_OPCODE_FETCH_ADD; + case IB_WC_LSO: + return PSIF_WC_OPCODE_LSO; + case IB_WC_MASKED_COMP_SWAP: + return PSIF_WC_OPCODE_MASKED_CMP_SWAP; + case IB_WC_MASKED_FETCH_ADD: + return PSIF_WC_OPCODE_MASKED_FETCH_ADD; + case IB_WC_RECV: + return PSIF_WC_OPCODE_RECEIVE_SEND; + case IB_WC_RECV_RDMA_WITH_IMM: + return PSIF_WC_OPCODE_RECEIVE_RDMA_WR_IMM; + case IB_WC_BIND_MW: + case IB_WC_LOCAL_INV: + case IB_WC_FAST_REG_MR: + break; + } + sif_log0(SIF_INFO, "IB opcode %d not implemented", opcode); + return -1; +} + +enum ib_wc_status sif2ib_wc_status(enum psif_wc_status status) +{ + switch (status) { + case PSIF_WC_STATUS_SUCCESS: + return IB_WC_SUCCESS; + case PSIF_WC_STATUS_LOC_LEN_ERR: + return IB_WC_LOC_LEN_ERR; + case PSIF_WC_STATUS_LOC_QP_OP_ERR: + return IB_WC_LOC_QP_OP_ERR; + case PSIF_WC_STATUS_LOC_EEC_OP_ERR: + return IB_WC_LOC_EEC_OP_ERR; + case PSIF_WC_STATUS_LOC_PROT_ERR: + return IB_WC_LOC_PROT_ERR; + case PSIF_WC_STATUS_WR_FLUSH_ERR: + return IB_WC_WR_FLUSH_ERR; + case PSIF_WC_STATUS_MW_BIND_ERR: + return IB_WC_MW_BIND_ERR; + case PSIF_WC_STATUS_BAD_RESP_ERR: + return IB_WC_BAD_RESP_ERR; + case PSIF_WC_STATUS_LOC_ACCESS_ERR: + return IB_WC_LOC_ACCESS_ERR; + case PSIF_WC_STATUS_REM_INV_REQ_ERR: + return IB_WC_REM_INV_REQ_ERR; + case PSIF_WC_STATUS_REM_ACCESS_ERR: + return IB_WC_REM_ACCESS_ERR; + case PSIF_WC_STATUS_REM_OP_ERR: + return IB_WC_REM_OP_ERR; + case PSIF_WC_STATUS_RETRY_EXC_ERR: + return IB_WC_RETRY_EXC_ERR; + case PSIF_WC_STATUS_RNR_RETRY_EXC_ERR: + return IB_WC_RNR_RETRY_EXC_ERR; + case PSIF_WC_STATUS_LOC_RDD_VIOL_ERR: + return IB_WC_LOC_RDD_VIOL_ERR; + case PSIF_WC_STATUS_REM_INV_RD_REQ_ERR: + return IB_WC_REM_INV_RD_REQ_ERR; + case PSIF_WC_STATUS_REM_ABORT_ERR: + return IB_WC_REM_ABORT_ERR; + case PSIF_WC_STATUS_INV_EECN_ERR: + return IB_WC_INV_EECN_ERR; + case PSIF_WC_STATUS_INV_EEC_STATE_ERR: + return IB_WC_INV_EEC_STATE_ERR; + case PSIF_WC_STATUS_FATAL_ERR: + return IB_WC_FATAL_ERR; + case PSIF_WC_STATUS_RESP_TIMEOUT_ERR: + return IB_WC_RESP_TIMEOUT_ERR; + case PSIF_WC_STATUS_GENERAL_ERR: + return IB_WC_GENERAL_ERR; + case PSIF_WC_STATUS_FIELD_MAX: + return -1; + } + return -1; +} + +enum psif_wc_status ib2sif_wc_status(enum ib_wc_status status) +{ + switch (status) { + case IB_WC_SUCCESS: + return PSIF_WC_STATUS_LOC_LEN_ERR; + case IB_WC_LOC_LEN_ERR: + return PSIF_WC_STATUS_LOC_LEN_ERR; + case IB_WC_LOC_QP_OP_ERR: + return PSIF_WC_STATUS_LOC_QP_OP_ERR; + case IB_WC_LOC_EEC_OP_ERR: + return PSIF_WC_STATUS_LOC_EEC_OP_ERR; + case IB_WC_LOC_PROT_ERR: + return PSIF_WC_STATUS_LOC_PROT_ERR; + case IB_WC_WR_FLUSH_ERR: + return PSIF_WC_STATUS_WR_FLUSH_ERR; + case IB_WC_MW_BIND_ERR: + return PSIF_WC_STATUS_MW_BIND_ERR; + case IB_WC_BAD_RESP_ERR: + return PSIF_WC_STATUS_BAD_RESP_ERR; + case IB_WC_LOC_ACCESS_ERR: + return PSIF_WC_STATUS_LOC_ACCESS_ERR; + case IB_WC_REM_INV_REQ_ERR: + return PSIF_WC_STATUS_REM_INV_REQ_ERR; + case IB_WC_REM_ACCESS_ERR: + return PSIF_WC_STATUS_REM_ACCESS_ERR; + case IB_WC_REM_OP_ERR: + return PSIF_WC_STATUS_REM_OP_ERR; + case IB_WC_RETRY_EXC_ERR: + return PSIF_WC_STATUS_RETRY_EXC_ERR; + case IB_WC_RNR_RETRY_EXC_ERR: + return PSIF_WC_STATUS_RNR_RETRY_EXC_ERR; + case IB_WC_LOC_RDD_VIOL_ERR: + return PSIF_WC_STATUS_LOC_RDD_VIOL_ERR; + case IB_WC_REM_INV_RD_REQ_ERR: + return PSIF_WC_STATUS_REM_INV_RD_REQ_ERR; + case IB_WC_REM_ABORT_ERR: + return PSIF_WC_STATUS_REM_ABORT_ERR; + case IB_WC_INV_EECN_ERR: + return PSIF_WC_STATUS_INV_EECN_ERR; + case IB_WC_INV_EEC_STATE_ERR: + return PSIF_WC_STATUS_INV_EEC_STATE_ERR; + case IB_WC_FATAL_ERR: + return PSIF_WC_STATUS_FATAL_ERR; + case IB_WC_RESP_TIMEOUT_ERR: + return PSIF_WC_STATUS_RESP_TIMEOUT_ERR; + case IB_WC_GENERAL_ERR: + return PSIF_WC_STATUS_GENERAL_ERR; + } + return -1; +} + + +enum psif_qp_trans ib2sif_qp_type(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_RC: + return PSIF_QP_TRANSPORT_RC; + case IB_QPT_UC: + return PSIF_QP_TRANSPORT_UC; + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + return PSIF_QP_TRANSPORT_UD; + case IB_QPT_RAW_IPV6: + case IB_QPT_RAW_ETHERTYPE: + break; + case IB_QPT_XRC_INI: + case IB_QPT_XRC_TGT: + return PSIF_QP_TRANSPORT_XRC; + case IB_QPT_MAX: + case IB_QPT_RAW_PACKET: + /* IB_QPT_EPSA_TUNNELING = IB_QPT_RESERVED1; */ + break; + case IB_QPT_EPSA_TUNNELING: + return PSIF_QP_TRANSPORT_UD; + + case IB_QPT_RESERVED2: + case IB_QPT_RESERVED3: + case IB_QPT_RESERVED4: + case IB_QPT_RESERVED5: + case IB_QPT_RESERVED6: + case IB_QPT_RESERVED7: + case IB_QPT_RESERVED8: + case IB_QPT_RESERVED9: + case IB_QPT_RESERVED10: + break; + } + /* map to a value we don't support as the + * error status value for now.. + */ + return (enum psif_qp_trans)(-1); +} + + +enum psif_qp_state ib2sif_qp_state(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: + return PSIF_QP_STATE_RESET; + case IB_QPS_INIT: + return PSIF_QP_STATE_INIT; + case IB_QPS_RTR: + return PSIF_QP_STATE_RTR; + case IB_QPS_RTS: + return PSIF_QP_STATE_RTS; + case IB_QPS_ERR: + return PSIF_QP_STATE_ERROR; + case IB_QPS_SQE: + return PSIF_QP_STATE_SQERR; + case IB_QPS_SQD: /* TBD: Is this right? */ + break; + } + return PSIF_QP_STATE_INVALID; +} + + +enum ib_qp_state sif2ib_qp_state(enum psif_qp_state state) +{ + switch (state) { + case PSIF_QP_STATE_RESET: + return IB_QPS_RESET; + case PSIF_QP_STATE_INIT: + return IB_QPS_INIT; + case PSIF_QP_STATE_RTR: + return IB_QPS_RTR; + case PSIF_QP_STATE_RTS: + return IB_QPS_RTS; + case PSIF_QP_STATE_ERROR: + return IB_QPS_ERR; + case PSIF_QP_STATE_SQERR: + return IB_QPS_SQE; + case PSIF_QP_STATE_INVALID: + break; + } + return IB_QPS_ERR; +} + +enum psif_migration ib2sif_mig_state(enum ib_mig_state mstate) +{ + switch (mstate) { + case IB_MIG_MIGRATED: + return APM_MIGRATED; + case IB_MIG_REARM: + return APM_REARM; + case IB_MIG_ARMED: + return APM_ARMED; + } + return APM_OFF; +} + +enum ib_mig_state sif2ib_mig_state(enum psif_migration mstate) +{ + switch (mstate) { + case APM_MIGRATED: + return IB_MIG_MIGRATED; + case APM_REARM: + return IB_MIG_REARM; + case APM_ARMED: + return IB_MIG_ARMED; + default: + return (enum ib_mig_state)-1; + } +} + +enum psif_path_mtu ib2sif_path_mtu(enum ib_mtu mtu) +{ + switch (mtu) { + case IB_MTU_256: + return MTU_256B; + case IB_MTU_512: + return MTU_512B; + case IB_MTU_1024: + return MTU_1024B; + case IB_MTU_2048: + return MTU_2048B; + case IB_MTU_4096: + return MTU_4096B; + } + return MTU_INVALID; +} + +enum ib_mtu sif2ib_path_mtu(enum psif_path_mtu mtu) +{ + switch (mtu) { + case MTU_256B: + return IB_MTU_256; + case MTU_512B: + return IB_MTU_512; + case MTU_1024B: + return IB_MTU_1024; + case MTU_2048B: + return IB_MTU_2048; + case MTU_4096B: + return IB_MTU_4096; + default: + return (enum ib_mtu)0; + } +} + + +/* TBD: IB datastructure dump functions - remove/replace? */ + +const char *ib_event2str(enum ib_event_type e) +{ + switch (e) { + case IB_EVENT_CQ_ERR: + return "IB_EVENT_CQ_ERR"; + case IB_EVENT_QP_FATAL: + return "IB_EVENT_QP_FATAL"; + case IB_EVENT_QP_REQ_ERR: + return "IB_EVENT_QP_REQ_ERR"; + case IB_EVENT_QP_ACCESS_ERR: + return "IB_EVENT_QP_ACCESS_ERR"; + case IB_EVENT_COMM_EST: + return "IB_EVENT_COMM_EST"; + case IB_EVENT_SQ_DRAINED: + return "IB_EVENT_SQ_DRAINED"; + case IB_EVENT_PATH_MIG: + return "IB_EVENT_PATH_MIG"; + case IB_EVENT_PATH_MIG_ERR: + return "IB_EVENT_PATH_MIG_ERR"; + case IB_EVENT_DEVICE_FATAL: + return "IB_EVENT_DEVICE_FATAL"; + case IB_EVENT_PORT_ACTIVE: + return "IB_EVENT_PORT_ACTIVE"; + case IB_EVENT_PORT_ERR: + return "IB_EVENT_PORT_ERR"; + case IB_EVENT_LID_CHANGE: + return "IB_EVENT_LID_CHANGE"; + case IB_EVENT_PKEY_CHANGE: + return "IB_EVENT_PKEY_CHANGE"; + case IB_EVENT_SM_CHANGE: + return "IB_EVENT_SM_CHANGE"; + case IB_EVENT_SRQ_ERR: + return "IB_EVENT_SRQ_ERR"; + case IB_EVENT_SRQ_LIMIT_REACHED: + return "IB_EVENT_SRQ_LIMIT_REACHED"; + case IB_EVENT_QP_LAST_WQE_REACHED: + return "IB_EVENT_QP_LAST_WQE_REACHED"; + case IB_EVENT_CLIENT_REREGISTER: + return "IB_EVENT_CLIENT_REREGISTER"; + case IB_EVENT_GID_CHANGE: + return "IB_EVENT_GID_CHANGE"; + default: + return "(Undefined event type)"; + } +} + +static inline enum kernel_ulp_type find_ulp_type_from_address(void *ptr) +{ + if (ptr) { +#if defined(__x86_64__) || defined(__sparc__) + char symbol_name[100]; + + snprintf(symbol_name, sizeof(symbol_name), "%ps", ptr); + if (strstr(symbol_name, "rds_")) + return RDS_ULP; + else if (strstr(symbol_name, "ipoib_cm_")) + return IPOIB_CM_ULP; + else if (strstr(symbol_name, "ipoib_")) + return IPOIB_ULP; +#endif + } + return OTHER_ULP; +} + +static inline enum kernel_ulp_type find_ulp_type_via_stack_unwind(const int level) +{ +/* __builtin_return_address argument must be a constant */ +#define STACK_UNWIND_CASE_LEVEL(n) \ + case (n): { \ + enum kernel_ulp_type type = OTHER_ULP; \ + void *ptr = __builtin_return_address(n);\ + type = find_ulp_type_from_address(ptr); \ + if (type != OTHER_ULP) \ + return type; \ + } + + switch (level) { + default: + STACK_UNWIND_CASE_LEVEL(7); + STACK_UNWIND_CASE_LEVEL(6); + STACK_UNWIND_CASE_LEVEL(5); + STACK_UNWIND_CASE_LEVEL(4); + STACK_UNWIND_CASE_LEVEL(3); + STACK_UNWIND_CASE_LEVEL(2); + STACK_UNWIND_CASE_LEVEL(1); + STACK_UNWIND_CASE_LEVEL(0); + } +#undef STACK_UNWIND_CASE_LEVEL + return OTHER_ULP; +} + +enum kernel_ulp_type sif_find_kernel_ulp_caller(void) +{ + enum kernel_ulp_type type = OTHER_ULP; + + if (!(__builtin_return_address(0))) { + /* if current function returns NULL, + * there is no reason to check further. + */ + goto error; + } + type = find_ulp_type_via_stack_unwind(STACK_UNWIND_LEVEL); +error: + return type; +} diff --git a/drivers/infiniband/hw/sif/sif_defs.h b/drivers/infiniband/hw/sif/sif_defs.h new file mode 100644 index 0000000000000..f0a06db3fe8b1 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_defs.h @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_defs.h: Div. utility definitions and auxiliary data structures + */ + +#ifndef __SIF_DEFS_H +#define __SIF_DEFS_H +#include "psif_hw_data.h" +#include "sif_mmu.h" +#include "sif_pd.h" +#include "sif_sq.h" +#include "sif_cq.h" +#include "sif_mem.h" +#include "sif_rq.h" +#include "sif_ireg.h" + +/* Needed by print funcs */ +#define xprintf(x, format, arg...) \ + do {\ + if (x) \ + (x)->buf += sprintf((x)->buf, format, ## arg); \ + else \ + printk(format, ## arg); \ + } while (0) + +struct xchar { + char *buf; +}; + +#define GREATER_16(a, b) ((s16)((s16)(a) - (s16)(b)) > 0) + + +#define XFILE struct xchar +#include "psif_hw_print.h" + +enum sif_tab_type; + +enum psif_wr_type sif_invalidate_opcode(enum sif_tab_type type); + +enum ib_wc_opcode sif2ib_wc_opcode(enum psif_wc_opcode opcode); +enum psif_wc_opcode ib2sif_wc_opcode(enum ib_wc_opcode opcode); + +enum ib_wc_status sif2ib_wc_status(enum psif_wc_status status); +enum psif_wc_status ib2sif_wc_status(enum ib_wc_status status); + +enum ib_wr_opcode sif2ib_wr_op(enum psif_wr_type op); +enum psif_wr_type ib2sif_wr_op(enum ib_wr_opcode op, bool is_dr); + +enum psif_qp_trans ib2sif_qp_type(enum ib_qp_type type); + +enum psif_qp_state ib2sif_qp_state(enum ib_qp_state state); +enum ib_qp_state sif2ib_qp_state(enum psif_qp_state state); + +enum ib_mig_state sif2ib_mig_state(enum psif_migration mstate); +enum psif_migration ib2sif_mig_state(enum ib_mig_state mstate); + +enum ib_mtu sif2ib_path_mtu(enum psif_path_mtu mtu); +enum psif_path_mtu ib2sif_path_mtu(enum ib_mtu mtu); +enum kernel_ulp_type sif_find_kernel_ulp_caller(void); + +/* TBD: IB datastructure dump functions - remove/replace? */ +const char *ib_event2str(enum ib_event_type e); + +static inline struct sif_pd *to_spd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct sif_pd, ibpd); +} + +static inline struct sif_shpd *to_sshpd(struct ib_shpd *ibshpd) +{ + return container_of(ibshpd, struct sif_shpd, ibshpd); +} + +/* Generic table handling functions: + * For xx in cq,rq,sq: + * + * Return element# @index in the xx queue referred by q: + * + * struct psif_xx_entry *get_xx_entry(struct sif_xx *q, int index); + * + * @ptr: Kernel virtual address offset into an entry in the xx queue @q + * Return value: The corresponding dma address. + * + * u64 xxe_to_dma(struct sif_xx *q, void* ptr); + + * TBD: Document the rest of the macro defined generic calls + */ + + +#define sif_define_entry_funcs(type, dtype) \ +static inline struct psif_##type##_entry \ + *get_##type##_entry(struct sif_##type *q, unsigned dtype seq)\ +{\ + return (struct psif_##type##_entry *) sif_mem_kaddr(q->mem, (seq & q->mask) * q->extent); \ +} \ +static inline u64 get_##type##e_dma(struct sif_##type *q, unsigned dtype seq) \ +{\ + return sif_mem_dma(q->mem, (seq & q->mask) * q->extent); \ +} \ +static inline int type##_is_empty(struct sif_##type *q, unsigned dtype head, unsigned dtype tail)\ +{\ + return (head == tail); \ +} \ +static inline dtype type##_length(struct sif_##type *q, dtype head, dtype tail)\ +{\ + return tail - head;\ +} \ + +sif_define_entry_funcs(cq, int) +sif_define_entry_funcs(rq, int) +sif_define_entry_funcs(sq, short) + +static inline void *sq_sgl_offset(struct sif_sq *sq, struct psif_sq_entry *sqe) +{ + return (u8 *)sqe + sq->sgl_offset; +} + +/* Define an architecture independent write combining flush: + * According to documentation, we should have been able to use + * mmiowb() but on x86_64 mmiowb does not contain the necessary sfence instruction. + */ + +#if defined(__i386__) +#define wc_wmb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory") +#elif defined(__x86_64__) +#define wc_wmb() asm volatile("sfence" ::: "memory") +#elif defined(__ia64__) +#define wc_wmb() asm volatile("fwb" ::: "memory") +#else +#define wc_wmb() wmb() +#endif + +#endif diff --git a/drivers/infiniband/hw/sif/sif_dev.h b/drivers/infiniband/hw/sif/sif_dev.h new file mode 100644 index 0000000000000..2ea33817cffaf --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_dev.h @@ -0,0 +1,728 @@ +/* + * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_dev.h: Driver specific data structure definitions + */ + +#ifndef __SIF_DEV_H +#define __SIF_DEV_H + +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "sif_idr.h" +#include "sif_fwa.h" +#include "sif_mmu.h" +#include "sif_pqp.h" +#include "sif_mem.h" + + +#include "sif_verbs.h" + +#define PCI_VENDOR_ID_SUN 0x108e +#define PCI_DEVICE_ID_PSIF_PF 0x2088 +#define PCI_DEVICE_ID_PSIF_VF 0x2089 +#define PCI_DEVICE_ID_SN1_PF 0x2188 +#define PCI_DEVICE_ID_SN1_VF 0x2189 +#define PCI_DEVICE_ID_SN2_PF 0x2198 +#define PCI_DEVICE_ID_SN2_VF 0x2199 +#define PCI_DEVICE_ID_SN3_PF 0x21A8 +#define PCI_DEVICE_ID_SN3_VF 0x21A9 + +#define PSIF_DEVICE(sdevice) ((sdevice)->pdev->device) +#define PSIF_SUBSYSTEM(sdevice) ((sdevice)->pdev->subsystem_device) + +#define IS_PSIF(sdevice) (PSIF_DEVICE(sdevice) == PCI_DEVICE_ID_PSIF_PF || \ + PSIF_DEVICE(sdevice) == PCI_DEVICE_ID_PSIF_VF) + +#define IS_SIBS(sdevice) (PSIF_DEVICE(sdevice) == PCI_DEVICE_ID_SN1_PF || \ + PSIF_DEVICE(sdevice) == PCI_DEVICE_ID_SN1_VF || \ + PSIF_DEVICE(sdevice) == PCI_DEVICE_ID_SN2_PF || \ + PSIF_DEVICE(sdevice) == PCI_DEVICE_ID_SN2_VF || \ + PSIF_DEVICE(sdevice) == PCI_DEVICE_ID_SN3_PF || \ + PSIF_DEVICE(sdevice) == PCI_DEVICE_ID_SN3_VF) + +/* Sonoma rev 1 most closely resembles PSIF rev 2 + * TBD: Need a more fine grained solution to feature/bug checking as we move on.. + */ +#define PSIF_REVISION(sdevice) \ + (IS_SIBS(sdevice) ? (sdevice)->pdev->revision + 1 : (sdevice)->pdev->revision) + + +/* Tested limit on #of CQEs - may support 2^30 but + * need machine with lots of memory to test it! + */ +#define SIF_SW_MAX_CQE_LOG2 0x18 /* = 16 MB - tested and should cover most use cases.. */ +#define SIF_SW_MAX_CQE (1 << SIF_SW_MAX_CQE_LOG2) + +#define SIF_SW_MAX_SQE_LOG2 0xf /* = 32K */ +#define SIF_SW_MAX_SQE (1 << SIF_SW_MAX_SQE_LOG2) + +/* Start offset of the special sq_cmpl mapping: + * each queue have at most 1 << SIF_SW_MAX_SQE_LOG2 entries + * Maximal extent of elements in a queue is 1 << 1f + * We then shift an additional bit to get to an unused upper bit + * to set just to avoid starting at vaddr 0: + */ +#define SIF_SQ_CMPL_SHIFT (SIF_SW_MAX_SQE_LOG2 + 0x1f + 1) +#define SIF_SQ_CMPL_START (1ULL << SIF_SQ_CMPL_SHIFT) + +/* Use easily identifiable high addresses to map descriptor arrays + * when GVA2GPA mapping is needed. These are virtual addresses + * that will only be used by sif. + * For debug purposes, encode the sif_tab_type index in the address: + */ +#define SIF_BASE_ADDR_START(queue) \ + ((1ULL << (SIF_SQ_CMPL_SHIFT + 1)) + ((u64)(queue) << (SIF_SQ_CMPL_SHIFT - 6))) +#define SIF_BASE_ADDR_EQ_START(queue) \ + (SIF_BASE_ADDR_START(queue) + (1ULL << SIF_SQ_CMPL_SHIFT)) + +/* TBD: Software emulation of UD send SGEs - hardware is limited to 16 */ +#define SIF_SW_MAX_UD_SEND_SGE 32 +#define SIF_HW_MAX_SEND_SGE 16 + +/* This defines the defaults for implicit timers within the driver */ +#define SIF_HW_TIMEOUT 5000 + +/* BAR indices for SIF */ +#define SIF_MSIX_BAR 0 +#define SIF_CBU_BAR 2 +#define SIF_EPS_BAR 4 + +struct sif_mmu_ctx; /* See sif_mmu.h */ + +/* Hardware/firmware accessible tables in memory + * NB! If you change anything here (including order) + * remember to update + * - struct sif_table_layout in sif_base.c + * - define_funcs call list in sif_base.h + */ +#define sif_tab_init_max epsa0_csr_req + +enum sif_tab_type { + epsc_csr_req, /* EPSC request queue */ + epsc_csr_rsp, /* EPSC response queue (EPSC completions) */ + key, /* Key validation table */ + qp, /* QP descriptor table (hw owned) */ + rqsp, /* RQ scratch pad data */ + atsp, /* Atomic replay data */ + ah, /* Address handle table (sw owned) */ + cq_hw, /* Compl desc (read only for sw) */ + cq_sw, /* Compl desc (writable for sw) */ + rq_hw, /* Receive queue (read only for sw) */ + rq_sw, /* Receive queue (writable for sw) */ + sq_hw, /* Send queue (readable for sw) */ + sq_sw, /* Send queue (writable for sw)*/ + sq_cmpl, /* sqe cache for cq block (used by hw only) */ + sq_ring, /* Send queue scheduler ring buffer */ + sq_tvl, /* Send queue scheduler (TBD-what is this?) */ + sq_rspq, /* Send queue scheduler response queue */ + bw_cb, /* High bandwidth collect buffers (NB! Device addr space) */ + lat_cb, /* Low latency collect buffers (NB! Device addr space) */ + epsa0_csr_req, /* EPSA-n request queue */ + epsa0_csr_rsp, /* EPSA-n response queue (EPSC completions) */ + epsa1_csr_req, + epsa1_csr_rsp, + epsa2_csr_req, + epsa2_csr_rsp, + epsa3_csr_req, + epsa3_csr_rsp, + sif_tab_max +}; + +/* Depends on sif_tab_type: */ +#include "sif_epsc.h" + +/* Driver record of a block of entries associated with a particular PD + * Used for tables that have entry_per_block > 1: + */ +struct sif_table_block { + struct sif_pd *pd; /* Owning protection domain, if allocated */ + struct sif_table *table; /* Pointer back to table this is a block within */ + struct list_head pd_list; /* Used by pd to chain it's allocated blocks */ + u32 offset; /* Index offset that this block starts at */ + u32 last_used; /* Last alloc'ed entry - used to support round-robin alloc */ + ulong bitmap[0]; /* Used bitmap for entries, follows right after struct */ +}; + +/* Driver record of a sif in-memory table */ +struct sif_table { + bool is_eq; + union { + enum sif_tab_type type; /* Our type (and index within sdev->ba) */ + u32 index; /* index of this eq - valid iff @is_eq */ + }; + bool from_interrupt; /* If set, alloc/free must be permitted from intr.ctxt */ + bool alloc_rr; /* Set if round-robin allocation is to be used */ + spinlock_t lock; /* Protects bitmap */ + ulong *bitmap; /* Used bitmap for blocks of entries */ + struct sif_mem *mem; /* Allocated memory for the table */ + void *drv_ref; /* array of driver struct pointers for non-inline structs */ + union { + u64 sif_base; /* Virtual base address as seen from SIF */ + void __iomem *sif_off; /* Used for collect buffer mgmt */ + }; + size_t table_sz; /* Size in byte of the table */ + u32 ext_sz; /* Dist.in bytes between start of each entry */ + u32 entry_cnt; /* Number of entries in table */ + u32 block_cnt; /* No.of blocks (1st level alloc granularity) in table */ + u32 entry_per_block; /* entry_per_block = entry_cnt / block_cnt */ + u32 last_used; /* Last alloc'ed entry - used to support round-robin alloc */ + struct sif_mmu_ctx mmu_ctx; /* MMU context bookkeeping */ + void *block; /* Space for array with block_cnt elems + bitmap iff entry_per_block > 1 */ + u32 block_ext; /* Dist in bytes between sif_table_block elements in block */ + struct sif_dev *sdev; /* Pointer back to main driver struct */ +}; + +/* Driver management of event queues and interrupt channel coalescing settings*/ + +#define SIF_EQ_NAME_LEN 15 + +struct sif_irq_ch { + bool enable_adaptive; /* Adaptive coalescing */ + u16 channel_rx_scale; /* rx-to-tx timer scaling factor, 2-exponent value */ + u32 channel_rate_low; /* Message rate in messages per second. Low rate threshold. */ + u32 channel_rate_high; /* Message rate in messages per second. High rate threshold. */ + u16 channel_ausec; /* How many usecs to delay after first packet. */ + u16 channel_ausec_low; /* How many usecs to delay after first packet. Low rate value. */ + u16 channel_ausec_high; /* How many usecs to delay after first packet. High rate value. */ + u16 channel_pusec; /* How many usecs to delay after packet. */ + u16 channel_pusec_low; /* How many usecs to delay after packet. Low rate value. */ + u16 channel_pusec_high; /* How many usecs to delay after packet. High rate value. */ + u32 entries; + u32 mask; /* entries - 1 for modulo using & */ + u32 extent; + struct sif_mem *mem; /* Ref. to ba.mem to implement macro patterns */ +}; + +struct sif_eq { + struct sif_table ba; /* Layout of hardware exposed table */ + struct sif_eps *eps; /* Pointer back to controlling EPS */ + u32 index; /* EQ index - EPS is 0, hw starts at 1 */ + u32 next_seq; /* Next seq to look for in eq */ + u32 entries; + u32 extent; /* Size in byte of each entry */ + u32 mask; /* entries - 1 for modulo using & */ + struct sif_mem *mem; /* Ref. to ba.mem to implement macro patterns */ + int intr_vec; /* Index into s->entries[..] for the interrupt vector used */ + u32 sw_index_interval; /* No. of events we can receive before the sw index must be updated */ + u32 sw_index_next_update; /* Next scheduled update point */ + atomic_t intr_cnt; /* Number of interrupts for the interrupt vector for this eq */ + atomic_t work_cnt; /* No. of work queue elements processed */ + char name[SIF_EQ_NAME_LEN+1]; /* Storage for name visible from /proc/interrupts */ + struct sif_irq_ch irq_ch; /* Per channel interrupt coalescing settings */ + cpumask_var_t affinity_mask; /* cpu affinity_mask for set_irq_hints. */ +}; + +/* Driver specific per instance data */ + +struct sif_dfs; /* Declared in sif_debug.c */ +struct sif_compl; /* Declared in sif_cq.h */ + +struct sif_dev { + struct ib_device ib_dev; + struct sif_verbs sv; + struct pci_dev *pdev; + struct sif_dfs *dfs; /* Optional debugfs info, if enabled in kernel */ + struct sif_mem_info mi; /* Used by sif_mem.c - configured SIF page sizes etc */ + struct sif_fwa fwa; /* Used by sif_fwa.c - firmware access API */ + u8 __iomem *cb_base; /* Collect buffer space base address */ + u8 __iomem *msi_base; /* Base for the MSI-X vector table */ + u8 __iomem *eps_base; /* "Raw" pointer to EPSC BAR space */ + u32 num_vfs; /* #of virtual functions to enable */ + int fw_vfs; /* #of virtual functions enabled in firmware */ + bool is_vf; /* Set if this is a VF instance */ + u8 mbox_epsc; /* EPSC mailbox index (differs between SIBS and PSIF) */ + u8 eps_cnt; /* Number of EPSes on the chip */ + int cbu_mtrr; /* mtrr register for the cbu - save for cleanup */ + struct psif_pcie_mbox __iomem *eps; /* Pointer to EPS-* mailboxes */ + struct workqueue_struct *wq; /* Used a.o. for async event processing */ + struct sif_mr *dma_mr; /* Privileged kernel mem MR (bypass mode) used for local_lkey */ + struct sif_mr *dma_inv_mr; /* Invalid MR for key 0 */ + struct sif_pd *pd; /* PD used for driver private table resources */ + + /* BAR space sizes */ + size_t cb_sz; + size_t msi_sz; + size_t eps_sz; + + /* Interrupt allocation */ + size_t intr_req; /* Number of irqs requested */ + size_t intr_cnt; /* Number of irqs allocated */ + size_t bw_cb_cnt; /* No.of virtual collect buffers available */ + size_t lat_cb_cnt; /* No.of virtual collect buffers available */ + size_t msix_entries_sz; /* Size of the allocated msix_entries array */ + spinlock_t msix_lock; /* Protects intr_used */ + struct msix_entry *msix_entries; /* MSI-X vector info */ + ulong *intr_used; /* Bitmap for allocation of irqs */ + + atomic_t sqp_usecnt[4]; /* track if someone has created QP 0/1 for port 1/2 */ + atomic_t cq_count; /* Track #used CQs to better scale (internal debug) timeouts */ + atomic_t cq_miss_cnt; /* Historic #completions sif_poll_cq had to busy wait for */ + atomic_t cq_miss_occ; /* Global #times sif_poll_cq had to busy wait (upd.by destroy_cq) */ + struct sif_eps *es; /* State for the EPS comm (sif_epsc.h) */ + struct sif_table ba[sif_tab_max]; /* Base address setup structures */ + struct sif_pqp **pqp; /* PSIF management QPs */ + struct sif_cb **kernel_cb[2]; /* cb's for the kernel (bw and low latency per cpu) */ + int pqp_cnt; /* Number of PQPs set up */ + atomic_t next_pqp; /* Used for round robin assignment of pqp */ + int kernel_cb_cnt; /* Number of pairs of CBs set up for the kernel */ + struct sif_idr xrcd_refs; /* Mgmt of sif_xrcd allocations */ + struct sif_idr pd_refs; /* Mgmt of sif_pd allocations */ + struct sif_spqp_pool ki_spqp; /* Stencil PQPs for key invalidates */ + /* Misc settings */ + bool registered; /* Set when we are registered with the verbs layer */ + u64 min_resp_ticks; /* expected min. hw resp.time in ticks */ + + u16 jiffies_sampling_cnt; /* 1/N counter used to display performance measurement. */ + /* Support for workaround for #3552 - feature_mask create_do_not_evict_qp: */ + u32 dne_qp; + + /* Support for workaround for #3713 */ + u32 flush_qp; + struct mutex flush_lock; + + /* Support for PMA proxy QP (indexes for port 1 and 2) bug #3357 */ + u32 pma_qp_idxs[2]; + + /* Support for WA for bug #4096 */ + bool single_pte_pt; /* If set, use a level + 1 page table even for a single pte */ + + enum sif_mem_type mt_override; /* Special memory type override available from sysfs */ + /* TBD: Make sure it gets updated upon value changes (handle error events) */ + struct ib_port_attr port[2]; /* cached port info. */ + + /* SL to TSL map. Indexed by sl, port (0-1 range) and qosl */ + char sl2tsl[16][2][2]; + + /* qosl hint for regular qps, indexed by sl and port (0-1 range) */ + enum psif_tsu_qos qp_qosl_hint[16][2]; + + /* tsl for pqps, latency sensitive (RCN) and bulk (non-critical) per port */ + char pqp_rcn_tsl[2]; + char pqp_bulk_tsl[2]; + + /* pqp qosl hint per port */ + enum psif_tsu_qos pqp_qosl_rcn_hint[2]; + enum psif_tsu_qos pqp_qosl_bulk_hint[2]; + + /* tsl for qp 0 (per port) */ + char qp0_tsl[2]; + + /* qp 0 qosl hint (per port) */ + enum psif_tsu_qos qp0_qosl_hint[2]; + + /* limited mode for device, no IB traffic possible */ + bool limited_mode; + /* PSIF is degraded */ + bool degraded; + +}; + +/* TBD: These should probably come from common pci headers + */ +#ifndef PCI_MSIX_ENTRY_SIZE +#define PCI_MSIX_ENTRY_SIZE 16 +#endif +#ifndef PCI_MSIX_ENTRY_VECTOR_CTRL +#define PCI_MSIX_ENTRY_VECTOR_CTRL 12 +#endif + +/* SIF specific debugging facilities */ +extern ulong sif_debug_mask; +extern ulong sif_trace_mask; + +/* Defined classes */ +#define SIF_INFO 0x1L +#define SIF_INIT 0x2L +#define SIF_QPE 0x4L +#define SIF_INFO_V 0x8L +#define SIF_WCE 0x10L /* Log error completions */ +#define SIF_PQPT 0x20L /* Log WR upon PQP timeouts */ +#define SIF_NCQ 0x40L +#define SIF_XRC 0x80L +#define SIF_INTR 0x100L +#define SIF_VERBS 0x200L +#define SIF_PQP 0x400L +#define SIF_EPS 0x800L +#define SIF_PD 0x1000L +#define SIF_QP 0x2000L +#define SIF_CQ 0x4000L +#define SIF_MR 0x8000L +#define SIF_FMR 0x10000L +#define SIF_MEM 0x20000L +#define SIF_AH 0x40000L +#define SIF_SRQ 0x80000L +#define SIF_SND 0x100000L +#define SIF_RCV 0x200000L +#define SIF_DMA 0x400000L +#define SIF_RQ 0x800000L +#define SIF_WCE_V 0x1000000L +#define SIF_SQ 0x2000000L +#define SIF_POLL 0x4000000L +#define SIF_PT 0x8000000L +#define SIF_MMU 0x10000000L +#define SIF_IPOLL 0x20000000L +#define SIF_MMAP 0x40000000L +#define SIF_MC 0x80000000L +#define SIF_IDX 0x100000000L +#define SIF_IDX2 0x200000000L +#define SIF_MEM_SG 0x400000000L +#define SIF_DFS 0x800000000L +#define SIF_FWA 0x1000000000L +#define SIF_VERBS_V 0x2000000000L +#define SIF_DUMP 0x4000000000L +#define SIF_MMU_V 0x8000000000L +#define SIF_MEM_V 0x10000000000L +#define SIF_TSL 0x20000000000L +#define SIF_CSR 0x40000000000L +#define SIF_PT_V 0x80000000000L +#define SIF_PT_VV 0x100000000000L +#define SIF_QP_V 0x200000000000L +#define SIF_PERF_V 0x400000000000L + +#ifdef SIF_TRACE_MASK +#define sif_log_trace(class, format, arg...) \ + do { \ + if (unlikely((sif_trace_mask) & (class))) { \ + const char *cl = #class; \ + trace_printk("%5s " format "\n", &cl[4], ##arg); \ + } \ + } while (0) +#else +#define sif_log_trace(class, format, arg...) +#endif + +#define sif_log(sdev, class, format, arg...) \ + do { \ + sif_log_trace(class, format, ## arg); \ + if (unlikely((sif_debug_mask) & (class))) { \ + const char *cl = #class;\ + dev_info(&(sdev)->pdev->dev, \ + "[%d] %5s %s: " format "\n", \ + current->pid, &cl[4], __func__, \ + ## arg); \ + } \ + } while (0) + +#define sif_logi(ibdev, class, format, arg...) \ + do { \ + if (unlikely((sif_debug_mask) & (class))) { \ + const char *cl = #class;\ + dev_info((ibdev)->dma_device, \ + "[%d] %5s %s: " format "\n", \ + current->pid, &cl[4], __func__, \ + ## arg); \ + } \ + } while (0) + +#define sif_log0(class, format, arg...) \ + do { \ + if (unlikely((sif_debug_mask) & (class))) \ + pr_info("pid [%d] %s: " format "\n", \ + current->pid, __func__, \ + ## arg); \ + } while (0) + +#define sif_dump(class, txt, addr, len) \ + do { \ + if (unlikely((sif_debug_mask) & (class))) { \ + print_hex_dump(KERN_INFO, txt, \ + DUMP_PREFIX_ADDRESS, 8, 1, addr, len, 0); \ + } \ + } while (0) + +#define sif_logs(class, stmt_list) \ + do { \ + if (unlikely((sif_debug_mask) & (class))) { \ + stmt_list;\ + } \ + } while (0) + +#define sif_log_cq(cq, class, format, arg...) \ + do { \ + if (unlikely((sif_debug_mask) & (class))) { \ + struct sif_dev *sdev = \ + container_of(cq->ibcq.device, struct sif_dev, ib_dev); \ + if (time_before((cq)->next_logtime, jiffies)) { \ + (cq)->next_logtime = jiffies + max(1000ULL, sdev->min_resp_ticks); \ + } else { \ + (cq)->log_cnt++; \ + continue; \ + } \ + dev_info(&sdev->pdev->dev, \ + "pid [%d] %s (suppressed %d): " format "\n", \ + current->pid, __func__, (cq)->log_cnt, \ + ## arg); \ + (cq)->log_cnt = 0; \ + } \ + } while (0) + +#define sif_log_perf(sdev, class, format, arg...) \ + do { \ + if (unlikely((sif_debug_mask) & (class))) { \ + if ((sdev)->jiffies_sampling_cnt % sif_perf_sampling_threshold) { \ + (sdev)->jiffies_sampling_cnt++; \ + continue; \ + } \ + dev_info(&(sdev)->pdev->dev, \ + "pid [%d] %s: " format "\n", \ + current->pid, __func__, \ + ## arg); \ + } \ + } while (0) + + + +/* some convenience pointer conversion macros: */ +#define to_sdev(ibdev) container_of((ibdev), struct sif_dev, ib_dev) + +#include + +#define def_copy_conv(name, type1, type2) \ +static inline void copy_conv_to_##name(type1 void *dest, const type2 void *src, size_t n) \ +{ \ + int words = n / 8; \ + int i; \ + type1 u64 *dp = (type1 u64 *) dest; \ + type2 u64 *sp = (type2 u64 *) src; \ + for (i = 0; i < words; i++) \ + dp[i] = cpu_to_be64(sp[i]); \ + wmb(); \ +} + +/* make checkpatch happy */ +#define N + +def_copy_conv(hw, volatile, N) +def_copy_conv(sw, N, volatile) + +static inline void copy_conv_to_le(void *dest, const void *src, size_t n) +{ + int words = n / 8; + int i; + u64 *dp = (u64 *) dest; + u64 *sp = (u64 *) src; + + BUG_ON(n & 7); + for (i = 0; i < words; i++) + dp[i] = cpu_to_le64(sp[i]); + wmb(); +} + +static inline void copy_conv_to_mmio(void __iomem *dest, const void *src, size_t n) +{ + int words = n / 8; + int i; + u64 __iomem *dp = (u64 __iomem *) dest; + u64 *sp = (u64 *) src; + + BUG_ON(n & 7); + for (i = 0; i < words; i++) + __raw_writeq(cpu_to_be64(sp[i]), &dp[i]); +} + +/* Non-converting copy routines */ +#define def_copy_plain(name, type1, type2) \ +static inline void copy_to_##name(type1 void *dest, const type2 void *src, size_t n) \ +{ \ + int words = n / 8; \ + int i; \ + type1 u64 *dp = (type1 u64 *) dest; \ + type2 u64 *sp = (type2 u64 *) src; \ + for (i = 0; i < words; i++) \ + dp[i] = sp[i]; \ +} + +def_copy_plain(hw, volatile, N) +def_copy_plain(sw, N, volatile) + +static __always_inline void *sif_kmalloc(struct sif_dev *sdev, size_t size, gfp_t flags) +{ +#ifdef CONFIG_NUMA + void *m; + + m = kmalloc_node(size, flags, sdev->pdev->dev.numa_node); + if (m) + return m; + + sif_log(sdev, SIF_INFO, "Warning: unable to allocate memory on numa node %d", + sdev->pdev->dev.numa_node); +#endif + return kmalloc(size, flags); +} + +static inline const char *get_product_str(struct sif_dev *sdev) +{ + if (IS_PSIF(sdev)) + return + (PSIF_SUBSYSTEM(sdev) == 0x6278) ? "Oracle Dual-port QDR IB Adapter M4" : + (PSIF_SUBSYSTEM(sdev) == 0x6279) ? "Oracle Dual-port EDR IB Adapter" : + (PSIF_SUBSYSTEM(sdev) == 0x6280) ? "Oracle InfiniBand Switch IS2-46" : + (PSIF_SUBSYSTEM(sdev) == 0x6281) ? "Oracle InfiniBand Switch IS2-254" : + (PSIF_SUBSYSTEM(sdev) == 0x6282) ? "Oracle Fabric Interconnect F2-12" : + "Unknown PSIF based card"; + + switch (PSIF_DEVICE(sdev)) { + case PCI_DEVICE_ID_SN1_PF: + case PCI_DEVICE_ID_SN1_VF: + return "SPARC Integrated FDR IB M1"; + case PCI_DEVICE_ID_SN2_PF: + case PCI_DEVICE_ID_SN2_VF: + return "SPARC Integrated EDR IB M2"; + case PCI_DEVICE_ID_SN3_PF: + case PCI_DEVICE_ID_SN3_VF: + return "SPARC Integrated EDR IB M3"; + default: + return "Unknown Sonoma or PSIF based system"; + } +} + +/* Param feature_mask defines */ +extern ulong sif_feature_mask; + +/* Disable INVALIDATE_*KEY(S) */ +#define SIFF_disable_invalidate_key 0x1 + +/* Disable RQ flushing */ +#define SIFF_disable_rq_flush 0x2 + +/* Disable SRQ */ +#define SIFF_disable_srq 0x8 + +/* Disable INVALIDATE_CQ only: */ +#define SIFF_disable_invalidate_cq 0x10 + +/* Disable INVALIDATE_RQ only: */ +#define SIFF_disable_invalidate_rq 0x20 + +/* Disable INVALIDATE_TLB only: */ +#define SIFF_disable_invalidate_tlb 0x40 + +/* Disable support for use of huge pages + * This feature is necessary to avoid running into bugDB #21690736 + * on OVM: + */ +#define SIFF_no_huge_pages 0x80 + +/* Use stencil pqp for invalidation of FMR keys */ +#define SIFF_disable_stencil_invalidate 0x100 + +/* Force disable vpci iommu trapping (to operate as on real hardware..) */ +#define SIFF_disable_vpci_iommu 0x400 + +/* Toss all multipacket qp's instead of resetting and reusing, see #3334 */ +#define SIFF_no_multipacket_qp_reuse 0x800 + +/* Set PCI max payload size to the supported max payload size to avoid #2105 */ +#define SIFF_max_supported_payload 0x1000 + +/* Let driver do page table walk instead of EPSC for query QP - to avoid #3583 */ +#define SIFF_passthrough_query_qp 0x4000 + +/* Check all event queues on all interrupts */ +#define SIFF_check_all_eqs_on_intr 0x8000 + +/* Don't allocate vcbs in a round robin fashion */ +#define SIFF_alloc_cb_round_robin 0x20000 + +/* Don't allocate from all other queues (except cb and qp) in a round robin fashion */ +#define SIFF_disable_alloc_round_robin 0x40000 + +/* Default on rev1 is to force rnr_retry_init to 0 - this feature + * forces it to 7 (infinite retry) instead: + */ +#define SIFF_infinite_rnr 0x80000 + +/* Default is to allocate table entries + * from a two-level allocation where each pd reserves all entries + * within a page and allocates from within this. + * This disables the second level to revert to a + * flat 1-level allocation scheme: + */ +#define SIFF_flat_alloc 0x100000 + +/* SQS Atomics (only has effect for PSIF rev > 3) */ +#define SIFF_force_sqs_atomic_disable 0x200000 + +#define SIFF_force_ib_atomic_hca_mode 0x400000 + +/* Force link retraining upon some errors to ease PCIe triggering */ +#define SIFF_pcie_trigger 0x800000 + +/* Use 0 as magic value in qp setup to debug #3595 */ +#define SIFF_zero_magic 0x1000000 + +/* Use optimization of 2 sge_entries with the first being 48 */ +#define SIFF_disable_inline_first_sge 0x2000000 +/* disable Adaptive int coalescing */ +#define SIFF_dis_auto_int_coalesce 0x4000000 + +/* + * Bringup SIF a in limited mode, where no IB traffic and only + * limited mailbox traffic will be possible + */ +#define SIFF_force_limited_mode 0x8000000 + +/* + * Force WA for HW bug bug 3646, PSIF does not honor min_rnr_timer, + * assumes a homogenous PSIF cluster. + */ +#define SIFF_force_wa_3646 0x10000000 + +#define SIFF_force_rc_2048_mtu 0x20000000 + +/* Configure PSIF to use the opposite base page size (e.g. 8K on x86 and 4K on sparc) */ +#define SIFF_toggle_page_size 0x40000000 + +#define SIFF_all_features 0x7ffeddfb + +#define sif_feature(x) (sif_feature_mask & (SIFF_##x)) + +extern ulong sif_vendor_flags; +#define sif_vendor_enable(x, uflags) ((sif_vendor_flags | uflags) & x) + +extern uint sif_vf_en; +extern uint sif_fwa_mr_en; + +extern uint sif_max_inline; + +extern uint sif_qp_size; +extern uint sif_mr_size; +extern uint sif_ah_size; +extern uint sif_cq_size; +extern uint sif_rq_size; + +extern ulong sif_eps_log_size; +extern ushort sif_eps_log_level; + +extern ushort sif_perf_sampling_threshold; +extern uint sif_fmr_cache_flush_threshold; + +/* Maximum number of outstanding privileged QP requests supported */ +extern uint sif_max_pqp_wr; + +/* Max number of stencil PQPs for (bulk) key invalidate to allocate */ +extern uint sif_ki_spqp_size; + +/* Max number of collect buffers supported */ +extern uint sif_cb_max; + +/* Initialized in init */ +extern struct kmem_cache *compl_cache; + +#endif diff --git a/drivers/infiniband/hw/sif/sif_dma.c b/drivers/infiniband/hw/sif/sif_dma.c new file mode 100644 index 0000000000000..18218d1e4a7e7 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_dma.c @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_dma.c: DMA memory mapping + */ +#include +#include "sif_dma.h" +#include "sif_dev.h" +#include "psif_hw_data.h" + +struct page *sif_alloc_pages(struct sif_dev *sdev, gfp_t gfp_mask, unsigned int order) +{ +#ifdef CONFIG_NUMA + if (sdev->pdev->dev.numa_node >= 0) { + struct page *page = alloc_pages_node(sdev->pdev->dev.numa_node, gfp_mask, order); + + if (page) + return page; + + sif_logi(&sdev->ib_dev, SIF_INFO, "Warning: unable to allocate order %d, on numa node %d", + order, sdev->pdev->dev.numa_node); + } +#endif + return alloc_pages(gfp_mask, order); +} + + + + +/* allocate/release aligned memory */ +void *sif_dma_alloc_aligned(struct ib_device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag, + enum dma_data_direction dir) +{ + dma_addr_t ioaddr; + int ret; + void *cpu_addr; + struct sif_dev *sdev = to_sdev(dev); + struct page *page = sif_alloc_pages(sdev, flag, get_order(size)); + + if (!page) + return NULL; + + cpu_addr = page_address(page); + ioaddr = (dma_addr_t) ib_dma_map_single(dev, cpu_addr, size, dir); + ret = dma_mapping_error(dev->dma_device, ioaddr); + if (ret) { + sif_logi(dev, SIF_DMA, "DMA mapping %p sz %lx %sfailed", + cpu_addr, size, (dir == DMA_TO_DEVICE ? "read only " : "")); + free_pages((unsigned long)cpu_addr, get_order(size)); + return NULL; + } + *dma_handle = ioaddr; + return cpu_addr; +} + +void sif_dma_free_aligned(struct ib_device *dev, size_t size, + void *cpu_addr, u64 dma_handle, + enum dma_data_direction dir) +{ + ib_dma_unmap_single(dev, dma_handle, size, dir); + free_pages((unsigned long)cpu_addr, get_order(size)); +} + + +void *sif_dma_alloc_readonly(struct ib_device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + return sif_dma_alloc_aligned(dev, size, dma_handle, flag, DMA_TO_DEVICE); +} + +void sif_dma_free_readonly(struct ib_device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_handle) +{ + sif_dma_free_aligned(dev, size, cpu_addr, dma_handle, DMA_TO_DEVICE); +} diff --git a/drivers/infiniband/hw/sif/sif_dma.h b/drivers/infiniband/hw/sif/sif_dma.h new file mode 100644 index 0000000000000..c9de27fd37ed9 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_dma.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_dma.h: DMA memory mapping + */ + +#ifndef __SIF_DMA_H +#define __SIF_DMA_H + +#include + +struct sif_dev; + +struct page *sif_alloc_pages(struct sif_dev *sdev, gfp_t gfp_mask, unsigned int order); + +void *sif_dma_alloc_coherent(struct ib_device *dev, size_t size, + u64 *dma_handle, gfp_t flag); +void sif_dma_free_coherent(struct ib_device *dev, size_t size, + void *cpu_addr, u64 dma_handle); + +/* allocate/release readonly (and noncoherent?) memory */ +void *sif_dma_alloc_readonly(struct ib_device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag); + +void sif_dma_free_readonly(struct ib_device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_handle); + +/* Allocate/release memory that is naturally aligned according to size, + * eg. 2M gets 2M aligned etc: + */ +void *sif_dma_alloc_aligned(struct ib_device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag, + enum dma_data_direction dir); + +void sif_dma_free_aligned(struct ib_device *dev, size_t size, + void *cpu_addr, u64 dma_handle, + enum dma_data_direction dir); + + +struct sif_table; + +/* Largest single dma alloc we can get + * - if larger need, switch to vmalloc: + */ +#define SIF_MAX_CONT (PAGE_SIZE << (MAX_ORDER - 1)) + +#endif diff --git a/drivers/infiniband/hw/sif/sif_drvapi.h b/drivers/infiniband/hw/sif/sif_drvapi.h new file mode 100644 index 0000000000000..2e6ba7ac7bb14 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_drvapi.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_drvapi.h: Device specific operations available via the FWA access path + * + */ +#ifndef _SIF_DRVAPI_H +#define _SIF_DRVAPI_H + + +enum sif_drv_opcode { + SIF_DRV_CMD_EPSA_SETUP, /* Set up the standard communication link towards an EPS-A */ + SIF_DRV_CMD_EPSA_TEARDOWN, /* Terminate the communication link with an EPS-A */ +}; + +struct epsa_setup { + enum psif_eps_a_core epsa; /* Which EPS-A to operate on */ + u32 req_size; /* Size in number of reqs of the EPS-A req/rsp queues (only 2**n sizes supported) */ +}; + + +struct sif_drv_req { + enum sif_drv_opcode opcode; + union { + struct epsa_setup epsa; /* The EPS-A number for the operation */ + } u; +}; + +struct sif_drv_rsp { + enum sif_drv_opcode opcode; /* The opcode of the driver operation */ + struct psif_epsc_csr_rsp eps_rsp; /* If status != EPSC_SUCCESS an opt. err resp. from the EPSC */ +}; + + +static inline enum psif_mbox_type epsa_to_mbox(enum psif_eps_a_core epsa) +{ + switch (epsa) { + case PSIF_EPS_A_1: + return MBOX_EPSA0; + case PSIF_EPS_A_2: + return MBOX_EPSA1; + case PSIF_EPS_A_3: + return MBOX_EPSA2; + case PSIF_EPS_A_4: + return MBOX_EPSA3; + default: + break; + } + return (enum psif_mbox_type)-1; +} + + +#endif diff --git a/drivers/infiniband/hw/sif/sif_elog.c b/drivers/infiniband/hw/sif/sif_elog.c new file mode 100644 index 0000000000000..5547fd64e5595 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_elog.c @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_elog.c: Log over PCIe support for firmware + * TBD: Remove + */ + +#include +#include +#include +#include "sif_dev.h" +#include "sif_elog.h" +#include "sif_query.h" + +static int sif_elog_wait(struct sif_dev *sdev, enum psif_mbox_type eps_num) +{ + int ret; + struct sif_eps *es = &sdev->es[eps_num]; + struct psif_epsc_csr_rsp resp; + struct psif_epsc_csr_req req; + + init_completion(&es->logdev_more_log); + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_LOG_REQ_NOTIFY; + ret = sif_eps_wr(sdev, eps_num, &req, &resp); + if (ret || resp.status != EPSC_SUCCESS) + return -EINVAL; + + /* data contains the last byte written by eps at the moment + * where the notify call was processed. + */ + + if (resp.data > be64_to_cpu(es->data->log.consume_offset)) + return 0; + + ret = wait_for_completion_interruptible(&es->logdev_more_log); + + return ret; +} + +void sif_elog_intr(struct sif_dev *sdev, enum psif_mbox_type eps_num) +{ + complete(&sdev->es[eps_num].logdev_more_log); +} + +static int sif_elog_open(struct inode *inode, struct file *f) +{ + struct sif_eps *es = container_of(f->f_op, struct sif_eps, logdev_ops); + int ok = atomic_add_unless(&es->logdev_use, -1, 0); + + if (!ok) + return -EBUSY; + + return 0; +} + + +static int sif_elog_release(struct inode *inode, struct file *f) +{ + struct sif_eps *es = container_of(f->f_op, struct sif_eps, logdev_ops); + + atomic_inc(&es->logdev_use); + return 0; +} + + +static ssize_t sif_elog_read(struct file *f, char __user *user, size_t size, loff_t *offset) +{ + int stat; + struct sif_eps *es = container_of(f->f_op, struct sif_eps, logdev_ops); + struct sif_dev *sdev = es->sdev; + struct psif_epsc_log_stat ls; + u64 start_off, end_off, sz, len, start; +restart: + if (eps_version_ge(es, 0, 31)) + copy_conv_to_sw(&ls, &es->data->log, sizeof(ls)); + else + memcpy(&ls, &es->data->log, sizeof(ls)); + + start_off = ls.consume_offset; + end_off = ls.produce_offset; + sz = ls.size; + + len = min((u64)size, end_off - start_off); + start = start_off % sz; + + if (start + len > sz) + len = sz - start; + + if (len == 0) { + stat = sif_elog_wait(sdev, es->eps_num); + if (stat < 0) + return stat; + goto restart; + } + + sif_log(sdev, SIF_EPS, " requested sz %lx, off %llx. Queue: produce %llx, consume %llx - got %llx", + size, *offset, ls.produce_offset, ls.consume_offset, + len); + + if (copy_to_user(user, &es->data->log_data_area[start], len)) + return -EIO; + + ls.consume_offset += len; + es->data->log.consume_offset = cpu_to_be64(ls.consume_offset); + return len; +} + + + +int sif_elog_init(struct sif_dev *sdev, enum psif_mbox_type eps_num) +{ + struct sif_eps *es = &sdev->es[eps_num]; + struct miscdevice *logdev = &es->logdev; + struct file_operations *logdev_ops = &es->logdev_ops; + struct pci_dev *pdev = sdev->pdev; + + snprintf(es->logdevname, MAX_LOGDEVNAME, "infiniband/sif_eps%s/%02x:%02x.%x", + eps_suffix(sdev, eps_num), pdev->bus->number, + PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); + logdev_ops->read = sif_elog_read; + logdev_ops->open = sif_elog_open; + logdev_ops->release = sif_elog_release; + logdev_ops->owner = THIS_MODULE; + logdev->name = es->logdevname; + logdev->minor = MISC_DYNAMIC_MINOR; + logdev->fops = &es->logdev_ops; + atomic_set(&es->logdev_use, 1); + return misc_register(logdev); +} + +int sif_elog_deinit(struct sif_dev *sdev, enum psif_mbox_type eps_num) +{ + return misc_deregister(&sdev->es[eps_num].logdev); +} diff --git a/drivers/infiniband/hw/sif/sif_elog.h b/drivers/infiniband/hw/sif/sif_elog.h new file mode 100644 index 0000000000000..8c0ecdaa7efe8 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_elog.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_elog.h: Misc device for capturing log from the EPSC + */ + +#ifndef _SIF_ELOG_H +#define _SIF_ELOG_H + +struct sif_dev; + +int sif_elog_init(struct sif_dev *sdev, enum psif_mbox_type eps_num); +int sif_elog_deinit(struct sif_dev *sdev, enum psif_mbox_type eps_num); + +void sif_elog_intr(struct sif_dev *sdev, enum psif_mbox_type eps_num); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_enl.h b/drivers/infiniband/hw/sif/sif_enl.h new file mode 100644 index 0000000000000..9fa605461cb8e --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_enl.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_enl.h: Protocol definitions for the netlink protocol for EPSC access from + * user space. Shared between kernel and user space. + */ + +#ifndef _SIF_ENL_H +#define _SIF_ENL_H + +/* Supported packet types */ +enum sif_enl_cmd_type { + SIF_ENL_CMD_NONE, + SIF_ENL_CMD_REQ, /* Request to an EPS */ + SIF_ENL_CMD_RSP, /* Response from an EPS */ + SIF_ENL_CMD_REQ_DRV, /* Driver requests */ + SIF_ENL_CMD_RSP_DRV, /* Driver response */ + SIF_ENL_CMD_MAX +}; + +/* Supported attributes */ +enum sif_test_attr { + SIF_ENL_A_CMD, + SIF_ENL_A_COMPLEX, + SIF_ENL_A_BUS, + SIF_ENL_A_DEVFN, + SIF_ENL_A_PAYLOAD, + SIF_ENL_A_DATA, + SIF_ENL_A_INDEX, + SIF_ENL_A_MAX +}; + + +/* attribute policy */ +static struct nla_policy sif_enl_policy[SIF_ENL_A_MAX] = { + [SIF_ENL_A_CMD] = { .type = NLA_U32 }, + [SIF_ENL_A_COMPLEX] = { .type = NLA_U16 }, + [SIF_ENL_A_BUS] = { .type = NLA_U16 }, + [SIF_ENL_A_DEVFN] = { .type = NLA_U16 }, + [SIF_ENL_A_PAYLOAD] = { .type = NLA_UNSPEC }, + [SIF_ENL_A_DATA] = { .type = NLA_UNSPEC }, + [SIF_ENL_A_INDEX] = { .type = NLA_U32 } +}; + + +#endif diff --git a/drivers/infiniband/hw/sif/sif_epsc.c b/drivers/infiniband/hw/sif/sif_epsc.c new file mode 100644 index 0000000000000..c38e9122e7a07 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_epsc.c @@ -0,0 +1,1739 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_epsc.c: Implementation of API for communication with the EPSC + * + * In general this module has to make sure that + * 1) we never have more packets outstanding with the EPS than hw_enties + * 2) we do not post more packets than we have completion entries for, + * eg. we must ensure that completions not yet forwarded as a result of + * a *waitfor* call is not overwritten by hw. + */ + +#include "sif_epsc.h" +#include "sif_eq.h" +#include "sif_dev.h" +#include "sif_base.h" +#include "psif_hw_csr.h" +#include "psif_hw_data.h" +#include "psif_hw_setget.h" +#include "sif_dma.h" +#include "sif_query.h" +#include "sif_elog.h" +#include "sif_hwi.h" +#include "sif_spt.h" +#include "sif_defs.h" +#include +#include + +#define CSR_ONLINE_MASK 0x8000 + +#define EPSC_LOG_MODE_BUFFER EPSC_LOG_MODE_SCAT + + +static int write_csr(struct sif_dev *sdev, u32 addr, u64 val); +static u64 read_csr(struct sif_dev *sdev, u32 addr, bool local); + +union sif_mailbox { + u64 raw; + struct psif_epsc_csr_doorbell x; +}; + +static int __sif_eps_send_keep_alive(struct sif_dev *sdev, enum psif_mbox_type eps_num, + bool force); + +static enum psif_mbox_type sif_tab2mbox(struct sif_dev *sdev, enum sif_tab_type tab_type) +{ + return (tab_type & ~1) == epsc_csr_req ? sdev->mbox_epsc + : ((tab_type - epsa0_csr_req) >> 1); +} + + +static enum sif_tab_type sif_mbox2req_tab(struct sif_dev *sdev, enum psif_mbox_type eps_num) +{ + return eps_num == sdev->mbox_epsc ? epsc_csr_req + : epsa0_csr_req + (eps_num << 1); +} + +static enum sif_tab_type sif_mbox2rsp_tab(struct sif_dev *sdev, enum psif_mbox_type eps_num) +{ + return eps_num == sdev->mbox_epsc ? epsc_csr_rsp + : epsa0_csr_rsp + (eps_num << 1); +} + + +const char *eps_name(struct sif_dev *sdev, enum psif_mbox_type eps_num) +{ + if (eps_num == sdev->mbox_epsc) + return "C"; + + switch (eps_num) { + case MBOX_EPSA0: + return "A-0"; + case MBOX_EPSA1: + return "A-1"; + case MBOX_EPSA2: + return "A-2"; + case MBOX_EPSA3: + return "A-3"; + default: + break; + } + return "(nonexisting eps)"; +} + + +const char *eps_suffix(struct sif_dev *sdev, enum psif_mbox_type eps_num) +{ + if (eps_num == sdev->mbox_epsc) + return "c"; + + switch (eps_num) { + case MBOX_EPSA0: + return "a0"; + case MBOX_EPSA1: + return "a1"; + case MBOX_EPSA2: + return "a2"; + case MBOX_EPSA3: + return "a3"; + default: + break; + } + return "(nonexisting eps)"; +} + + +bool is_eps_rsp_tab(enum sif_tab_type type) +{ + switch (type) { + case epsc_csr_rsp: + case epsa0_csr_rsp: + case epsa1_csr_rsp: + case epsa2_csr_rsp: + case epsa3_csr_rsp: + return true; + default: + return false; + } +} + + +int eps_status_to_err(enum psif_epsc_csr_status status) +{ + switch (status) { + case EPSC_SUCCESS: + return 0; + case EPSC_EKEYREJECTED: + return -EKEYREJECTED; + case EPSC_EADDRNOTAVAIL: + return -EPERM; + case EPSC_EOPNOTSUPP: + return -EOPNOTSUPP; + case EPSC_ENOMEM: + return -ENOMEM; + case EPSC_ENODATA: /* ENODATA is not an error */ + return 0; + case EPSC_EAGAIN: + return -EAGAIN; + case EPSC_ECANCELED: + return -ECANCELED; + case EPSC_ECONNRESET: + return -ECONNRESET; + case EPSC_ECSR: + return -EACCES; + case EPSC_MODIFY_QP_OUT_OF_RANGE: + return -ERANGE; + case EPSC_MODIFY_QP_INVALID: + return -EINVAL; + case EPSC_MODIFY_CANNOT_CHANGE_QP_ATTR: + return -EBUSY; + case EPSC_MODIFY_INVALID_QP_STATE: + case EPSC_MODIFY_INVALID_MIG_STATE: + return -EINVAL; + case EPSC_MODIFY_TIMEOUT: + return -ETIMEDOUT; + case EPSC_ETEST_HEAD: + case EPSC_ETEST_TAIL: + case EPSC_ETEST_PATTERN: + return -EIO; + case EPSC_EADDRINUSE: + return -EADDRINUSE; + case EPSC_EINVALID_VHCA: + return -ECHRNG; + case EPSC_EINVALID_PORT: + return -ELNRNG; + case EPSC_EINVALID_ADDRESS: + return -EADDRNOTAVAIL; + case EPSC_EINVALID_PARAMETER: + return -EINVAL; + case EPSC_FAIL: + return -ENOTRECOVERABLE; + default: + return -EUCLEAN; /* If this is returned, this function needs corrections */ + } +} + + +struct psif_epsc_csr_req *get_eps_csr_req(struct sif_dev *sdev, + enum psif_mbox_type eps_num, int index) +{ + enum sif_tab_type type = sif_mbox2req_tab(sdev, eps_num); + + return (struct psif_epsc_csr_req *) + (sif_mem_kaddr(sdev->ba[type].mem, index * sdev->ba[type].ext_sz)); +} + +struct psif_epsc_csr_rsp *get_eps_csr_rsp(struct sif_dev *sdev, + enum psif_mbox_type eps_num, int index) +{ + enum sif_tab_type type = sif_mbox2rsp_tab(sdev, eps_num); + + return (struct psif_epsc_csr_rsp *) + (sif_mem_kaddr(sdev->ba[type].mem, index * sdev->ba[type].ext_sz)); +} + +static inline u16 get_eps_mailbox_seq_num(volatile struct psif_epsc_csr_rsp *rsp) +{ + return rsp->seq_num & (CSR_ONLINE_MASK - 1); +} + +/* Cond. call completion on an entry in the response queue + * Assumes the eps lock is held + */ +static inline void __epsc_complete(struct sif_dev *sdev, enum psif_mbox_type eps_num, int idx) +{ + struct sif_eps *es = &sdev->es[eps_num]; + struct sif_eps_cqe *cqe = es->cqe[idx]; + + if (cqe && cqe->need_complete) + complete(&cqe->cmpl); +} + +void epsc_complete(struct sif_dev *sdev, enum psif_mbox_type eps_num, int idx) +{ + unsigned long flags; + struct sif_eps *es = &sdev->es[eps_num]; + + spin_lock_irqsave(&es->lock, flags); + __epsc_complete(sdev, eps_num, idx); + spin_unlock_irqrestore(&es->lock, flags); +} + +static int sif_eps_api_version_ok(struct sif_dev *sdev, enum psif_mbox_type eps_num) +{ + bool psif_version_ok; + bool epsc_version_ok; + struct sif_eps *es = &sdev->es[eps_num]; + + /* Validate that we have compatible versions */ + sif_log(sdev, SIF_INFO, "Connected to SIF version %d.%d, EPS%s version %d.%d", + es->ver.psif_major, es->ver.psif_minor, + eps_name(sdev, eps_num), + es->ver.epsc_major, es->ver.epsc_minor); + + psif_version_ok = + es->ver.psif_major == PSIF_MAJOR_VERSION && + es->ver.psif_minor == PSIF_MINOR_VERSION; + + if (!psif_version_ok) { + u32 ever, dver, rev1ver; + + sif_log(sdev, SIF_INFO, + " *** PSIF architecture version mismatch: driver expects v.%d.%d, fw supports v.%d.%d ***", + PSIF_MAJOR_VERSION, PSIF_MINOR_VERSION, + es->ver.psif_major, es->ver.psif_minor); + ever = PSIF_API_VERSION(es->ver.psif_major, es->ver.psif_minor); + rev1ver = PSIF_API_VERSION(4, 06); + dver = PSIF_VERSION; + if ((dver > rev1ver && ever <= rev1ver) || + (ever > rev1ver && dver <= rev1ver)) { + sif_log(sdev, SIF_INFO, "Wrong driver build for this chip revision!"); + return -ENOEXEC; + } + } + + epsc_version_ok = + es->ver.epsc_major == EPSC_MAJOR_VERSION && + es->ver.epsc_minor == EPSC_MINOR_VERSION; + + if (!epsc_version_ok) { + sif_log(sdev, SIF_INFO_V, + " *** EPS%s API version mismatch: driver expects v.%d.%d, firmware implements v.%d.%d ***", + eps_name(sdev, eps_num), + EPSC_MAJOR_VERSION, EPSC_MINOR_VERSION, + es->ver.epsc_major, es->ver.epsc_minor); + } + + /* PSIF version must match exactly, any EPSC version is ok */ + if (!psif_version_ok) + return -ENOEXEC; + return 0; +} + + +static int sif_eps_firmware_version_ok(struct sif_dev *sdev, enum psif_mbox_type eps_num) +{ + int ret; + int i = 0, fi = 0; + struct psif_epsc_csr_req req; + struct psif_epsc_csr_rsp rsp; + char *p; + char *start; + char *vs; + struct sif_eps *es = &sdev->es[eps_num]; + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_FW_VERSION; + req.u.fw_version.host_addr = + (u64)es->data_dma_hdl + offsetof(struct sif_epsc_data, fw_version); + + ret = sif_eps_wr_poll(sdev, eps_num, &req, &rsp); + if (ret) + return ret; + + /* Parse the string we got: */ + p = start = es->data->fw_version; + for (i = 0; i < MAX_FW_VERSION_INFO_SZ; i++) { + if (p[i] == '\0') { + sif_log(sdev, SIF_VERBS, "fw_version[%d]: %s", + fi, start); + es->ver.fw_version[fi++] = start; + /* skip 0 byte */ + start = p + i + 1; + if (fi >= FWV_MAX) + break; + } + } + sif_log(sdev, SIF_INFO, "EPSC firmware image revision string %s", + es->ver.fw_version[FWV_EPS_REV_STRING]); + sif_log(sdev, SIF_INFO, "EPSC firmware version tag:\n%s", + es->ver.fw_version[FWV_EPS_GIT_LAST_COMMIT]); + if (es->ver.fw_version[FWV_EPS_GIT_STATUS][0] != '\0') + sif_log(sdev, SIF_INFO, " *** epsfw git status at build time: ***\n%s", + es->ver.fw_version[FWV_EPS_GIT_STATUS]); + + vs = es->ver.fw_version[FWV_EPS_REV_STRING]; + if (sscanf(vs, "%hu.%hu", &es->ver.fw_major, &es->ver.fw_minor) != 2) + return -EINVAL; + + if (vs[0] == 'R' && es->ver.fw_minor == 0) + es->ver.fw_minor = 1; + + sif_log(sdev, SIF_INFO, "EPSC interpreted firmware revision: %hu.%hu", + es->ver.fw_major, es->ver.fw_minor); + return 0; +} + + +static int sif_eps_log_ctrl(struct sif_dev *sdev, enum psif_mbox_type eps_num, + enum psif_epsc_log_mode mode, + enum psif_epsc_log_level level) +{ + int ret; + struct psif_epsc_csr_req req; + struct psif_epsc_csr_rsp rsp; + struct sif_eps *es = &sdev->es[eps_num]; + + if (eps_num != sdev->mbox_epsc) { + /* TBD: Data area has not been allocated for EPSAs! */ + return -ENOMEM; + } + + if (!es->data->log.size) { + sif_log(sdev, SIF_INFO, "cannot redirect - no data buffer configured"); + return -ENOMEM; + } + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_LOG_CTRL; + /* TBD: Higher log levels than debug will give a feedback loop... */ + req.u.log_ctrl.level = level > EPS_LOG_DEBUG ? EPS_LOG_DEBUG : level; + req.u.log_ctrl.mode = mode; + req.u.log_ctrl.mmu_cntx = sdev->ba[epsc_csr_rsp].mmu_ctx.mctx; + + if (mode == EPSC_LOG_MODE_HOST) { + req.u.log_ctrl.stat_base = + (u64)es->data_dma_hdl + offsetof(struct sif_epsc_data, log); + req.u.log_ctrl.base = + (u64)es->data_dma_hdl + offsetof(struct sif_epsc_data, log_data_area); + req.u.log_ctrl.length = + es->data->log.size; + } + + ret = sif_eps_wr_poll(sdev, eps_num, &req, &rsp); + if (!ret) { + if (mode == EPSC_LOG_MODE_HOST) { + sif_log(sdev, SIF_INFO, + "Enabled EPS log redirect to buffer at %p (sz 0x%llx)", + es->data->log_data_area, + es->data->log.size); + ret = sif_elog_init(sdev, eps_num); + if (ret) + sif_log(sdev, SIF_INFO, "Failed to create eps logging device for EPS%s", + eps_name(sdev, eps_num)); + es->log_redir_en = true; + } else { + if (es->log_redir_en) { + sif_elog_deinit(sdev, eps_num); + es->log_redir_en = false; + } + sif_log(sdev, SIF_INFO, "Disabled EPS log redirect"); + } + } + return ret; +} + + +int epsc_set_mmu_upper(struct sif_dev *sdev, u16 value) +{ + int ret; + + if (eps_version_ge(&sdev->es[sdev->mbox_epsc], 0, 103)) { + struct psif_epsc_csr_req req; + struct psif_epsc_csr_rsp rsp; + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_SET; + req.u.set.data.op = EPSC_QUERY_TA_UPPER_TWELVE; + req.u.set.info.op = EPSC_QUERY_PA_UPPER_TWELVE; + req.u.set.data.value = value; + req.u.set.info.value = value; + ret = sif_epsc_wr_poll(sdev, &req, &rsp); + } else { + u64 v = read_csr(sdev, 0x200000, false); + + v &= ~((0xfffull << 48) | (0xfffull << 32)); + v |= ((u64)value << 48) | ((u64)value << 32); + ret = write_csr(sdev, 0x200000, v); + } + if (ret) + sif_log(sdev, SIF_INFO, "Failed to set mmu_upper bits!"); + + if (PSIF_REVISION(sdev) <= 3) + /* Enable WA for Bug #4096: TA/PA upper has no effect on level0 contexts */ + sdev->single_pte_pt = true; + return ret; +} + + + +/* special epsc initialization */ +static void eps_struct_init(struct sif_dev *sdev) +{ + struct sif_eps *es; + u8 i; + + for (i = 0; i < sdev->eps_cnt; i++) { + memset(sdev->es, 0, sizeof(*sdev->es)); + es = &sdev->es[i]; + es->sdev = sdev; + es->eps_num = i; + spin_lock_init(&es->lock); + + if (i != sdev->mbox_epsc) + continue; + + /* EPSC is implicitly started at power on */ + if (es->state == ES_NOT_RUNNING) + es->state = ES_RUNNING; + } +} + + +static int eps_set_state(struct sif_dev *sdev, enum psif_mbox_type eps_num, + enum sif_eps_state new_state) +{ + unsigned long flags; + struct sif_eps *es = &sdev->es[eps_num]; + int ret = 0; + + spin_lock_irqsave(&es->lock, flags); + switch (es->state) { + case ES_NOT_RUNNING: + case ES_RUNNING: + if (new_state == ES_INIT || new_state == ES_NOT_RUNNING) + break; + ret = -EINVAL; + goto init_failed; + case ES_INIT: + if (new_state == ES_ACTIVE || new_state == ES_NOT_RUNNING) + break; + ret = -ENODEV; + goto init_failed; + case ES_ACTIVE: + if (new_state == ES_RUNNING) + break; + ret = -EBUSY; + goto init_failed; + } + es->state = new_state; + spin_unlock_irqrestore(&es->lock, flags); + return 0; +init_failed: + sif_log(sdev, SIF_INIT, "Invalid EPS%s state transition (%d -> %d)", + eps_name(sdev, eps_num), es->state, new_state); + spin_unlock_irqrestore(&es->lock, flags); + return ret; +} + + +/* Define the atomic op completer device capabilites and device control here as + * not able to find it in the pci_reg.h. This should be get into the pci_reg.h. + */ +#define ATOMIC_OP_32_BIT_COMPLETER_SUPPORTED (1ULL << 7) +#define ATOMIC_OP_64_BIT_COMPLETER_SUPPORTED (1ULL << 8) +#define CAS_OP_128_BIT_COMPLETER_SUPPORTED (1ULL << 9) +#define ATOMIC_OP_REQUESTER_ENABLE (1ULL << 6) +static enum psif_epsc_csr_atomic_op sif_get_atomic_config(struct sif_dev *sdev, + enum psif_mbox_type eps_num) +{ + struct pci_dev *parent; + int pcie_cap, pcie_parent_cap; + u16 pdevcap2, devctrl2; + int ret = 0; + enum psif_epsc_csr_atomic_op atomic_op_flags = PSIF_PCIE_ATOMIC_OP_NONE; + parent = pci_upstream_bridge(sdev->pdev); + + if (!parent) { + sif_log(sdev, SIF_INFO, + "No parent bridge device, cannot determine atomic capabilities!"); + return PSIF_PCIE_ATOMIC_OP_NONE; + } + + pcie_parent_cap = pci_find_capability(parent, PCI_CAP_ID_EXP); + + if (!pcie_parent_cap) { + sif_log(sdev, SIF_INFO, + "PCIe capability in parent device not found, cannot determine atomic capabilities!"); + return PSIF_PCIE_ATOMIC_OP_NONE; + } + + ret = pci_read_config_word(parent, pcie_parent_cap + PCI_EXP_DEVCAP2, &pdevcap2); + if (ret) { + /* set to PSIF_PCIE_ATOMIC_OP_NONE if pci read fails*/ + return atomic_op_flags; + } + if (pdevcap2 & (ATOMIC_OP_32_BIT_COMPLETER_SUPPORTED | + ATOMIC_OP_64_BIT_COMPLETER_SUPPORTED | + CAS_OP_128_BIT_COMPLETER_SUPPORTED)) { + pcie_cap = pci_find_capability(sdev->pdev, PCI_CAP_ID_EXP); + ret = pci_read_config_word(sdev->pdev, pcie_cap + PCI_EXP_DEVCTL2, &devctrl2); + /* check whether PSIF set the ATOMIC_OP_REQUESTER_ENABLE bit */ + if (!(devctrl2 & ATOMIC_OP_REQUESTER_ENABLE)) { + ret = pci_write_config_word(sdev->pdev, pcie_cap + PCI_EXP_DEVCTL2, + (devctrl2 | ATOMIC_OP_REQUESTER_ENABLE)); + if (ret) { + /* set to PSIF_PCIE_ATOMIC_OP_NONE if pci write fails*/ + return atomic_op_flags; + } + sif_log(sdev, SIF_INFO, + "Set atomic_op_requester_enable in devctrl2 (%x)\n", devctrl2); + } + + /* Always enable SQS atomic and IB global atomic if RC supports atomicOp */ + atomic_op_flags = PSIF_PCIE_ATOMIC_OP_BOTH; + /* EPS-A cores do not need to worry about different IB atomic mode, as they only + * need to know whether PSIF has atomic_op_requester_enable set. + */ + if (eps_num == sdev->mbox_epsc) { + /* SQS atomics does not work in these revisions: */ + bool disable_sqs_atomics = PSIF_REVISION(sdev) <= 3 ? + true : sif_feature(force_sqs_atomic_disable); + + if (disable_sqs_atomics && + sif_feature(force_ib_atomic_hca_mode)) { + atomic_op_flags = PSIF_PCIE_ATOMIC_OP_NONE; + } else if (disable_sqs_atomics) { + atomic_op_flags = PSIF_PCIE_ATOMIC_OP_IB; + } else if (sif_feature(force_ib_atomic_hca_mode)) { + atomic_op_flags = PSIF_PCIE_ATOMIC_OP_SQS; + } + } + } + return atomic_op_flags; +} + + +/* Helper function to handle the legacy cases of endianness conversion for the + * initial config request (see #3804) + */ +static struct psif_epsc_csr_config *eps_init_config(struct sif_eps *es, struct psif_epsc_csr_config *lcfg) +{ +#ifdef __LITTLE_ENDIAN + switch (es->ver.seq_set_proto) { + case 0: + return lcfg; + case 1: + case 2: + /* Use a config struct in network byte order */ + copy_conv_to_hw(&es->ver.nb_cfg, lcfg, sizeof(*lcfg)); + return &es->ver.nb_cfg; + } +#else + struct sif_dev *sdev = es->sdev; + + switch (es->ver.seq_set_proto) { + case 0: + /* Legacy mode: + * Handling not endian neutral and becomes different depending on + * EPSC platform endianness.. + */ + if (IS_SIBS(sdev)) { + sif_log(sdev, SIF_INFO, "Using straight through mode"); + return lcfg; + } + sif_log(sdev, SIF_INFO, "Converting config to LE (bw comp mode)"); + copy_conv_to_le(&es->ver.nb_cfg, lcfg, sizeof(*lcfg)); + return &es->ver.nb_cfg; + case 1: + case 2: + return lcfg; + } +#endif + return NULL; +} + + +/* Initial setup of communication with the EPSC: + * The initial phase consists of using the mailbox to communicate + * about about where the request and response queues of the EPSC + * should be placed in memory, and a few basic configuration options. + * This is done via + * 1) A reset cycle + * 2) An optional (supported by all new firmware) protocol version negotiation + * 3) Transfer of the psif_epsc_csr_config request which informs EPSC about where to find the + * req and resp queues, which is used for all following communication + * for the rest of the driver instance's lifetime. + */ + +/* This driver supports all initial mailbox exchange protocol versions up to and + * including this version: + */ +#define MAILBOX_SUPPORTED_PROTOCOL 2 + +int sif_eps_init(struct sif_dev *sdev, enum sif_tab_type type) +{ + /* We get called with the response queue type */ + enum psif_mbox_type eps_num = sif_tab2mbox(sdev, type); + struct sif_table *req_tp = &sdev->ba[type - 1]; + struct sif_table *rsp_tp = &sdev->ba[type]; + struct psif_epsc_csr_config lconfig; + struct psif_epsc_csr_config *config; + struct sif_eps_cqe lcqe; + struct psif_epsc_csr_rsp lrsp; + union sif_mailbox set, get; + struct psif_epsc_csr_rsp *cqe; + struct sif_eps *es = &sdev->es[eps_num]; + int ret = 0; + u16 seq_num = 0; /* Init runs in separate seq.numbers */ + int i; + ulong timeout = es->keepalive_interval = sdev->min_resp_ticks * 2; + ulong timeout_time = jiffies + timeout; + u64 tries = 0; + size_t bsz; + size_t config_cycle_count = sizeof(struct psif_epsc_csr_config)/sizeof(u32); + bool restarted_reset = false; + + /* Max mailbox exchange protocol version supported by this driver */ + u16 mailbox_seq_version_to_use = 2; + + if (eps_num == sdev->mbox_epsc) + eps_struct_init(sdev); + + es->last_seq = 0; + + ret = eps_set_state(sdev, eps_num, ES_INIT); + if (ret) + return ret; + + es->last_seq = 0; + atomic_set(&es->cur_reqs, 1); /* The initial request is not "posted" */ + es->max_reqs = 0; + es->mask = req_tp->entry_cnt - 1; + es->lowpri_lim = req_tp->entry_cnt - min_t(int, req_tp->entry_cnt/2, 2); + + if (rsp_tp->entry_cnt != req_tp->entry_cnt) { + sif_log(sdev, SIF_INFO, + "Illegal config - EPS queues must have the same length"); + return -EINVAL; + } + + bsz = sizeof(struct sif_eps_cqe *) * rsp_tp->entry_cnt; + es->cqe = kzalloc(bsz, GFP_KERNEL); + if (!es->cqe) { + sif_log(sdev, SIF_INFO, + "Failed to allocate %ld bytes for EPS%s completions", bsz, + eps_suffix(sdev, eps_num)); + return -ENOMEM; + } + + /* Use extra allocated space at the end of the completion array for the data area + * TBD: This code is not safe if any of the data elements cross a 2M page boundary + * - should move it out as a separate allocation. + */ + es->data = sif_mem_kaddr(rsp_tp->mem, rsp_tp->table_sz); + es->data_dma_hdl = sif_mem_dma(rsp_tp->mem, rsp_tp->table_sz); + es->data->log.size = sif_eps_log_size; + + /* Initialize the first response status to != 0 */ + cqe = get_eps_csr_rsp(sdev, eps_num, 0); + set_psif_epsc_csr_rsp__seq_num(cqe, (u64)-1); + + sif_log(sdev, SIF_INIT, "Data area for EPSC queries: %p (dma %pad) len %ld", + es->data, &es->data_dma_hdl, sizeof(struct sif_epsc_data)); + memset(&lconfig, 0, sizeof(lconfig)); + config = &lconfig; + memset(&lrsp, 0x6a, sizeof(struct psif_epsc_csr_rsp)); + lcqe.rsp = &lrsp; + lcqe.need_complete = false; + + lconfig.hwapi_major_ver = PSIF_MAJOR_VERSION; + lconfig.hwapi_minor_ver = PSIF_MINOR_VERSION; + lconfig.epsapi_major_ver = EPSC_MAJOR_VERSION; + lconfig.epsapi_minor_ver = EPSC_MINOR_VERSION; + + lconfig.request = req_tp->sif_base; + lconfig.response = rsp_tp->sif_base; + lconfig.extent_req = req_tp->ext_sz; + lconfig.extent_rsp = rsp_tp->ext_sz; + lconfig.entries = rsp_tp->entry_cnt; + if (!sdev->is_vf) + lconfig.atomic_support = sif_get_atomic_config(sdev, eps_num); + else + lconfig.atomic_support = PSIF_PCIE_ATOMIC_OP_NONE; + /* Ask the EPSC to reset the function we are accessing - starting from a clean state */ + lconfig.clean_state = 1; + +#ifndef __LITTLE_ENDIAN + /* Tell the EPSC that host is big endian */ + sif_log(sdev, SIF_INFO, "Configure for big endian host"); + lconfig.big_endian = 1; +#endif + lconfig.sparc_pages = (sdev->mi.page_size == 0x2000) ? 1 : 0; + if (rsp_tp->mem->mem_type != SIFMT_BYPASS) { + sif_log(sdev, SIF_INFO, + "Failed EPSC mappings: GVA2GPA mode not supported yet, consider reducing epsc_size"); + ret = -ENOMEM; + goto err_map_ctx; + } + + /* Allocate bypass mmu context (for responses) with wr_access set */ + ret = sif_map_ctx(sdev, &rsp_tp->mmu_ctx, rsp_tp->mem, rsp_tp->sif_base, + rsp_tp->table_sz, true); + if (ret) { + sif_log(sdev, SIF_INFO, "Failed to set mmu context for epsc_rsp"); + goto err_map_ctx; + } + + /* Pass the populated context on to the EPS */ + lconfig.mmu_cntx = rsp_tp->mmu_ctx.mctx; + +eps_reset: + sif_log(sdev, SIF_INIT, "Resetting EPS%s..", eps_name(sdev, eps_num)); + + /* 1) EPSC reset cycles: + * Special write cycle to reset EPS communication + */ + set.raw = MAILBOX_RESTART; + do { + tries++; + eps_mailbox_write(sdev, eps_num, set.raw); + get.raw = eps_mailbox_read(sdev, eps_num); + } while (get.raw != 0 && time_is_after_jiffies(timeout_time)); + + if (get.raw != MAILBOX_RESTART) { + sif_log(sdev, SIF_INFO, + "Failed to reset EPS%s after %lld tries (%ld ticks) - last read 0x%llx", + eps_name(sdev, eps_num), tries, timeout, get.raw); + ret = -ENODEV; + goto epsc_failed; + } + + /* 2) Meta protocol version negotiation: + * This step is basically used to determine how the initial config request + * should look: + */ + timeout_time = jiffies + timeout; + tries = 0; + + if (restarted_reset && mailbox_seq_version_to_use > 1) { + /* 2nd attempt - very old firmware - skip the protocol probing algo.. */ + goto proto_probing_done; + } + set.x.head = set.x.tail = MAILBOX_SEQ_SET_PROTOCOL; + + if (!restarted_reset) { + /* Handle bug #4101: + * Some old firmware versions will respond with the same mailbox protocol version + * as the one requested by the driver, no matter what. We must check that we don't have + * this version by trying version 0xffff which does not exist. If we get v.0xffff back + * we know we have this old firmware and can retry with v.0. + * v.2 and later will respond with the negotiated version. + */ + set.x.data = 0xffff; + } else { + /* The meta protocol number we request - if this fails, we are at the legacy firmware + * version which does not support this stage, and where config data is + * expected in LE order (See #3804) + */ + set.x.data = mailbox_seq_version_to_use; + } + + do { + tries++; + eps_mailbox_write(sdev, eps_num, set.raw); + get.raw = eps_mailbox_read(sdev, eps_num); + } while (get.x.head != MAILBOX_SEQ_SET_PROTOCOL && get.raw != MAILBOX_IN_ERROR + && time_is_after_jiffies(timeout_time)); + + if (time_is_before_eq_jiffies(timeout_time)) { + sif_log(sdev, SIF_INFO, + "Failed to get seq.protocol info from EPS%s after %lld tries (%ld ticks) - last read 0x%llx", + eps_name(sdev, eps_num), tries, timeout, get.raw); + if (!restarted_reset) { + restarted_reset = true; + sif_log(sdev, SIF_INFO, + "- assuming very old firmware without protocol version probing: restarting.."); + goto eps_reset; + } else { + ret = -ESRCH; + goto epsc_failed; + } + } + + if (!restarted_reset && get.x.data == 0xffff) { + /* We have identified bug #4101 in firmware: + * Firmware that responds wrongly on the mailbox exchange protocol, + * retry with version 0: + */ + sif_log(sdev, SIF_INFO, + "- found old firmware which responds wrongly to protocol version probing: restarting.."); + restarted_reset = true; + mailbox_seq_version_to_use = 0; + goto eps_reset; + } + + if (get.x.head != MAILBOX_SEQ_SET_PROTOCOL) { + sif_log(sdev, SIF_INFO, "Legacy firmware found - no SEQ_SET_PROTOCOL supported"); + es->ver.seq_set_proto = 0; + } else if (get.x.data > MAILBOX_SUPPORTED_PROTOCOL) { + mailbox_seq_version_to_use = MAILBOX_SUPPORTED_PROTOCOL; + restarted_reset = true; + goto eps_reset; + } else + es->ver.seq_set_proto = get.x.data; + +proto_probing_done: + sif_log(sdev, SIF_INFO, "In contact with EPS%s with initial mailbox negotiation protocol v.%d", + eps_name(sdev, eps_num), es->ver.seq_set_proto); + if (!es->ver.seq_set_proto) + sif_log(sdev, SIF_INFO, + "***** Warning: firmware update necessary, support for this version discontinued! *****"); + + /* Set up the config struct correctly for transfer */ + config = eps_init_config(es, &lconfig); + if (!config) + goto epsc_failed; + + /* At this point it is safe to enable bus master for PSIF + * Firmware guarantees that we do not get here until all state + * from any previous runs have been cleared out + */ + pci_set_master(sdev->pdev); + + /* 3) Transfer the psif_epsc_csr_config request via the mailbox. + * The result is then expected as response in the first response queue + * element in the area pointed to by the request transferred here: + */ + tries = 0; + sif_log(sdev, SIF_INIT, + "Setting up EPS%s: req at %llx, rsp at %llx, entries %d cycles %ld", + eps_name(sdev, eps_num), lconfig.request, lconfig.response, + lconfig.entries, sizeof(lconfig)/sizeof(u32)); + + + seq_num = 0; + for (i = 0; i < config_cycle_count; i++) { + set.x.head = set.x.tail = ++seq_num; + set.x.data = ((u32 *)(config))[i]; + tries = 0; + timeout_time = jiffies + timeout; + do { + tries++; + eps_mailbox_write_data(sdev, eps_num, set.raw); + get.raw = eps_mailbox_read_data(sdev, eps_num); + } while (((get.x.head != seq_num) || (get.x.tail != seq_num)) && + get.raw != MAILBOX_IN_ERROR && + time_is_after_jiffies(timeout_time)); + if (get.raw == MAILBOX_IN_ERROR && time_is_after_jiffies(timeout_time)) { + sif_log(sdev, SIF_INFO, + "Writing config data failed before timeout - retrying..."); + goto eps_reset; + } else if (seq_num > 0xa && time_is_before_eq_jiffies(timeout_time)) { + config_cycle_count = i; + sif_log(sdev, SIF_INFO, + "Unable to get part %d (%lld tries) - old firmware? - retrying...", + i, tries); + goto eps_reset; + } else if (set.x.data != get.x.data || time_is_before_eq_jiffies(timeout_time)) { + sif_log(sdev, SIF_INFO, + "Failed during init sequence for EPS%s, part %d (%lld tries) set %llx get %llx, expected seq %x %s", + eps_name(sdev, eps_num), i, tries, set.raw, get.raw, seq_num, + (time_is_before_jiffies(timeout_time) ? "[timeout]" : "")); + ret = -EIO; + goto epsc_failed; + } + } + + sdev->es[eps_num].timeout = timeout_time; + + /* Set storage for this initial request manually before polling */ + es->cqe[0] = &lcqe; + + /* At this point we expect to have a valid response in the first position: */ + ret = sif_eps_poll_cqe(sdev, eps_num, 0, &lcqe); + if (ret) { + goto epsc_failed; + } + /* We are up and running with the EPSC, figure out what + * this firmware offers. + */ + + /* in protocol version 2 bits 16-31 of the response sequence number contain + * an ID the driver has to provide in requests + */ + es->mbox_id = (lrsp.seq_num >> 16) & 0xffff; + + memcpy(&es->ver, &lrsp.data, sizeof(lrsp.data)); + + /* The addr field now contains the number of available event queues from this EPS */ + es->eqs.max_cnt = lrsp.addr & 0xffff; + /* minimum number of async EPSC EQ entries per port is in the higher 16 bits + * and is an offset to 16 + */ + es->eqs.min_sw_entry_cnt = ((lrsp.addr >> 16) & 0xffff) + 16; + + /* PSIF has flagged that it is running in degraded mode */ + if (lrsp.info & PSIF_INFO_FLAG_DEGRADED) { + sif_log(sdev, SIF_INFO, "PSIF device is degraded"); + sdev->degraded = true; + } + + if (sif_cq_eq_max < 1) + sif_cq_eq_max = 1; /* Adjust - need at least 1 completion event queue */ + + /* We only allocate resources for these */ + es->eqs.cnt = min_t(ulong, es->eqs.max_cnt, sif_cq_eq_max + 2); + + ret = sif_eps_api_version_ok(sdev, eps_num); + if (ret) + goto epsc_failed; + + /* APIs are ok - now request, report and possibly + * validate epsc firmware (build) version info + */ + ret = sif_eps_firmware_version_ok(sdev, eps_num); + if (ret) + goto epsc_failed; + +#if defined(CONFIG_ARCH_DMA_ADDR_T_64BIT) && defined(__sparc__) + /* The kernel is currently using iommu bypass mode in the sparc iommu, and + * the PSIF MMU requires a fixed configuration of the upper 12 bits of the + * DMA addresses: we need bit 63 set in all GVA2GPA accesses. + */ + { + u16 upper_12 = sif_mem_dma(rsp_tp->mem, 0) >> PSIF_TABLE_PTR_SHIFT; + + ret = epsc_set_mmu_upper(sdev, upper_12); + if (ret) + goto epsc_failed; + } +#endif + + /* Interrupt setup */ + if (eps_num == sdev->mbox_epsc) { + ret = sif_enable_msix(sdev); + if (ret) + goto epsc_failed; + } + + /* Set up the event queues as a special case here */ + ret = sif_eq_init(sdev, es, &lrsp); + if (ret) + goto epsc_eq_init_failed; + + if (sif_eps_log_size) + ret = sif_eps_log_ctrl(sdev, eps_num, EPSC_LOG_MODE_HOST, sif_eps_log_level); + if (ret) + goto epsc_log_ctrl_failed; + + eps_set_state(sdev, eps_num, ES_ACTIVE); + return ret; + + +epsc_log_ctrl_failed: + sif_eq_deinit(sdev, es); +epsc_eq_init_failed: + if (eps_num == sdev->mbox_epsc) + sif_disable_msix(sdev); +epsc_failed: + sif_unmap_ctx(sdev, &rsp_tp->mmu_ctx); +err_map_ctx: + kfree(es->cqe); + return ret; +} + + +int sif_eps_deinit(struct sif_dev *sdev, enum sif_tab_type rsp_type) +{ + enum psif_mbox_type eps_num = sif_tab2mbox(sdev, rsp_type); + struct sif_eps *es = &sdev->es[eps_num]; + struct sif_table *rsp_tp = &sdev->ba[rsp_type]; + struct psif_epsc_csr_req req; + struct psif_epsc_csr_rsp rsp; + + if (es->data->log.size) + sif_eps_log_ctrl(sdev, eps_num, EPSC_LOG_MODE_BUFFER, sif_eps_log_level); + sif_eq_deinit(sdev, es); + + if (eps_num == sdev->mbox_epsc) + sif_disable_msix(sdev); + + /* Note that beyond this point the EQs no longer exists so we need to use poll + * mode for the remaining epsc communication. + */ + + /* Flush TLB for old FW version. On current FW versions this is done + * automatically by FW. + * During takedown TLB invalidate is not generally possible since it requires + * working privileged QPs. Instead flush the whole TLB in one go. + */ + if (!eps_fw_version_ge(es, 0, 54) && !sdev->is_vf) + sif_flush_tlb(sdev); + + /* Tell the EPSC that we have terminated cleanly: */ + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_TEARDOWN; + sif_epsc_wr_poll(sdev, &req, &rsp); + + sif_unmap_ctx(sdev, &rsp_tp->mmu_ctx); + kfree(es->cqe); + + return 0; +} + + +#define epsc_seq(x) (x & 0x7fff) + +/* process any queued responses from the EPS + * Return the number processed, or -errno upon errors: + * assumes es->lock is held + */ +static inline int __eps_process_cqe(struct sif_dev *sdev, enum psif_mbox_type eps_num) +{ + struct sif_eps *es = &sdev->es[eps_num]; + int ret = 0; + int rsp_cnt = 0; + u64 seq_num_expected, seq_num; + u32 idx; + u16 ql; + struct psif_epsc_csr_rsp *cqe; + struct sif_eps_cqe *lcqe; + + for (;;) { + seq_num_expected = es->first_seq | CSR_ONLINE_MASK; + idx = es->first_seq & es->mask; + cqe = get_eps_csr_rsp(sdev, eps_num, idx); + seq_num = be64_to_cpu((volatile u64)(cqe->seq_num)) & 0xffff; + + if (seq_num != seq_num_expected) + break; + lcqe = es->cqe[idx]; + if (lcqe) { + rmb(); + sif_log(sdev, SIF_EPS, "copying to caller rsp at %p", lcqe->rsp); + copy_conv_to_sw(lcqe->rsp, cqe, sizeof(struct psif_epsc_csr_rsp)); + if (lcqe->rsp->status != EPSC_SUCCESS && sif_feature(pcie_trigger)) + force_pcie_link_retrain(sdev); + rsp_cnt++; + __epsc_complete(sdev, eps_num, idx); + es->cqe[idx] = NULL; + } + ql = atomic_dec_return(&es->cur_reqs); + es->first_seq = (es->first_seq + 1) & ~CSR_ONLINE_MASK; + ret++; + } + if (ret < 0) + sif_log(sdev, SIF_INFO, "failed with status %d", ret); + else if (ret > 0) { + sif_log(sdev, SIF_EPS, + "processed %d (%d with resp) requests - first_seq 0x%x, oustanding %d", + ret, rsp_cnt, es->first_seq, atomic_read(&es->cur_reqs)); + mb(); + } + + __sif_eps_send_keep_alive(sdev, eps_num, false); + + return ret; +} + + +static int eps_process_cqe(struct sif_dev *sdev, enum psif_mbox_type eps_num) +{ + int ret; + unsigned long flags; + struct sif_eps *es = &sdev->es[eps_num]; + + spin_lock_irqsave(&es->lock, flags); + ret = __eps_process_cqe(sdev, eps_num); + spin_unlock_irqrestore(&es->lock, flags); + return ret; +} + + +static void eps_reset_cmpl(struct sif_dev *sdev, u16 seq_num, enum psif_mbox_type eps_num) +{ + struct sif_eps *es = &sdev->es[eps_num]; + struct sif_table *t = &sdev->ba[sif_mbox2rsp_tab(sdev, eps_num)]; + u16 idx = seq_num % t->entry_cnt; + unsigned long flags; + + /* Protect against nil'ing it while anyone accessing cqe */ + spin_lock_irqsave(&es->lock, flags); + es->cqe[idx] = NULL; + spin_unlock_irqrestore(&es->lock, flags); +} + + +/* Asynchronous post of an EPS work request. + * returns nonzero if there is no more room + * in completion queue for a new entry. + * If seq_num is nonzero, the caller is expected to handle the + * completion using sif_epsc_poll_cqe, otherwise the entry is marked as + * "response ignored by the caller". + * If wait is set, post with flag EPSC_FL_NOTIFY to receive an interrupt from the eps: + * + */ +static int __sif_post_eps_wr(struct sif_dev *sdev, enum psif_mbox_type eps_num, + struct psif_epsc_csr_req *lreq, u16 *seq_num, + struct sif_eps_cqe *lcqe, bool wait) +{ + struct psif_epsc_csr_req *req; + struct sif_table *t = &sdev->ba[sif_mbox2rsp_tab(sdev, eps_num)]; + struct sif_eps *es = &sdev->es[eps_num]; + u32 idx; + union sif_mailbox lmbx; + u16 cur_reqs; + u16 limit = in_interrupt() ? t->entry_cnt : es->lowpri_lim; + unsigned long timeout = sdev->min_resp_ticks * 8; + int ret = 0; + bool waiting = false; + + es->timeout = jiffies + timeout; +restart: + + if (atomic_read(&es->cur_reqs)) { + /* Make sure emptying the queue takes preference over filling it up: */ + ret = __eps_process_cqe(sdev, eps_num); + + if (ret > 0) + ret = 0; /* Got some rsps */ + else if (ret < 0) + return ret; + } + + /* Allocate a new seq.number */ + cur_reqs = atomic_inc_return(&es->cur_reqs); + if (cur_reqs > limit) { + u16 tried_seq_num = (es->last_seq + 1) & ~CSR_ONLINE_MASK; + + atomic_dec(&es->cur_reqs); + if (!waiting) + atomic_inc(&es->waiters); + if (es->first_seq != es->last_full_seq) { + sif_log(sdev, SIF_INFO_V, + "req.queue full: seq %d, first %d, cur_reqs %d, %slimit %d, epsc_req_size is %d", + tried_seq_num, es->first_seq, cur_reqs, + (in_interrupt() ? "" : "(low pri) "), limit, t->entry_cnt); + es->last_full_seq = es->first_seq; + } + + + if (in_interrupt()) { + /* Only the EVENT_INDEX updates are sent from interrupt level and + * they are high pri, and should have reserved space: + */ + sif_log(sdev, SIF_INFO, + "Warning: Interrupt level EPSC req. while over limit (%d/%d), tried seq %d!", + cur_reqs, limit, tried_seq_num); + sif_logs(SIF_INFO, write_struct_psif_epsc_csr_req(NULL, 0, lreq)); + return -EFAULT; + } + + if (time_is_after_jiffies(es->timeout)) + goto restart; + else { + sif_log(sdev, SIF_INFO, + "Timeout waiting for previous response (seq %d) to complete", + es->first_seq); + return -EAGAIN; + } + } + if (waiting) + atomic_dec(&es->waiters); + + if (cur_reqs > es->max_reqs) + es->max_reqs = cur_reqs; + + es->last_seq = (es->last_seq + 1) & ~CSR_ONLINE_MASK; + idx = es->last_seq & es->mask; + req = get_eps_csr_req(sdev, eps_num, idx); + + lreq->seq_num = es->last_seq | CSR_ONLINE_MASK; + if (wait) { + /* Request interrupt upon completion */ + lreq->flags |= EPSC_FL_NOTIFY; + } + + /* Tell where to copy the completion upon arrival: */ + es->cqe[idx] = lcqe; + if (lcqe) { + sif_log(sdev, SIF_EPS, "set cqe[%d] = %p", idx, lcqe); + + /* set the software host order copy seq_num to something useful for comparison + * in the poll routines: + */ + lcqe->rsp->seq_num = get_psif_epsc_csr_req__seq_num(req); + lcqe->need_complete = wait; + } + wmb(); + sif_log(sdev, SIF_EPS, "opcode %s seq.%d to addr %p %s", + string_enum_psif_epsc_csr_opcode(lreq->opcode), + es->last_seq, req, (wait ? "wait" : "")); + + /* Update hw accessible req */ + copy_conv_to_hw(req, lreq, sizeof(struct psif_epsc_csr_req)); + + /* Doorbell - notify hw */ + lmbx.x.head = CSR_ONLINE_MASK | lreq->seq_num; + if (es->ver.seq_set_proto == 2) { + lmbx.x.tail = es->mbox_id; + lmbx.x.data = lreq->opcode; + } else { + lmbx.x.tail = lmbx.x.head; + lmbx.x.data = 0x5a5a5a5a; /* Not used - just an easy recognizable pattern */ + } + eps_mailbox_write(sdev, eps_num, lmbx.raw); + + if (seq_num) + *seq_num = es->last_seq; + return ret; +} + +/* Asynchronous post of an EPS work request. + * returns nonzero if there is no more room + * in completion queue for a new entry. + * If seq_num is nonzero, the caller is expected to handle the + * completion using sif_epsc_poll_cqe, otherwise the entry is marked as + * "response ignored by the caller". + * If wait is set, post with flag EPSC_FL_NOTIFY to receive an interrupt from the eps: + * + */ +int sif_post_eps_wr(struct sif_dev *sdev, enum psif_mbox_type eps_num, + struct psif_epsc_csr_req *lreq, u16 *seq_num, + struct sif_eps_cqe *lcqe, bool wait) +{ + struct sif_eps *es = &sdev->es[eps_num]; + unsigned long flags; + int ret; + + spin_lock_irqsave(&es->lock, flags); + ret = __sif_post_eps_wr(sdev, eps_num, lreq, seq_num, lcqe, wait); + spin_unlock_irqrestore(&es->lock, flags); + return ret; +} + +int sif_post_epsc_wr(struct sif_dev *sdev, struct psif_epsc_csr_req *lreq, + u16 *seq_num, struct sif_eps_cqe *lcqe, bool wait) +{ + return sif_post_eps_wr(sdev, sdev->mbox_epsc, lreq, seq_num, lcqe, wait); +} + + +/* Poll waiting for response on request seq_num. + * Polls for different completions may be executing this code in parallel: + */ +int sif_eps_poll_cqe(struct sif_dev *sdev, enum psif_mbox_type eps_num, + u16 seq_num, struct sif_eps_cqe *lcqe) +{ + struct sif_eps *es = &sdev->es[eps_num]; + int ret = 0; + ulong timeout = sdev->min_resp_ticks * 8; + int npolled = 0; + + es->timeout = jiffies + timeout; + while (seq_num != get_eps_mailbox_seq_num(lcqe->rsp)) { + ret = eps_process_cqe(sdev, eps_num); + if (ret < 0) + goto out; + + if (time_is_before_eq_jiffies(es->timeout)) { + if (sif_feature(pcie_trigger)) + force_pcie_link_retrain(sdev); + + sif_log(sdev, SIF_INFO, + "No response for req %#x from EPS (rsp->seq_num 0x%x) in %ld ms - #reqs outstanding %d", + seq_num, get_eps_mailbox_seq_num(lcqe->rsp), timeout, + atomic_read(&es->cur_reqs)); + ret = -ETIMEDOUT; + goto out; + } + cpu_relax(); + npolled += ret; + } + + ret = eps_status_to_err(lcqe->rsp->status); + + /* We got something, reset the timeout for all waiters */ + es->timeout = jiffies + timeout; +out: + if (ret < 0) { + int log_level = lcqe->rsp->opcode == EPSC_MODIFY_QP ? SIF_QPE : SIF_INFO; + + if (sif_feature(pcie_trigger)) + force_pcie_link_retrain(sdev); + if (ret != -ETIMEDOUT) + sif_log(sdev, log_level, + "Error response (%s) for req 0x%x from EPS (errno %d)", + string_enum_psif_epsc_csr_status(lcqe->rsp->status), + get_eps_mailbox_seq_num(lcqe->rsp), ret); + eps_reset_cmpl(sdev, seq_num, eps_num); + } else + sif_log(sdev, SIF_EPS, "seq 0x%x polled", seq_num); + return ret; +} + + +int sif_epsc_poll_cqe(struct sif_dev *sdev, u16 seq_num, struct sif_eps_cqe *lcqe) +{ + return sif_eps_poll_cqe(sdev, sdev->mbox_epsc, seq_num, lcqe); +} + + +/* Wait up to @timeout ticks for an earlier posted event + * with ID @seq_num to complete + */ +static int eps_waitfor_timeout(struct sif_dev *sdev, enum psif_mbox_type eps_num, + u16 seq_num, unsigned long timeout, + struct sif_eps_cqe *lcqe) +{ + struct completion *cmpl = &lcqe->cmpl; + unsigned long rem_time, wait_time; + volatile struct psif_epsc_csr_rsp *rsp = lcqe->rsp; + int ret; + unsigned int attempts = 4; + + + rem_time = wait_time = timeout/attempts; + for (;;) { + ret = eps_process_cqe(sdev, eps_num); + if (ret < 0) + goto out; + + if (get_eps_mailbox_seq_num(rsp) != seq_num) { + rem_time = wait_for_completion_interruptible_timeout(cmpl, rem_time); + if (!rem_time) { + rem_time = wait_time; + if (!--attempts) { + sif_log(sdev, SIF_INFO, "req %u timed out after %ld ms", + seq_num, timeout); + ret = -ETIMEDOUT; + goto out; + } + } + continue; + } + break; + } + + ret = eps_status_to_err(rsp->status); +out: + if (ret < 0) { + if (ret != -ETIMEDOUT) { + sif_log(sdev, SIF_INFO, + "Error response (%s) for req 0x%x from EPS", + string_enum_psif_epsc_csr_status(rsp->status), + get_eps_mailbox_seq_num(rsp)); + } + eps_reset_cmpl(sdev, seq_num, eps_num); + } + return ret; +} + +/* Wait for an earlier posted request with ID @seq_num to complete + */ +static int eps_waitfor(struct sif_dev *sdev, enum psif_mbox_type eps_num, + u16 seq_num, struct sif_eps_cqe *cqe) +{ + ulong timeout = sdev->min_resp_ticks * (1 + atomic_read(&sdev->es[eps_num].cur_reqs)) * 8; + + return eps_waitfor_timeout(sdev, eps_num, seq_num, timeout, cqe); +} + +int sif_epsc_waitfor(struct sif_dev *sdev, u16 seq_num, + struct sif_eps_cqe *cqe) +{ + return eps_waitfor(sdev, sdev->mbox_epsc, seq_num, cqe); +} + +/* Synchronous post of an EPS work request. + * Will wait until request completes and return the completion + * notification. Uses EPSC interrupts for wakeup. + */ + +int sif_eps_wr(struct sif_dev *sdev, enum psif_mbox_type eps_num, + struct psif_epsc_csr_req *req, struct psif_epsc_csr_rsp *cqe) +{ + u16 seq_num; + int ret; + struct sif_eps_cqe lcqe; + + lcqe.rsp = cqe; + init_completion(&lcqe.cmpl); +restart: + ret = sif_post_eps_wr(sdev, eps_num, req, &seq_num, &lcqe, true); + if (ret) + return ret; + + ret = eps_waitfor(sdev, eps_num, seq_num, &lcqe); + if (ret == -EAGAIN) { + sif_log(sdev, SIF_EPS, "EPS%s requests retry for req# %d", + eps_name(sdev, eps_num), seq_num); + goto restart; + } + sif_log(sdev, SIF_EPS, "Received EPS%s completion for req# %d", + eps_name(sdev, eps_num), seq_num); + return ret; +} + + +int sif_epsc_wr(struct sif_dev *sdev, struct psif_epsc_csr_req *req, + struct psif_epsc_csr_rsp *cqe) +{ + return sif_eps_wr(sdev, sdev->mbox_epsc, req, cqe); +} + + +/* Same as sif_eps_wr but poll for completion */ +int sif_eps_wr_poll(struct sif_dev *sdev, enum psif_mbox_type eps_num, + struct psif_epsc_csr_req *req, struct psif_epsc_csr_rsp *cqe) +{ + u16 seq_num; + int ret; + struct sif_eps_cqe lcqe; + + lcqe.rsp = cqe; +restart: + ret = sif_post_eps_wr(sdev, eps_num, req, &seq_num, &lcqe, false); + if (ret) + return ret; + + ret = sif_eps_poll_cqe(sdev, eps_num, seq_num, &lcqe); + if (ret == -EAGAIN) { + sif_log(sdev, SIF_EPS, "EPS%s requests retry for req# %d", + eps_name(sdev, eps_num), seq_num); + goto restart; + } + if (!ret) + sif_log(sdev, SIF_EPS, "Received EPS%s completion for req# %d", + eps_name(sdev, eps_num), seq_num); + return ret; +} + +int sif_epsc_wr_poll(struct sif_dev *sdev, struct psif_epsc_csr_req *req, + struct psif_epsc_csr_rsp *rsp) +{ + return sif_eps_wr_poll(sdev, sdev->mbox_epsc, req, rsp); +} + + + +/* EPS-A support */ +int sif_activate_epsa(struct sif_dev *sdev, enum psif_mbox_type eps_num) +{ + enum sif_tab_type type = epsa0_csr_req + (eps_num * 2); + + /* First initiate communication protocol with the EPS# */ + int ret = sif_table_init(sdev, type); + + if (ret) + return ret; + ret = sif_table_init(sdev, type + 1); + if (ret) + return ret; + + /* The rest of the init operations does not involve any memory setup, + * it just communicates the table base pointers setup up with the EPSC + * on to the EPSA. + */ + + /* Only key (DMA validation) is needed so far */ + ret = sif_table_update(sdev, eps_num, key); + return ret; +} + +inline bool sif_eps_keep_alive_timeout(struct sif_eps *es) +{ + return time_is_before_jiffies(es->last_req_posted + es->keepalive_interval); +} + + +static int __sif_eps_send_keep_alive(struct sif_dev *sdev, enum psif_mbox_type eps_num, + bool force) +{ + struct psif_epsc_csr_req req; + struct sif_eps *es = &sdev->es[eps_num]; + int ret = 0; + + if (sif_eps_keep_alive_timeout(es) || force) { + sif_log(sdev, SIF_INFO, "Sending keep-alive (force=%i)", force); + + /* prevent infinite loop with __sif_post_eps_wr */ + es->last_req_posted = jiffies; + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_KEEP_ALIVE; + ret = __sif_post_eps_wr(sdev, eps_num, &req, NULL, NULL, false); + } + return ret; +} + +int sif_eps_send_keep_alive(struct sif_dev *sdev, enum psif_mbox_type eps_num, + int force) +{ + struct sif_eps *es = &sdev->es[eps_num]; + unsigned long flags; + int ret; + + spin_lock_irqsave(&es->lock, flags); + ret = __sif_eps_send_keep_alive(sdev, eps_num, force); + spin_unlock_irqrestore(&es->lock, flags); + return ret; +} + +/**** Low level mailbox handling ****/ + +u64 eps_mailbox_read(struct sif_dev *sdev, u8 epsno) +{ + return be64_to_cpu(__raw_readq(&sdev->eps->eps[epsno].out)); +} + +void eps_mailbox_write(struct sif_dev *sdev, u8 epsno, u64 value) +{ + sdev->es[epsno].last_req_posted = jiffies; + wmb(); + __raw_writeq(cpu_to_be64(value), &sdev->eps->eps[epsno].in); + wmb(); +} + +u64 eps_mailbox_read_data(struct sif_dev *sdev, u8 epsno) +{ + union sif_mailbox set; + + set.raw = eps_mailbox_read(sdev, epsno); + if (sdev->es[epsno].ver.seq_set_proto <= 1) + set.x.data = le32_to_cpu(set.x.data); + else + set.x.data = be32_to_cpu(set.x.data); + return set.raw; +} + +void eps_mailbox_write_data(struct sif_dev *sdev, u8 epsno, u64 value) +{ + union sif_mailbox set; + + set.raw = value; + if (sdev->es[epsno].ver.seq_set_proto <= 1) + set.x.data = cpu_to_le32(set.x.data); + else + set.x.data = cpu_to_be32(set.x.data); + value = set.raw; + eps_mailbox_write(sdev, epsno, value); +} + + +/**** High level synchronous CSR operations */ + +/* Read a 64 bit CSR register */ +static u64 read_csr(struct sif_dev *sdev, u32 addr, bool local) +{ + struct psif_epsc_csr_rsp resp; + struct psif_epsc_csr_req req; + int ret; + + memset(&req, 0, sizeof(req)); + req.opcode = local ? EPSC_GET_SINGLE : EPSC_GET_ONE_CSR; + req.addr = addr; + + ret = sif_epsc_wr_poll(sdev, &req, &resp); + if (ret) + return -1; + + sif_log(sdev, SIF_CSR, "%s address 0x%x value 0x%llx", + (local ? "UF local" : "global"), addr, resp.data); + return resp.data; +} + +/* Write a 64 bit EPS CSR register. Only valid for old FW. */ +static int write_csr(struct sif_dev *sdev, u32 addr, u64 val) +{ + struct psif_epsc_csr_rsp resp; + struct psif_epsc_csr_req req; + int ret; + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_SET_ONE_CSR; + req.addr = addr; + req.u.single.data = val; + sif_log(sdev, SIF_CSR, "write address 0x%x value 0x%llx", + addr, val); + + ret = sif_epsc_wr_poll(sdev, &req, &resp); + if (ret) + return ret; + return ret; +} + + +/* Read a 64 bit CSR register (local UF mapping) */ +u64 sif_read_local_csr(struct sif_dev *sdev, u32 addr) +{ + return read_csr(sdev, addr, true); +} + +/* Read a 64 bit CSR register (global PSIF mapping - uf 0 only) */ +u64 sif_read_global_csr(struct sif_dev *sdev, u32 addr) +{ + return read_csr(sdev, addr, false); +} + +/* Write a 64 bit EPS CSR register (global PSIF mapping - uf 0 only) */ +int sif_write_global_csr(struct sif_dev *sdev, u32 addr, u64 val) +{ + return write_csr(sdev, addr, val); +} + + +/* Helper for dfs iteration */ +int sif_eps_next_used(struct sif_table *table, int index) +{ + struct sif_dev *sdev = table->sdev; + enum psif_mbox_type eps_num = sif_tab2mbox(sdev, table->type); + struct sif_eps *es = &sdev->es[eps_num]; + int first, last; + + first = es->first_seq & es->mask; + last = es->last_seq & es->mask; + + if (es->first_seq == es->last_seq + 1) + return -1; + if (first <= last) { + if (index <= first) + return first; + if (index > last) + return -1; + } else { + if (index >= table->entry_cnt) + return -1; + if (index > last && index < first) + return first; + } + return index; +} + + +static void sif_dfs_print_eps(struct seq_file *s, struct sif_dev *sdev, + loff_t pos, enum psif_mbox_type eps_num) +{ + struct psif_epsc_csr_req *req; + struct psif_epsc_csr_rsp *rsp; + struct sif_eps *es = &sdev->es[eps_num]; + u16 seq, rsp_seq; + + if (unlikely(pos < 0)) { + u32 sz = sdev->ba[epsc_csr_req].entry_cnt; + + seq_printf(s, + "# EPS%s Request queue, outstanding %d/%d max.%d waiters %d first/last seq. %d/%d\n" + "# %6s %15s %8s %15s %6s\n", + eps_suffix(sdev, eps_num), atomic_read(&es->cur_reqs), + sz, es->max_reqs, atomic_read(&es->waiters), + es->first_seq, es->last_seq, + "Entry", "req.opcode", "req.seq", "rsp.opcode", "rsp.seq"); + return; + } + + req = get_eps_csr_req(sdev, eps_num, pos); + seq = get_psif_epsc_csr_req__seq_num(req) & ~CSR_ONLINE_MASK; + + /* Correlate to response queue */ + rsp = get_eps_csr_rsp(sdev, eps_num, pos); + rsp_seq = get_psif_epsc_csr_rsp__seq_num(rsp) & ~CSR_ONLINE_MASK; + + seq_printf(s, "%8lld %15s %8d %15s %8d\n", pos, + string_enum_psif_epsc_csr_opcode(get_psif_epsc_csr_req__opcode(req)) + 5, + seq, + string_enum_psif_epsc_csr_opcode(get_psif_epsc_csr_rsp__opcode(rsp)) + 5, + rsp_seq); +} + + +void sif_dfs_print_epsc(struct seq_file *s, struct sif_dev *sdev, + loff_t pos) +{ + sif_dfs_print_eps(s, sdev, pos, sdev->mbox_epsc); +} + +void sif_dfs_print_epsa0(struct seq_file *s, struct sif_dev *sdev, + loff_t pos) +{ + sif_dfs_print_eps(s, sdev, pos, MBOX_EPSA0); +} + +void sif_dfs_print_epsa1(struct seq_file *s, struct sif_dev *sdev, + loff_t pos) +{ + sif_dfs_print_eps(s, sdev, pos, MBOX_EPSA1); +} + +void sif_dfs_print_epsa2(struct seq_file *s, struct sif_dev *sdev, + loff_t pos) +{ + sif_dfs_print_eps(s, sdev, pos, MBOX_EPSA2); +} + +void sif_dfs_print_epsa3(struct seq_file *s, struct sif_dev *sdev, + loff_t pos) +{ + sif_dfs_print_eps(s, sdev, pos, MBOX_EPSA3); +} + +void epsc_report_degraded(struct sif_dev *sdev, u64 cause_mask) +{ + unsigned int cause; + + for (cause = 0; cause < 64; cause++) { + if ((1L << cause) & cause_mask) { + sif_log(sdev, SIF_INFO, "Device reports degraded cause %s", + string_enum_psif_epsc_degrade_cause((enum psif_epsc_degrade_cause)cause)); + } + } +} diff --git a/drivers/infiniband/hw/sif/sif_epsc.h b/drivers/infiniband/hw/sif/sif_epsc.h new file mode 100644 index 0000000000000..4a7c6682cbab5 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_epsc.h @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_epsc.h: API for communication with the EPSC (and EPS-A's) + */ + +#ifndef __SIF_EPSC_H +#define __SIF_EPSC_H +#include +#include +#include +#include "sif_eq.h" +#include "psif_hw_data.h" + +struct sif_dev; +struct sif_table; +struct psif_epsc_csr_req; +struct psif_epsc_csr_rsp; + +struct sif_epsc_data; /* sif_query.h */ +enum psif_mbox_type; /* psif_hw_data.h */ +enum sif_tab_type; /* sif_dev.h */ + +/* Max number of strings (including final NULL) + * we expect from the firmware version details: + */ +enum sif_eps_fw_info_idx { + FWV_EPS_REV_STRING, + FWV_EPS_GIT_REPO, + FWV_EPS_GIT_LAST_COMMIT, + FWV_EPS_GIT_STATUS, + FWV_EPS_BUILD_USER, + FWV_EPS_BUILD_GIT_TIME, + FWV_PSIF_GIT_REPO, + FWV_PSIF_GIT_COMMIT, + FWV_PSIF_GIT_STATUS, + FWV_MAX +}; + + +struct eps_version_data { +#ifdef __LITTLE_ENDIAN + u16 epsc_minor; + u16 epsc_major; + u16 psif_minor; + u16 psif_major; +#else + u16 psif_major; + u16 psif_minor; + u16 epsc_major; + u16 epsc_minor; +#endif + u16 fw_minor; + u16 fw_major; + int seq_set_proto; /* Protocol version of the initial setup meta protocol (0 == legacy) */ + struct psif_epsc_csr_config nb_cfg; /* "Network" byte order config storage (see #3804) */ + char *fw_version[FWV_MAX]; +}; + + +enum sif_eps_state { + ES_NOT_RUNNING, /* EPS core thread not started */ + ES_RUNNING, /* EPS core thread started but comm.protocol not initiated */ + ES_INIT, /* Driver is working to set up tables with this EPS */ + ES_ACTIVE /* Communication with this EPS is up and running */ +}; + + +struct sif_eps_cqe { + struct psif_epsc_csr_rsp *rsp; /* process_cqe places a host order copy of the response here */ + struct completion cmpl; /* a completion to wait on for response */ + bool need_complete; /* req was posted with EPSC_FL_NOTIFY */ +}; + + +#define EPS_TAG_FROM_HOST 0x8000 + +#define MAX_LOGDEVNAME 32 + +/* Internal bookkeeping for sif_epsc.c/h: */ +struct sif_eps { + struct sif_dev *sdev; + enum psif_mbox_type eps_num; /* Which EPS this is */ + enum sif_eps_state state; /* Current state of the EPS */ + struct eps_version_data ver; /* Minor/major version info of the epsc firmware */ + spinlock_t lock;/* Serializes CPU access to the epsc hw and sw resources */ + volatile u16 last_seq; /* Last used sequence number */ + volatile u16 first_seq; /* First sequence number not seen any completion on */ + u16 mask; /* req/rsp table sz - 1 */ + u16 max_reqs; /* Max outstanding reqs seen */ + u16 lowpri_lim; /* Max number of outstanding low priority reqs */ + u16 last_full_seq; /* notify when queue full was last logged to avoid repeating logs */ + u16 mbox_id; /* ID of the mailbox as provided by EPS */ + atomic_t cur_reqs; /* current outstanding req count */ + atomic_t waiters; /* Number of threads waiting for a slot in the queue */ + unsigned long timeout; /* EPSC resp timeout - rescheduled when new completions observed */ + unsigned long keepalive_interval; /* how long to wait before sending a keepalive */ + unsigned long last_req_posted; /* time the last request was posted */ + struct sif_eps_cqe **cqe; /* An of caller owned pointers indexed by req.index */ + struct sif_epsc_data *data; /* Ptr to data recv area for EPS/SMA queries */ + dma_addr_t data_dma_hdl; /* DMA address of data area for query device/port etc. */ + struct sif_eq_base eqs; /* Setup of event queues */ + + /* log redirection support: */ + struct miscdevice logdev; /* Device for log rederect from the EPS, if enabled */ + struct file_operations logdev_ops; + char logdevname[MAX_LOGDEVNAME]; + bool log_redir_en; /* Set if log is currently redirected */ + atomic_t logdev_use; + struct completion logdev_more_log; /* elog reader will block on this one */ +}; + +/**** Low level mailbox handling ****/ +u64 eps_mailbox_read(struct sif_dev *sdev, u8 epsno); +void eps_mailbox_write(struct sif_dev *sdev, u8 epsno, u64 value); + +u64 eps_mailbox_read_data(struct sif_dev *sdev, u8 epsno); +void eps_mailbox_write_data(struct sif_dev *sdev, u8 epsno, u64 value); + +/* (De-)initialization necessary to communicate with the EPS */ +int sif_eps_init(struct sif_dev *sdev, enum sif_tab_type rsp_type); +int sif_eps_deinit(struct sif_dev *sdev, enum sif_tab_type rsp_type); + +const char *eps_name(struct sif_dev *sdev, enum psif_mbox_type eps_num); +const char *eps_suffix(struct sif_dev *sdev, enum psif_mbox_type eps_num); + +/* Convert EPSC status code to errno */ +int eps_status_to_err(enum psif_epsc_csr_status status); + +struct psif_epsc_csr_req *get_eps_csr_req(struct sif_dev *sdev, + enum psif_mbox_type eps_num, int index); + +struct psif_epsc_csr_rsp *get_eps_csr_rsp(struct sif_dev *sdev, + enum psif_mbox_type eps_num, int index); + +/* Returns true if this is the response table for any of the EPSes: */ +bool is_eps_rsp_tab(enum sif_tab_type type); + +/* Asynchronous post of an EPSC work request to psif. + * returns nonzero if #of outstanding requests + * exceed what the hardware offers or if there is no more room + * in completion queue for a new entry. + * if @seq_num is nonzero, the sequence number of the posted request will be placed there. + * If @lcqe is nonzero, a host endian copy of the response will be placed + * there when detected. + * + * If wait is set, it means that the epsc wr should be posted with + * flag EPSC_FL_NOTIFY to receive an interrupt from the epsc: + */ +int sif_post_eps_wr(struct sif_dev *sdev, enum psif_mbox_type eps_num, + struct psif_epsc_csr_req *lreq, u16 *seq_num, + struct sif_eps_cqe *lcqe, bool wait); + +int sif_post_epsc_wr(struct sif_dev *sdev, struct psif_epsc_csr_req *lreq, + u16 *seq_num, struct sif_eps_cqe *lcqe, bool wait); + +/* Get the seq.num from a epsc response in host order */ +u16 sif_epsc_get_seq(struct psif_epsc_csr_rsp *cqe); + +/* Wait up to @timeout ticks + * for an earlier posted request with ID @seq_num to complete + * return 0 if success, -errno else. @cqe will be populated with the response + * from the EPS. Uses EPSC interrupts for wakeup. + */ +int sif_epsc_waitfor_timeout(struct sif_dev *sdev, u16 seq_num, + unsigned long timeout, + struct sif_eps_cqe *cqe); + +/* Wait for an earlier posted request with ID @seq_num to complete + * return 0 if success, -errno else. @cqe will be populated with the response + * from the EPS. Uses EPSC interrupts for wakeup. + */ +int sif_epsc_waitfor(struct sif_dev *sdev, u16 seq_num, + struct sif_eps_cqe *cqe); + +/* Poll waiting for a response - in attach we cannot suspend or sleep.. + * return 0 if a successful operation, eg.EPSC_SUCCESS, + * otherwise a suitable -errno. @cqe will be populated with the response + * from the EPS + */ +int sif_epsc_poll_cqe(struct sif_dev *sdev, u16 seq_num, + struct sif_eps_cqe *cqe); + +/* Synchronous post of an EPSC work request. + * Will wait until request completes. @cqe will be populated with the response + * from the EPS. Return value: A suitable errno value that also captures the + * status code from the EPSC operation, if any. + */ +int sif_epsc_wr(struct sif_dev *sdev, struct psif_epsc_csr_req *req, + struct psif_epsc_csr_rsp *rsp); + +/* Same as sif_epsc_wr but poll for completion */ +int sif_epsc_wr_poll(struct sif_dev *sdev, struct psif_epsc_csr_req *req, + struct psif_epsc_csr_rsp *rsp); + +/* Generic EPS access (any EPS) */ +int sif_eps_wr(struct sif_dev *sdev, enum psif_mbox_type eps_num, + struct psif_epsc_csr_req *req, struct psif_epsc_csr_rsp *rsp); + +int sif_eps_wr_poll(struct sif_dev *sdev, enum psif_mbox_type eps_num, + struct psif_epsc_csr_req *req, struct psif_epsc_csr_rsp *rsp); + +int sif_eps_poll_cqe(struct sif_dev *sdev, enum psif_mbox_type eps_num, + u16 seq_num, struct sif_eps_cqe *lcqe); + +/* EPS-A support */ +int sif_activate_epsa(struct sif_dev *sdev, enum psif_mbox_type eps_num); + +/* Send a keep-alive request to an EPS */ +int sif_eps_send_keep_alive(struct sif_dev *sdev, enum psif_mbox_type eps_num, + int force); + +/**** High level synchronous CSR operations */ + +/* Read a 64 bit CSR register (local UF mapping) */ +u64 sif_read_local_csr(struct sif_dev *sdev, u32 addr); + +/* Read a 64 bit CSR register (global PSIF mapping - uf 0 only) */ +u64 sif_read_global_csr(struct sif_dev *sdev, u32 addr); + +/* Write a 64 bit EPS CSR register (global PSIF mapping - uf 0 only) */ +int sif_write_global_csr(struct sif_dev *sdev, u32 addr, u64 val); + +/* Helper for dfs iteration */ +int sif_eps_next_used(struct sif_table *table, int index); + +/* Sysfs entry printers */ +void sif_dfs_print_epsc(struct seq_file *s, struct sif_dev *sdev, + loff_t pos); +void sif_dfs_print_epsa0(struct seq_file *s, struct sif_dev *sdev, + loff_t pos); +void sif_dfs_print_epsa1(struct seq_file *s, struct sif_dev *sdev, + loff_t pos); +void sif_dfs_print_epsa2(struct seq_file *s, struct sif_dev *sdev, + loff_t pos); +void sif_dfs_print_epsa3(struct seq_file *s, struct sif_dev *sdev, + loff_t pos); + +/* completion invocation - called from sif_eq as result of epsc completion event processing */ +void epsc_complete(struct sif_dev *sdev, enum psif_mbox_type eps_num, int idx); + +/* Report cause for EPSC degraded mode */ +void epsc_report_degraded(struct sif_dev *sdev, u64 cause_mask); + +/* Set the SIF value to use for the 12 upper bits of a DMA address */ +int epsc_set_mmu_upper(struct sif_dev *sdev, u16 value); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_eq.c b/drivers/infiniband/hw/sif/sif_eq.c new file mode 100644 index 0000000000000..e52890dd27821 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_eq.c @@ -0,0 +1,1083 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_eq.c: Setup of event queues and interrupt handling + */ + +#include "sif_dev.h" +#include "sif_eq.h" +#include "sif_qp.h" +#include "sif_defs.h" +#include "sif_query.h" +#include "sif_base.h" +#include "sif_dma.h" +#include "sif_elog.h" +#include "sif_hwi.h" +#include "sif_ibqp.h" +#include "psif_hw_csr.h" +#include "psif_hw_setget.h" +#include + +static int sif_map_irq(struct sif_eq *eq); +static int sif_irq_coalesce(struct sif_eq *eq); + +static void sif_unmap_irq(struct sif_eq *eq); + +static int sif_eq_table_init(struct sif_dev *sdev, struct sif_eps *es, u16 eq_idx); +static void sif_eq_table_deinit(struct sif_dev *sdev, struct sif_eps *es, u16 eq_idx); + +static void sif_eq_deinit_tables(struct sif_dev *sdev, struct sif_eps *es); + +static int dispatch_eq(struct sif_eq *eq); + +static enum ib_event_type epsc2ib_event(struct psif_eq_entry *eqe); + +/* Work elements for dispatching events at non-interrupt level + */ +struct event_work { + struct work_struct ws; + struct ib_event ibe; + struct sif_eq *eq; +}; + +/* Define accessor functions - see sif_defs.h */ +sif_define_entry_funcs(eq, int) + +/* Set up the event queues using info about #of queues from the @cqe + * which contains a host byte order copy of the successful response + * to the configuration request to the EPS-C. + * The EPS-C event queue which receives the async events is always + * index 0 + */ +int sif_eq_init(struct sif_dev *sdev, struct sif_eps *es, struct psif_epsc_csr_rsp *cqe) +{ + int ret = 0; + int i; + int cnt; + struct sif_eq_base *eqb = &es->eqs; + struct sif_eq *eq; + + cnt = es->eqs.cnt; + sif_log(sdev, SIF_INIT, "setting up %d event queues for EPS%s", cnt, + eps_name(sdev, es->eps_num)); + + eq = (struct sif_eq *) + kzalloc(sizeof(struct sif_eq) * cnt, GFP_KERNEL); + if (!eq) + return -ENOMEM; + + eqb->eq = eq; + for (i = 0; i < cnt; i++) { + ret = sif_eq_table_init(sdev, es, i); + if (ret) { + eqb->cnt = i; + goto eqi_failed; + } + } + + eqb->cnt = cnt; + return 0; + +eqi_failed: + sif_eq_deinit_tables(sdev, es); + kfree(eqb->eq); + eqb->eq = NULL; + return ret; +} + + +static void sif_eq_deinit_tables(struct sif_dev *sdev, struct sif_eps *es) +{ + int i; + + for (i = es->eqs.cnt - 1; i >= 0; i--) + sif_eq_table_deinit(sdev, es, i); + es->eqs.cnt = 0; +} + + +void sif_eq_deinit(struct sif_dev *sdev, struct sif_eps *es) +{ + if (es->eqs.cnt > 0) + sif_eq_deinit_tables(sdev, es); + + kfree(es->eqs.eq); + es->eqs.eq = NULL; +} + +static int sif_set_affinity_mask_hint(struct sif_dev *sdev, struct sif_eq *eq) +{ + int numa_node = dev_to_node(&sdev->pdev->dev); + int cpu; + + if (!zalloc_cpumask_var(&eq->affinity_mask, GFP_KERNEL)) + return -ENOMEM; + + cpu = cpumask_local_spread(eq->index, numa_node); + cpumask_set_cpu(cpu, eq->affinity_mask); + return 0; +} + + +/* Bit field for #entries in hw is 5 bits wide */ +#define SIF_MAX_EQ_ENTRIES (1 << 0x1f) + +/* Set up of a single EQ requested by an EPS. + * This code is quite similar to base table setup in sif_base.c - sif_table_init + * but since we do not have the base_layout for each of these tables since + * we do not know the number of tables in advance, we cannot use the same code. + * We also need separat accessor functions and use a dynamically allocated array + * of sif_eq objects with some more extra info in addition to the sif_table + */ +static int sif_eq_table_init(struct sif_dev *sdev, struct sif_eps *es, u16 eq_idx) +{ + struct sif_eq *eq = &es->eqs.eq[eq_idx]; + volatile struct psif_eq_entry *eqe; + struct sif_table *tp = &eq->ba; + int extent; /* As log2 */ + int ret = 0; + u32 min_entries, headroom; + + struct psif_epsc_csr_req req; /* local epsc wr copy */ + struct psif_epsc_csr_rsp resp; + + memset(eq, 0, sizeof(*eq)); + eq->eps = es; + eq->index = tp->type = eq_idx; /* We *reuse* type with a different meaning here */ + eq->next_seq = 0; + tp->sdev = sdev; + tp->ext_sz = roundup_pow_of_two(sizeof(struct psif_eq_entry)); + tp->is_eq = true; /* To distinguish namespace from other base tables */ + + /* Event queue sizes: It is critical that these are sized for worst case. + * The size of event queues used for completions must be large enough to + * receive at least one entry from each associated completion queue. + * The async event queue (queue 1) must be scaled to fit every possible event. + * See sec.36.2.3. Event Queue Sizing, page 361 in the PSIF PRM. + */ + + switch (eq_idx) { + case 0: /* Async + epsc events */ + headroom = sif_epsc_eq_headroom; + min_entries = es->eps_num == sdev->mbox_epsc ? + (sif_epsc_size + headroom + 2*es->eqs.min_sw_entry_cnt + 1) + : 64; + break; + case 1: + /* TSU - asynchronous events: */ + headroom = sif_tsu_eq_headroom; + min_entries = es->eps_num == sdev->mbox_epsc ? + 7 * sif_qp_size + 2 * sif_rq_size + sif_cq_size + 9 + headroom : 64; + break; + default: + /* completion notification events coming here + * TBD: We might want to scale the sizes of each of these queues and limit + * the number of CQs to handle by each of them instead: + */ + headroom = sif_tsu_eq_headroom; + min_entries = es->eps_num == sdev->mbox_epsc ? sif_cq_size + headroom : 64; + break; + } + + eq->entries = tp->entry_cnt = roundup_pow_of_two(min_entries); + eq->sw_index_interval = eq->entries - min_entries + headroom; + if (!eq->sw_index_interval) + eq->sw_index_interval = 1; /* Always update case */ + eq->sw_index_next_update = eq->sw_index_interval; + + if (eq->entries > SIF_MAX_EQ_ENTRIES) { + sif_log(sdev, SIF_INFO, + "requested %d entries but sif only supports %d", + eq->entries, SIF_MAX_EQ_ENTRIES); + return -ENFILE; /* 5 bit size_log2 field in eq descs in psif */ + } + + eq->mask = eq->entries - 1; + eq->extent = tp->ext_sz; + tp->table_sz = (size_t)tp->ext_sz * tp->entry_cnt; + extent = order_base_2(tp->ext_sz); + + sif_alloc_table(tp, tp->table_sz); + if (!tp->mem) { + sif_log(sdev, SIF_INIT, + "Failed to allocate 0x%lx bytes of memory for event queue table %d", + tp->table_sz, eq_idx); + return -ENOMEM; + } + + ret = sif_set_affinity_mask_hint(sdev, eq); + if (ret) + goto err_map_ctx; + + /* No MMU translations from EPS-C in PSIF Rev 2 or SIBS rev 1 */ + if (epsc_gva_permitted(sdev) && eq_idx == 0 && tp->mem->mem_type != SIFMT_BYPASS) { + sif_log(sdev, SIF_INFO, + "Rev 2.0 does not support MMU translations from EPS-C"); + ret = -EINVAL; + goto err_map_ctx; + } + + eq->mem = tp->mem; + + /* Make sure the initial value of entry 0's seq.no is is different from a real event */ + eqe = (struct psif_eq_entry *)get_eq_entry(eq, 0); + set_psif_eq_entry__seq_num(eqe, eq->entries); + + sif_log(sdev, SIF_INFO, + "Event queue %d: entry cnt %d (min.req.%d), ext sz %d, extent %d, sw_index_interval %d", + eq_idx, tp->entry_cnt, min_entries, tp->ext_sz, extent, eq->sw_index_interval); + sif_log(sdev, SIF_INIT, " - table sz 0x%lx %s sif_base 0x%llx", + tp->table_sz, sif_mem_type_str(tp->mem->mem_type), + tp->sif_base); + + spin_lock_init(&tp->lock); + + /* Set up HW descriptor */ + memset(&req, 0, sizeof(req)); + + req.opcode = EPSC_SET_BASEADDR_EQ; + req.u.base_addr.address = tp->sif_base; + req.u.base_addr.num_entries = tp->entry_cnt; + req.u.base_addr.extent_log2 = extent; + req.addr = eq_idx; /* The "CSR address" for this operation is the index of the queue */ + + /* Allocate mmu context with wr_access set */ + ret = sif_map_ctx(sdev, &tp->mmu_ctx, tp->mem, tp->sif_base, tp->table_sz, true); + if (ret) { + sif_log(sdev, SIF_INFO, "Failed to set mmu context for eq %d", + eq_idx); + goto err_map_ctx; + } + + /* Allocate an irq index */ + ret = sif_map_irq(eq); + if (ret) + goto err_map_irq; + + /* Pass the populated mmu context on to the EPS */ + req.u.base_addr.mmu_context = tp->mmu_ctx.mctx; + + req.u.base_addr.msix_index = eq->intr_vec; + + ret = sif_eps_wr_poll(sdev, es->eps_num, &req, &resp); + if (ret) + goto err_epsc_comm; + + /* Default interrupt channel coalescing settings */ + if (eq_idx != 0 && eps_version_ge(&sdev->es[sdev->mbox_epsc], 0, 36)) { + ret = sif_irq_coalesce(eq); + if (ret) + goto err_epsc_comm; + } + + return 0; + +err_epsc_comm: + sif_unmap_irq(eq); +err_map_irq: + sif_unmap_ctx(sdev, &tp->mmu_ctx); +err_map_ctx: + sif_free_table(tp); + return ret; +} + + +static void sif_eq_table_deinit(struct sif_dev *sdev, struct sif_eps *es, u16 eq_idx) +{ + struct sif_eq *eq = &es->eqs.eq[eq_idx]; + struct sif_table *tp = &eq->ba; + + sif_unmap_irq(eq); + + if (tp->mem) { + sif_unmap_ctx(sdev, &tp->mmu_ctx); + sif_free_table(tp); + tp->mem = NULL; + } +} + + +/* Interrupt routines for MSI-X */ + +static irqreturn_t sif_intr(int irq, void *d) +{ + u32 nreqs; + struct sif_eq *eq = (struct sif_eq *)d; + struct sif_dev *sdev = eq->ba.sdev; + nreqs = dispatch_eq(eq); + sif_log(sdev, SIF_INTR, + "done [irq %d (eq %d) - %d events dispatched]", + irq, eq->index, nreqs); + + if (sif_feature(check_all_eqs_on_intr)) { + int i; + struct sif_eps *es = &sdev->es[sdev->mbox_epsc]; + + sif_log(sdev, SIF_INTR, "feature check_all_eqs_on_intr - dispatching:"); + for (i = 0; i < es->eqs.cnt; i++) + if (i != eq->index) + dispatch_eq(&es->eqs.eq[i]); + sif_log(sdev, SIF_INTR, "feature check_all_eqs_on_intr - dispatch done."); + /* Note: this feature does not check the EPSA* interrupt queues */ + } + + return IRQ_HANDLED; +} + +/* Interrupt coalescing settings for a single channel */ +static int sif_irq_coalesce(struct sif_eq *eq) +{ + int ret; + struct sif_dev *s = eq->ba.sdev; + struct psif_epsc_csr_req req; /* local epsc wr copy */ + struct psif_epsc_csr_rsp resp; + + if (!eps_version_ge(&s->es[s->mbox_epsc], 0, 36)) + goto opcode_not_available; + + sif_log(s, SIF_INTR, "Set default coalescing settings for the interrupt channel %d\n", + eq->index); + + memset(&req, 0, sizeof(req)); + + req.opcode = EPSC_HOST_INT_CHANNEL_CTRL; + req.uf = 0; + req.u.int_channel.int_channel = eq->index; +#define SET_DEFAULT_HOST_INT_CTRL_SETTING(attr, _value) { \ + int value = ((sif_feature(dis_auto_int_coalesce)) || \ + (eq->index < 2)) ? 0 : _value; \ + req.u.int_channel.attributes.attr = 1; \ + req.u.int_channel.attr = value; \ + eq->irq_ch.attr = value; \ + } + SET_DEFAULT_HOST_INT_CTRL_SETTING(enable_adaptive, 1); + SET_DEFAULT_HOST_INT_CTRL_SETTING(channel_rx_scale, 1); + SET_DEFAULT_HOST_INT_CTRL_SETTING(channel_rate_low, 0); + SET_DEFAULT_HOST_INT_CTRL_SETTING(channel_rate_high, 200000); + SET_DEFAULT_HOST_INT_CTRL_SETTING(channel_ausec, 0); + SET_DEFAULT_HOST_INT_CTRL_SETTING(channel_ausec_low, 0); + SET_DEFAULT_HOST_INT_CTRL_SETTING(channel_ausec_high, 190); + SET_DEFAULT_HOST_INT_CTRL_SETTING(channel_pusec, 0); + SET_DEFAULT_HOST_INT_CTRL_SETTING(channel_pusec_low, 0); + SET_DEFAULT_HOST_INT_CTRL_SETTING(channel_pusec_high, 10); + + ret = sif_epsc_wr_poll(s, &req, &resp); + if (ret) { + sif_log(s, SIF_INFO, + "Failed to initialize the coalescing settings for interrupt channel %d\n", + eq->index); + memset(&eq->irq_ch, 0, sizeof(eq->irq_ch)); + return ret; + } + + return 0; +opcode_not_available: + return -1; +} + +/* Interrupt handling for a single event queue */ +static int sif_map_irq(struct sif_eq *eq) +{ + int irq; + int ret; + int vector_num; + struct sif_dev *s = eq->ba.sdev; + int flags = (s->intr_cnt != s->intr_req) ? IRQF_SHARED : 0; + const char *en; + + spin_lock(&s->msix_lock); + vector_num = find_next_zero_bit(s->intr_used, s->msix_entries_sz, 0); + if (vector_num < s->msix_entries_sz) + set_bit(vector_num, s->intr_used); + else + vector_num = -1; + spin_unlock(&s->msix_lock); + + if (vector_num == -1) { + sif_log(s, SIF_INFO, "Failed to allocate an irq for eq %d", eq->index); + return -ENOMEM; + } + + irq = s->msix_entries[vector_num].vector; + en = eps_name(s, eq->eps->eps_num); + + if (eq->index) + snprintf(eq->name, SIF_EQ_NAME_LEN, "sif%d-%d", 0, eq->index); + else + snprintf(eq->name, SIF_EQ_NAME_LEN, "sif%d-EPS%s", 0, en); + + ret = request_irq(irq, &sif_intr, flags, eq->name, eq); + if (ret) + return ret; + sif_log(s, SIF_INFO_V, "Allocated irq %d for EPS%s, eq %d, name %s", irq, en, + eq->index, eq->name); + eq->intr_vec = vector_num; + + ret = irq_set_affinity_hint(irq, eq->affinity_mask); + if (ret) { + sif_log(s, SIF_INFO_V, "set affinity hint for irq %d, failed", irq); + return ret; + } + return 0; +} + +static void sif_unmap_irq(struct sif_eq *eq) +{ + struct sif_dev *s = eq->ba.sdev; + int irq = s->msix_entries[eq->intr_vec].vector; + + free_cpumask_var(eq->affinity_mask); + irq_set_affinity_hint(irq, NULL); + free_irq(irq, eq); + spin_lock(&s->msix_lock); + clear_bit(eq->intr_vec, s->intr_used); + spin_unlock(&s->msix_lock); + eq->intr_vec = -1; + sif_log(s, SIF_INTR, "Freed irq %d for EPS%s", irq, eps_name(s, eq->eps->eps_num)); +} + + +int sif_enable_msix(struct sif_dev *sdev) +{ + int err; + int i = -1; + int cnt = sdev->es[sdev->mbox_epsc].eqs.cnt + 4; + int array_alloc_cnt = cnt; + int bitmap_words = max(1, array_alloc_cnt + 63 / 64); + + sdev->msix_entries = kcalloc(array_alloc_cnt, sizeof(struct msix_entry), GFP_KERNEL); + if (!sdev->msix_entries) + return -ENOMEM; + + sdev->msix_entries_sz = array_alloc_cnt; + sdev->intr_used = kcalloc(bitmap_words, sizeof(ulong), GFP_KERNEL); + if (!sdev->intr_used) { + err = -ENOMEM; + goto iu_failed; + } + + sif_log(sdev, SIF_INIT, + "EPSC offers %ld event queues, need %ld + 4 for the EPSA's = %d vecs, array sz %d", + sdev->es[sdev->mbox_epsc].eqs.max_cnt, sdev->es[sdev->mbox_epsc].eqs.cnt, + cnt, array_alloc_cnt); + spin_lock_init(&sdev->msix_lock); + + for (i = 0; i < cnt; i++) + sdev->msix_entries[i].entry = i; + + err = pci_enable_msix_range(sdev->pdev, sdev->msix_entries, 1, cnt); + if (err < 0) { + sif_log(sdev, SIF_INFO, + "Failed to allocate %d MSI-X vectors", cnt); + goto vector_alloc_failed; + } + + if (err < cnt) + sif_log(sdev, SIF_INFO, + "Unable to allocate more than %d MSI-X vectors", err); + + sdev->intr_req = cnt; + sdev->intr_cnt = err; + return 0; + +vector_alloc_failed: + kfree(sdev->intr_used); +iu_failed: + kfree(sdev->msix_entries); + return err; +} + + +int sif_disable_msix(struct sif_dev *sdev) +{ + pci_disable_msix(sdev->pdev); + kfree(sdev->intr_used); + kfree(sdev->msix_entries); + return 0; +} + + +/* simple allocation of EPSC EQ channels for CQs: Just do round robin for now: */ +u32 sif_get_eq_channel(struct sif_dev *sdev, struct sif_cq *cq) +{ + struct sif_eps *es = &sdev->es[sdev->mbox_epsc]; + u32 seq = atomic_inc_return(&es->eqs.eq_sel_seq); + + /* This is supposed to be a number between 0 and cnt - 2 as the EPSC EQ and the + * EQ for async events are not counted by hardware, so the first eilgible EQ + * is eq[2] which for hardware has index 0: + */ + u32 eqs_cnt = (u32) (es->eqs.cnt - 2); + + return seq % eqs_cnt; +} + +/* check a valid EQ channel */ +bool sif_check_valid_eq_channel(struct sif_dev *sdev, int comp_vector) +{ + struct sif_eps *es = &sdev->es[sdev->mbox_epsc]; + u32 eqs_cnt = (u32) (es->eqs.cnt - 2); + + return ((comp_vector >= 0) && (comp_vector <= eqs_cnt) ? true : false); +} + +/* @eqe contains little endian copy of event triggering the call + * - called from interrupt level + * Returns the number of events handled + */ +static u32 handle_completion_event(struct sif_eq *eq, struct psif_eq_entry *eqe) +{ + u32 ret = 1; + struct sif_dev *sdev = eq->ba.sdev; + struct sif_cq *cq = safe_get_sif_cq(sdev, eqe->cqd_id); + + if (!cq) { + sif_log(sdev, SIF_INTR, "eq %d: CQ Event seq %d: invalid or out-of-range cqd_id %d", + eq->index, eqe->seq_num, eqe->cqd_id); + return 0; + } + if (atomic_add_unless(&cq->refcnt, 1, 0)) { + u32 ec = atomic_inc_return(&cq->event_cnt); + + sif_log(sdev, SIF_INTR, "eq %d: Processing PSIF_EVENT_COMPLETION event #%d, seq %d - cq %d", + eq->index, ec, eqe->seq_num, eqe->cqd_id); + if (unlikely(!cq->ibcq.comp_handler)) { + /* This should not be possible - hw error? */ + sif_log(sdev, SIF_INFO, + "eq %d: No handler for PSIF_EVENT_COMPLETION event seq %d on cq %d", + eq->index, eqe->seq_num, eqe->cqd_id); + ret = 0; + } else + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); + + if (atomic_dec_and_test(&cq->refcnt)) + complete(&cq->cleanup_ok); + + } else { + /* TBD: We end up here also if an event was processed after the cq was destroyed + * but before the cq was reallocated again. We may consequently also + * get "spurious" events on a new CQ that was a delayed event from the previous + * usage but that should be ok. + */ + sif_log(sdev, SIF_INFO, + "eq %d: PSIF_EVENT_COMPLETION event seq %d - cq %d for invalid cq", + eq->index, eqe->seq_num, eqe->cqd_id); + ret = 0; + } + return ret; +} + + +static void handle_event_work(struct work_struct *work) +{ + struct event_work *ew = container_of(work, struct event_work, ws); + struct sif_dev *sdev = to_sdev(ew->ibe.device); + + atomic_inc(&ew->eq->work_cnt); + + if (unlikely(!sdev->registered)) { + sif_log(sdev, SIF_INFO, + "Event of type %s received before verbs framework is up - ignoring", + ib_event2str(ew->ibe.event)); + + if ((ew->ibe.event == IB_EVENT_LID_CHANGE) + && (ew->ibe.element.port_num == 1) + && (PSIF_REVISION(sdev) <= 3)) + sif_r3_recreate_flush_qp(sdev); + goto out; + } + + switch (ew->ibe.event) { + case IB_EVENT_CQ_ERR: { + struct ib_cq *cq = ew->ibe.element.cq; + + if (cq->event_handler) + cq->event_handler(&ew->ibe, cq->cq_context); + else + sif_log(sdev, SIF_INFO, + "Unhandled event of type %s received", + ib_event2str(ew->ibe.event)); + break; + } + case IB_EVENT_SRQ_LIMIT_REACHED: + case IB_EVENT_SRQ_ERR: { + struct ib_srq *srq = ew->ibe.element.srq; + + if (ew->ibe.event == IB_EVENT_SRQ_LIMIT_REACHED) + to_srq(srq)->srq_limit = 0; + + if (srq->event_handler) + srq->event_handler(&ew->ibe, srq->srq_context); + else + sif_log(sdev, SIF_INFO, + "Unhandled event of type %s received, srq %d", + ib_event2str(ew->ibe.event), to_srq(srq)->index); + break; + } + case IB_EVENT_QP_FATAL: + case IB_EVENT_QP_REQ_ERR: + case IB_EVENT_QP_ACCESS_ERR: + case IB_EVENT_PATH_MIG_ERR: + case IB_EVENT_QP_LAST_WQE_REACHED: { + struct ib_qp *ibqp = ew->ibe.element.qp; + struct sif_qp *qp = to_sqp(ibqp); + + if (is_regular_qp(qp)) { + struct sif_rq *rq = get_sif_rq(sdev, qp->rq_idx); + struct sif_rq_sw *rq_sw = get_sif_rq_sw(sdev, rq->index); + + /* WA #3850:if SRQ, generate LAST_WQE event */ + if (rq->is_srq && ibqp->event_handler) { + struct ib_event ibe = { + .device = &sdev->ib_dev, + .event = IB_EVENT_QP_LAST_WQE_REACHED, + .element.qp = &qp->ibqp + }; + ibqp->event_handler(&ibe, ibqp->qp_context); + } else { + /* WA #622: if reqular RQ, flush */ + if (sif_flush_rq(sdev, rq, qp, atomic_read(&rq_sw->length))) + sif_log(sdev, SIF_INFO, "failed to flush RQ %d", + rq->index); + } + } + if (!ibqp->event_handler) + sif_log(sdev, SIF_INFO, + "Unhandled event of type %s received, qp %d", + ib_event2str(ew->ibe.event), qp->qp_idx); + /* fall through */ + } + case IB_EVENT_PATH_MIG: + case IB_EVENT_COMM_EST: { + struct ib_qp *ibqp = ew->ibe.element.qp; + struct sif_qp *qp = to_sqp(ibqp); + + if (ibqp->event_handler) + ibqp->event_handler(&ew->ibe, ibqp->qp_context); + + if (atomic_dec_and_test(&qp->refcnt)) + complete(&qp->can_destroy); + + if (!ibqp->event_handler) + sif_log(sdev, SIF_INFO, + "Unhandled event of type %s received, qp %d", + ib_event2str(ew->ibe.event), qp->qp_idx); + break; + } + case IB_EVENT_LID_CHANGE: + if (ew->ibe.element.port_num == 1 && PSIF_REVISION(sdev) <= 3) + sif_r3_recreate_flush_qp(sdev); + case IB_EVENT_PORT_ERR: + case IB_EVENT_CLIENT_REREGISTER: + case IB_EVENT_PORT_ACTIVE: + case IB_EVENT_DEVICE_FATAL: + case IB_EVENT_PKEY_CHANGE: + case IB_EVENT_GID_CHANGE: + case IB_EVENT_SM_CHANGE: + ib_dispatch_event(&ew->ibe); + break; + default: + sif_log(sdev, SIF_INFO, "Unhandled event type %d", ew->ibe.event); + break; + } +out: + kfree(ew); + } + +/* Generic event handler - @eqe contains little endian copy of event triggering the call + * ib_dispatch_event dispatches directly so we have to defer the actual dispatch + * a better priority level via sdev->wq: + */ + +static u32 handle_event(struct sif_eq *eq, void *element, enum ib_event_type ev_type) +{ + struct sif_dev *sdev = eq->ba.sdev; + struct event_work *ew = kmalloc(sizeof(struct event_work), GFP_ATOMIC); + + if (!ew) { + /* TBD: kmem_cache_alloc or fallback static necessary? */ + sif_log(sdev, SIF_INFO, "FATAL: Failed to allocate work struct"); + return 0; + } + memset(&ew->ibe, 0, sizeof(struct ib_event)); + ew->ibe.device = &sdev->ib_dev; + ew->ibe.event = ev_type; + ew->eq = eq; + + /* Assume ibe.element is a union and that our caller has + * set up the right value for us (port, cq, qp or srq): + */ + ew->ibe.element.cq = element; + INIT_WORK(&ew->ws, handle_event_work); + + sif_log(sdev, SIF_INTR, "Processing IB event type %s", + ib_event2str(ew->ibe.event)); + queue_work(sdev->wq, &ew->ws); + return 1; +} + +static u32 handle_psif_event(struct sif_eq *eq, struct psif_eq_entry *eqe, + const char *type_str) +{ + struct sif_dev *sdev = eq->ba.sdev; + + sif_log(sdev, SIF_INFO, "Received (unhandled) psif event of type %s, port flags %s", + type_str, + string_enum_psif_event(eqe->port_flags)); + return 1; +} + +static u32 handle_epsc_event(struct sif_eq *eq, struct psif_eq_entry *eqe) +{ + struct sif_dev *sdev = eq->ba.sdev; + struct sif_eps *es = &sdev->es[eq->eps->eps_num]; + u32 ret = 1; + enum psif_event event_type; + + if (eqe->port_flags == PSIF_EVENT_EXTENSION) + event_type = eqe->extension_type; + else + event_type = eqe->port_flags; + + switch (event_type) { + case PSIF_EVENT_MAILBOX: + sif_log(sdev, SIF_INTR, "epsc completion event for seq.%d eps_num %d", + eqe->cq_sequence_number, eq->eps->eps_num); + epsc_complete(sdev, eq->eps->eps_num, eqe->cq_sequence_number & es->mask); + break; + case PSIF_EVENT_LOG: + sif_log(sdev, SIF_INTR, "epsc log event"); + sif_elog_intr(sdev, sdev->mbox_epsc); + break; + case PSIF_EVENT_EPSC_KEEP_ALIVE: + sif_log(sdev, SIF_INTR, "epsc keep-alive event"); + sif_eps_send_keep_alive(sdev, eq->eps->eps_num, true); + break; + default: + { + enum ib_event_type ibe = epsc2ib_event(eqe); + + if (ibe != (enum ib_event_type)-1) { + void *element = (void *)((u64) eqe->port + 1); + + return handle_event(eq, element, ibe); + } + sif_log(sdev, SIF_INFO, "Unhandled epsc event of type %s::%s (%d::%u)", + string_enum_psif_event(eqe->port_flags), + string_enum_psif_event(eqe->extension_type), + eqe->port_flags, eqe->extension_type); + if (eqe->extension_type == PSIF_EVENT_DEGRADED_MODE) { + sdev->degraded = true; + epsc_report_degraded(sdev, eqe->event_data); + } + ret = 0; + break; + } + } + return ret; +} + + +static u32 handle_epsa_event(struct sif_eq *eq, struct psif_eq_entry *eqe) +{ + struct sif_dev *sdev = eq->ba.sdev; + + sif_log(sdev, SIF_INFO, "Received (unhandled) epsa event of type %s", + string_enum_psif_event(eqe->port_flags)); + return 1; +} + +#define check_for_psif_event(__event__)\ + if (leqe.__event__)\ + nevents += handle_psif_event(eq, &leqe, #__event__) + +/* Bug #3952 - WA for HW bug #3523 (leqe.rqd_id is not valid) + * If QP transport is different from XRC + * and the QP is not already destroyed + * then retrieve the rq_idx from the QP + * Note: For SRQ_LIM event due to modify_srq, QP points to pQP. + */ +static u32 handle_srq_event(struct sif_eq *eq, void *element, enum ib_event_type ev_type) +{ + if (element != NULL) { + struct sif_dev *sdev = eq->ba.sdev; + struct sif_qp *qp = to_sqp(element); + enum psif_qp_trans type = qp->type; + struct sif_rq *rq = (ev_type == IB_EVENT_SRQ_LIMIT_REACHED && + type == PSIF_QP_TRANSPORT_MANSP1) ? + get_sif_rq(sdev, qp->srq_idx) : get_sif_rq(sdev, qp->rq_idx); + + /* release the qp lock */ + if (atomic_dec_and_test(&qp->refcnt)) + complete(&qp->can_destroy); + + return handle_event(eq, (void *)&rq->ibsrq, ev_type); + } + sif_log(eq->ba.sdev, SIF_INFO, "eq %d: Discarding %s event: QP destroyed", eq->index, + ev_type == IB_EVENT_SRQ_ERR ? "IB_EVENT_SRQ_ERR" : "IB_EVENT_SRQ_LIMIT_REACHED"); + return 1; +} + + +#define dump_eq_entry(level, _s, _eqe) \ + sif_logs(level, printk("%s: ", _s); \ + write_struct_psif_eq_entry(NULL, 0, &leqe); printk("\n")) + + +/* Called from interrupt threads */ +static int dispatch_eq(struct sif_eq *eq) +{ + volatile struct psif_eq_entry *eqe; + struct psif_eq_entry leqe; + struct psif_epsc_csr_req req; + struct sif_dev *sdev = eq->ba.sdev; + + u32 seqno; + u32 nreqs = 0; + ulong flags; + void *port_elem; + void *qp_elem = NULL; + + /* Serialize event queue processing: */ + spin_lock_irqsave(&eq->ba.lock, flags); + seqno = eq->next_seq; + eqe = (struct psif_eq_entry *)get_eq_entry(eq, seqno); + sif_log(sdev, SIF_INTR, "eqe at %p next seq.no %x", eqe, seqno); + while (get_psif_eq_entry__seq_num(eqe) == seqno) { + u32 nevents = 0; + + eq->next_seq++; + + /* Update eq_sw::index if necessary */ + if (eq->next_seq == eq->sw_index_next_update) { + u32 old_nu = eq->sw_index_next_update; + + memset(&req, 0, sizeof(req)); + req.opcode = EPSC_EVENT_INDEX; + req.addr = eq->index; + req.u.single.data = eq->next_seq; + eq->sw_index_next_update += eq->sw_index_interval; + + spin_unlock_irqrestore(&eq->ba.lock, flags); + + sif_log(eq->ba.sdev, SIF_INFO_V, + "Updating EQ_SW_INDEX for eq %d to %x. Interval %x, lim %x, next lim %x", + eq->index, eq->next_seq, eq->sw_index_interval, old_nu, + eq->sw_index_next_update); + + /* We ignore the response by providing NULL for seq_num and lcqe */ + sif_post_eps_wr(eq->ba.sdev, eq->eps->eps_num, &req, NULL, NULL, false); + } else { + /* Avoid callbacks while interrupts off */ + spin_unlock_irqrestore(&eq->ba.lock, flags); + } + + copy_conv_to_sw(&leqe, eqe, sizeof(leqe)); + + port_elem = (void *)((u64) leqe.port + 1); + + if (likely(leqe.event_status_cmpl_notify)) { + nevents += handle_completion_event(eq, &leqe); + + /* No other event type bits will be set on a CNE */ + goto only_cne; + } + + dump_eq_entry(SIF_DUMP, " ", &leqe); + + /* TBD: Handle this check with a mask... */ + if (unlikely(leqe.event_status_local_work_queue_catastrophic_error || + leqe.event_status_path_migration_request_error || + leqe.event_status_invalid_request_local_wq_error || + leqe.event_status_local_access_violation_wq_error || + leqe.event_status_last_wqe_reached || + leqe.event_status_communication_established || + leqe.event_status_path_migrated || + leqe.event_status_srq_limit_reached || + leqe.event_status_srq_catastrophic_error)) { + struct sif_qp *sif_qp_elem = safe_get_sif_qp(sdev, leqe.qp); + bool is_srq_event = (leqe.event_status_srq_limit_reached || + leqe.event_status_srq_catastrophic_error); + + /* silently drop the event if qp is no longer there. */ + if (!sif_qp_elem) { + sif_log(eq->ba.sdev, SIF_INFO, "QP context is NULL!"); + goto only_cne; + } + + /* silently drop the event if it is a PQP. */ + if (unlikely(sif_qp_elem->type == PSIF_QP_TRANSPORT_MANSP1) && + !leqe.event_status_srq_limit_reached) { + sif_log(eq->ba.sdev, SIF_INFO, "Received async event on PQP!"); + goto only_cne; + } + + if (unlikely(sif_qp_elem->type == PSIF_QP_TRANSPORT_XRC) && is_srq_event) { + sif_log(sdev, SIF_INTR, + "eq %d: Discarding %s event: QP transport XRC", + eq->index, leqe.event_status_srq_catastrophic_error ? + "IB_EVENT_SRQ_ERR" : "IB_EVENT_SRQ_LIMIT_REACHED"); + goto only_cne; + } + + /* check whether a qp context is required */ + if (PSIF_REVISION(sdev) <= 3 || !is_srq_event) { + /* silently drop the event if qp has been destroyed at this point. */ + if (!atomic_add_unless(&sif_qp_elem->refcnt, 1, 0)) { + sif_log(sdev, SIF_INTR, + "eq %d: qp %d has been destroyed for event seq %d", + eq->index, sif_qp_elem->qp_idx, eqe->seq_num); + goto only_cne; + } + qp_elem = (void *) &sif_qp_elem->ibqp; + } + } + + if (leqe.event_status_eps_c) + nevents += handle_epsc_event(eq, &leqe); + if (leqe.event_status_eps_a) + nevents += handle_epsa_event(eq, &leqe); + if (leqe.event_status_port_error) + nevents += handle_event(eq, port_elem, IB_EVENT_PORT_ERR); + if (leqe.event_status_client_registration) + nevents += handle_event(eq, port_elem, IB_EVENT_CLIENT_REREGISTER); + if (leqe.event_status_port_active) + nevents += handle_event(eq, port_elem, IB_EVENT_PORT_ACTIVE); + if (leqe.event_status_local_work_queue_catastrophic_error) { + nevents += handle_event(eq, qp_elem, IB_EVENT_QP_FATAL); + dump_eq_entry(SIF_INFO, "Got Fatal error", &leqe); + } + if (leqe.event_status_srq_catastrophic_error) + nevents += PSIF_REVISION(sdev) <= 3 ? + handle_srq_event(eq, qp_elem, IB_EVENT_SRQ_ERR) : + handle_event(eq, &get_sif_rq(sdev, leqe.rqd_id)->ibsrq, IB_EVENT_SRQ_ERR); + if (leqe.event_status_path_migration_request_error) + nevents += handle_event(eq, qp_elem, IB_EVENT_PATH_MIG_ERR); + if (leqe.event_status_local_access_violation_wq_error) + nevents += handle_event(eq, qp_elem, IB_EVENT_QP_ACCESS_ERR); + if (leqe.event_status_invalid_request_local_wq_error) + nevents += handle_event(eq, qp_elem, IB_EVENT_QP_REQ_ERR); + if (leqe.event_status_last_wqe_reached) + nevents += handle_event(eq, qp_elem, + IB_EVENT_QP_LAST_WQE_REACHED); + if (leqe.event_status_srq_limit_reached) + nevents += PSIF_REVISION(sdev) <= 3 ? + handle_srq_event(eq, qp_elem, IB_EVENT_SRQ_LIMIT_REACHED) : + handle_event(eq, &get_sif_rq(sdev, leqe.rqd_id)->ibsrq, + IB_EVENT_SRQ_LIMIT_REACHED); + if (leqe.event_status_communication_established) + nevents += handle_event(eq, qp_elem, IB_EVENT_COMM_EST); + if (leqe.event_status_path_migrated) + nevents += handle_event(eq, qp_elem, IB_EVENT_PATH_MIG); + if (leqe.event_status_cq_error) { + nevents += handle_event(eq, &get_sif_cq(sdev, leqe.cqd_id)->ibcq, + IB_EVENT_CQ_ERR); + dump_eq_entry(SIF_INFO, "Got cq_error", &leqe); + } + if (leqe.event_status_local_catastrophic_error) + nevents += handle_event(eq, port_elem, IB_EVENT_DEVICE_FATAL); + + + /* TBD: These are the ones that do not map directly to IB errors */ + check_for_psif_event(event_status_port_changed); + check_for_psif_event(event_status_invalid_xrceth); + check_for_psif_event(event_status_xrc_domain_violation); + + if (!nevents) { + sif_log(eq->ba.sdev, SIF_INTR, "eq %d: Warning: No events found for seq 0x%x", + eq->index, seqno); + dump_eq_entry(SIF_INFO, "(no event processed)", &leqe); + } else + sif_log(eq->ba.sdev, SIF_INTR, "Handled %d set event bits", nevents); + +only_cne: + spin_lock_irqsave(&eq->ba.lock, flags); + seqno = eq->next_seq; + eqe = (struct psif_eq_entry *)get_eq_entry(eq, seqno); + nreqs++; + } + spin_unlock_irqrestore(&eq->ba.lock, flags); + atomic_add(nreqs, &eq->intr_cnt); + return nreqs; +} + + +static enum ib_event_type epsc2ib_event(struct psif_eq_entry *eqe) +{ + switch (eqe->port_flags) { + case PSIF_EVENT_SGID_TABLE_CHANGED: + return IB_EVENT_GID_CHANGE; + case PSIF_EVENT_PKEY_TABLE_CHANGED: + return IB_EVENT_PKEY_CHANGE; + case PSIF_EVENT_MASTER_SM_LID_CHANGED: + case PSIF_EVENT_MASTER_SM_SL_CHANGED: + case PSIF_EVENT_IS_SM_DISABLED_CHANGED: + return IB_EVENT_SM_CHANGE; + case PSIF_EVENT_LID_TABLE_CHANGED: + return IB_EVENT_LID_CHANGE; + case PSIF_EVENT_SUBNET_TIMEOUT_CHANGED: + case PSIF_EVENT_CLIENT_REREGISTER: + return IB_EVENT_CLIENT_REREGISTER; + case PSIF_EVENT_PORT_ACTIVE: + return IB_EVENT_PORT_ACTIVE; + case PSIF_EVENT_PORT_ERR: + return IB_EVENT_PORT_ERR; + default: + return (enum ib_event_type)-1; + } +} + + +void sif_dfs_print_eq(struct seq_file *s, struct sif_dev *sdev, + loff_t pos) +{ + struct sif_eq *eq; + + if (unlikely(pos < 0)) { + seq_printf(s, "# sii = software index update interval\n" + "# niu = (index of) next software index update\n#\n" + "# ni = Number of events seen\n" + "# wi = Number of events handled in work queue\n" + "# Name\tindex\tentries\textent\tn.seq\tvector#\tIRQ#\t" + "#ni\t#wi\tsii\tniu\n"); + return; + } + + eq = &sdev->es[sdev->mbox_epsc].eqs.eq[pos]; + + seq_printf(s, "%-12s%u\t%u\t%u\t%u\t%d\t%d\t%u\t%u\t%u\t%u\n", + eq->name, eq->index, eq->entries, eq->extent, eq->next_seq, eq->intr_vec, + sdev->msix_entries[eq->intr_vec].vector, + atomic_read(&eq->intr_cnt), atomic_read(&eq->work_cnt), + eq->sw_index_interval, eq->sw_index_next_update); +} + +void sif_dfs_print_irq_ch(struct seq_file *s, struct sif_dev *sdev, + loff_t pos) +{ + struct sif_eq *eq; + + if (unlikely(pos < 0)) { + seq_printf(s, "# Interrupt channel coalescing settings\n#\n" + "# echo \"channel=1;adaptive=0;rx_scale=0;rate_low=0;" + "rate_high=0;ausec=0;ausec_low=0;ausec_high=0;pusec=0;" + "pusec_low=0;pusec_high=0\" > irq_ch\n#\n\n" + "# Channel adaptive rx_scale rate_low rate_high ausec ausec_low ausec_high pusec pusec_low pusec_high\n"); + return; + } + + eq = &sdev->es[sdev->mbox_epsc].eqs.eq[pos]; + seq_printf(s, "%-11s%-10u%-10u%-10u%-11u%-7d%-11d%-12u%-7u%-11u%-12u\n", + eq->name, eq->irq_ch.enable_adaptive, eq->irq_ch.channel_rx_scale, + eq->irq_ch.channel_rate_low, eq->irq_ch.channel_rate_high, + eq->irq_ch.channel_ausec, eq->irq_ch.channel_ausec_low, + eq->irq_ch.channel_ausec_high, eq->irq_ch.channel_pusec, + eq->irq_ch.channel_pusec_low, eq->irq_ch.channel_pusec_high); +} diff --git a/drivers/infiniband/hw/sif/sif_eq.h b/drivers/infiniband/hw/sif/sif_eq.h new file mode 100644 index 0000000000000..0b7c114a63577 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_eq.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_eq.h: Event queues and interrupt handling + */ + +#ifndef _SIF_EQ_H +#define _SIF_EQ_H +#include "psif_hw_csr.h" + +extern uint sif_cq_eq_max; + +struct sif_dev; +struct psif_epsc_csr_rsp; +struct sif_eq; +struct sif_cq; +struct sif_eps; + +struct sif_eq_base { + size_t max_cnt; /* Number of available event queues in hw */ + size_t min_sw_entry_cnt; /* Number of required event queue entries per port for EPSC EQ */ + size_t cnt; /* Number of configured hardware event queues */ + u16 irq_moderation; /* Interrupt total moderation */ + atomic_t eq_sel_seq; /* A "sequence number" used to select EQ for CQs (EPSC only) */ + struct sif_eq *eq; /* Dyn.alloc'ed array of sz cnt of eq.desc setup */ +}; + + +/* Set up the event queues for an EPS using info about #of queues from the @cqe + * which contains a host byte order copy of the successful response + * to the configuration request to the EPS in question + */ +int sif_eq_init(struct sif_dev *sdev, struct sif_eps *es, struct psif_epsc_csr_rsp *cqe); + +void sif_eq_deinit(struct sif_dev *sdev, struct sif_eps *es); + +int sif_enable_msix(struct sif_dev *s); +int sif_disable_msix(struct sif_dev *sdev); + +/* Printer for debugfs eq file */ +void sif_dfs_print_eq(struct seq_file *s, struct sif_dev *sdev, loff_t pos); + +/* Printer for debugfs int channel file */ +void sif_dfs_print_irq_ch(struct seq_file *s, struct sif_dev *sdev, loff_t pos); + +/* simple allocation of EQ channel for CQs: */ +u32 sif_get_eq_channel(struct sif_dev *sdev, struct sif_cq *cq); +bool sif_check_valid_eq_channel(struct sif_dev *sdev, int comp_vector); + +#endif diff --git a/drivers/infiniband/hw/sif/sif_fmr.c b/drivers/infiniband/hw/sif/sif_fmr.c new file mode 100644 index 0000000000000..e2fc65229b4d2 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_fmr.c @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_fmr.c: Implementation of fast memory registration for SIF + */ + +#include +#include +#include "sif_fmr.h" +#include "sif_dev.h" +#include "sif_defs.h" +#include "sif_mr.h" +#include "sif_base.h" +#include "psif_hw_setget.h" + +struct ib_fmr *sif_alloc_fmr(struct ib_pd *ibpd, + int mr_access_flags, struct ib_fmr_attr *fmr_attr) +{ + struct sif_dev *sdev = to_sdev(ibpd->device); + struct sif_pd *pd = to_spd(ibpd); + struct sif_fmr *fmr = kmalloc(sizeof(struct sif_fmr), GFP_KERNEL); + struct sif_mem *mem; + struct ib_fmr *ibfmr; + void *ret; + + if (!fmr) { + sif_log(sdev, SIF_INFO, "Unable to allocate memory for the fmr"); + return ERR_PTR(-ENOMEM); + } + + mem = sif_mem_create_fmr(sdev, fmr_attr->max_pages, fmr_attr->page_shift, GFP_KERNEL); + if (!mem) { + ret = ERR_PTR(-ENOMEM); + goto mem_create_failed; + } + + memset(fmr, 0, sizeof(struct sif_fmr)); + fmr->mr = alloc_mr(sdev, pd, mem, 0, + IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_ATOMIC); + if (IS_ERR(fmr->mr)) { + ret = fmr->mr; + goto mr_alloc_failed; + } + + ibfmr = &fmr->ibfmr; + ibfmr->lkey = fmr->mr->index; + ibfmr->rkey = fmr->mr->index; + + sif_log(sdev, SIF_FMR, "max_pages %d, page_shift %d, max_maps %d", + fmr_attr->max_pages, fmr_attr->page_shift, fmr_attr->max_maps); + return &fmr->ibfmr; + +mr_alloc_failed: + sif_mem_free(mem); +mem_create_failed: + kfree(fmr); + return ret; +} + + +int sif_map_phys_fmr(struct ib_fmr *ibfmr, + u64 *page_list, int list_len, u64 iova) +{ + struct sif_dev *sdev = to_sdev(ibfmr->device); + struct sif_fmr *fmr = to_sfmr(ibfmr); + struct sif_mem *mem = fmr->mr->mem; + int ret = 0; + + if (mem->mem_type != SIFMT_PTONLY) { + sif_log(sdev, SIF_FMR, "Attempt to map an already mapped fmr - must unmap first"); + ret = sif_unmap_phys_fmr(ibfmr); + if (ret) + return ret; + } + + ret = sif_mem_map_fmr(mem, iova, page_list, list_len); + if (ret) + return ret; + + ret = sif_map_fmr_ctx(sdev, &fmr->mr->mmu_ctx, mem); + return ret; +} + + +int sif_unmap_phys_fmr(struct ib_fmr *ibfmr) +{ + struct sif_fmr *fmr = to_sfmr(ibfmr); + struct sif_dev *sdev = to_sdev(ibfmr->device); + struct sif_mmu_ctx *ctx = &fmr->mr->mmu_ctx; + int index = fmr->mr->index; + struct psif_key *key = get_key(sdev, index); + + /* See sif_mr.c for details on invalidation of DMA validation keys */ + + /* First set key to a state where memory accesses are invalid: */ + set_psif_key__lkey_state(key, PSIF_DMA_KEY_MMU_VALID); + set_psif_key__rkey_state(key, PSIF_DMA_KEY_MMU_VALID); + sif_invalidate_key(sdev, index, PCM_WAIT); + + /* Synchronous TLB invalidation to avoid invalidating the key too early: */ + sif_unmap_fmr_ctx(sdev, ctx, PCM_WAIT); + + /* Invalidate the keys */ + set_psif_key__lkey_state(key, PSIF_DMA_KEY_INVALID); + set_psif_key__rkey_state(key, PSIF_DMA_KEY_INVALID); + sif_invalidate_key(sdev, index, PCM_WAIT); + + /* TBD: We could add code here to nil the ptes + * for debugging purposes, for now they are left behind.. + * (can leave stale PTE data behind, but never for pages we allow access to) + */ + + /* Reset the memory object - remove stale refs to pages + * (for sanity checking purposes, could be eliminated) + */ + sif_mem_unmap_fmr(fmr->mr->mem); + return 0; +} + + +static int invalidate_fmr_key(struct sif_st_pqp *spqp, struct ib_fmr *ibfmr, + enum psif_dma_vt_key_states state, enum wr_mode mode) +{ + struct sif_fmr *fmr = to_sfmr(ibfmr); + struct sif_dev *sdev = to_sdev(ibfmr->device); + int index = fmr->mr->index; + struct psif_key *key = get_key(sdev, index); + + set_psif_key__lkey_state(key, state); + set_psif_key__rkey_state(key, state); + if (spqp) + return sif_inv_key_update_st(spqp, index, mode); + else + return sif_invalidate_key(sdev, index, mode); +} + + +int sif_unmap_phys_fmr_list(struct list_head *fmr_list) +{ + struct ib_fmr *ib_fmr; + struct sif_dev *sdev = NULL; + enum wr_mode mode; + int ret; + int cnt = 0; + bool flush_all = false; + struct sif_st_pqp *spqp = NULL; + u16 ms = 0; + ulong start_time = jiffies; + + if (!list_empty(fmr_list)) { + ib_fmr = list_first_entry(fmr_list, struct ib_fmr, list); + sdev = to_sdev(ib_fmr->device); + } else + return 0; + + if (!sif_feature(disable_stencil_invalidate)) { + spqp = sif_alloc_ki_spqp(sdev); + if (!spqp) + sif_log(sdev, SIF_PQPT, + "All %u configured stencil pqps busy, consider increasing ki_spqp_size", + sdev->ki_spqp.pool_sz); + } + + if (!sdev->is_vf && sdev->num_vfs == 0) { + /* Check if we should do a brute force whole MMU caches flush (PF only) */ + list_for_each_entry(ib_fmr, fmr_list, list) { + cnt++; + if (cnt >= sif_fmr_cache_flush_threshold) { + ret = sif_post_flush_tlb(sdev, false); + flush_all = true; + goto key_to_invalid; + } + } + } + + cnt = 0; + list_for_each_entry(ib_fmr, fmr_list, list) { + mode = list_is_last(&ib_fmr->list, fmr_list) ? PCM_WAIT + : (!(cnt & 0x1f) ? PCM_POST_COMPL : PCM_POST); + ret = invalidate_fmr_key(spqp, ib_fmr, PSIF_DMA_KEY_MMU_VALID, mode); + if (ret) + goto out; + cnt++; + } + sif_log(sdev, SIF_INFO_V, "done with %d invalidates to MMU_VALID", cnt); + + cnt = 0; + list_for_each_entry(ib_fmr, fmr_list, list) { + mode = list_is_last(&ib_fmr->list, fmr_list) ? PCM_WAIT + : (!(cnt & 0x1f) ? PCM_POST_COMPL : PCM_POST); + sif_unmap_fmr_ctx(to_sdev(ib_fmr->device), + &(to_sfmr(ib_fmr))->mr->mmu_ctx, mode); + cnt++; + } + sif_log(sdev, SIF_INFO_V, "done with %d unmap_fmr_ctxs", cnt); +key_to_invalid: + cnt = 0; + + list_for_each_entry(ib_fmr, fmr_list, list) { + mode = list_is_last(&ib_fmr->list, fmr_list) ? PCM_WAIT + : (!(cnt & 0x1f) ? PCM_POST_COMPL : PCM_POST); + ret = invalidate_fmr_key(spqp, ib_fmr, PSIF_DMA_KEY_INVALID, mode); + if (ret) + goto out; + cnt++; + } + sif_log(sdev, SIF_INFO_V, "done invalidating %d fmr keys%s", + cnt, (spqp ? " (stencil)" : "")); + + if (flush_all) { + ret = sif_complete_flush_tlb(sdev); + if (ret) + goto out; + } + + cnt = 0; + list_for_each_entry(ib_fmr, fmr_list, list) { + sif_mem_unmap_fmr((to_sfmr(ib_fmr))->mr->mem); + cnt++; + } + ms = jiffies_to_msecs(jiffies - start_time); + sif_log_perf(sdev, SIF_PERF_V, "done unmapping %d fmrs in %u ms", cnt, ms); +out: + if (spqp) + sif_release_ki_spqp(spqp); + + return ret; +} + + +int sif_dealloc_fmr(struct ib_fmr *ibfmr) +{ + struct sif_dev *sdev = to_sdev(ibfmr->device); + struct sif_fmr *fmr = to_sfmr(ibfmr); + + if (fmr->mr->mem->mem_type != SIFMT_PTONLY) { + sif_log(sdev, SIF_FMR, "Attempt to deallocate a mapped fmr (key %d) - must unmap first", + fmr->mr->index); + return -EBUSY; + } + sif_dealloc_mr(sdev, fmr->mr); + kfree(fmr); + return 0; +} diff --git a/drivers/infiniband/hw/sif/sif_fmr.h b/drivers/infiniband/hw/sif/sif_fmr.h new file mode 100644 index 0000000000000..15625631f1c91 --- /dev/null +++ b/drivers/infiniband/hw/sif/sif_fmr.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * Author: Knut Omang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Driver for Oracle Scalable Infiniband Fabric (SIF) Host Channel Adapters + * + * sif_fmr.h: Interface to internal IB Fast Memory Registration (FMR) + * logic for SIF + */ + +#ifndef __SIF_FMR_H +#define __SIF_FMR_H + +struct sif_fmr { + struct ib_fmr ibfmr; + struct sif_mr *mr; +}; + +static inline struct sif_fmr *to_sfmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct sif_fmr, ibfmr); +} + +struct ib_fmr *sif_alloc_fmr(struct ib_pd *ibpd, + int mr_access_flags, struct ib_fmr_attr *fmr_attr); +int sif_map_phys_fmr(struct ib_fmr *ibfmr, + u64 *page_list, int list_len, u64 iova); + +int sif_unmap_phys_fmr(struct ib_fmr *ibfmr); +int sif_unmap_phys_fmr_list(struct list_head *fmr_list); + +int sif_dealloc_fmr(struct ib_fmr *ibfmr); + +#endif