From: Saeed Mahameed Date: Wed, 17 Apr 2013 20:21:12 +0000 (+0300) Subject: mlx4_vnic: add mlx4_vnic X-Git-Tag: v4.1.12-92~319^2^2~6^2~1 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=b6f907ed293536ef2a5b910bb8bc736e697c8e79;p=users%2Fjedix%2Flinux-maple.git mlx4_vnic: add mlx4_vnic Add mlx4_vnic code Also squash following porting commmits for compilation of the integrated commit (without squashing they wont compile) mlx4_vnic: adapt vnic to ofed2 mlx4 implementation mlx4_vnic: align with OFED2 upstream 3.7 kernel mlx4_vnic: Fix reference path to hw/mlx4 header files mlx4_vnic: remove mlx4_vnic_helper module mlx4_vnic: use ib_modify_cq() in upstream kernel We modify code to use ib_modify_cq() in upstream kernel (and not use a modified Mellanox version) mlx4_vnic: removed reference to mlx4_ib_qp->rules_list in vnic_qp.c Remove field introduced with Mellanox OFED 2.4 flow steering patches which are not in upstream kernel. mlx4_vnic: used an older version of mlx4_qp_reserve_range() Use mlx4_qp_reserve_range() aligned with version in Linux 3.18 (We can use the new API when it is available upstream) mlx4_vnic: port to Linux 3.18* mlx4_vnic code is based on the original port of mlx4_vnic in UEK3. Make changes to compile on UEK4 (based on Linux 3.18). Use upstream APIs -not Mellanox specific ones - where they are in conflict and other changes to make it compile on Linux 3.18 Signed-off-by: Saeed Mahameed Signed-off-by: Ajaykumar Hotchandani Signed-off-by: Qing Huang (Ported from UEK3 and Mellanox OFED 2.4) Signed-off-by: Mukesh Kacker --- diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/Makefile b/drivers/net/ethernet/mellanox/mlx4_vnic/Makefile new file mode 100644 index 0000000000000..09d022a5f56a1 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/Makefile @@ -0,0 +1,9 @@ +obj-$(CONFIG_MLX4_VNIC) += mlx4_vnic.o + +mlx4_vnic-y := vnic_data_main.o vnic_data_ib.o vnic_data_netdev.o vnic_data_neigh.o \ + vnic_data_fs.o vnic_data_tx.o vnic_data_ethtool.o vnic_data_rx.o \ + vnic_fip_main.o vnic_fip_ib.o vnic_fip_discover.o vnic_fip_pkt.o \ + vnic_fip_login.o vnic_fip_vhub.o vnic_mcast.o vnic_port.o \ + vnic_param.o vnic_qp.o vnic_main.o fip_parser.o \ + vnic_data_mac.o + diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/doc/gw_state.dot b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/gw_state.dot new file mode 100644 index 0000000000000..44f59565f31ba --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/gw_state.dot @@ -0,0 +1,5 @@ +digraph { + FIP_GW_HOST_ADMIN; + FIP_GW_MCAST_RCVD; + FIP_GW_CONNECTED; +} diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/doc/regndev.dot b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/regndev.dot new file mode 100644 index 0000000000000..ea10aba3add12 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/regndev.dot @@ -0,0 +1,54 @@ +digraph { + + vnic_login_create_1 -> register_netdev; // + __vnic_login_create -> vnic_login_create_1; // + vnic_new_intf_store -> __vnic_login_create; // + vnic_port_data_init -> __vnic_login_create; // + vnic_ib_dev_add_one -> vnic_port_data_init; // + fip_vnic_login_create -> vnic_login_create_1; // + fip_vnic_test_login -> fip_vnic_login_create [label="login_wq", color=blue]; // + fip_vnic_destroy -> fip_vnic_test_login; // + fip_purge_vnics -> fip_vnic_destroy; // + fip_purge_vnics -> fip_purge_vnics [label="fip_wq", color=blue]; // + fip_vnic_close -> fip_purge_vnics [label="fip_wq", color=blue]; + fip_vnic_hadmin_init -> fip_vnic_test_login; // + fip_gw_update_hadmin_gw -> fip_vnic_hadmin_init; // + fip_discover_hadmin_update -> fip_gw_update_hadmin_gw; // + fip_hadmin_sysfs_update -> fip_discover_hadmin_update [label="fip_wq", color=blue]; // + fip_vnic_fsm -> fip_vnic_test_login; // + fip_gw_create_vnics -> fip_vnic_fsm; // + + + fip_gw_update_hadmin_gw -> fip_vnic_fsm; + fip_vnic_login_ack_recv -> fip_vnic_fsm; // + fip_discover_rx_packet_bh -> fip_vnic_login_ack_recv; + fip_vnic_tbl_done -> fip_vnic_fsm; // + vhub_handle_tbl -> fip_vnic_tbl_done; // + fip_vnic_recv_bh -> vhub_handle_tbl; // + fip_vnic_recv -> fip_vnic_recv_bh [label="fip_wq", color=blue]; // + fip_vnic_comp -> fip_vnic_recv; + + fip_discover_rx_advertise_bh -> fip_discover_gw_fsm; + + fip_hadmin_vnic_refresh -> fip_vnic_fsm; // + fip_gw_create_vnics -> fip_hadmin_vnic_refresh // + fip_gw_modified -> fip_gw_create_vnics; // + fip_discover_rx_advertise_bh -> fip_gw_modified; // + fip_discover_rx_packet_bh -> fip_discover_rx_advertise_bh; // + fip_discover_process_rx_bh -> fip_discover_rx_packet_bh; // + fip_discover_process_rx -> fip_discover_process_rx_bh [label="fip_wq", color=blue]; // + fip_discover_comp -> fip_discover_process_rx; + + + + fip_discover_rx_advertise_bh -> fip_gw_create_vnics; + fip_discover_gw_fsm -> fip_gw_create_vnics; + + vnic_login_pre_create_1 -> vnic_alloc_netdev; // + __vnic_login_create -> vnic_login_pre_create_1; + fip_vnic_hadmin_init -> vnic_login_pre_create_1; + fip_vnic_login_init -> vnic_login_pre_create_1; + fip_vnic_fsm -> fip_vnic_login_init; + + +} diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/doc/vnic_flush.dot b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/vnic_flush.dot new file mode 100644 index 0000000000000..fc2a8fd2560c5 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/vnic_flush.dot @@ -0,0 +1,5 @@ +digraph { + -> FIP_NO_FLUSH [label="fip_vnic_alloc"]; + FIP_PARTIAL_FLUSH; + FIP_FULL_FLUSH; +} diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/doc/vnic_state.dot b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/vnic_state.dot new file mode 100644 index 0000000000000..6adcd8996e52d --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/vnic_state.dot @@ -0,0 +1,15 @@ +digraph { + FIP_VNIC_CLOSED; + fip_vnic_alloc [shape=regular]; + fip_vnic_alloc -> FIP_VNIC_HADMIN_IDLE [label="hadmin"]; + fip_vnic_alloc -> FIP_VNIC_LOGIN [label="none hadmin"]; + FIP_VNIC_WAIT_4_ACK; + FIP_VNIC_RINGS_INIT; + FIP_VNIC_MCAST_INIT; + FIP_VNIC_MCAST_INIT_DONE; + FIP_VNIC_VHUB_INIT; + FIP_VNIC_VHUB_INIT_DONE; + FIP_VNIC_VHUB_DONE; + FIP_VNIC_VHUB_WRITE; + FIP_VNIC_CONNECTED; +} diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/fip_parser.c b/drivers/net/ethernet/mellanox/mlx4_vnic/fip_parser.c new file mode 100644 index 0000000000000..e1782998467c0 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/fip_parser.c @@ -0,0 +1,510 @@ +/* + * Copyright (c) 2010 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_fip.h" +#include "vnic_fip_pkt.h" + +static const struct subcode_rules { + u64 req_mask; + u64 opt_mask; +} subcodes_array[FIP_MAX_SUBCODES] = { + [FIP_HOST_SOL_SUB_OPCODE] = { + .req_mask = FIP_MASK(VENDOR_ID) | + FIP_MASK(ADDRESS), + .opt_mask = FIP_MASK(EXT_DESC), + }, + [FIP_GW_ADV_SUB_OPCODE] = { + .req_mask = FIP_MASK(VENDOR_ID) | + FIP_MASK(ADDRESS) | + FIP_MASK(GW_INFORMATION) | + FIP_MASK(GW_IDENTIFIER) | + FIP_MASK(KA_PARAMS), + .opt_mask = FIP_MASK(EXT_DESC), + }, + [FIP_HOST_LOGIN_SUB_OPCODE] = { + .req_mask = FIP_MASK(VENDOR_ID) | + FIP_MASK(ADDRESS) | + FIP_MASK(LOGIN) | + FIP_MASK(PARTITION), + .opt_mask = FIP_MASK(EXT_DESC), + }, + [FIP_GW_LOGIN_SUB_OPCODE] = { + .req_mask = FIP_MASK(VENDOR_ID) | + FIP_MASK(ADDRESS) | + FIP_MASK(LOGIN) | + FIP_MASK(PARTITION), + .opt_mask = FIP_MASK(EXT_DESC), + }, + [FIP_HOST_LOGOUT_SUB_OPCODE] = { + .req_mask = FIP_MASK(VENDOR_ID) | + FIP_MASK(VNIC_IDENTITY), + }, + [FIP_GW_UPDATE_SUB_OPCODE] = { + .req_mask = FIP_MASK(VENDOR_ID) | + FIP_MASK(VHUB_UPDATE), + .opt_mask = FIP_MASK(EXT_DESC), + }, + [FIP_GW_TABLE_SUB_OPCODE] = { + .req_mask = FIP_MASK(VENDOR_ID) | + FIP_MASK(VHUB_TABLE), + }, + [FIP_HOST_ALIVE_SUB_OPCODE] = { + .req_mask = FIP_MASK(VENDOR_ID) | + FIP_MASK(VNIC_IDENTITY), + }, +}; + +static int type2idx(struct fip_content *fc, struct fip_fip_type *ft) +{ + void *p = ft; + + switch (ft->type) { + case FIP_TYPE(VENDOR_ID): + fc->fvend = p; + return FIP_TYPE_IDX(VENDOR_ID); + case FIP_TYPE(ADDRESS): + fc->fa.fa[fc->fa.num++] = p; + return FIP_TYPE_IDX(ADDRESS); + case FIP_TYPE(GW_INFORMATION): + fc->fgwi = p; + return FIP_TYPE_IDX(GW_INFORMATION); + case FIP_TYPE(LOGIN): + fc->fl = p; + return FIP_TYPE_IDX(LOGIN); + case FIP_TYPE(VHUB_UPDATE): + fc->fvu = p; + return FIP_TYPE_IDX(VHUB_UPDATE); + case FIP_TYPE(VHUB_TABLE): + fc->fvt = p; + return FIP_TYPE_IDX(VHUB_TABLE); + case FIP_TYPE(VNIC_IDENTITY): + fc->fvi = p; + return FIP_TYPE_IDX(VNIC_IDENTITY); + case FIP_TYPE(PARTITION): + fc->fp = p; + return FIP_TYPE_IDX(PARTITION); + case FIP_TYPE(GW_IDENTIFIER): + fc->fgid = p; + return FIP_TYPE_IDX(GW_IDENTIFIER); + case FIP_TYPE(KA_PARAMS): + fc->fka = p; + return FIP_TYPE_IDX(KA_PARAMS); + case FIP_TYPE(EXT_DESC): + fc->fed.fed[fc->fed.num++] = p; + return FIP_TYPE_IDX(EXT_DESC); + default: + return -1; + } +} + +#ifdef CONFIG_MLX4_VNIC_DEBUG +static const char *fip_type_str(int type) +{ + switch (type) { + FIP_CASE_STR(VENDOR_ID); + FIP_CASE_STR(ADDRESS); + FIP_CASE_STR(GW_INFORMATION); + FIP_CASE_STR(LOGIN); + FIP_CASE_STR(VHUB_UPDATE); + FIP_CASE_STR(VHUB_TABLE); + FIP_CASE_STR(VNIC_IDENTITY); + FIP_CASE_STR(PARTITION); + FIP_CASE_STR(GW_IDENTIFIER); + FIP_CASE_STR(KA_PARAMS); + FIP_CASE_STR(EXT_DESC); + default: + return "Unknown"; + } +} + +static const char *fip_subcode_str(int subcode) +{ + switch (subcode) { + FIP_SUBCODE_CASE_STR(FIP_HOST_SOL_SUB_OPCODE); + FIP_SUBCODE_CASE_STR(FIP_GW_ADV_SUB_OPCODE); + FIP_SUBCODE_CASE_STR(FIP_HOST_LOGIN_SUB_OPCODE); + FIP_SUBCODE_CASE_STR(FIP_GW_LOGIN_SUB_OPCODE); + FIP_SUBCODE_CASE_STR(FIP_HOST_LOGOUT_SUB_OPCODE); + FIP_SUBCODE_CASE_STR(FIP_GW_UPDATE_SUB_OPCODE); + FIP_SUBCODE_CASE_STR(FIP_GW_TABLE_SUB_OPCODE); + FIP_SUBCODE_CASE_STR(FIP_HOST_ALIVE_SUB_OPCODE); + default: + return "Unknown"; + } +} +#endif + +static int verify_mlx_sig(void *p) +{ + static const char *mlx4_str = "mellanox"; + __be64 mlx_str_64 = *(__be64 *)mlx4_str; + __be64 *sig = p; + + return *sig != mlx_str_64; +} + +static int next_type(struct vnic_port *port, void *tlv, int len, + struct fip_content *fc, int *sz, int *idx) +{ + struct fip_fip_type *ft; + + if (sizeof *ft > len) { + vnic_dbg_parse(port->name, "message too short\n"); + return -1; + } + ft = tlv + ; + vnic_dbg_parse(port->name, "TLV: type %s(%d)\n", fip_type_str(ft->type), + ft->type); + + if (!ft->length || (ft->length << 2 > len)) { + vnic_dbg_parse(port->name, "TLV does not fit in message: %s(%d) " + "tlv->len %d, remaining %d\n", fip_type_str(ft->type), + ft->type, ft->length << 2, len); + return -1; + } + + *sz = (ft->length << 2); + + *idx = type2idx(fc, ft); + if (*idx < 0) { + vnic_dbg_parse(port->name, "unkown type %d\n", ft->type); + return -1; + } + + if (ft->type == FIP_TYPE(VENDOR_ID) && verify_mlx_sig(fc->fvend->vendor_id)) { + vnic_dbg_parse(port->name, "mellanox signature check failed\n"); + return -1; + } + + if (ft->type == FIP_TYPE(VHUB_TABLE) || ft->type == FIP_TYPE(VHUB_UPDATE)) { + int cte_list_sz; + struct context_table_entry *cte_start; + + if (ft->type == FIP_TYPE(VHUB_TABLE)) { + unsigned hdr = be16_to_cpu(fc->fvt->hdr) >> 14; + + if (hdr > FIP_TABLE_HDR_ONLY) { + vnic_dbg_parse(port->name, "invalid table header %d\n", hdr); + return -1; + } + cte_list_sz = *sz - sizeof(struct fip_vhub_table_tlv); + /* Todo, the next 2 lines are comented because the size of the tbl tlv is + miscomputed in BXM versions 1.3.6-5 and it causes tables to be discarded. + In reality the size should be used with the lines in tact. */ + /*if (hdr == FIP_TABLE_HDR_LAST) + cte_list_sz -= 4; + */ + + cte_start = (struct context_table_entry *)(fc->fvt + 1); + } else { + cte_list_sz = *sz - sizeof(struct fip_vhub_update_tlv); + cte_start = (struct context_table_entry *)(fc->fvu + 1); + } + + + fc->cte.num = cte_list_sz / sizeof(struct context_table_entry); + fc->cte.cte = cte_start; + } + + + return 0; +} + +static inline int check_eoib_ver(struct vnic_port *port, + struct fip_eoib_ver *eoib_ver, int sz, int *len) +{ + if (unlikely(sz < sizeof *eoib_ver)) { + vnic_dbg_parse(port->name, "message too short\n"); + *len = sz; + return -ENOMEM; + } + *len = sizeof *eoib_ver; + if (unlikely(eoib_ver->version >> 4)) { + vnic_dbg_parse(port->name, "eoib version check failed: %d\n", eoib_ver->version >> 4); + return -EINVAL; + } + return 0; +} + +static void dump_raw(struct vnic_port *port, void *buf, int len) +{ + int i; + + for (i = 0; i < len / 4; ++i) + vnic_dbg_parse(port->name, "0x%08x\n", be32_to_cpu(((__be32 *)(buf))[i])); +} + +static inline int check_fip_hdr(struct vnic_port *port, + struct fip_header_simple *fh, int sz, int *len) +{ + if (unlikely(sizeof *fh > sz)) { + vnic_dbg_parse(port->name, "message too short\n"); + return -1; + } + + if (unlikely(fh->opcode != cpu_to_be16(EOIB_FIP_OPCODE))) { + vnic_dbg_parse(port->name, "not fip opcode\n"); + return -1; + } + + if (unlikely((be16_to_cpu(fh->list_length) << 2) > (sz - sizeof *fh))) { + vnic_dbg_parse(port->name, "message too short: header length = %u, " + "left length = %lu\n", + be16_to_cpu(fh->list_length) << 2, sz - sizeof *fh); + return -1; + } + + *len = sizeof *fh; + + return 0; +} + +static int check_fip_mask(struct vnic_port *port, struct fip_content *fc) +{ + u64 req_mask = subcodes_array[fc->fh->subcode].req_mask; + u64 opt_mask = subcodes_array[fc->fh->subcode].opt_mask; + + if (((fc->mask & req_mask) != req_mask) || + ((fc->mask & ~opt_mask) & ~req_mask)) { + vnic_dbg_parse(port->name, "%s: mask check failed: mask 0x%llx," + "req_mask 0x%llx, opt_mask 0x%llx\n", + fip_subcode_str(fc->fh->subcode), fc->mask, req_mask, opt_mask); + return -1; + } + + return 0; +} + +static void dump_cte(struct vnic_port *port, struct context_table_entry *cte) +{ + vnic_dbg_parse(port->name, "CTE: V(%d) RSS(%d) type(%d) MAC(%pM) QPN(0x%06x) SL(%d) LID(0x%04x)\n", + (0x1 & (cte->v_rss_type >> 7)), + (0x1 & (cte->v_rss_type >> 6)), + (cte->v_rss_type & 0xf), + cte->mac, be32_to_cpu(cte->qpn) & 0xffffff, + (cte->sl & 0xf), be16_to_cpu(cte->lid)); +} + +static void dump_vnic_identity(struct vnic_port *port, + struct fip_vnic_identity_tlv *fvi) +{ +#define VHUB_ID be32_to_cpu(fvi->flags_vhub_id) + + vnic_dbg_parse(port->name, "%s: U(%d) R(%d) VP(%d) VHUBID(x%x) TUSN(0x%x) VNIC_ID(0x%x)" + "MAC(%pM) GUID("GUID_FORMAT") VNIC NAME (%s)\n", + fip_type_str(fvi->ft.type), (VHUB_ID >> 31), (0x01 & (VHUB_ID >> 30)), + (0x01 & (VHUB_ID >> 24)), VHUB_ID & 0xffffff, be32_to_cpu(fvi->tusn), + be16_to_cpu(fvi->vnic_id), fvi->mac, GUID_ARG(fvi->port_guid), fvi->vnic_name); +} + +static void dump_vnic_partition(struct vnic_port *port, struct fip_partition_tlv *fp) +{ + vnic_dbg_parse(port->name, "%s: PKEY(0x%x)\n", fip_type_str(fp->ft.type), + be16_to_cpu(fp->pkey)); +} + + +static void dump_gw_identifier(struct vnic_port *port, struct fip_gw_identifier_tlv *fgid) +{ + vnic_dbg_parse(port->name, "%s: SYS GUID("GUID_FORMAT") SYS NAME(%s) GW PORT NAME(%s)\n", + fip_type_str(fgid->ft.type), GUID_ARG(fgid->sys_guid), fgid->sys_name, fgid->sys_name); +} + +static void dump_ka_params(struct vnic_port *port, struct fip_ka_params_tlv *fka) +{ + vnic_dbg_parse(port->name, "%s: GW_ADV_PERIOD(%d) GW_KA_PERIOD(%d) VNIC_KA_PERIOD(%d)\n", + fip_type_str(fka->ft.type), be32_to_cpu(fka->adv_period), + be32_to_cpu(fka->ka_period), be32_to_cpu(fka->vnic_ka_period)); +} + +static void dump_vhub_table(struct vnic_port *port, struct fip_content *fc) +{ + int i; + + vnic_dbg_parse(port->name, "%s: VP(%d) vhub id(0x%x) TUSN(0x%x) HDR(%d) table size (%d)\n", + fip_type_str(fc->fvt->ft.type), be32_to_cpu(fc->fvt->vp_vhub_id) >> 24 & 1, + be32_to_cpu(fc->fvt->vp_vhub_id) & 0xffffff, be32_to_cpu(fc->fvt->tusn), + be16_to_cpu(fc->fvt->hdr) >> 14, be16_to_cpu(fc->fvt->table_size)); + for (i = 0; i < fc->cte.num; ++i) + dump_cte(port, &fc->cte.cte[i]); +} + +static void dump_fip_login(struct vnic_port *port, struct fip_login_tlv *p) +{ + vnic_dbg_parse(port->name, "%s: mtu(%d) vnic_id(0x%x) v_m_vp_h(0x%x) vlan(0x%x) mac(%pM)" + "mgid_prefix("MGID_PREFIX_FMT") vfields(0x%0x) syndrom(%d) QPN(0x%x)" + " vnic_name(%s)\n", fip_type_str(p->ft.type), be16_to_cpu(p->mtu), + be16_to_cpu(p->vnic_id), be16_to_cpu(p->flags_vlan) >> 12, + be16_to_cpu(p->flags_vlan) & 0xfff, p->mac, MGID_PRE_ARG(p->eth_gid_prefix), + be16_to_cpu(p->vfields), be32_to_cpu(p->syndrom_ctrl_qpn) >> 24, + be32_to_cpu(p->syndrom_ctrl_qpn) & 0xffffff, p->vnic_name); +} + +static void dump_fip_address(struct vnic_port *port, struct fip_address_tlv *fa) +{ + vnic_dbg_parse(port->name, "%s: GW_TYPE(%d) QPN(0x%x) SL(%d), GW_PORT_ID(0x%x)," + " LID(0x%x) GUID(" GUID_FORMAT ")\n", fip_type_str(fa->ft.type), + be32_to_cpu(fa->gwtype_qpn) >> 24, be32_to_cpu(fa->gwtype_qpn) & 0xffffff, + be16_to_cpu(fa->sl_gwportid) >> 12, be16_to_cpu(fa->sl_gwportid) & 0xfff, + be16_to_cpu(fa->lid), GUID_ARG(fa->guid)); +} + +static void dump_vhub_update(struct vnic_port *port, struct fip_content *fc) +{ +#define VHUB_ID_1 be32_to_cpu(fc->fvu->state_vhub_id) + int i; + + vnic_dbg_parse((port->name), "%s: eport_state(%s) vp(%d) vhub_id(0x%x) tusn(0x%x)\n", + fip_type_str(fc->fvu->ft.type), eport_state_str(VHUB_ID_1 >> 28 & 3), + VHUB_ID_1 >> 24 & 1, VHUB_ID_1 & 0xffffff, be32_to_cpu(fc->fvu->tusn)); + for (i = 0; i < fc->cte.num; ++i) + dump_cte(port, &fc->cte.cte[i]); +} + +static void dump_gateway_information(struct vnic_port *port, + struct fip_gw_information_tlv *fgwi) +{ + vnic_dbg_parse(port->name, "%s: accept host administered(%s) nmac_mgid(%d) " + "nrss_mgid(%d) ntss_qpn(%d), n_rss(%d), num_net_vnics(%d)\n", + fip_type_str(fgwi->ft.type), (fgwi->h_nmac_mgid >> 7) ? "Yes" : "No", + fgwi->h_nmac_mgid & 0x3f, fgwi->n_rss_mgid_tss_qpn >> 4, + fgwi->n_rss_mgid_tss_qpn & 0xf, be16_to_cpu(fgwi->n_rss_qpn_vnics) >> 12, + be16_to_cpu(fgwi->n_rss_qpn_vnics) & 0xfff); +} + +static void dump_fip_packet(struct vnic_port *port, struct fip_content *fc) +{ + int i; + + for (i = 0; i < fc->fa.num; ++i) + dump_fip_address(port, fc->fa.fa[i]); + + if (fc->fgwi) + dump_gateway_information(port, fc->fgwi); + + if (fc->fvu) + dump_vhub_update(port, fc); + + if (fc->fl) + dump_fip_login(port, fc->fl); + + if (fc->fvt) + dump_vhub_table(port, fc); + + if (fc->fvi) + dump_vnic_identity(port, fc->fvi); + + if (fc->fp) + dump_vnic_partition(port, fc->fp); + + if (fc->fgid) + dump_gw_identifier(port, fc->fgid); + + if (fc->fka) + dump_ka_params(port, fc->fka); +} + +int fip_packet_parse(struct vnic_port *port, void *packet, int pkt_size, struct fip_content *fc) +{ + void *ptr = packet; + int len; + int err; + int idx; + u16 offset = 0; + int size = pkt_size; + + vnic_dbg_parse(port->name, "size = %d\n", size); + err = check_eoib_ver(port, ptr, size, &len); + if (err) { + if (err != -EINVAL) + goto out_err; + else + vnic_dbg_parse(port->name, "version check failed\n"); + } + + fc->eoib_ver = ptr; + size -= len; + ptr += len; + offset += len; + fc->fh = ptr; + + err = check_fip_hdr(port, ptr, size, &len); + if (err) + goto out_err; + + ptr += len; + offset += len; + + fc->fa.num = 0; + fc->num = 0; + fc->mask = 0; + + /* workaround a BXM bug not reporting the correct descriptor length */ + if (fc->fh->subcode != FIP_GW_ADV_SUB_OPCODE) + size = be16_to_cpu(fc->fh->list_length) << 2; + else + size -= len; + + vnic_dbg_parse(port->name, "subcode = %s, size %d\n", + fip_subcode_str(fc->fh->subcode), size); + while (size > 0) { + err = next_type(port, ptr, size, fc, &len, &idx); + if (err) + break; + + fc->offsets[fc->num] = offset; + fc->mask |= ((u64)1 << idx); + ptr += len; + size -= len; + offset += len; + fc->num++; + } + + if (err) + goto out_err; + + err = check_fip_mask(port, fc); + if (err) { + vnic_dbg_parse(port->name, "check mask: failed\n"); + goto out_err; + } + + dump_fip_packet(port, fc); + + return 0; + +out_err: + dump_raw(port, packet, pkt_size); + return err; +} diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic.h new file mode 100644 index 0000000000000..04a5e832d4222 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic.h @@ -0,0 +1,1437 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef VNIC_H +#define VNIC_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* for mlx4_ib dev attr, used also in vnic_qp.c */ +#include "../../../../infiniband/hw/mlx4/mlx4_ib.h" +#include "../../../../infiniband/hw/mlx4/user.h" + +#include "vnic_utils.h" + +/* driver info definition */ +#define DRV_NAME "mlx4_vnic" +#define DRV_VER "1.4.0" +#define DRV_LIC "Dual BSD/GPL" +#define DRV_DESC "Mellanox BridgeX Virtual NIC Driver" +#define DRV_AUTH "Ali Ayoub & Gabi Liron" + +/* backports */ + +/* for kernel >= 3.17 */ +#define alloc_netdev_mqs(a, b, c, d, e) alloc_netdev_mqs(a, b, NET_NAME_UNKNOWN, c, d, e) + +#ifdef alloc_netdev_mq +#undef alloc_netdev_mq +#define alloc_netdev_mq(sizeof_priv, name, setup, count) \ + alloc_netdev_mqs(sizeof_priv, name, setup, count, count) +#endif + +#ifndef SET_ETHTOOL_OPS +#define SET_ETHTOOL_OPS(netdev,ops) \ + ( (netdev)->ethtool_ops = (ops) ) +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35)) +#define _BP_NO_MC_LIST + +// Not sure this should be here at least this is ok for 2.6.39 +#define _BP_NO_ATT_OWNER +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0)) +#define _BP_NO_GRO +#endif + +#ifndef NETIF_F_HW_VLAN_FILTER +#define NETIF_F_HW_VLAN_FILTER NETIF_F_HW_VLAN_CTAG_FILTER +#endif + +/* externs */ +extern u32 vnic_msglvl; +extern u32 vnic_max_tx_outs; +extern u32 vnic_lro_num; +extern u32 vnic_mcast_create; +extern u32 vnic_net_admin; +extern u32 vnic_child_max; +extern u32 vnic_napi_weight; +extern u32 vnic_linear_small_pkt; +extern u32 vnic_tx_rings_num; +extern u32 vnic_rx_rings_num; +extern u32 vnic_tx_rings_len; +extern u32 vnic_rx_rings_len; +extern u32 vnic_mgid_data_type; +extern u32 vnic_encap_headroom; +extern u32 vnic_tx_polling; +extern u32 vnic_rx_linear; +extern u32 vnic_change_mac; +extern u32 vnic_learn_mac_enabled; +extern u32 vnic_synd_backlog; +extern u32 vnic_eport_state_enforce; +extern u32 vnic_src_mac_enforce; +extern u32 vnic_inline_tshold; + +#define MAX_NUM_PKEYS_DISCOVERY (24) +#define ILLEGAL_PKEY_INDEX (0xFFFF) +extern u32 vnic_discovery_pkeys[MAX_NUM_PKEYS_DISCOVERY]; +extern u32 vnic_discovery_pkeys_count; +extern u32 vnic_sa_query; + + +extern u32 no_bxm; + +extern struct workqueue_struct *port_wq; +extern struct workqueue_struct *fip_wq; +extern struct workqueue_struct *mcast_wq; +extern struct workqueue_struct *login_wq; + +extern struct ib_sa_client vnic_sa_client; + +/* definitions */ +#define EOIB_SERVICE_ID ((0x10ULL << 56) | (0x0002C9E01B0000ULL)) +#define EOIB_CTRL_SERVICE_ID (EOIB_SERVICE_ID | 0x00FFULL) +#define VNIC_SKB_QUEUE_LEN 32 +#define VNIC_CNT_MAX 32 +#define VNIC_DESC_LEN (64 + 4) +#define VNIC_NAME_LEN 16 /* by spec, use IFNAMSIZ for OS */ +#define VNIC_SYSFS_FLEN (VNIC_NAME_LEN * 2) /* SYSFS file name len, allow pre/suffix (32)*/ +#define VNIC_SYSFS_LLEN 64 +#define VNIC_VENDOR_LEN 8 +#define GID_LEN 16 +#define GUID_LEN 8 +#define IPV4_LEN 4 +#define IPV6_LEN 16 +#define VNIC_SYSTEM_NAME_LEN 32 +#define VNIC_GW_PORT_NAME_LEN 8 +#define GID_PREFIX_LEN 5 +#define VNIC_MAX_DENTRIES 16 +#define VNIC_ID_LEN 16 +#define VNIC_CHILD_MAX 128 +#define VNIC_MAX_RETRIES 0 /* zero = unlimited */ +#define VNIC_WATCHDOG_TIMEOUT (25 * HZ) /* 25 sec */ +#define VNIC_NAPI_SCHED_TIMEOUT (5) +#define FIP_MAX_VNICS_PER_GW (1 << 9) +#define NOT_AVAILABLE_NUM (-1) +#define NOT_AVAILABLE_STRING "N/A" +#define is_valid_str(str) (strcmp(str, NOT_AVAILABLE_STRING)) +#define is_valid_num(num) (num != NOT_AVAILABLE_NUM) +#define is_valid_guid(arr) (!!(*((u64 *)(arr)))) +#define is_valid_ipv4(arr) (!!(*((u32 *)(arr)))) +#define is_mcast_promisc(login) (!(login->n_mac_mcgid)) +#define is_ucast_promisc(login) (!!(login->dev->flags & IFF_PROMISC)) +#define ARRAY_LEN(_x) (sizeof(_x)/sizeof(_x[0])) + +/* TODO: cleanup VNIC_GID_RAW_ARG and friends */ +#define VNIC_GID_RAW_ARG(gid) ((u8 *)(gid))[0], \ + ((u8 *)(gid))[1], \ + ((u8 *)(gid))[2], \ + ((u8 *)(gid))[3], \ + ((u8 *)(gid))[4], \ + ((u8 *)(gid))[5], \ + ((u8 *)(gid))[6], \ + ((u8 *)(gid))[7], \ + ((u8 *)(gid))[8], \ + ((u8 *)(gid))[9], \ + ((u8 *)(gid))[10],\ + ((u8 *)(gid))[11],\ + ((u8 *)(gid))[12],\ + ((u8 *)(gid))[13],\ + ((u8 *)(gid))[14],\ + ((u8 *)(gid))[15] +#define VNIC_GUID_RAW_ARG(gid) ((u8 *)(gid))[0], \ + ((u8 *)(gid))[1], \ + ((u8 *)(gid))[2], \ + ((u8 *)(gid))[3], \ + ((u8 *)(gid))[4], \ + ((u8 *)(gid))[5], \ + ((u8 *)(gid))[6], \ + ((u8 *)(gid))[7] + +#define VNIC_GID_ARG(gid) VNIC_GID_RAW_ARG((gid).raw) +#define VNIC_GID_FMT "%.2x:%.2x:%.2x:%.2x:" \ + "%.2x:%.2x:%.2x:%.2x:" \ + "%.2x:%.2x:%.2x:%.2x:" \ + "%.2x:%.2x:%.2x:%.2x" +#define VNIC_GUID_FMT "%.2x:%.2x:%.2x:%.2x:" \ + "%.2x:%.2x:%.2x:%.2x" + +#define MAC_6_PRINT_FMT "%.2x:%.2x:%.2x:%.2x:" \ + "%.2x:%.2x" +#define MAC_6_PRINT_ARG(mac) (mac)[0], (mac)[1], (mac)[2], \ + (mac)[3], (mac)[4], (mac)[5] + +#define IP_4_PRINT_FMT "%d.%d.%d.%d" +#define IP_4_PRINT_ARG(ip) (ip)[0], (ip)[1], (ip)[2], (ip)[3] + +#define CREATE_VHUB_ID(be_vlan, port_id) \ + ((be16_to_cpu(be_vlan) & 0xFFF) | (((port_id) & 0xFFF) << 12)) +#define CREATE_VHUB_ID_BE(vlan, port_id) \ + cpu_to_be32(CREATE_VHUB_ID(vlan, port_id)) +#define ROUNDUP_LOG2(x) ilog2(roundup_pow_of_two(x)) + +#define VNIC_RX_COAL_TARGET 0x20000 +#define VNIC_RX_COAL_TIME 0x10 +#define VNIC_TX_COAL_PKTS 64 +#define VNIC_TX_COAL_TIME 0x80 +#define VNIC_RX_RATE_LOW 400000 +#define VNIC_RX_COAL_TIME_LOW 0 +#define VNIC_RX_RATE_HIGH 450000 +#define VNIC_RX_COAL_TIME_HIGH 128 +#define VNIC_RX_SIZE_THRESH 1024 +#define VNIC_RX_RATE_THRESH (1000000 / VNIC_RX_COAL_TIME_HIGH) +#define VNIC_SAMPLE_INTERVAL 0 +#define VNIC_AVG_PKT_SMALL 256 +#define VNIC_AUTO_CONF 0xffff +#define VNIC_MCAST_MAX_RETRY 60 +#define VNIC_MCAST_ULIMIT_RETRY 0 +#define VNIC_MCAST_BACKOF_FAC 2 +#define MLX4_DEV_CAP_FLAG_UD_SWP (1 << 28) +#define VNIC_ETHTOOL_LINE_MAX 32 +#define VNIC_ENCAP_LEN 4 +#define VNIC_MAX_TX_SIZE 2048 +#define VNIC_MAX_RX_SIZE 4096 +#define ETH_LLC_SNAP_SIZE 8 + +#define VNIC_SM_HEADSTART 250 /* msecs to actually start handling SM events */ +#define VNIC_MCAST_BACKOFF_MSEC 1000 +#define VNIC_MCAST_BACKOFF_MAX_MSEC 16000 + +#define SYSFS_VLAN_ID_NO_VLAN (-1) + +#define VNIC_MAX_PAYLOAD_SIZE 4096 +#define VNIC_BUF_SIZE(_port) (min(_port->max_mtu_enum + \ + IB_GRH_BYTES, VNIC_MAX_PAYLOAD_SIZE)) + +#define VNIC_TX_QUEUE_LEN 1024 /* default, tuneable */ +#define VNIC_TX_QUEUE_LEN_MIN 64 +#define VNIC_TX_QUEUE_LEN_MAX (8 * 1024) + +#define VNIC_RX_QUEUE_LEN 2048 /* default, tuneable */ +#define VNIC_RX_QUEUE_LEN_MIN 64 +#define VNIC_RX_QUEUE_LEN_MAX (8 * 1024) + + +#define VNIC_MODER_DELAY (HZ / 4) +#define VNIC_STATS_DELAY VNIC_MODER_DELAY + +#define VNIC_AH_SL_DEFAULT 0x0 + +#define VNIC_DATA_QKEY 0x80020003 +#define VNIC_FIP_QKEY 0x80020002 +#define VNIC_VLAN_OFFSET(login) (login->vlan_used ? VLAN_HLEN : 0) +#define VNIC_VLAN_ENABLED(login) (login->vlan_used ? 1 : 0) +#define VNIC_MAX_TX_CQE 32 /* default, tuneable */ +#define VNIC_MAX_RX_CQE 64 /* default, tuneable */ +#define VNIC_MAX_NUM_CPUS 32 +#define VNIC_MAX_INLINE_TSHOLD 512 + +#define VNIC_EOIB_HDR_VER 0x0 +#define VNIC_EOIB_HDR_SIG 0x3 +#define VNIC_EOIB_HDR_UDP_CHK_OK 0x2 +#define VNIC_EOIB_HDR_TCP_CHK_OK 0x1 +#define VNIC_EOIB_HDR_IP_CHK_OK 0x1 + +#define VNIC_EOIB_HDR_GET_IP_CHK(eoib_hdr) (eoib_hdr->encap_data & 0x3) +#define VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr) ((eoib_hdr->encap_data >> 2) & 0x3) +#define VNIC_EOIB_HDR_GET_VER(eoib_hdr) ((eoib_hdr->encap_data >> 4) & 0x3) +#define VNIC_EOIB_HDR_GET_SIG(eoib_hdr) ((eoib_hdr->encap_data >> 6) & 0x3) + +#define VNIC_EOIB_HDR_SET_IP_CHK_OK(eoib_hdr) (eoib_hdr->encap_data = \ + (eoib_hdr->encap_data & 0xFC) | VNIC_EOIB_HDR_IP_CHK_OK) +#define VNIC_EOIB_HDR_SET_TCP_CHK_OK(eoib_hdr) (eoib_hdr->encap_data = \ + (eoib_hdr->encap_data & 0xF3) | (VNIC_EOIB_HDR_TCP_CHK_OK << 2)) +#define VNIC_EOIB_HDR_SET_UDP_CHK_OK(eoib_hdr) (eoib_hdr->encap_data = \ + (eoib_hdr->encap_data & 0xF3) | (VNIC_EOIB_HDR_UDP_CHK_OK << 2)) + +#define VNIC_IP_CSUM_OK(eoib_hdr) ((VNIC_EOIB_HDR_GET_IP_CHK(eoib_hdr)) == VNIC_EOIB_HDR_IP_CHK_OK) +#define VNIC_TCP_CSUM_OK(eoib_hdr) ((VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr)) == VNIC_EOIB_HDR_TCP_CHK_OK) +#define VNIC_UDP_CSUM_OK(eoib_hdr) ((VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr)) == VNIC_EOIB_HDR_UDP_CHK_OK) +#define VNIC_CSUM_OK(eoib_hdr) (VNIC_IP_CSUM_OK(eoib_hdr) && \ + (VNIC_TCP_CSUM_OK(eoib_hdr) || \ + VNIC_UDP_CSUM_OK(eoib_hdr))) +#define VNIC_EOIB_ZLEN_MAX (ETH_ZLEN + VNIC_ENCAP_LEN + VLAN_HLEN) + +#define VNIC_SKB_GET_HASH(_skb, _max) ((*(u32 *)(_skb->cb + sizeof _skb->cb - 4)) % _max) +#define VNIC_SKB_SET_HASH(_skb, _hash) ((*(u32 *)(_skb->cb + sizeof _skb->cb - 4)) = _hash) +#define VNIC_SKB_GET_ENCAP_CB(_skb) ((struct eoibhdr *)(_skb->cb + sizeof _skb->cb - 12)) +#define VNIC_SKB_GET_ENCAP(_skb) (vnic_encap_headroom ? (struct eoibhdr *)(_skb->data) : VNIC_SKB_GET_ENCAP_CB(_skb)) +#define VNIC_SKB_GET_ENCAP_OFFSET (vnic_encap_headroom ? VNIC_ENCAP_LEN :0) + +#define VNIC_NEIGH_GET_DQPN(_skb, _neighe) ((_neighe->rss) ? (_neighe->qpn + \ + VNIC_SKB_GET_HASH(_skb, _neighe->login->qps_num)) : (_neighe->qpn)) + +#define vnic_netdev_priv(netdev) (((struct vnic_login_info *)netdev_priv(netdev))->login) +#ifndef _BP_NETDEV_NO_TMQ /* >= 2.6.27 */ +#define VNIC_TXQ_GET_HASH(_skb, _max) (skb_get_queue_mapping(_skb)) +#define VNIC_TXQ_ALLOC_NETDEV(sz, nm, sp, qm) alloc_netdev_mq(sz, nm, sp, qm) +#define VNIC_TXQ_SET_ACTIVE(login, num) (login->dev->real_num_tx_queues = \ + login->real_tx_rings_num = \ + login->ndo_tx_rings_num = num) +#define VNIC_TXQ_GET_ACTIVE(login) (login->real_tx_rings_num) +#define VNIC_TXQ_GET(tx_res) netdev_get_tx_queue(tx_res->login->dev, tx_res->index) +#define VNIC_TXQ_STOP(tx_res) netif_tx_stop_queue(VNIC_TXQ_GET(tx_res)) +#define VNIC_TXQ_STOP_ALL(login) netif_tx_stop_all_queues(login->dev) +#define VNIC_TXQ_START(tx_res) netif_tx_start_queue(VNIC_TXQ_GET(tx_res)) +#define VNIC_TXQ_START_ALL(login) netif_tx_start_all_queues(login->dev) +#define VNIC_TXQ_STOPPED(tx_res) netif_tx_queue_stopped(VNIC_TXQ_GET(tx_res)) +#define VNIC_TXQ_WAKE(tx_res) netif_tx_wake_queue(VNIC_TXQ_GET(tx_res)) +#else +#define VNIC_TXQ_GET_HASH(skb, _max) VNIC_SKB_GET_HASH(skb, _max) +#define VNIC_TXQ_ALLOC_NETDEV(sz, nm, sp, qm) alloc_netdev(sz, nm, sp) +#define VNIC_TXQ_SET_ACTIVE(login, num) do { login->real_tx_rings_num = num; \ + login->ndo_tx_rings_num = 1; \ + } while (0) +#define VNIC_TXQ_GET_ACTIVE(login) (login->real_tx_rings_num) +#define VNIC_TXQ_STOP(tx_res) netif_stop_queue(tx_res->login->dev) +#define VNIC_TXQ_STOP_ALL(login) netif_stop_queue(login->dev) +#define VNIC_TXQ_START(tx_res) netif_start_queue(tx_res->login->dev) +#define VNIC_TXQ_START_ALL(login) netif_start_queue(login->dev) +#define VNIC_TXQ_STOPPED(tx_res) netif_queue_stopped(tx_res->login->dev) +#define VNIC_TXQ_WAKE(tx_res) netif_wake_queue(tx_res->login->dev) +#endif + +#define VNIC_ALLOC_ORDER 2 +#define VNIC_ALLOC_SIZE (PAGE_SIZE << VNIC_ALLOC_ORDER) +#define VNIC_MAX_LRO_AGGR 64 +#define VNIC_MAX_RX_FRAGS 4 +#define VNIC_MAX_TX_FRAGS (MAX_SKB_FRAGS + 2) +#define VNIC_MGID_PREFIX_LEN 5 + +/* TODO, when set VNIC_MAX_TX_OUTS to 16, + * noticed that the last CQE overwrites the first one + */ +#define VNIC_MAX_TX_OUTS 8 /* default, tuneable */ +#define VNIC_MAX_LRO_DESCS 32 /* default, tuneable */ +#define VNIC_EOIB_HDR_SIZE (IB_GRH_BYTES + VNIC_ENCAP_LEN) +#define SMALL_PACKET_SIZE (256 - NET_IP_ALIGN) +#define HEADER_COPY_SIZE (128 - NET_IP_ALIGN) +#define MAX_HEADER_SIZE 64 + +#define LAG_MAP_TABLE_SIZE 32 +#define MAX_LAG_MEMBERS 16 + +#define VNIC_FW_STR_MAX VNIC_ETHTOOL_LINE_MAX +#define VNIC_FW_STR(u64_fw_ver, str) \ +do { \ + snprintf(str, VNIC_FW_STR_MAX, "%d.%d.%d", \ + (int)(u64_fw_ver >> 32), \ + (int)(u64_fw_ver >> 16) & 0xffff, \ + (int)(u64_fw_ver & 0xffff)); \ +} while (0); +#define VNIC_STR_STRIP(str) \ +do { \ + int i; \ + for (i = 0; i < strlen(str); ++i) \ + str[i] = str[i] == '\n' ? ' ' : str[i]; \ +} while (0); + +/* well known addresses */ +static const u8 ETH_BCAST_MAC[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; + +static const u8 ETH_ZERO_MAC[] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +/* this used in no_bxm mode only */ +static const u8 NO_BXM_MGID_PREFIX[] = { + 0xff, 0x13, 0xe0, 0x1b, 0x00 +}; + +#define IS_ZERO_MAC(mac) (!memcmp((mac), ETH_ZERO_MAC, ETH_ALEN)) +#define IS_BCAST_MAC(mac) (!memcmp((mac), ETH_BCAST_MAC, ETH_ALEN)) +#define IS_MCAST_MAC(mac) (((unsigned char *)(mac))[0] & 0x01) +#define IS_UCAST_MAC(mac) (!(IS_MCAST_MAC(mac))) +#define IS_NEIGH_QUERY_RUNNING(neigh) \ + (neigh->query_id >= 0 && !IS_ERR(neigh->pquery) && neigh->pquery) + +struct mcast_root { + struct rb_root mcast_tree; + spinlock_t mcast_rb_lock; + struct list_head reattach_list; +}; + +/* structs */ +struct vnic_port_stats { + unsigned long gro_held; + unsigned long gro_merged; + unsigned long gro_normal; + unsigned long gro_drop; + unsigned long lro_aggregated; + unsigned long lro_flushed; + unsigned long lro_no_desc; + unsigned long tso_packets; + unsigned long queue_stopped; + unsigned long wake_queue; + unsigned long tx_timeout; + unsigned long rx_chksum_good; + unsigned long rx_chksum_none; + unsigned long tx_chksum_offload; + unsigned long sig_ver_err; + unsigned long vlan_err; + unsigned long shared_packets; + unsigned long runt_packets; + unsigned long realloc_packets; + unsigned long gw_tx_packets; + unsigned long gw_tx_bytes; +}; + +#define VNIC_STATS_DO_ADD(var, val) ((var) += (unsigned long)(val)) +#define VNIC_STATS_DO_INC(var) (++(var)) +#ifdef VNIC_EXTRA_STATS /* for performance */ +#define VNIC_STATS_ADD(var, val) ((var) += (unsigned long)(val)) +#define VNIC_STATS_INC(var) (++(var)) +#else +#define VNIC_STATS_ADD(var, val) do { } while (0) +#define VNIC_STATS_INC(var) do { } while (0) +#endif + +enum { + MCAST_ATTACHED, + MCAST_JOINED, + MCAST_JOIN_STARTED, + MCAST_JOIN_RUNNING, + MCAST_ATTACH_RUNNING, +}; + +struct vnic_port_mcast { + struct rb_node rb_node; + struct list_head list; + union ib_gid gid; + struct vnic_port *port; + struct completion leave_complete; + struct completion join_event_complete; + struct ib_sa_multicast *sa_mcast; + struct ib_sa_mcmember_rec rec; + + atomic_t ref_cnt; + struct delayed_work join_task; + struct work_struct leave_task; + unsigned long join_task_cnt; + long int state; + spinlock_t lock; + u8 join_state; + /* IN */ + unsigned long backoff; + unsigned long backoff_init; + unsigned long backoff_factor; + unsigned long retry; + u16 pkey; + u32 qkey; + u8 create; +}; + +struct vnic_mcast { + struct vnic_port_mcast *port_mcaste; + u32 qkey; + u16 pkey; + struct ib_qp *qp; + struct vnic_port *port; + struct ib_ah *ah; + struct completion attach_complete; + struct delayed_work attach_task; + struct delayed_work detach_task; + unsigned long attach_task_cnt; + struct rb_node rb_node; + struct list_head list; /* used when delete all */ + /* IN */ + u8 mac[ETH_ALEN]; + union ib_gid gid; + union ib_gid port_gid; + unsigned long backoff; + unsigned long backoff_init; + unsigned backoff_factor; + unsigned long retry; + unsigned long state; + u8 blocking; + void *attach_cb_ctx; + void *detach_cb_ctx; + void (*attach_cb) (struct vnic_mcast *mcaste, void *ctx); + void (*detach_cb) (struct vnic_mcast *mcaste, void *ctx); + u8 create; + u8 join_state; + void *priv_data; + spinlock_t lock; + int attach_bit_nr; + unsigned long *req_attach; + unsigned long *cur_attached; + int sender_only; +}; + +struct vnic_mac { + struct rb_node rb_node; /* list or RB tree */ + struct list_head list; + u16 vnic_id; /* needed for vnic child removal */ + u8 mac[ETH_ALEN]; /* key */ + unsigned long created; + unsigned long last_tx; // use jiffies_to_timeval +}; + +struct lag_properties { + u16 hash_mask; + u8 weights_policy; + u8 ca; /* conjestion aware */ + u8 ca_thresh; +}; + +struct vnic_neigh { + struct neighbour *neighbour; + struct ib_ah *ah; + struct vnic_login *login; + struct rb_node rb_node; + struct ib_sa_query *pquery; + struct completion query_comp; + int query_id; + struct sk_buff_head pkt_queue; + struct delayed_work destroy_task; + u8 valid; + u32 qpn; + u16 lid; + u8 sl; /* only for debug */ + u8 mac[ETH_ALEN]; + u8 rss; + u16 info; +}; + +enum lag_gw_state { + GW_MEMBER_INFO_CREATED = 1 << 0, + GW_MEMBER_INFO_EPORT_UP = 1 << 1, + GW_MEMBER_INFO_MCAST = 1 << 2, + GW_MEMBER_INFO_MAPPED = 1 << 3, +}; + +struct vnic_gw_info { + enum lag_gw_state info; + int member_id; + u16 gw_id; + struct vnic_neigh neigh; +}; + +struct vnic_sysfs_attr { + void *ctx; + struct kobject *kobj; + unsigned long data; + char name[VNIC_SYSFS_FLEN]; + struct module_attribute dentry; + struct device *dev; +}; + +enum gw_ext_lag_hash_policy { + GW_LAG_HASH_DMAC = 1 << 0, + GW_LAG_HASH_SMAC = 1 << 1, + GW_LAG_HASH_TPID = 1 << 2, /* ethertype */ + GW_LAG_HASH_VID = 1 << 3, + GW_LAG_HASH_SIP = 1 << 4, + GW_LAG_HASH_DIP = 1 << 5, + GW_LAG_HASH_IP_NEXT = 1 << 6, + GW_LAG_HASH_SPORT = 1 << 7, + GW_LAG_HASH_DPORT = 1 << 8, + GW_LAG_LAYER_2_3 = 0x1f0 +}; + +struct vnic_tx_buf { + struct sk_buff *skb; + u64 mapping[VNIC_MAX_TX_FRAGS]; + u8 ip_off; + u8 ip6_off; + u8 tcp_off; + u8 udp_off; + void *phead; + int hlen; +}; + +enum { +#if 1 + FRAG_SZ0 = 536 - NET_IP_ALIGN, /* so 1500 mtu fits in first 2 frags */ + FRAG_SZ1 = 1024, + FRAG_SZ2 = 2048, + FRAG_SZ3 = 4096 - FRAG_SZ2 - FRAG_SZ1 - FRAG_SZ0 +#else + FRAG_SZ0 = 512 - NET_IP_ALIGN, + FRAG_SZ1 = 1024, + FRAG_SZ2 = 2048, + FRAG_SZ3 = 4096 << VNIC_ALLOC_ORDER +#endif +}; + +struct vnic_frag_info { + u16 frag_size; + u16 frag_prefix_size; + u16 frag_stride; + u16 frag_align; + u16 last_offset; +}; + +struct vnic_rx_alloc { + struct page *page; + u16 offset; +}; + +struct vnic_frag_data { + struct skb_frag_struct frags[VNIC_MAX_RX_FRAGS]; + u64 dma_addr[VNIC_MAX_RX_FRAGS]; + struct sk_buff *skb; /* used only for linear buffers mode */ +}; + +struct vnic_rx_ring { + struct vnic_port *port; + int index; + struct vnic_rx_alloc page_alloc[VNIC_MAX_RX_FRAGS]; + + u32 size; /* number of RX descs */ + spinlock_t lock; + struct vnic_frag_data *rx_info; + + struct vnic_frag_info frag_info[VNIC_MAX_RX_FRAGS]; + u32 rx_skb_size; + u16 log_rx_info; + u16 num_frags; + + struct ib_recv_wr wr; + struct ib_sge sge[VNIC_MAX_RX_FRAGS]; + + struct ib_srq *srq; + struct net_device_stats stats; +}; + +/* vnic states + these vlaues can be used only in struct fip_vnic_data.login_state */ +enum { + VNIC_STATE_LOGIN_OFF = 0, + VNIC_STATE_LOGIN_PRECREATE_1, + VNIC_STATE_LOGIN_PRECREATE_2, + VNIC_STATE_LOGIN_CREATE_1, + VNIC_STATE_LOGIN_CREATE_2, + VNIC_STATE_LOGIN_BCAST_ATTACH = 31 +}; + +/* netdevice open state, depeneds on calls to open/stop + these vlaues can be used only in struct vnic_login.netdev_state */ +enum { + VNIC_STATE_NETDEV_OFF = 0, + VNIC_STATE_NETDEV_OPEN_REQ, + VNIC_STATE_NETDEV_OPEN, + VNIC_STATE_NETDEV_CARRIER_ON, + VNIC_STATE_NETDEV_NO_TX_ENABLE = 31 +}; + +struct vnic_rx_res { + struct vnic_login *login; + struct ib_cq *cq; + struct net_lro_mgr lro; + struct net_lro_desc lro_desc[VNIC_MAX_LRO_DESCS]; + struct ib_wc recv_wc[VNIC_MAX_RX_CQE]; + int index; + int stopped; +#ifndef _BP_NAPI_POLL + struct napi_struct napi; +#else + struct net_device *poll_dev; +#endif +}; + +struct vnic_tx_res { + struct vnic_tx_buf *tx_ring; + struct ib_sge tx_sge[VNIC_MAX_TX_FRAGS]; + struct ib_wc send_wc[VNIC_MAX_TX_CQE]; + struct ib_send_wr tx_wr; + struct vnic_login *login; + struct ib_cq *cq; + unsigned tx_head; + unsigned tx_tail; + unsigned tx_outstanding; + unsigned tx_stopped_cnt; + struct net_device_stats stats; + struct ib_ah_attr mcast_av; + u8 lso_hdr[VNIC_MAX_PAYLOAD_SIZE]; + int index; + int stopped; + spinlock_t lock; +}; + +#ifdef VNIC_PROFILLNG +#define VNIC_PROFILLNG_SKB_MAX 100 +struct vnic_prof_skb_entry { + struct sk_buff skb; + struct timespec tstamp; + unsigned long jiffies; + int cnt; + u8 nr_frags; +}; +#endif + +struct vnic_qp_res { + struct vnic_login *login; + struct ib_qp *qp; + struct completion last_wqe_complete; + int tx_index; + int rx_index; +}; + +/* + * Wrapper struct for vnic_login, used as netdev private data. + * some kernels (such as 2.6.18-194.26.1) doesn't allow private + * data struct longer than 64KB (NETDEV_PRIV_LEN_MAX). + * we allocate the private data separately to work-around this limit. + */ +struct vnic_login_info { + struct vnic_login *login; +}; + +struct vnic_login { + spinlock_t lock; + spinlock_t stats_lock; + struct net_device *dev; + struct ethtool_drvinfo drvinfo; + struct vnic_port *port; + char desc[VNIC_DESC_LEN]; + struct fip_vnic_data *fip_vnic; /* for ethtool/sysfs*/ + int queue_stopped; + unsigned long netdev_state; + char name[VNIC_NAME_LEN]; + char vnic_name[VNIC_NAME_LEN]; + char vendor_id[VNIC_VENDOR_LEN]; + struct vnic_neigh *gw_neigh; + struct vnic_gw_info lag_gw_neigh[MAX_LAG_MEMBERS]; + struct lag_properties lag_prop; + int is_lag; + int lag_gw_map[LAG_MAP_TABLE_SIZE]; + int lag_member_count; + int lag_member_active_count; + union ib_gid gw_mgid; + int promisc; + union ib_gid gid; + __be16 vid; + u8 vlan_used; + u32 qkey; + u16 pkey; + u16 pkey_index; + u64 gw_guid; + u8 mgid_prefix[VNIC_MGID_PREFIX_LEN]; + u8 n_mac_mcgid; + u8 sl; + u16 gw_port_id; + u16 vnic_id; + unsigned int max_mtu; + int zlen; + int cnt; + unsigned qps_num; + u32 qp_base_num; + u8 dev_addr[ETH_ALEN]; + u8 all_vlan_gw; + + /* statistics */ + struct net_device_stats stats; + struct vnic_port_stats port_stats; + + /* tasks */ + struct work_struct mcast_restart; + struct delayed_work stats_task; + struct delayed_work mcast_task; + struct delayed_work restart_task; + struct mutex moder_lock; + struct mutex state_lock; + + /* data structures */ + struct workqueue_struct *neigh_wq; + struct rb_root neigh_tree; + struct rb_root mac_tree; + atomic_t vnic_child_cnt; + rwlock_t mac_rwlock; + struct mcast_root mcast_tree; + struct vnic_sysfs_attr dentries[VNIC_MAX_DENTRIES]; + struct list_head list; + + /* QP resources */ + struct vnic_qp_res qp_res[VNIC_MAX_NUM_CPUS]; + + /* RX resouces */ + struct vnic_rx_res rx_res[VNIC_MAX_NUM_CPUS]; + struct ib_recv_wr rx_wr; + u32 lro_num; + unsigned lro_mng_num; + int rx_csum; + unsigned napi_num; + unsigned rx_rings_num; + + /* TX resources */ + struct vnic_tx_res tx_res[VNIC_MAX_NUM_CPUS]; + unsigned tx_rings_num; + unsigned real_tx_rings_num; + unsigned ndo_tx_rings_num; + u8 *pad_va; + u64 pad_dma; + + /* for profiling */ +#ifdef VNIC_PROFILLNG + struct vnic_prof_skb_entry prof_arr[VNIC_PROFILLNG_SKB_MAX]; + int prof_arr_it; +#endif + /* interrupt coalecence */ + u16 rx_usecs; + u16 rx_frames; + u32 pkt_rate_low; + u16 rx_usecs_low; + u32 pkt_rate_high; + u16 rx_usecs_high; + u16 sample_interval; + u16 adaptive_rx_coal; + unsigned long last_moder_packets; + unsigned long last_moder_tx_packets; + unsigned long last_moder_bytes; + unsigned long last_moder_jiffies; + unsigned long last_moder_time; + u16 tx_usecs; + u16 tx_frames; + u8 shared_vnic; + u8 shared_mac[ETH_ALEN]; +}; + +struct eoibhdr { + __u8 encap_data; + __u8 seg_off; + __be16 seg_id; +}; + +struct vnic_ib_dev { + char name[VNIC_DESC_LEN]; + struct mutex mlock; + struct list_head list; + struct list_head port_list; + struct ib_device *ca; + struct mlx4_ib_dev *mdev; + struct ib_device_attr attr; + char fw_ver_str[VNIC_FW_STR_MAX]; +}; + +struct fip_ring_entry { + void *mem; + u64 bus_addr; + int length; + int entry_posted; +}; + +struct fip_ring { + int size; + struct fip_ring_entry *ring; + unsigned long head; + unsigned long tail; + spinlock_t ring_lock; + spinlock_t head_tail_lock; +}; + +enum fip_discover_state { + FIP_DISCOVER_OFF, + FIP_DISCOVER_INIT, + FIP_DISCOVER_SOLICIT, + FIP_DISCOVER_CLEAR +}; + +#define MAX_INPUT_LEN 64 +#define MAX_INPUT_ARG 12 +struct fip_hadmin_cmd { + u8 c_name [MAX_INPUT_LEN]; + u8 c_mac [MAX_INPUT_LEN]; + u8 c_vnic_id [MAX_INPUT_LEN]; + u8 c_vid [MAX_INPUT_LEN]; + u8 c_bxname [MAX_INPUT_LEN]; + u8 c_bxguid [MAX_INPUT_LEN]; + u8 c_eport [MAX_INPUT_LEN]; + u8 c_ipv4 [MAX_INPUT_LEN]; + u8 c_ipv6 [MAX_INPUT_LEN]; + u8 c_emac [MAX_INPUT_LEN]; + u8 c_pkey [MAX_INPUT_LEN]; + u8 c_parent [MAX_INPUT_LEN]; +}; + +struct fip_hadmin_cache { + struct fip_hadmin_cmd cmd; + u8 system_guid[GUID_LEN]; + u8 system_name[VNIC_SYSTEM_NAME_LEN]; + u8 eport_name[VNIC_GW_PORT_NAME_LEN]; + u8 mac[ETH_ALEN]; + u16 vnic_id; + u16 gw_port_id; + u16 vlan; + u8 vlan_used; + u8 all_vlan_gw; + u8 interface_name[VNIC_NAME_LEN]; + u8 parent_name[VNIC_NAME_LEN]; + int parent_used; + int remove; + struct list_head next; + u32 qp_base_num; + u8 shared_vnic_ip[IPV4_LEN]; + u8 shared_vnic_mac[ETH_ALEN]; +}; + +struct pkt_rcv_list { + struct list_head list; + spinlock_t lock; +}; + +struct fip_discover { + char name[VNIC_NAME_LEN]; + struct vnic_port *port; + struct list_head discover_list; + spinlock_t lock; + struct list_head gw_list; + struct rw_semaphore l_rwsem; /* gw list rw semaphore **/ + int hadmin_update; + struct list_head hadmin_cache; + enum fip_discover_state state; + int flush; + struct completion flush_complete; + struct ib_cq *cq; + struct ib_qp *qp; + struct fip_ring rx_ring; + struct fip_ring tx_ring; + struct mcast_root mcast_tree; + struct delayed_work fsm_task; + struct delayed_work cleanup_task; + struct delayed_work hadmin_update_task; + struct work_struct pkt_rcv_task_bh; + struct pkt_rcv_list rcv_list; + + int mcast_dest_mask; + unsigned long discover_mcast_attached_jiffies; + unsigned long discover_mcast_detached_jiffies; + unsigned long discover_mcast_state; + u16 pkey; + u16 pkey_index; + unsigned long req_attach; + unsigned long cur_attached; + unsigned new_prot_gws; + unsigned old_prot_gws; +}; + +struct fip_root { + struct list_head discover_list; +}; + +struct port_fs_dentry { + struct module_attribute fs_entry; + struct vnic_port *port; +}; + +struct vnic_port { + char name[VNIC_DESC_LEN]; + u8 num; + int rx_rings_num; + int tx_rings_num; + struct vnic_ib_dev *dev; + struct mcast_root mcast_tree; + struct list_head list; + struct list_head login_list; + struct delayed_work event_task; + struct delayed_work event_task_light; + struct delayed_work discover_restart_task; + struct ib_event_handler event_handler; + struct ib_port_attr attr; + union ib_gid gid; + int rate; + u8 rate_enum; + atomic_t vnic_child_ids; + + /* IB resources per port */ + struct vnic_rx_ring *rx_ring[VNIC_MAX_NUM_CPUS]; + struct ib_pd *pd; + struct ib_mr *mr; + + /* for FIP */ + struct mutex mlock; + struct mutex start_stop_lock; + u16 pkey_index; + u16 pkey; + int max_mtu_enum; + struct fip_root fip; + struct vnic_sysfs_attr dentries[VNIC_MAX_DENTRIES]; +}; + +enum fip_vnic_state { + FIP_VNIC_CLOSED = 0, + FIP_VNIC_HADMIN_IDLE = 1<<0, + FIP_VNIC_LOGIN = 1<<1, + FIP_VNIC_WAIT_4_ACK = 1<<2, + FIP_VNIC_RINGS_INIT = 1<<3, /* temporary, create rings */ + FIP_VNIC_MCAST_INIT = 1<<4, /* temporary, start mcast attach */ + FIP_VNIC_MCAST_INIT_DONE= 1<<5, /* wait for mcast cb */ + FIP_VNIC_VHUB_INIT = 1<<6, + FIP_VNIC_VHUB_INIT_DONE = 1<<7, /* wait for vhub table */ + FIP_VNIC_VHUB_DONE = 1<<8, + FIP_VNIC_VHUB_WRITE = 1<<9, + FIP_VNIC_CONNECTED = 1<<10 +}; + +enum vhub_table_state { + VHUB_TBL_INIT, + VHUB_TBL_UP2DATE, + VHUB_TBL_UPDATED +}; + +struct vhub_elist { + u32 tusn; + int count; + int total_count; + struct list_head vnic_list; /* chain vnics */ +}; + +struct vnic_table_entry { + u32 qpn; + u16 lid; + u8 mac[ETH_ALEN]; + u8 sl; + + struct list_head list; + u8 rss; + u8 valid; +}; + +struct vhub_table { + enum vhub_table_state state; + u32 checksum; + u32 tusn; + struct vhub_elist main_list; + struct vhub_elist update_list; +}; + +struct fip_shared_vnic_data { + u8 ip[IPV4_LEN]; + u8 emac[ETH_ALEN]; + u8 enabled; + u8 arp_proxy; +}; + +struct lag_member { + u32 qpn; + u8 sl; + u16 gw_port_id; + u16 lid; + u8 guid[GUID_LEN]; + u8 eport_state; + u8 weight; + u8 link_utilization; +}; + +struct lag_members { + int num; + long used_bitmask; + struct lag_properties prop; + struct lag_member memb[MAX_LAG_MEMBERS]; +}; + +struct fip_login_data { + u32 qpn; + u32 ctl_qpn; + u16 port_id; /* must always be uptodate */ + u16 lid; /* must always be uptodate */ + u16 vlan; + u16 pkey; + u16 pkey_index; + u16 vnic_id; /* must always be uptodate */ + u32 vhub_id; + u16 mtu; + + u8 sl; /* service level -- 4 bits */ + u8 guid[GUID_LEN]; + u8 mac[ETH_ALEN]; + u8 mgid_prefix[VNIC_MGID_PREFIX_LEN]; + u8 vnic_name[VNIC_NAME_LEN]; + u8 vendor_id[VNIC_VENDOR_LEN]; + u8 n_mac_mcgid; + u8 n_rss_mgid; + u8 syndrome; /* must always be uptodate */ + + u8 vp; /* 1 bit: do we use vlan */ + u8 all_vlan_gw; /* 1 bit. + is promisc vlan supported on this vnic */ + struct lag_members lagm; +}; + +enum fip_flush { + FIP_NO_FLUSH, + FIP_PARTIAL_FLUSH, /* use this for events caused by vnic/gw logic will */ + FIP_FULL_FLUSH /* use this for events caused by unload, host admin destroy */ +}; + +struct fip_vnic_send_info { + u32 gw_qpn; + u32 qkey; + u16 gw_lid; + u8 gw_sl; +}; + +/* + * This struct holds informative info about the GW that can change without + * implecations on GW or vnic logic (only reported to user) + */ +struct fip_gw_volatile_info { + u8 system_guid[GUID_LEN]; + u8 system_name[VNIC_SYSTEM_NAME_LEN+1]; + u8 gw_port_name[VNIC_GW_PORT_NAME_LEN+1]; +}; + +struct fip_vnic_data { + char name[VNIC_NAME_LEN]; + enum fip_vnic_state state; + enum fip_flush flush; + spinlock_t lock; + spinlock_t ka_lock; + struct vnic_sysfs_attr dentry; + unsigned long login_state; + + /* data structures maintenance */ + struct fip_gw_data *gw; + struct vnic_port *port; + struct list_head gw_vnics; + struct vhub_table vhub_table; + + /* execution maintenance */ + unsigned long update_jiffs; + unsigned long keep_alive_jiffs; + unsigned long detached_ka_jiffs; + unsigned long vnic_mcaste_state; + struct delayed_work vnic_task; + struct hrtimer keepalive_timer; + struct list_head timer; + struct delayed_work vnic_gw_alive_task; + struct work_struct vnic_pkt_rcv_task_bh; + struct work_struct vnic_login_destroy_task; + struct work_struct vnic_login_create_task; + struct pkt_rcv_list vnic_rcv_list; + struct fip_vnic_send_info gw_address; + + /* vnic driver API */ + struct vnic_login *login; + unsigned long login_status; + int qps_num; + u32 qp_base_num; + int parent_used; + u8 parent_name[VNIC_NAME_LEN]; + + /* rx + tx data structures */ + struct ib_cq *cq; + struct ib_qp *qp; + struct fip_ring rx_ring; + struct fip_ring tx_ring; + struct ib_ah *ah; + + /* data domain */ + union ib_gid mgid; + + /* vHub context update mcast groups */ + struct mcast_root mcast_tree; + struct fip_login_data login_data; + struct fip_shared_vnic_data shared_vnic; + u16 mlid; + /* u16 pkey_index; not used for now */ + + u16 vnic_id; /* unique id for GW */ + u16 vlan; + u8 vlan_used; + u8 all_vlan_gw; + u16 pkey; + u16 pkey_index; + u8 hadmined; /* todo, use the state for this */ + u8 interface_name[VNIC_NAME_LEN]; + u8 mac_cache[ETH_ALEN]; + atomic_t eport_state; + unsigned long last_send_jiffs; + int retry_count; + int synd_backlog; + struct fip_hadmin_cmd cmd; + struct fip_gw_volatile_info gw_info; + struct lag_members lm; + unsigned long req_attach; + unsigned long cur_attached; + union ib_gid ka_mcast_gid; +}; + +enum vhub_mgid_type { + VHUB_MGID_DATA = 0, + VHUB_MGID_UPDATE = 2, + VHUB_MGID_TABLE = 3, + VHUB_MGID_KA = 5, +}; + +enum fip_all_mgids { + FIP_MCAST_DISCOVER, + FIP_MCAST_SOLICIT, + FIP_MCAST_VHUB_DATA, + FIP_MCAST_VHUB_UPDATE, + FIP_MCAST_TABLE, + FIP_MCAST_VHUB_KA, +}; + +union vhub_mgid { + struct mgid { + u8 mgid_prefix[VNIC_MGID_PREFIX_LEN]; + u8 type; + u8 dmac[ETH_ALEN]; + u8 rss_hash; + u8 vhub_id[3]; + } mgid; + union ib_gid ib_gid; +}; + +void vnic_carrier_update(struct vnic_login *login); +int vnic_param_check(void); + +/* mac table funcs */ +void vnic_learn_mac(struct net_device *dev, u8 *mac, int remove); +void vnic_child_flush(struct vnic_login *login, int all); +int vnic_child_update(struct vnic_login *login, u8 *mac, int remove); +int vnic_mace_update(struct vnic_login *login, u8 *mac, u16 vnic_id, int remove); +int vnic_parent_update(struct vnic_port *port, char *name, u16 vnic_id, + u8 *mac, u32 *qp_base_num_ptr, char *parent_name, + int remove); + +/* mcast funcs */ +int vnic_mcast_init(void); +void vnic_mcast_cleanup(void); + +/* + * A helper function to prevent code duplication. Receives a multicast mac + * and a gw_id and attaches it (join + attach). The function also receives + * a default_mcaste (used for the MGID over default MLID hack and a user list. + * Returns 0 on success and non 0 on failure. + * + * in: mmac - to be used in creation MGID address + * in: default_mcaste - mcaste entry of the default MGID. Can be NULL + * in: private_data - A user pointer that can be used to identify owner + * in: gw_id - to be used in creation MGID address + */ +int _vnic_mcast_attach_mgid(struct vnic_login *login, + char *mmac, + struct vnic_mcast *default_mcaste, + void *private_data, + u16 gw_id); + +struct vnic_mcast *vnic_mcast_alloc(struct vnic_port *port, + unsigned long *req_attach, + unsigned long *cur_attach); +/* + * A helper function to prevent code duplication. Fills vnic_mcast struct with + * common values. + * + * in: mcaste - mcaste to fill + * in: gw_id - to be used in creation MGID address + * in: mac - to be used in creation MGID address + * in: rss_hash - to be used in creation MGID address (ususally 0) + * in: create - value of create field in mcaste + */ +void __vnic_mcaste_fill(struct vnic_login *login, struct vnic_mcast *mcaste, + u16 gw_id, const u8 *mac, u8 rss_hash, int create); + +void vnic_mcast_dealloc(struct vnic_mcast *mcaste); + +int vnic_mcast_attach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste); +int vnic_mcast_detach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste); + +/* + * This function grabs the mcast_tree->mcast_rb_lock +*/ +int vnic_mcast_add(struct mcast_root *mcast_tree, + struct vnic_mcast *mcaste); +int vnic_mcast_del_all(struct mcast_root *mcast_tree); +int vnic_mcast_del_user(struct mcast_root *mcast_tree, void *owner); + +void vnic_tree_mcast_detach(struct mcast_root *mcast_tree); +void vnic_tree_mcast_attach(struct mcast_root *mcast_tree); + +/*void vnic_port_mcast_del_all(struct mcast_root *port); */ +static inline void vnic_mcast_root_init(struct mcast_root *mcast_tree) +{ + spin_lock_init(&mcast_tree->mcast_rb_lock); + INIT_LIST_HEAD(&mcast_tree->reattach_list); +} + +/* port funcs */ +int vnic_ports_init(void); +void vnic_ports_cleanup(void); + +/* + * The caller must hold the mcast_tree->mcast_rb_lock lock before calling +*/ +void vnic_mcast_del(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste); +struct vnic_mcast *vnic_mcast_search(struct mcast_root *mcast_tree, + union ib_gid *gid); +void port_fip_discover_restart(struct work_struct *work); +int vnic_port_fip_init(struct vnic_port *port); +void vnic_port_fip_cleanup(struct vnic_port *port, int lock); + +/* others */ +void fip_refresh_mcasts(struct fip_discover *discover); +void vnic_login_refresh_mcasts(struct vnic_port *port); + +/* There are 2 different create flows, for host admin and net admin. + * In net admin we always create the vnic after connected with GW but we do not + * yet know the vnic details (mac, vlan etc). We know the ring paramets and + * will need to create the RX/TX rings (before login). + * To accomplish this we call vnic_login_pre_create_1, vnic_login_pre_create_2 + * and after login ACK we will call vnic_login_register_netdev and vnic_login_complete_ack. + * In Host admin, we know the vnic info but not the GW info when we create the + * vnic. So we call vnic_login_pre_create_1 and vnic_login_register_netdev, after + * getting the login ACK we will call vnic_login_pre_create_2, vnic_login_complete_ack. + */ +int vnic_login_register_netdev(struct fip_vnic_data *vnic, + const char *mac, + const char *name); +int vnic_login_complete_ack(struct fip_vnic_data *vnic, + struct fip_login_data *login_data, + struct fip_shared_vnic_data *shared_vnic); +int vnic_login_pre_create_1(struct vnic_port *port, + struct fip_vnic_data *vnic); +int vnic_login_pre_create_2(struct fip_vnic_data *vnic, int qps_num, int is_lag); + +/* + * When destroying login, call to stop login wq tasks. do not call from + * login_wq context. +*/ +void vnic_login_destroy_stop_wq(struct fip_vnic_data *vnic, enum fip_flush flush); +/* + * When destroy login data struct. Assumes all login wq tasks are stopped. + * Can be called from any context, might block for a few secs. +*/ +void vnic_login_destroy_wq_stopped(struct fip_vnic_data *vnic, enum fip_flush flush); + +/* + * Destroy a login datastructure. + * This function can not be called from login_wq context. If you need to run + * from login_wq use the split function vnic_login_destroy_stop_wq/wq_stopped + * instead. + */ +static inline +void vnic_login_destroy(struct fip_vnic_data *vnic, enum fip_flush flush) +{ + vnic_login_destroy_stop_wq(vnic, flush); + vnic_login_destroy_wq_stopped(vnic, flush); +} + +/* add / remove members eports from LAG GW */ +void vnic_member_prop(struct vnic_login *login, struct lag_properties *prop); +int vnic_member_add(struct vnic_login *login, int member_id, + struct lag_member *emember); +int vnic_member_remove(struct vnic_login *login, int member_id); +int vnic_member_modify(struct vnic_login *login, int member_id, + struct lag_member *emember); +void vnic_member_remove_all(struct vnic_login *login); + +int vnic_vhube_add(struct fip_vnic_data *vnic, struct vnic_table_entry *vhube); +void vnic_vhube_flush(struct fip_vnic_data *vnic); +void vnic_vhube_del(struct fip_vnic_data *vnic, u8 *mac); +int vnic_neighe_path_query(struct vnic_neigh *neighe); + +void vhub_mgid_create(const char *mgid_prefix, + const char *mmac, /* mcast mac for bcast 0xFF.. */ + u64 n_mac, /* bits to take from mmac */ + u32 vhub_id, + enum vhub_mgid_type type, + u8 rss_hash, + union vhub_mgid *mgid); +/* + * read the state of the gw eport. Can be called from any context. +*/ +int fip_vnic_get_eport_state(struct fip_vnic_data *vnic); +/* + * get GW info funcs. +*/ +int fip_vnic_get_eport_name(struct fip_vnic_data *vnic, char *buff); +int fip_vnic_get_bx_name(struct fip_vnic_data *vnic, char *buff); +int fip_vnic_get_bx_guid(struct fip_vnic_data *vnic, char *buff); +u8 fip_vnic_get_bx_sl(struct fip_vnic_data *vnic); +int fip_vnic_get_gw_type(struct fip_vnic_data *vnic); +int fip_vnic_get_lag_eports(struct fip_vnic_data *vnic, char *buf); +int fip_vnic_get_all_vlan_mode(struct fip_vnic_data *vnic, char *buff); + + +/* + * return short format string of GW info. can be called from any context. +*/ +int fip_vnic_get_short_gw_info(struct fip_vnic_data *vnic, char *buff); + +void vnic_data_cleanup(void); + +/* + * This function is called from the sysfs update callback function. + * it parses the request and adds the request to a list. It then queues a + * work request to process the list from the fip_wq context. +*/ +int fip_hadmin_sysfs_update(struct vnic_port *port, + const char *buffer, int count, int remove); +int fip_gw_sysfs_show(struct vnic_port *port, char *buffer); +int vnic_login_cmd_set(char *buf, struct fip_hadmin_cmd *cmd); +void vnic_login_cmd_init(struct fip_hadmin_cmd *cmd); + +int fip_hadmin_vnic_refresh(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address); +void fip_vnic_set_gw_param(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address); +void fip_vnic_create_gw_param(struct fip_vnic_send_info *gw_address, u32 gw_qpn, + u32 qkey, u16 gw_lid, u8 gw_sl); + +int fip_vnic_hadmin_init(struct vnic_port *port, struct fip_vnic_data *vnic); + +int port_fs_init(struct vnic_port *port); +void port_fs_exit(struct vnic_port *port); + +int vnic_port_query(struct vnic_port *port); + +#endif /* VNIC_H */ diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data.h new file mode 100644 index 0000000000000..d21517f916bbd --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _VNIC_DATA_H +#define _VNIC_DATA_H + +#include "vnic.h" + +enum { + VNIC_SEND_INLINE_FLAG_POS = 63, +}; + +#define VNIC_SEND_INLINE_FLAG ((u64)1 << VNIC_SEND_INLINE_FLAG_POS) + +/* main funcs */ +int vnic_port_data_init(struct vnic_port *port); +void vnic_port_data_cleanup(struct vnic_port *port); + +/* ib funcs */ +struct sk_buff *vnic_alloc_rx_skb(struct vnic_rx_ring *ring, int buf_ind, + gfp_t gfp_flag); +int vnic_post_recv(struct vnic_rx_ring *ring, u64 wr_id); +int vnic_post_recvs(struct vnic_rx_ring *ring); +int vnic_ib_create_qp_range(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, + struct ib_udata *udata, int nqps, + int align, struct ib_qp *list[]); +int vnic_ib_destroy_qp(struct ib_qp *qp); +int vnic_ib_post_send(struct ib_qp *ibqp, + struct ib_send_wr *wr, + struct ib_send_wr **bad_wr, + u8 ip_off, u8 ip6_off, + u8 tcp_off, u8 udp_off); +struct vnic_rx_ring *vnic_create_rx_ring(struct vnic_port *port, int index); +void vnic_destroy_rx_ring(struct vnic_rx_ring *ring); +int vnic_init_qp(struct vnic_login *login, int qp_index); +int vnic_create_qp(struct vnic_login *login, int qp_index); +int vnic_create_qp_range(struct vnic_login *login); +void vnic_destroy_qp(struct vnic_login *login, int qp_index); +int vnic_create_tx_res(struct vnic_login *login, int tx_res_index); +int vnic_create_rx_res(struct vnic_login *login, int rx_res_index); +void vnic_destroy_tx_res(struct vnic_login *login, int tx_res_index); +void vnic_destroy_rx_res(struct vnic_login *login, int rx_res_index); + +int vnic_ib_up(struct net_device *dev); +int vnic_ib_down(struct net_device *dev); +int vnic_ib_open(struct net_device *dev); +int vnic_ib_stop(struct net_device *dev); + +int vnic_ib_set_moder(struct vnic_login *login, + u16 rx_usecs, u16 rx_frames, u16 tx_usecs, u16 tx_frames); +int vnic_port_ib_init(struct vnic_port *port); +void vnic_port_ib_cleanup(struct vnic_port *port); +void vnic_ib_dispatch_event(struct ib_event *event); +#ifndef _BP_NAPI_POLL +int vnic_poll_cq_rx(struct napi_struct *napi, int budget); +#else +int vnic_poll_cq_rx(struct net_device *poll_dev, int *budget); +#endif +void vnic_send(struct vnic_login *login, struct sk_buff *skb, + struct ib_ah *ah, u32 dqpn, int tx_res_index); +void vnic_ib_free_ring(struct vnic_rx_ring *ring); +int vnic_ib_init_ring(struct vnic_rx_ring *ring); + +/* netdev funcs */ +struct net_device *vnic_alloc_netdev(struct vnic_port *port); +void vnic_free_netdev(struct vnic_login *login); +int vnic_restart(struct net_device *dev); +void __bcast_attach_cb(struct vnic_mcast *mcaste, void *login_ptr); +void __bcast_detach_cb(struct vnic_mcast *mcaste, void *login_ptr); + +/* rx funcs */ +int vnic_rx(struct vnic_login *login, struct sk_buff *skb, struct ib_wc *wc); +int vnic_unmap_and_replace_rx(struct vnic_rx_ring *ring, struct ib_device *dev, + struct skb_frag_struct *skb_frags_rx, + u64 wr_id, int length); +int vnic_rx_skb(struct vnic_login *login, struct vnic_rx_ring *ring, + struct ib_wc *wc, int ip_summed, char *eth_hdr_va); + +/* tx funcs */ +int vnic_tx(struct sk_buff *skb, struct net_device *dev); + +/* sysfs funcs */ +int vnic_create_dentry(struct vnic_login *login); +void vnic_delete_dentry(struct vnic_login *login); + +/* ethtool funcs */ +void vnic_set_ethtool_ops(struct net_device *dev); + +/* neigh funcs */ +void vnic_neigh_del_all(struct vnic_login *login); +struct vnic_neigh *vnic_neighe_search(struct vnic_login *login, u8 *mac); +void vnic_neighe_dealloc_task(struct work_struct *work); +void vnic_neighe_dealloc(struct vnic_neigh *neighe); +struct vnic_neigh *vnic_neighe_alloc(struct vnic_login *login, + const u8 *mac, u16 dlid, u32 dqpn, u8 rss); +void vnic_neighe_del(struct vnic_login *login, struct vnic_neigh *neighe); +int vnic_neighe_add(struct vnic_login *login, struct vnic_neigh *neighe); +struct ib_ah *vnic_ah_alloc(struct vnic_login *login, u16 dlid); +void vnic_neigh_invalidate(struct vnic_login *login); + + + +struct vnic_login *__vnic_login_create(struct vnic_port *port, int index); +u32 vnic_hash(struct net_device *dev, struct sk_buff *skb); +#endif /* _VNIC_DATA_H */ diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_ethtool.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_ethtool.c new file mode 100644 index 0000000000000..16ff551dd95c3 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_ethtool.c @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "vnic.h" +#include "vnic_data.h" + +static struct ethtool_ops vnic_ethtool_ops; + +static const char vnic_strings[][ETH_GSTRING_LEN] = { + /* public statistics */ + "rx_packets", "tx_packets", "rx_bytes", + "tx_bytes", "rx_errors", "tx_errors", + "rx_dropped", "tx_dropped", "multicast", + "collisions", "rx_length_errors", "rx_over_errors", + "rx_crc_errors", "rx_frame_errors", "rx_fifo_errors", + "rx_missed_errors", "tx_aborted_errors", "tx_carrier_errors", + "tx_fifo_errors", "tx_heartbeat_errors", "tx_window_errors", +#define VNIC_PUB_STATS_LEN 21 + + /* private statistics */ + "gro_held", "gro_merged", "gro_normal", "gro_drop", + "lro_aggregated", "lro_flushed", "lro_no_desc", + "tso_packets", "queue_stopped", "wake_queue", + "tx_timeout", "rx_chksum_good", "rx_chksum_none", + "tx_chksum_offload", "sig_ver_err", "vlan_err", + "shared_packets", "runt_packets", "realloc_packets", + "gw_tx_packets", "gw_tx_bytes", +#define VNIC_PORT_STATS_LEN 21 + + /* packet statistics rx_prio_X (TODO) */ +#define VNIC_PKT_STATS_LEN 0 +}; + +#define VNIC_STATS_LEN (sizeof(vnic_strings) / ETH_GSTRING_LEN) + +static void vnic_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *drvinfo) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + *drvinfo = login->drvinfo; +} + +static u32 vnic_get_msglevel(struct net_device *dev) +{ + return vnic_msglvl; +} + +static void vnic_set_msglevel(struct net_device *dev, u32 mlevel) +{ + vnic_msglvl = mlevel; +} + +static int vnic_get_coalesce(struct net_device *dev, + struct ethtool_coalesce *coal) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + vnic_dbg_ethtool(login->name, "get coalescing params for mtu:%d " + "rx_frames:%d rx_usecs:%d, " + "tx_frames:%d tx_usecs:%d, " + "adaptive_rx_coal:%d, " + "adaptive_tx_coal:%d\n", + login->dev->mtu, + login->rx_frames, login->rx_usecs, + login->tx_frames, login->tx_usecs, + login->adaptive_rx_coal, 0); + + coal->tx_coalesce_usecs = login->tx_usecs; + coal->tx_max_coalesced_frames = login->tx_frames; + coal->rx_coalesce_usecs = login->rx_usecs; + coal->rx_max_coalesced_frames = login->rx_frames; + + coal->pkt_rate_low = login->pkt_rate_low; + coal->rx_coalesce_usecs_low = login->rx_usecs_low; + coal->pkt_rate_high = login->pkt_rate_high; + coal->rx_coalesce_usecs_high = login->rx_usecs_high; + coal->rate_sample_interval = login->sample_interval; + coal->use_adaptive_rx_coalesce = login->adaptive_rx_coal; + + return 0; +} + +static int vnic_set_coalesce(struct net_device *dev, + struct ethtool_coalesce *coal) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + login->rx_frames = (coal->rx_max_coalesced_frames == + VNIC_AUTO_CONF) ? + VNIC_RX_COAL_TARGET / + login->dev->mtu + 1 : coal->rx_max_coalesced_frames; + login->rx_usecs = (coal->rx_coalesce_usecs == + VNIC_AUTO_CONF) ? + VNIC_RX_COAL_TIME : coal->rx_coalesce_usecs; + login->tx_frames = coal->tx_max_coalesced_frames; + login->tx_usecs = coal->tx_coalesce_usecs; + + /* Set adaptive coalescing params */ + login->pkt_rate_low = coal->pkt_rate_low; + login->rx_usecs_low = coal->rx_coalesce_usecs_low; + login->pkt_rate_high = coal->pkt_rate_high; + login->rx_usecs_high = coal->rx_coalesce_usecs_high; + login->sample_interval = coal->rate_sample_interval; + login->adaptive_rx_coal = coal->use_adaptive_rx_coalesce; + login->last_moder_time = VNIC_AUTO_CONF; + + if (login->adaptive_rx_coal) + return 0; + + vnic_ib_set_moder(login, + login->rx_usecs, login->rx_frames, + login->tx_usecs, login->tx_frames); + + return 0; +} + +static int vnic_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) +{ + cmd->autoneg = AUTONEG_DISABLE; + cmd->supported = SUPPORTED_10000baseT_Full; + cmd->advertising = SUPPORTED_10000baseT_Full; + if (netif_carrier_ok(dev)) { + cmd->speed = SPEED_10000; + cmd->duplex = DUPLEX_FULL; + } else { + cmd->speed = -1; + cmd->duplex = -1; + } + return 0; +} + +static int vnic_set_settings(struct net_device *dev, struct ethtool_cmd *cmd) +{ + if ((cmd->autoneg == AUTONEG_ENABLE) || + (cmd->speed != SPEED_10000) || (cmd->duplex != DUPLEX_FULL)) + return -EINVAL; + + /* Nothing to change */ + return 0; +} + +static void vnic_get_strings(struct net_device *dev, + uint32_t stringset, uint8_t *data) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + int index = 0, stats_off = 0, i; + + if (stringset != ETH_SS_STATS) + return; + + /* Add main counters */ + for (i = 0; i < VNIC_PUB_STATS_LEN; i++) + strcpy(data + (index++) * ETH_GSTRING_LEN, + vnic_strings[i + stats_off]); + stats_off += VNIC_PUB_STATS_LEN; + + for (i = 0; i < VNIC_PORT_STATS_LEN; i++) + strcpy(data + (index++) * ETH_GSTRING_LEN, + vnic_strings[i + stats_off]); + stats_off += VNIC_PORT_STATS_LEN; + + for (i = 0; i < VNIC_PKT_STATS_LEN; i++) + strcpy(data + (index++) * ETH_GSTRING_LEN, + vnic_strings[i + stats_off]); + stats_off += VNIC_PKT_STATS_LEN; + + for (i = 0; i < login->tx_rings_num; i++) { + sprintf(data + (index++) * ETH_GSTRING_LEN, + "tx%d_packets", i); + sprintf(data + (index++) * ETH_GSTRING_LEN, + "tx%d_bytes", i); + } + for (i = 0; i < login->rx_rings_num; i++) { + sprintf(data + (index++) * ETH_GSTRING_LEN, + "rx%d_packets", i); + sprintf(data + (index++) * ETH_GSTRING_LEN, + "rx%d_bytes", i); + } +} + +static void vnic_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, uint64_t *data) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + int index = 0, i; + + spin_lock_bh(&login->stats_lock); + + for (i = 0; i < VNIC_PUB_STATS_LEN; i++) + data[index++] = ((unsigned long *) &login->stats)[i]; + for (i = 0; i < VNIC_PORT_STATS_LEN; i++) + data[index++] = ((unsigned long *) &login->port_stats)[i]; + for (i = 0; i < VNIC_PKT_STATS_LEN; i++) + data[index++] = 0; + for (i = 0; i < login->tx_rings_num; i++) { + data[index++] = login->tx_res[i].stats.tx_packets; + data[index++] = login->tx_res[i].stats.tx_bytes; + } + for (i = 0; i < login->rx_rings_num; i++) { + data[index++] = login->port->rx_ring[i]->stats.rx_packets; + data[index++] = login->port->rx_ring[i]->stats.rx_bytes; + } + spin_unlock_bh(&login->stats_lock); +} + +#ifndef _BP_ETHTOOL_NO_SSETC +static int vnic_get_sset_count(struct net_device *dev, int sset) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + switch (sset) { + case ETH_SS_STATS: + return VNIC_STATS_LEN + /* static stats + stats per ring */ + (login->tx_rings_num + login->rx_rings_num) * 2; + default: + return -EOPNOTSUPP; + } +} + +#else +static int vnic_get_stats_count(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + return VNIC_STATS_LEN + + (login->tx_rings_num + login->rx_rings_num) * 2; +} +#endif + +static void vnic_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol) +{ + wol->supported = wol->wolopts = 0; + + return; +} + +void vnic_get_ringparam(struct net_device *dev, struct ethtool_ringparam *param) +{ + memset(param, 0, sizeof *param); + param->rx_max_pending = VNIC_MAX_RX_SIZE; + param->tx_max_pending = VNIC_MAX_TX_SIZE; + param->rx_pending = vnic_rx_rings_len; + param->tx_pending = vnic_tx_rings_len; +} + +void vnic_set_ethtool_ops(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + struct mlx4_ib_dev *mlx4_ibdev = login->port->dev->mdev; + + ASSERT(login); + ASSERT(login->port->dev->ca); + ASSERT(login->port->dev->ca->dma_device); + + SET_ETHTOOL_OPS(dev, &vnic_ethtool_ops); + strncpy(login->drvinfo.driver, DRV_NAME, VNIC_ETHTOOL_LINE_MAX); + strncpy(login->drvinfo.version, DRV_VER, VNIC_ETHTOOL_LINE_MAX); + login->drvinfo.n_stats = 0; + login->drvinfo.regdump_len = 0; + login->drvinfo.eedump_len = 0; + + sprintf(login->drvinfo.bus_info, "%s [%s:%d]", + pci_name(to_pci_dev(login->port->dev->ca->dma_device)), + login->port->dev->ca->name, login->port->num); + sprintf(login->drvinfo.fw_version, "%s [%.*s]", + login->port->dev->fw_ver_str, MLX4_BOARD_ID_LEN, + mlx4_ibdev->dev->board_id); + vnic_dbg_ethtool(login->name, "bus %s, port %d, fw_ver %s\n", + login->drvinfo.bus_info, login->port->num, + login->drvinfo.fw_version); + + return; +} + +static struct ethtool_ops vnic_ethtool_ops = { + .get_link = ethtool_op_get_link, + .get_drvinfo = vnic_get_drvinfo, + .get_msglevel = vnic_get_msglevel, + .set_msglevel = vnic_set_msglevel, + .get_coalesce = vnic_get_coalesce, + .set_coalesce = vnic_set_coalesce, + .get_strings = vnic_get_strings, + .get_ethtool_stats = vnic_get_ethtool_stats, +#ifndef _BP_ETHTOOL_NO_SSETC + .get_sset_count = vnic_get_sset_count, +#else + .get_stats_count = vnic_get_stats_count, +#endif + .get_settings = vnic_get_settings, + .set_settings = vnic_set_settings, + .get_wol = vnic_get_wol, + .get_ringparam = vnic_get_ringparam, + .set_ringparam = NULL, +}; + diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_fs.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_fs.c new file mode 100644 index 0000000000000..95d7ef796fc18 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_fs.c @@ -0,0 +1,993 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "vnic.h" +#include "vnic_data.h" +#include "vnic_fip_discover.h" + +#define ALL_VLAN_GW_VID "all" + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0)) +#define __MODULE_KOBJ_TYPE struct module_kobject +#else +#define __MODULE_KOBJ_TYPE struct module +#endif + +char *login_dentry_name(char *buf, struct vnic_login *login, char *str) +{ + snprintf(buf, VNIC_SYSFS_FLEN, "%s%d-%s", "vnic", + login->cnt, str); + return buf; +} + +char *port_dentry_name(char *buf, struct vnic_port *port, char *str) +{ + snprintf(buf, VNIC_SYSFS_FLEN, "%s_%s_%d", + str, port->dev->name, port->num); + return buf; +} + +char *vnic_dentry_name(char *buf, struct fip_vnic_data *vnic, char *str) +{ + snprintf(buf, VNIC_SYSFS_FLEN, "%s-%s-%s", "vnic", + vnic->interface_name, str); + return buf; +} + +#ifndef _BP_NO_ATT_OWNER +#define DENTRY_OWNER(_vdentry) \ + (_vdentry)->dentry.attr.owner = THIS_MODULE; \ + (_vdentry)->kobj = &vdentry->dentry.attr.owner->mkobj.kobj; +#else +#define DENTRY_OWNER(_vdentry) \ + (_vdentry)->kobj = &(THIS_MODULE)->mkobj.kobj; +#endif + +#define DENTRY_REMOVE(_dentry) \ +do { \ + vnic_dbg_sysfs((_dentry)->name, "deleted\n"); \ + sysfs_remove_file((_dentry)->kobj, &(_dentry)->dentry.attr); \ + (_dentry)->ctx = NULL; \ +} while (0); + +#define DENTRY_CREATE(_ctx, _dentry, _name, _show, _store) \ +do { \ + struct vnic_sysfs_attr *vdentry = _dentry; \ + vdentry->ctx = _ctx; \ + vdentry->dentry.show = _show; \ + vdentry->dentry.store = _store; \ + vdentry->dentry.attr.name = vdentry->name; \ + vdentry->dentry.attr.mode = 0; \ + DENTRY_OWNER(vdentry); \ + snprintf(vdentry->name, VNIC_SYSFS_FLEN, "%s", _name); \ + if (vdentry->dentry.store) \ + vdentry->dentry.attr.mode |= S_IWUSR; \ + if (vdentry->dentry.show) \ + vdentry->dentry.attr.mode |= S_IRUGO; \ + vnic_dbg_sysfs(_ctx->name, "creating %s\n", \ + vdentry->name); \ + if (strlen(_name) > VNIC_SYSFS_FLEN) { \ + vnic_err(_ctx->name, "name too long %d > %d\n", \ + (int)strlen(_name), VNIC_SYSFS_FLEN); \ + vdentry->ctx = NULL; \ + break; \ + } \ + if (sysfs_create_file(vdentry->kobj, &vdentry->dentry.attr)) { \ + vnic_err(_ctx->name, "failed to create %s\n", \ + vdentry->dentry.attr.name); \ + vdentry->ctx = NULL; \ + break; \ + } \ + vnic_dbg_sysfs(_ctx->name, "created %s\n", vdentry->name); \ +} while (0); + +/* helper functions */ +static const char *port_phys_state_str(enum ib_port_state pstate) +{ + switch (pstate) { + case 0: + return "no_state_change"; + case 1: + return "sleep"; + case 2: + return "polling"; + case 3: + return "disabled"; + case 4: + return "port_configuration_training"; + case 5: + return "up"; + case 6: + return "error_recovery"; + case 7: + return "phy_test"; + default: + return "invalid_state"; + } +} +static const char *port_state_str(enum ib_port_state pstate) +{ + switch (pstate) { + case IB_PORT_DOWN: + return "down"; + case IB_PORT_INIT: + return "initializing"; + case IB_PORT_ARMED: + return "armed"; + case IB_PORT_ACTIVE: + return "active"; + case IB_PORT_NOP: + return "nop"; + case IB_PORT_ACTIVE_DEFER: + return "defer"; + default: + return "invalid_state"; + } +} + +/* store/show functions */ +static ssize_t vnic_neigh_show(struct module_attribute *attr, + __MODULE_KOBJ_TYPE *mod, char *buf) +{ + char *p = buf; + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct vnic_login *login = vnic_dentry->ctx; + struct vnic_neigh *neighe; + struct vnic_mcast *mcaste; + struct rb_node *n; + unsigned long flags; + + /* check if GW entry is ready */ + if (!login->gw_neigh) + goto out; + ASSERT(login->gw_neigh); + + /* print GW entry */ + neighe = login->gw_neigh; + p += _sprintf(p, buf, "G:MAC["MAC_6_PRINT_FMT"] VID[0x%04x] " + "VID_USED[%d] QPN[0x%06x] LID[0x%04x] RSS[%d] SL[%d] VALID[%d]\n", + MAC_6_PRINT_ARG(neighe->mac), + be16_to_cpu(login->vid), login->vlan_used, neighe->qpn, + neighe->lid, neighe->rss, neighe->sl, neighe->valid); + + /* print neigh tree entries */ + n = rb_first(&login->neigh_tree); + while (n) { + neighe = rb_entry(n, struct vnic_neigh, rb_node); + p += _sprintf(p, buf, "U:MAC["MAC_6_PRINT_FMT"] VID[0x%04x] " + "VID_USED[%d] QPN[0x%06x] LID[0x%04x] RSS[%d] SL[%d] VALID[%d]\n", + MAC_6_PRINT_ARG(neighe->mac), + be16_to_cpu(login->vid), login->vlan_used, + neighe->qpn, neighe->lid, neighe->rss, neighe->sl, neighe->valid); + n = rb_next(n); + } + + /* print mcast tree entries */ + spin_lock_irqsave(&login->mcast_tree.mcast_rb_lock, flags); + n = rb_first(&login->mcast_tree.mcast_tree); + while (n) { + u16 lid = 0xFFFF; + mcaste = rb_entry(n, struct vnic_mcast, rb_node); + n = rb_next(n); + if (test_bit(MCAST_ATTACHED, &mcaste->state)) + lid = mcaste->port_mcaste->rec.mlid; + p += _sprintf(p, buf, "M:MAC["MAC_6_PRINT_FMT"] VID[0x%04x] " + "VID_USED[%d] QPN[0x%06x] LID[0x%04x] RSS[%d] SL[%d]\n", + MAC_6_PRINT_ARG(mcaste->mac), + 0, login->vlan_used, IB_MULTICAST_QPN, lid, 0, mcaste->port_mcaste->sa_mcast->rec.sl); + } + spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags); + +out: + return (ssize_t)(p - buf); +} + +/* store/show functions */ +static ssize_t vnic_member_show(struct module_attribute *attr, + __MODULE_KOBJ_TYPE *mod, char *buf) +{ + char *p = buf; + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct vnic_login *login = vnic_dentry->ctx; + int i; + + if (!login->is_lag) + goto out; + + netif_tx_lock_bh(login->dev); + p += _sprintf(p, buf, "GW member count=%d active count=%d hash bitmask=0x%X\n", + login->lag_member_count, login->lag_member_active_count, login->lag_prop.hash_mask); + + p += _sprintf(p, buf, "GW hash mapping table:\n"); + + for (i=0; ilag_gw_map[i], login->lag_gw_map[i+1], login->lag_gw_map[i+2], login->lag_gw_map[i+3], + login->lag_gw_map[i+4], login->lag_gw_map[i+5], login->lag_gw_map[i+6], login->lag_gw_map[i+7]); + } + + p += _sprintf(p, buf, "\nGW member state info: (0x1-created, 0x2-eport up, 0x4-mcast join complete, 0x8-member in use)\n"); + + for (i=0; ilag_gw_neigh[i].gw_id, + login->lag_gw_neigh[i].info, + login->lag_gw_neigh[i].neigh.lid, + login->lag_gw_neigh[i].neigh.qpn, + login->lag_gw_neigh[i].neigh.sl, + login->lag_gw_neigh[i].neigh.valid); + } + netif_tx_unlock_bh(login->dev); + +out: + return (ssize_t)(p - buf); +} + +static ssize_t vnic_login_show(struct module_attribute *attr, + __MODULE_KOBJ_TYPE *mod, char *buf) +{ + char *p = buf, tmp_line[VNIC_SYSFS_LLEN]; + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct vnic_login *login = vnic_dentry->ctx; + struct fip_vnic_data *vnic_fip = login->fip_vnic; + int rc, eport_connected = test_bit(VNIC_STATE_LOGIN_CREATE_2, &vnic_fip->login_state); + u16 pkey_used = 0; + int lag_gw; + int ret; + + ASSERT(login->dev); + ASSERT(login->port->dev->ca); + + /* NETDEV attributes */ + p += _sprintf(p, buf, "NETDEV_NAME %s\n", login->dev->name); + p += _sprintf(p, buf, "NETDEV_LINK %s\n", + netif_carrier_ok(login->dev) ? "up" : "down"); + p += _sprintf(p, buf, "NETDEV_OPEN %s\n", + (login->dev->flags & IFF_UP) ? "yes" : "no"); + p += _sprintf(p, buf, "NETDEV_QSTOP %s\n", + netif_queue_stopped(login->dev) ? "yes" : "no"); + p += _sprintf(p, buf, "NETDEV_MTU %d/%d\n", + (int)login->dev->mtu, + (int)login->max_mtu); + + /* IOA attributes */ + p += _sprintf(p, buf, "IOA_PORT %s:%d\n", + login->port->dev->ca->name, + login->port->num); + p += _sprintf(p, buf, "IOA_NAME %s\n", + login->desc); + p += _sprintf(p, buf, "IOA_LID 0x%04x\n", login->port->attr.lid); + p += _sprintf(p, buf, "IOA_GUID "VNIC_GUID_FMT"\n", + VNIC_GUID_RAW_ARG(login->port->gid.raw + 8)); + p += _sprintf(p, buf, "IOA_LOG_LINK %s\n", + port_phys_state_str(login->port->attr.phys_state)); + p += _sprintf(p, buf, "IOA_PHY_LINK %s\n", + port_state_str(login->port->attr.state)); + p += _sprintf(p, buf, "IOA_MTU %d\n", login->port->max_mtu_enum); + + + /* EPORT and BX attributes */ + if (no_bxm) { + p += _sprintf(p, buf, "EPORT_STATE %s\n", "bridgeless"); + } else if (vnic_fip) { + p += _sprintf(p, buf, "EPORT_STATE %s\n", + !eport_connected ? "disconnected" : + (fip_vnic_get_eport_state(vnic_fip) ? + "up" : "down")); + p += _sprintf(p, buf, "EPORT_NAME %s\n", + fip_vnic_get_eport_name(vnic_fip, tmp_line) ? + NOT_AVAILABLE_STRING : tmp_line); + p += _sprintf(p, buf, "EPORT_QPN 0x%06x\n", + login->gw_neigh ? login->gw_neigh->qpn : 0); + p += _sprintf(p, buf, "EPORT_LID 0x%04x\n", + login->gw_neigh ? login->gw_neigh->lid : 0); + p += _sprintf(p, buf, "EPORT_ID %u\n", login->gw_port_id); + + p += _sprintf(p, buf, "BX_NAME %s\n", + fip_vnic_get_bx_name(vnic_fip, tmp_line) ? + NOT_AVAILABLE_STRING : tmp_line); + fip_vnic_get_bx_guid(vnic_fip, tmp_line); + if (*((u64 *)tmp_line) == 0) + p += _sprintf(p, buf, "BX_GUID %s\n", NOT_AVAILABLE_STRING); + else + p += _sprintf(p, buf, "BX_GUID "VNIC_GUID_FMT"\n", + VNIC_GUID_RAW_ARG(tmp_line)); + + lag_gw = fip_vnic_get_gw_type(vnic_fip); + if (lag_gw) { + p += _sprintf(p, buf, "GW_TYPE LAG\n"); + ret = fip_vnic_get_lag_eports(vnic_fip, p); + p += (ret > 0) ? ret : 0; + } else + p += _sprintf(p, buf, "GW_TYPE LEGACY\n"); + + rc = fip_vnic_get_all_vlan_mode(vnic_fip, tmp_line); + p += _sprintf(p, buf, "ALL_VLAN %s\n", + rc < 0 ? NOT_AVAILABLE_STRING : tmp_line); + + } else { + p += _sprintf(p, buf, "EPORT_STATE %s\n", "error"); + } + + /* misc attributes*/ + p += _sprintf(p, buf, "SW_RSS %s\n", + !eport_connected ? NOT_AVAILABLE_STRING : + ((login->qps_num > 1) ? "yes" : "no")); + p += _sprintf(p, buf, "SW_RSS_SIZE %u\n", login->qps_num); + p += _sprintf(p, buf, "RX_RINGS_NUM %d\n", login->rx_rings_num); + p += _sprintf(p, buf, "RX_RINGS_LIN %s\n", + login->port->rx_ring[0]->log_rx_info ? "no" : "yes"); + p += _sprintf(p, buf, "TX_RINGS_NUM %d\n", login->tx_rings_num); + p += _sprintf(p, buf, "TX_RINGS_ACT %d\n", + VNIC_TXQ_GET_ACTIVE(login)); + p += _sprintf(p, buf, "NDO_TSS %s\n", + (login->ndo_tx_rings_num > 1) ? "yes" : "no"); + p += _sprintf(p, buf, "NDO_TSS_SIZE %u\n", login->ndo_tx_rings_num); + p += _sprintf(p, buf, "MCAST_PROMISC %s\n", + !eport_connected ? NOT_AVAILABLE_STRING : + (is_mcast_promisc(login) ? "yes" : "no")); + p += _sprintf(p, buf, "UCAST_PROMISC %s\n", + (is_ucast_promisc(login) ? "yes" : "no")); + p += _sprintf(p, buf, "MCAST_MASK %d\n", login->n_mac_mcgid); + p += _sprintf(p, buf, "CHILD_VNICS %d/%d\n", + atomic_read(&login->vnic_child_cnt), + vnic_child_max); + p += _sprintf(p, buf, "PKEY 0x%04x\n", login->pkey); + p += _sprintf(p, buf, "PKEY_INDEX 0x%04x\n", login->pkey_index); + rc = ib_query_pkey(login->port->dev->ca, login->port->num, + login->pkey_index, &pkey_used); + p += _sprintf(p, buf, "PKEY_MEMBER %s\n", + (rc || !eport_connected) ? NOT_AVAILABLE_STRING : + ((pkey_used & 0x8000) ? "full" : "partial")); + p += _sprintf(p, buf, "SL_DATA %u\n", login->sl); + p += _sprintf(p, buf, "SL_CONTROL %u\n", + vnic_fip ? fip_vnic_get_bx_sl(vnic_fip) : 0); +#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO) + p += _sprintf(p, buf, "GRO %s\n", + login->dev->features & NETIF_F_GRO ? "yes" : "no"); +#elif defined(NETIF_F_LRO) + p += _sprintf(p, buf, "LRO %s\n", + login->dev->features & NETIF_F_LRO ? "yes" : "no"); + p += _sprintf(p, buf, "LRO_NUM %d\n", login->lro_num); +#endif + p += _sprintf(p, buf, "NAPI %s\n", + login->napi_num ? "yes" : "no"); + p += _sprintf(p, buf, "NAPI_WEIGHT %u\n", + login->napi_num ? vnic_napi_weight : 0); + p += _sprintf(p, buf, "QPN 0x%x\n", + login->qp_base_num); + p += _sprintf(p, buf, "MAC "MAC_6_PRINT_FMT"\n", + MAC_6_PRINT_ARG(login->dev_addr)); + p += _sprintf(p, buf, "VNIC_ID %d\n", + vnic_fip ? vnic_fip->vnic_id : 0); + p += _sprintf(p, buf, "ADMIN_MODE %s\n", + !vnic_fip ? NOT_AVAILABLE_STRING : + (vnic_fip->hadmined ? "host" : "network")); + + if (vnic_fip && vnic_fip->vlan_used) + p += _sprintf(p, buf, "VLAN 0x%03x\n", vnic_fip->vlan); + else + p += _sprintf(p, buf, "VLAN %s\n", NOT_AVAILABLE_STRING); + + if (vnic_fip && vnic_fip->shared_vnic.enabled) { + p += _sprintf(p, buf, "SHARED_MAC "MAC_6_PRINT_FMT"\n", + MAC_6_PRINT_ARG(vnic_fip->shared_vnic.emac)); + p += _sprintf(p, buf, "SHARED_IP "IP_4_PRINT_FMT"\n", + IP_4_PRINT_ARG(vnic_fip->shared_vnic.ip)); + } else { + p += _sprintf(p, buf, "SHARED_MAC %s\n", NOT_AVAILABLE_STRING); + p += _sprintf(p, buf, "SHARED_IP %s\n", NOT_AVAILABLE_STRING); + } + + return (ssize_t)(p - buf); +} + +static ssize_t vnic_qps_show(struct module_attribute *attr, + __MODULE_KOBJ_TYPE *mod, char *buf) +{ + char *p = buf; + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct vnic_login *login = vnic_dentry->ctx; + struct ib_qp *qp; + struct ib_qp_attr query_attr; + struct ib_qp_init_attr query_init_attr; + int i, mask = -1; + + for (i = 0; i < login->qps_num; ++i) { + qp = login->qp_res[i].qp; + if (ib_query_qp(qp, &query_attr, mask, &query_init_attr)) + continue; + p += _sprintf(p, buf, "QP_INDEX %d\n", i); + p += _sprintf(p, buf, "QP_NUM 0x%06x\n", qp->qp_num); + p += _sprintf(p, buf, "QP_QKEY 0x%08x\n", query_attr.qkey); + p += _sprintf(p, buf, "QP_STATE 0x%02x\n", query_attr.qp_state); + p += _sprintf(p, buf, "QP_RX_RING %d\n", i % login->rx_rings_num); + p += _sprintf(p, buf, "QP_PTR %p\n", qp); + p += _sprintf(p, buf, "QP_RX_SRQ_PTR %p\n", qp->srq); + p += _sprintf(p, buf, "QP_RX_CQ_PTR %p\n", qp->recv_cq); + p += _sprintf(p, buf, "QP_TX_CQ_PTR %p\n", qp->send_cq); + p += _sprintf(p, buf, "\n"); + } + + return (ssize_t)(p - buf); +} +static char* vnic_state_2str(enum fip_vnic_state state) +{ + switch(state) { + case FIP_VNIC_CLOSED: return "CLOSED"; + case FIP_VNIC_CONNECTED: return "CONNECTED"; + case FIP_VNIC_HADMIN_IDLE: return "HADMIN_IDLE"; + case FIP_VNIC_LOGIN: return "LOGIN"; + case FIP_VNIC_MCAST_INIT: return "MCAST_INIT"; + case FIP_VNIC_MCAST_INIT_DONE: return "MCAST_INIT_DONE"; + case FIP_VNIC_RINGS_INIT: return "RINGS_INIT"; + case FIP_VNIC_VHUB_DONE: return "VHUB_DONE"; + case FIP_VNIC_VHUB_INIT: return "VHUB_INIT"; + case FIP_VNIC_VHUB_INIT_DONE: return "VHUB_INIT_DONE"; + case FIP_VNIC_VHUB_WRITE: return "VHUB_WRITE"; + case FIP_VNIC_WAIT_4_ACK: return "WAIT_4_ACK"; + } + return "UNKNOWN"; + + +} + +int port_vnics_sysfs_show(struct vnic_port *port, char *buf) +{ + struct fip_gw_data *gw; + char *p = buf; + struct fip_discover *discover; + struct fip_vnic_data *vnic; + + mutex_lock(&port->start_stop_lock); + list_for_each_entry(discover, &port->fip.discover_list, discover_list) { + + down_read(&discover->l_rwsem); + + list_for_each_entry(gw, &discover->gw_list, list) { + list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) { + p += _sprintf(p, buf, "%-15s\t%-10s\t%10s:%d %-10s\t%.7d\t%-10s\t%s\n", + gw->info.vol_info.system_name, + gw->info.vol_info.gw_port_name, + gw->discover->port->dev->ca->name, + gw->discover->port->num, + vnic->name, + vnic->vnic_id, + vnic->hadmined?"HOSTADMIN":"NETADMIN", + vnic_state_2str(vnic->state)); + } + } + + up_read(&discover->l_rwsem); + } + + mutex_unlock(&port->start_stop_lock); + return (p - buf); +} + + +#ifdef VNIC_PROFILLNG +static ssize_t vnic_dentry_prof_skb_show(struct module_attribute *attr, + __MODULE_KOBJ_TYPE *mod, char *buf) +{ + char *p = buf; + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct vnic_login *login = vnic_dentry->ctx; + struct sk_buff *skb; + int i; + + for (i = 0; i < VNIC_PROFILLNG_SKB_MAX; ++i) { + if (!login->prof_arr[i].cnt) + continue; + skb = &login->prof_arr[i].skb; + p += _sprintf(p, buf, "==============\n"); + p += _sprintf(p, buf, "SKB[%d] CNT %d\n", i, login->prof_arr[i].cnt); + p += _sprintf(p, buf, "len %d\n", skb->len); + p += _sprintf(p, buf, "data_len %d\n", skb->data_len); + p += _sprintf(p, buf, "head_len %d\n", skb_headlen(skb)); + p += _sprintf(p, buf, "gso %d\n", skb_is_gso(skb)); + p += _sprintf(p, buf, "nr_frags %d\n", login->prof_arr[i].nr_frags); + p += _sprintf(p, buf, "jiffies %lu\n", login->prof_arr[i].jiffies); + p += _sprintf(p, buf, "msecs %u\n", + jiffies_to_msecs(login->prof_arr[i].jiffies)); + p += _sprintf(p, buf, "msecs_diff %u\n", + jiffies_to_msecs(login->prof_arr[i].jiffies) - + jiffies_to_msecs(login->prof_arr[i ? i -1 : 0].jiffies)); + } + + return (ssize_t)(p - buf); +} + +#endif + +static int get_guid(u8 *guid, char *s) +{ + if (sscanf(s, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", + guid + 0, guid + 1, guid + 2, guid + 3, guid + 4, + guid + 5, guid + 6, guid + 7) != 8) + return -1; + + return 0; +} + +static int get_mac(u8 *mac, char *s) +{ + if (sscanf(s, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", + mac + 0, mac + 1, mac + 2, mac + 3, mac + 4, + mac + 5) != 6) + return -1; + + return 0; +} + +static int get_ipv4(short unsigned int *ip, char *s) +{ + if (sscanf(s, "%hu.%hu.%hu.%hu", ip + 0, ip + 1, ip + 2, ip + 3) != 4) + return -1; + + return 0; +} + +static int get_parent(struct vnic_port *port, char *parent) +{ + struct net_device *parent_netdev; + + /* check parent syntax */ + if (!dev_valid_name(parent)) + return -EINVAL; + + parent_netdev = dev_get_by_name(&init_net, parent); + if (parent_netdev) + dev_put(parent_netdev); + + return parent_netdev ? 0 : -ENODATA; +} + +static struct fip_hadmin_cache *get_hadmin_entry(void) +{ + struct fip_hadmin_cache *hadmin_entry; + + hadmin_entry = kzalloc(sizeof *hadmin_entry, GFP_ATOMIC); + if (!hadmin_entry) + return NULL; + + hadmin_entry->vnic_id = NOT_AVAILABLE_NUM; + hadmin_entry->gw_port_id = NOT_AVAILABLE_NUM; + + return hadmin_entry; +} + +void vnic_login_cmd_init(struct fip_hadmin_cmd *cmd) +{ + char *buf = (char *)cmd; + u8 i; + + for (i = 0; i < MAX_INPUT_ARG; ++i) + sprintf(buf + (i * MAX_INPUT_LEN), NOT_AVAILABLE_STRING); +} + +int vnic_login_cmd_set(char *buf, struct fip_hadmin_cmd *cmd) +{ + int count; + + if (cmd) { + count = sprintf(buf, "name=%s mac=%s vnic_id=%s vid=%s " + "bxname=%s bxguid=%s eport=%s ipv4=%s ipv6=%s " + "emac=%s pkey=%s parent=%s\n", + cmd->c_name, cmd->c_mac, cmd->c_vnic_id, + cmd->c_vid, cmd->c_bxname, cmd->c_bxguid, + cmd->c_eport, cmd->c_ipv4, cmd->c_ipv6, + cmd->c_emac, cmd->c_pkey, cmd->c_parent); + vnic_dbg_sysfs((char *)(cmd->c_name), "cmd: %s", buf); + } else /* print the cmd syntax */ + count = sprintf(buf, "name=%%s mac=%%s vnic_id=%%s vid=%%s " + "bxname=%%s bxguid=%%s eport=%%s ipv4=%%s " + "ipv6=%%s emac=%%s pkey=%%s parent=%%s\n"); + + return count; +} + +/* create/destroy child vNic; syntax example: + * +00:11:22:33:44:55 + */ +static ssize_t vnic_child_write(struct module_attribute *attr, + __MODULE_KOBJ_TYPE *mod, + const char *buf, size_t count) +{ + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct vnic_login *login = vnic_dentry->ctx; + char action = buf[0]; + char *buf_mac = (char *)buf + 1; + int remove = -1; + u8 mac[ETH_ALEN]; + + if (action == '-') + remove = 1; + if (action == '+') + remove = 0; + + if (remove < 0 || get_mac(mac, buf_mac) || !is_valid_ether_addr(mac)) + return -EINVAL; + + vnic_learn_mac(login->dev, mac, remove); + return count; +} + +int fip_hadmin_sysfs_update(struct vnic_port *port, + const char *buf, int count, int remove) +{ + struct fip_discover *discover; + struct fip_hadmin_cache *hadmin_entry, *hadmin_it; + struct fip_hadmin_cmd *cmd; + char *name = NULL; + int rc, num; + u16 pkey; + + hadmin_entry = get_hadmin_entry(); + if (!hadmin_entry) { + rc = -ENOMEM; + vnic_dbg_sysfs(port->name, "get_hadmin_entry failed\n"); + goto err; + } + + cmd = &hadmin_entry->cmd; + rc = sscanf(buf, "name=%s mac=%s vnic_id=%s vid=%s bxname=%s bxguid=%s " + "eport=%s ipv4=%s ipv6=%s emac=%s pkey=%s parent=%s", + cmd->c_name, cmd->c_mac, cmd->c_vnic_id, cmd->c_vid, + cmd->c_bxname, cmd->c_bxguid, cmd->c_eport, cmd->c_ipv4, + cmd->c_ipv6, cmd->c_emac, cmd->c_pkey, cmd->c_parent); + if (rc != MAX_INPUT_ARG) { + vnic_dbg_sysfs(port->name, "sscanf failed, rc %d\n", rc); + rc = -EINVAL; + goto err; + } else + name = (char *)(cmd->c_name); + + /* get parent name */ + if (!dev_valid_name(cmd->c_parent)) + hadmin_entry->parent_used = 0; + else if (remove || !get_parent(port, cmd->c_parent)) { + vnic_dbg_sysfs(name, "parent set %s\n", cmd->c_parent); + strncpy(hadmin_entry->parent_name, cmd->c_parent, + sizeof(hadmin_entry->parent_name)); + hadmin_entry->parent_used = 1; + } else { + vnic_warn(name, "invalid parent name %s\n", cmd->c_parent); + rc = -EINVAL; + goto err; + } + + /* get vNic ID dec (must) */ + if (sscanf(cmd->c_vnic_id, "%d", &num) != 1) { + /* abort on failure */ + vnic_warn(name, "invalid vNic ID %s\n", cmd->c_vnic_id); + rc = -EINVAL; + goto err; + } + hadmin_entry->vnic_id = (u16)num; + + /* get vNic MAC (must) */ + if (get_mac(hadmin_entry->mac, cmd->c_mac)) { + vnic_warn(name, "invalid vNic MAC %s\n", cmd->c_vnic_id); + rc = -EINVAL; + goto err; + } + + /* get interface name (must) */ + if ((!dev_valid_name(cmd->c_name) && !hadmin_entry->parent_used) || + ((strlen(cmd->c_name) > VNIC_NAME_LEN) && hadmin_entry->parent_used)) { + vnic_warn(name, "invalid vNic name %s\n", cmd->c_name); + rc = -EINVAL; + goto err; + } + + strncpy(hadmin_entry->interface_name, cmd->c_name, + sizeof(hadmin_entry->interface_name)); + + /* get BX GUID, if fails, get BX NAME */ + if (get_guid(hadmin_entry->system_guid, cmd->c_bxguid)) { + strncpy(hadmin_entry->system_name, cmd->c_bxname, + sizeof(hadmin_entry->system_name)); + vnic_dbg_sysfs(name, "use BX NAME %s\n", cmd->c_bxname); + } + + /* get shared emac/ip */ + if (!get_ipv4((short unsigned int *)hadmin_entry->shared_vnic_ip, + cmd->c_ipv4)) { + /* TODO, add IPv6 support for shared vNic */ + get_mac(hadmin_entry->shared_vnic_mac, cmd->c_emac); + vnic_dbg_sysfs(name, "use shared ip/mac\n"); + } + +#ifndef VLAN_GROUP_ARRAY_LEN +#define VLAN_GROUP_ARRAY_LEN VLAN_N_VID +#endif + + /* get VLAN field (dec) */ + if ((sscanf(cmd->c_vid, "%d", &num) == 1) && + num < VLAN_GROUP_ARRAY_LEN && num >= 0) { + /* set other fields on success, skip on failure */ + vnic_dbg_sysfs(name, "vlan set 0x%x\n", hadmin_entry->vlan); + hadmin_entry->vlan_used = 1; + hadmin_entry->vlan = (u16)num; + } else if (!strcmp(cmd->c_vid, ALL_VLAN_GW_VID)) { + /* Dont set 'vlan_used'. the code counts on it being NULL for + * host admin vnics in all_vlan mode, when Vlans are used */ + hadmin_entry->vlan = 0; + hadmin_entry->all_vlan_gw = 1; + } + + /* get eport name */ + if (!strlen(cmd->c_eport)) { + vnic_warn(name, "invalid eport name %s\n", cmd->c_eport); + rc = -EINVAL; + goto err; + } + strncpy(hadmin_entry->eport_name, cmd->c_eport, + sizeof(hadmin_entry->eport_name)); + + /* set remove/add flag */ + vnic_dbg_sysfs(name, "%s hadmin vNic\n", remove ? "remove" : "add"); + hadmin_entry->remove = remove; + + /* set pkey (hex) */ + if ((sscanf(cmd->c_pkey, "%x", &num) != 1) || !num) + pkey = 0xffff; /* default */ + else + pkey = (u16)num | 0x8000; + vnic_dbg_sysfs(name, "pkey 0x%x\n", pkey); + + /* cannot sleep in this functions for child vnics flow + * (avoid schedule while atomic oops) + * TODO: check if holding start_stop_lock is needed here + */ + //mutex_lock(&port->start_stop_lock); + + list_for_each_entry(discover, &port->fip.discover_list, discover_list) { + if (discover->pkey == pkey) { + spin_lock_irq(&discover->lock); + + if (discover->flush != FIP_NO_FLUSH) { + rc = -EBUSY; + spin_unlock_irq(&discover->lock); + goto skip; + } + + /* check that this mac/vlan is not in the cache list + * (saves redundant queue_delayed_work call during + * vnic_learn_mac bursts) + */ + list_for_each_entry_reverse(hadmin_it, &discover->hadmin_cache, next) { + if (!memcmp(hadmin_entry->mac, hadmin_it->mac, ETH_ALEN) && + hadmin_entry->vlan == hadmin_it->vlan && + hadmin_entry->remove == hadmin_it->remove) { + rc = -EEXIST; + spin_unlock_irq(&discover->lock); + goto skip; + } + } + list_add_tail(&hadmin_entry->next, &discover->hadmin_cache); + /* calls fip_discover_hadmin_update() */ + queue_delayed_work(fip_wq, &discover->hadmin_update_task, HZ/10); + spin_unlock_irq(&discover->lock); + goto updated_discover; + } + } + + //mutex_unlock(&port->start_stop_lock); + vnic_dbg_sysfs(name, "Requested PKEY=0x%x is not configured\n", pkey); + goto skip; + +err: + vnic_dbg_sysfs(name, "Invalid host admin request format string. Request rejected\n"); +skip: + kfree(hadmin_entry); + return rc; + +updated_discover: + //mutex_unlock(&port->start_stop_lock); + return count; +} + +static ssize_t vnic_login_cmd(struct module_attribute *attr, + __MODULE_KOBJ_TYPE *mod, char *buf) +{ + char *p = buf; + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct fip_vnic_data *vnic_fip = vnic_dentry->ctx; + struct fip_hadmin_cmd *cmd; + + if (!vnic_fip || !vnic_fip->hadmined) + goto out; + + cmd = &vnic_fip->cmd; + p += _sprintf(p, buf, "name=%s mac=%s vnic_id=%s vid=%s bxname=%s bxguid=%s " + "eport=%s ipv4=%s ipv6=%s emac=%s pkey=%s parent=%s ", + cmd->c_name, cmd->c_mac, cmd->c_vnic_id, cmd->c_vid, + cmd->c_bxname, cmd->c_bxguid, cmd->c_eport, cmd->c_ipv4, + cmd->c_ipv6, cmd->c_emac, cmd->c_pkey, cmd->c_parent); + p += _sprintf(p, buf, "ib_port=%s", vnic_fip->port->name); + p += _sprintf(p, buf, "\n"); + +out: + return (ssize_t)(p - buf); +} + +int vnic_create_hadmin_dentry(struct fip_vnic_data *vnic) +{ + char name[VNIC_SYSFS_FLEN]; + + DENTRY_CREATE(vnic, &vnic->dentry, + vnic_dentry_name(name, vnic, "cmd"), + vnic_login_cmd, NULL); + return 0; +} + +void vnic_delete_hadmin_dentry(struct fip_vnic_data *vnic) +{ + if (vnic->dentry.ctx) + DENTRY_REMOVE(&vnic->dentry); +} + +int vnic_create_dentry(struct vnic_login *login) +{ + int i = 0; + char name[VNIC_SYSFS_FLEN]; + + DENTRY_CREATE(login, &login->dentries[i++], + login_dentry_name(name, login, "info"), + vnic_login_show, NULL); + DENTRY_CREATE(login, &login->dentries[i++], + login_dentry_name(name, login, "child"), + NULL, vnic_child_write); + DENTRY_CREATE(login, &login->dentries[i++], + login_dentry_name(name, login, "neigh"), + vnic_neigh_show, NULL); + DENTRY_CREATE(login, &login->dentries[i++], + login_dentry_name(name, login, "qps"), + vnic_qps_show, NULL); + DENTRY_CREATE(login, &login->dentries[i++], + login_dentry_name(name, login, "member"), + vnic_member_show, NULL); + +#ifdef VNIC_PROFILLNG + DENTRY_CREATE(login, &login->dentries[i++], + login_dentry_name(name, login, "prof_skb"), + vnic_dentry_prof_skb_show, NULL); +#endif + return 0; +} + +void vnic_delete_dentry(struct vnic_login *login) +{ + int i; + + for (i = 0; i < VNIC_MAX_DENTRIES; ++i) { + if (login->dentries[i].ctx) + DENTRY_REMOVE(&login->dentries[i]); + } +} + +static ssize_t port_gw_fs_show(struct module_attribute *attr, + __MODULE_KOBJ_TYPE *mod, char *buf) +{ + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct vnic_port *port = vnic_dentry->ctx; + + return fip_gw_sysfs_show(port, buf); +} + + +static ssize_t port_vnics_fs_show(struct module_attribute *attr, + __MODULE_KOBJ_TYPE *mod, char *buf) +{ + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct vnic_port *port = vnic_dentry->ctx; + return port_vnics_sysfs_show(port, buf); +} + +static ssize_t port_hadmin_syntax(struct module_attribute *attr, + __MODULE_KOBJ_TYPE *mod, char *buf) +{ + /* print cmd syntax only (for usage) */ + return vnic_login_cmd_set(buf, NULL); +} + +static ssize_t port_hadmin_add_write(struct module_attribute *attr, + __MODULE_KOBJ_TYPE *mod, + const char *buf, size_t count) +{ + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct vnic_port *port = vnic_dentry->ctx; + + return fip_hadmin_sysfs_update(port, buf, count, 0); +} + +static ssize_t port_hadmin_del_write(struct module_attribute *attr, + __MODULE_KOBJ_TYPE *mod, + const char *buf, size_t count) +{ + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct vnic_port *port = vnic_dentry->ctx; + + return fip_hadmin_sysfs_update(port, buf, count, 1); +} + +int port_fs_init(struct vnic_port *port) +{ + int i = 0; + char name[VNIC_SYSFS_FLEN]; + + DENTRY_CREATE(port, &port->dentries[i++], + port_dentry_name(name, port, "host_add"), + port_hadmin_syntax, port_hadmin_add_write); + + DENTRY_CREATE(port, &port->dentries[i++], + port_dentry_name(name, port, "host_del"), + port_hadmin_syntax, port_hadmin_del_write); + + DENTRY_CREATE(port, &port->dentries[i++], + port_dentry_name(name, port, "gws"), + port_gw_fs_show, NULL); + + DENTRY_CREATE(port, &port->dentries[i++], + port_dentry_name(name, port, "vnics"), + port_vnics_fs_show, NULL); + return 0; +} + +void port_fs_exit(struct vnic_port *port) +{ + int i; + + for (i = 0; i < VNIC_MAX_DENTRIES; ++i) { + if (port->dentries[i].ctx) + DENTRY_REMOVE(&port->dentries[i]); + } +} + diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_ib.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_ib.c new file mode 100644 index 0000000000000..ba6e93bb85ce9 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_ib.c @@ -0,0 +1,1649 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "vnic.h" +#include "vnic_data.h" + +int vnic_post_recv(struct vnic_rx_ring *ring, u64 wr_id) +{ + struct ib_recv_wr *bad_wr; + int i, rc; + + ring->wr.wr_id = wr_id; + + for (i = 0; i < ring->num_frags; i++) + ring->sge[i].addr = ring->rx_info[wr_id].dma_addr[i]; + + rc = ib_post_srq_recv(ring->srq, &ring->wr, &bad_wr); + if (unlikely(rc)) { + /* we will not use a lock here. In the worst case we will have + * an incorrect value of need_refill. Not a biggie + */ + + /*ring->rx_info[wr_id].info = VNIC_FRAG_NOT_POSTED; + ring->need_refill = 1; + */ + vnic_dbg_data(ring->port->name, "receive failed for buf %llu (%d)\n", + wr_id, rc); + } + + return rc; +} + +static void vnic_dealloc_tx_skb(struct vnic_login *login, unsigned cq_index, + u64 wr_id) +{ + struct vnic_tx_res *tx_res = &login->tx_res[cq_index]; + int is_inline = !!(wr_id & VNIC_SEND_INLINE_FLAG); + struct sk_buff *skb; + u64 *mapping; + int i, off = 0; + + wr_id &= ~VNIC_SEND_INLINE_FLAG; + skb = tx_res->tx_ring[wr_id].skb; + ASSERT(skb); + mapping = tx_res->tx_ring[wr_id].mapping; + + if (!is_inline) { + if (!vnic_encap_headroom && !skb_is_gso(skb)) { + ib_dma_unmap_single(login->port->dev->ca, mapping[off], + VNIC_ENCAP_LEN, DMA_TO_DEVICE); + off++; + } + if (skb_headlen(skb)) { + ib_dma_unmap_single(login->port->dev->ca, mapping[off], + skb_headlen(skb), DMA_TO_DEVICE); + off++; + } + for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + ib_dma_unmap_page(login->port->dev->ca, + mapping[i + off], frag->size, + DMA_TO_DEVICE); + } + } + + /* dealloc skb */ + dev_kfree_skb_any(skb); + tx_res->tx_ring[wr_id].skb = NULL; +} + +static void vnic_ib_handle_tx_wc(struct vnic_login *login, + int tx_res_index, struct ib_wc *wc) +{ + struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index]; + u64 wr_id = wc->wr_id & ~VNIC_SEND_INLINE_FLAG; + + vnic_dbg_data(login->name, "send completion: wr_id %llu, status: %d " + "[head %d - tail %d]\n", wr_id, wc->status, + tx_res->tx_head, tx_res->tx_tail); + + ASSERT(wr_id < vnic_tx_rings_len); + vnic_dealloc_tx_skb(login, tx_res_index, wc->wr_id); + + ++tx_res->tx_tail; + --tx_res->tx_outstanding; + + if (unlikely(wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR)) { + vnic_warn(login->name, "failed send event " + "(status %d, wr_id %llu, vend_err 0x%x)\n", + wc->status, wr_id, wc->vendor_err); + vnic_warn(login->name, "TX CQE error, queueing rings restart\n"); + if (!login->queue_stopped) + queue_delayed_work(login_wq, &login->restart_task, HZ / 100); + } +} + +int vnic_post_recvs(struct vnic_rx_ring *ring) +{ + int i, rc; + + for (i = 0; i < ring->size; i++) { + rc = vnic_post_recv(ring, i); + if (rc) { + vnic_err(ring->port->name, "Failed post receive %d\n", rc); + return rc; + } + } + + return 0; +} + +static int vnic_vlan_is_valid(struct vnic_login *login, + struct vlan_ethhdr *veth) +{ + ASSERT(veth->h_vlan_proto == htons(ETH_P_8021Q)); + if ((be16_to_cpu(veth->h_vlan_TCI) & 0xfff) != + be16_to_cpu(login->vid)) { + vnic_dbg_data(login->name, "invalid vlan, ingress vid " + "0x%x, login: vid 0x%x vlan_used %d\n", + be16_to_cpu(veth->h_vlan_TCI), + be16_to_cpu(login->vid), + login->vlan_used); + return 0; + } + + return 1; +} + +/* If a vlan tag should exist in the eth_hdr - validate it. + is_vlan_proto is set if vlan protocol is present in the eth header + return values 0 - on success, 1 - on error : + for all vlans gateway (promisc vlan): + 0 - there is no vlan or there is a vlan and it is valid + 1 - vlan is present and not valid. + for all other vlans: + 0 - there shouldn't be a vlan, or vlan should be present and is valid. + 1 - vlan should be present and it is not, ot it is not valid. */ +static int validate_vnic_vlan(struct vnic_login *login, + struct vlan_ethhdr *veth, + int *is_vlan_proto) +{ + int is_vlan = !!(veth->h_vlan_proto == htons(ETH_P_8021Q)); + + *is_vlan_proto = is_vlan; + + if (login->all_vlan_gw) + return 0; + + if (VNIC_VLAN_ENABLED(login) && login->vid && !is_vlan) { + vnic_dbg_data(login->name, "missing vlan tag\n"); + VNIC_STATS_INC(login->port_stats.vlan_err); + return 1; + } + + if (is_vlan && unlikely(!vnic_vlan_is_valid(login, veth))) { + vnic_dbg_data(login->name, "invalid vlan tag\n"); + VNIC_STATS_INC(login->port_stats.vlan_err); + return 1; + } + + return 0; +} + +static void vnic_ib_handle_rx_wc_linear(struct vnic_login *login, + struct ib_wc *wc, int rx_ring_index) +{ + struct vnic_rx_ring *ring = login->port->rx_ring[rx_ring_index]; + struct eoibhdr *eoib_hdr; + struct sk_buff *skb; + struct vlan_ethhdr *veth; + int rc, wr_id = wc->wr_id, checksum_ok, ip_summed, + buf_size = VNIC_BUF_SIZE(ring->port); + int is_vlan_proto; + u64 mapping; + u16 eth_type; + u8 *va, *eth_hdr; + + spin_lock_bh(&ring->lock); + ASSERT(wr_id < ring->size); + + skb = ring->rx_info[wr_id].skb; + mapping = ring->rx_info[wr_id].dma_addr[0]; + + /* termination with error */ + if (unlikely(wc->status != IB_WC_SUCCESS)) { + if(wc->status != IB_WC_REM_ABORT_ERR && + wc->status != IB_WC_LOC_LEN_ERR) { + vnic_dbg_data(login->name, "RX CQE error " + "(status %d, vend_err 0x%x), " + "queueing rings restart\n", + wc->status, wc->vendor_err); + if (!login->queue_stopped) + queue_delayed_work(login_wq, + &login->restart_task, + HZ / 10); + } + goto repost; + } + + ASSERT(skb); + ASSERT(mapping); + + /* If we can't allocate a new RX buffer, dump + * this packet and reuse the old buffer. + */ + if (unlikely(!vnic_alloc_rx_skb(ring, wr_id, GFP_ATOMIC))) { + VNIC_STATS_DO_INC(login->stats.rx_dropped); + goto repost; + } + + ib_dma_unmap_single(login->port->dev->ca, mapping, + buf_size, DMA_FROM_DEVICE); + skb_put(skb, wc->byte_len); + skb_pull(skb, IB_GRH_BYTES); + + /* check EoIB header signature and version */ + va = skb->data; + eoib_hdr = (struct eoibhdr *)va; + if (unlikely(VNIC_EOIB_HDR_GET_SIG(eoib_hdr) != VNIC_EOIB_HDR_SIG || + VNIC_EOIB_HDR_GET_VER(eoib_hdr) != VNIC_EOIB_HDR_VER)) { + vnic_dbg_data(login->name, "bad sig (0x%x) or ver (0x%x)\n", + VNIC_EOIB_HDR_GET_SIG(eoib_hdr), + VNIC_EOIB_HDR_GET_VER(eoib_hdr)); + VNIC_STATS_INC(login->port_stats.sig_ver_err); + goto repost; + } + + /* check EoIB CSUM */ + checksum_ok = login->rx_csum && VNIC_CSUM_OK(eoib_hdr); + ip_summed = checksum_ok ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + if (likely((checksum_ok))) + VNIC_STATS_INC(login->port_stats.rx_chksum_good); + else + VNIC_STATS_INC(login->port_stats.rx_chksum_none); + + /* Ethernet header */ + skb_pull(skb, VNIC_ENCAP_LEN); + va += VNIC_ENCAP_LEN; + veth = (struct vlan_ethhdr *)(va); + + eth_hdr = va; + eth_type = be16_to_cpu(((struct ethhdr *)(va))->h_proto); + + /* validate VLAN tag, strip it if valid */ + if (validate_vnic_vlan(login, veth, &is_vlan_proto)) + goto repost; + + /* for all_vlan_gw - we don't strip the packet but send it as is*/ + if (!login->all_vlan_gw && is_vlan_proto) { + eth_type = be16_to_cpu(veth->h_vlan_encapsulated_proto); + eth_hdr += VLAN_HLEN; + skb_pull(skb, VLAN_HLEN); + memmove(eth_hdr, va, ETH_ALEN * 2); + } + + /* update skb fields, keep this before LRO/GRO funcs */ + skb->dev = login->dev; + skb->protocol = eth_type_trans(skb, skb->dev); + skb->ip_summed = ip_summed; + +#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO) + if ((login->dev->features & NETIF_F_GRO) && checksum_ok) { + struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index]; + int ret; + + ret = napi_gro_receive(&rx_res->napi, skb); + if (ret == GRO_HELD) + VNIC_STATS_INC(login->port_stats.gro_held); + else if (ret == GRO_NORMAL) + VNIC_STATS_INC(login->port_stats.gro_normal); + else if (ret == GRO_MERGED || ret == GRO_MERGED_FREE) + VNIC_STATS_INC(login->port_stats.gro_merged); + else + VNIC_STATS_INC(login->port_stats.gro_drop); + + goto rx_repost; + } +#elif defined(NETIF_F_LRO) + if (login->dev->features & NETIF_F_LRO && checksum_ok) { + struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index]; + + /* processed for LRO */ + lro_receive_skb(&rx_res->lro, skb, NULL); + VNIC_STATS_INC(login->port_stats.lro_aggregated); + + goto rx_repost; + } +#endif + + rc = vnic_rx(login, skb, wc); + if (unlikely(rc)) { + vnic_dbg_data(login->name, "vnic_rx failed, rc %d\n", rc); + goto repost; + } + +rx_repost: + VNIC_STATS_INC(ring->stats.rx_packets); + VNIC_STATS_ADD(ring->stats.rx_bytes, wc->byte_len); + + VNIC_STATS_DO_INC(login->stats.rx_packets); + VNIC_STATS_DO_ADD(login->stats.rx_bytes, wc->byte_len); + + if (unlikely(vnic_post_recv(ring, wr_id))) + vnic_dbg_data(login->name, "failed to post RX WQE id %d\n", + (int)wr_id); + spin_unlock_bh(&ring->lock); + + return; + +repost: + login->dev->last_rx = jiffies; + if (unlikely(vnic_post_recv(ring, wr_id))) + vnic_dbg_data(login->name, "failed to post RX WQE id %d\n", + (int)wr_id); + + VNIC_STATS_INC(ring->stats.rx_dropped); + VNIC_STATS_DO_INC(login->stats.rx_dropped); + spin_unlock_bh(&ring->lock); + + return; +} + +static void vnic_ib_handle_rx_wc(struct vnic_login *login, + struct ib_wc *wc, int rx_ring_index) +{ + struct vnic_rx_ring *ring = login->port->rx_ring[rx_ring_index]; + struct ib_device *ib_device = login->port->dev->ca; + struct vnic_frag_data *frags_entry; + struct skb_frag_struct frags[VNIC_MAX_RX_FRAGS] = {}; + struct eoibhdr *eoib_hdr; + struct vlan_ethhdr *veth; + struct iphdr *ip_hdr; + u64 wr_id = wc->wr_id; + u16 eth_type; + u8 *va, *eth_hdr, ip_type; + int rc, checksum_ok, ip_offset = ETH_HLEN, + packet_length = wc->byte_len - VNIC_EOIB_HDR_SIZE, + page_offset = VNIC_EOIB_HDR_SIZE, ip_summed; + int is_vlan_proto; + + spin_lock_bh(&ring->lock); + ASSERT(wr_id < ring->size); + + /* termination with error */ + if (unlikely(wc->status != IB_WC_SUCCESS)) { + if(wc->status != IB_WC_REM_ABORT_ERR && + wc->status != IB_WC_LOC_LEN_ERR) { + vnic_dbg_data(login->name, "RX CQE error " + "(status %d, vend_err 0x%x), " + "queueing rings restart\n", + wc->status, wc->vendor_err); + if (!login->queue_stopped) + queue_delayed_work(login_wq, &login->restart_task, HZ / 10); + goto out; + } + goto drop_repost; + } + + frags_entry = &ring->rx_info[wr_id]; + + /* ensure cache coherency for packet headers and get vq */ + ib_dma_sync_single_for_cpu(ib_device, + ring->rx_info[wr_id].dma_addr[0] + IB_GRH_BYTES, + MAX_HEADER_SIZE, DMA_FROM_DEVICE); + + va = page_address(ring->rx_info[wr_id].frags[0].page.p) + + ring->rx_info[wr_id].frags[0].page_offset + IB_GRH_BYTES; + + /* check EoIB header signature and version */ + eoib_hdr = (struct eoibhdr *)va; + if (unlikely(VNIC_EOIB_HDR_GET_SIG(eoib_hdr) != VNIC_EOIB_HDR_SIG || + VNIC_EOIB_HDR_GET_VER(eoib_hdr) != VNIC_EOIB_HDR_VER)) { + vnic_dbg_data(login->name, "bad sig (0x%x) or ver (0x%x)\n", + VNIC_EOIB_HDR_GET_SIG(eoib_hdr), + VNIC_EOIB_HDR_GET_VER(eoib_hdr)); + VNIC_STATS_INC(login->port_stats.sig_ver_err); + goto unmap_repost; + } + + /* check EoIB CSUM */ + checksum_ok = login->rx_csum && VNIC_CSUM_OK(eoib_hdr); + ip_summed = checksum_ok ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + if (likely((checksum_ok))) + VNIC_STATS_INC(login->port_stats.rx_chksum_good); + else + VNIC_STATS_INC(login->port_stats.rx_chksum_none); + + /* Ethernet header */ + va += VNIC_ENCAP_LEN; + veth = (struct vlan_ethhdr *)(va); + + eth_hdr = va; + eth_type = be16_to_cpu(((struct ethhdr *)(va))->h_proto); + + /* validate VLAN tag, strip it if valid + * - if VID is set and !0, then VLAN tag must exist + * note: VID zero can accept untagged packets + * - if ingress VID exists: validate it, and update the packet + * note: rx user prio is ignored + * - else; it's valid untagged packet + */ + if (validate_vnic_vlan(login, veth, &is_vlan_proto)) + goto unmap_repost; + + /* for all_vlan_gw - we don't strip the packet but send it as is*/ + if (!login->all_vlan_gw && is_vlan_proto) { + ip_offset += VLAN_HLEN; + page_offset += VLAN_HLEN; + packet_length -= VLAN_HLEN; + eth_hdr += VLAN_HLEN; + eth_type = be16_to_cpu(veth->h_vlan_encapsulated_proto); + memmove(eth_hdr, va, ETH_ALEN * 2); + } + + /* IP header */ + va += ip_offset; + ip_hdr = (struct iphdr *)va; + ip_type = ip_hdr->protocol; + + ib_dma_sync_single_for_device(ib_device, + frags_entry->dma_addr[0] + IB_GRH_BYTES, + MAX_HEADER_SIZE, DMA_FROM_DEVICE); + +#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO) + if ((login->dev->features & NETIF_F_GRO) && checksum_ok) { + struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index]; + struct sk_buff *gro_skb; + struct skb_frag_struct *gro_frags; + int nr_frags, ret; + + gro_skb = napi_get_frags(&rx_res->napi); + if (!gro_skb) + goto drop_repost; + + gro_frags = skb_shinfo(gro_skb)->frags; + nr_frags = vnic_unmap_and_replace_rx(ring, ib_device, + gro_frags, wr_id, + wc->byte_len); + if (unlikely(!nr_frags)) + goto drop_repost; + + /* disregard GRH and eoib headers */ + gro_frags[0].page_offset += page_offset; + gro_frags[0].size -= page_offset; + + skb_shinfo(gro_skb)->nr_frags = nr_frags; + gro_skb->len = packet_length; + gro_skb->data_len = packet_length; + gro_skb->truesize += packet_length; + gro_skb->ip_summed = CHECKSUM_UNNECESSARY; + + /* processed for GRO */ + skb_record_rx_queue(gro_skb, rx_res->index); + ret = napi_gro_frags(&rx_res->napi); + if (ret == GRO_HELD) + VNIC_STATS_INC(login->port_stats.gro_held); + else if (ret == GRO_NORMAL) + VNIC_STATS_INC(login->port_stats.gro_normal); + else if (ret == GRO_MERGED || ret == GRO_MERGED_FREE) + VNIC_STATS_INC(login->port_stats.gro_merged); + else + VNIC_STATS_INC(login->port_stats.gro_drop); + + goto rx_repost; + } +#elif defined(NETIF_F_LRO) + if (login->dev->features & NETIF_F_LRO && checksum_ok && + eth_type == ETH_P_IP && ip_type == IPPROTO_TCP) { + struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index]; + int nr_frags; + + /* unmap the needed fragment and reallocate them. + * Fragments that were not used will be reused as is.*/ + nr_frags = vnic_unmap_and_replace_rx(ring, ib_device, frags, + wr_id, wc->byte_len); + if (unlikely(!nr_frags)) + goto drop_repost; + + /* disregard GRH and eoib headers */ + frags[0].page_offset += page_offset; + frags[0].size -= page_offset; + + /* processed for LRO */ +#if defined(CONFIG_COMPAT_LRO_ENABLED) + lro_receive_frags(&rx_res->lro, frags, packet_length, + packet_length, NULL, 0); +#endif + VNIC_STATS_INC(login->port_stats.lro_aggregated); + + goto rx_repost; + } +#endif + + rc = vnic_rx_skb(login, ring, wc, ip_summed, eth_hdr); + if (unlikely(rc)) { + vnic_dbg_data(login->name, "vnic_rx_skb failed, rc %d\n", rc); + goto drop_repost; + } + +rx_repost: + /* must hold lock when touching login->stats so the stats + * task won't read invalid values + */ + spin_lock(&login->stats_lock); + VNIC_STATS_INC(ring->stats.rx_packets); + VNIC_STATS_ADD(ring->stats.rx_bytes, packet_length); + + VNIC_STATS_DO_INC(login->stats.rx_packets); + VNIC_STATS_DO_ADD(login->stats.rx_bytes, packet_length); + spin_unlock(&login->stats_lock); + + login->dev->last_rx = jiffies; + if (vnic_post_recv(ring, wr_id)) + vnic_dbg_data(login->name, "vnic_post_recv failed, " + "wr_id %llu\n", wr_id); + spin_unlock_bh(&ring->lock); + + return; + +unmap_repost: + /* ignore rc of vnic_unmap_and_replace_rx() */ + vnic_unmap_and_replace_rx(ring, ib_device, frags, + wr_id, wc->byte_len); +drop_repost: + VNIC_STATS_INC(ring->stats.rx_dropped); + + spin_lock(&login->stats_lock); + VNIC_STATS_DO_INC(login->stats.rx_dropped); + spin_unlock(&login->stats_lock); + + if (vnic_post_recv(ring, wr_id)) + vnic_dbg_data(login->name, "vnic_post_recv failed, " + "wr_id %llu\n", wr_id); +out: + spin_unlock_bh(&ring->lock); + return; +} + +static inline void vnic_drain_tx_cq(struct vnic_login *login, + int tx_res_index) +{ + struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index]; + int n, i; + + do { + n = ib_poll_cq(tx_res->cq, VNIC_MAX_TX_CQE, tx_res->send_wc); + for (i = 0; i < n; ++i) + vnic_ib_handle_tx_wc(login, tx_res_index, + tx_res->send_wc + i); + } while (n == VNIC_MAX_TX_CQE); +} + +static void vnic_drain_arm_tx_cq(struct vnic_login *login, int tx_res_index) +{ + struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index]; + + ASSERT(login); + ASSERT(login->dev); + + /* darin CQ then [arm] it */ + vnic_drain_tx_cq(login, tx_res_index); + + /* in tx interrupt mode, arm TX CQ after every interrupt */ + if (!vnic_tx_polling && ib_req_notify_cq(tx_res->cq, IB_CQ_NEXT_COMP)) + vnic_dbg(login->name, "ib_req_notify_cq failed\n"); + else if (unlikely(VNIC_TXQ_STOPPED(tx_res) && + test_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state))) { + if ((tx_res->tx_outstanding <= vnic_tx_rings_len >> 1)) { + if (!test_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state)) { + VNIC_STATS_DO_INC(login->port_stats.wake_queue); + VNIC_TXQ_WAKE(tx_res); + } + /* make sure that after arming the cq, there is no access to + * login fields to avoid conflict with cq event handler. + * i.e., ib_req_notify_cq() must come at the end of this func + */ + } else if (ib_req_notify_cq(tx_res->cq, IB_CQ_NEXT_COMP)) { + vnic_dbg(login->name, "ib_req_notify_cq failed\n"); + /* TODO: have to reset the device here */ + } + } +} + +static inline void vnic_comp_handler_tx(struct ib_cq *cq, void *ctx) +{ + struct vnic_tx_res *tx_res = ctx; + + if (!vnic_tx_polling) { + spin_lock(&tx_res->lock); + vnic_drain_arm_tx_cq(tx_res->login, tx_res->index); + spin_unlock(&tx_res->lock); + } else + vnic_drain_arm_tx_cq(tx_res->login, tx_res->index); + +} + +static int vnic_drain_rx_cq(struct vnic_login *login, int max_poll, + int rx_res_index) +{ + struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index]; + int polled, i; + + ASSERT(max_poll <= vnic_napi_weight); + polled = ib_poll_cq(rx_res->cq, max_poll, rx_res->recv_wc); + + for (i = 0; vnic_rx_linear && i < polled; ++i) + vnic_ib_handle_rx_wc_linear(login, &rx_res->recv_wc[i], + rx_res_index); + + for (i = 0; !vnic_rx_linear && i < polled; ++i) + vnic_ib_handle_rx_wc(login, &rx_res->recv_wc[i], + rx_res_index); + +#ifdef NETIF_F_LRO + /* Done CQ handling: flush all LRO sessions unconditionally */ + if (login->dev->features & NETIF_F_LRO) { + VNIC_STATS_INC(login->port_stats.lro_flushed); + lro_flush_all(&rx_res->lro); + } +#endif + + return polled; +} + +/* RX CQ polling - called by NAPI */ +#ifndef _BP_NAPI_POLL +int vnic_poll_cq_rx(struct napi_struct *napi, int budget) +{ + struct vnic_rx_res *rx_res = container_of(napi, struct vnic_rx_res, napi); + struct vnic_login *login = rx_res->login; + struct ib_cq *cq_rx = rx_res->cq; + int rx_res_index = rx_res->index, polled; + + /* shouldn't happen, since when stopped=1 NAPI is disabled */ + if (unlikely(rx_res->stopped)) { +#ifndef _BP_NAPI_NETIFRX + napi_complete(napi); +#else + netif_rx_complete(login->dev, napi); +#endif + return 0; + } + + polled = vnic_drain_rx_cq(login, min(budget, VNIC_MAX_RX_CQE), rx_res_index); + vnic_dbg_data(login->name, "after vnic_drain_rx_cq budget %d," + " done %d, index %d\n", budget, polled, rx_res_index); + + /* If we used up all the quota - we're probably not done yet... */ + ASSERT(polled <= budget); + if (polled < budget) { + /* ATTENTION: ARM CQ must come after napi_complete() */ +#ifndef _BP_NAPI_NETIFRX + napi_complete(napi); +#else + netif_rx_complete(login->dev, napi); +#endif + /* Eventually calls vnic_comp_handler_rx() */ + if (ib_req_notify_cq(cq_rx, IB_CQ_NEXT_COMP)) + vnic_err(login->name, "ib_req_notify_cq failed\n"); + } + + return polled; +} +#else +int vnic_poll_cq_rx(struct net_device *poll_dev, int *budget) +{ + struct vnic_rx_res *rx_res = poll_dev->priv; + struct vnic_login *login = rx_res->login; + struct ib_cq *cq_rx = rx_res->cq; + int rx_res_index = rx_res->index, polled, max_poll = min(*budget, poll_dev->quota); + + /* shouldn't happen, since when stopped=1 NAPI is disabled */ + if (unlikely(rx_res->stopped)) { + netif_rx_complete(poll_dev); + return 0; + } + + while (max_poll >= 0) { + polled = vnic_drain_rx_cq(login, min(max_poll, VNIC_MAX_RX_CQE), rx_res_index); + if (polled <= 0) + break; + else { + poll_dev->quota -= polled; + *budget -= polled; + } + max_poll -= polled; + } + + if (!max_poll) + return 1; + + netif_rx_complete(poll_dev); + ib_req_notify_cq(cq_rx, IB_CQ_NEXT_COMP); + + return 0; +} +#endif + +static void vnic_comp_handler_rx(struct ib_cq *cq, void *rx_res_ptr) +{ + struct vnic_rx_res *rx_res = rx_res_ptr; + struct vnic_login *login = rx_res->login; + + ASSERT(rx_res->cq == cq); + ASSERT(login->dev); + + /* is this happens, will re-arm later in vnic_open */ + if (unlikely(rx_res->stopped)) + return; + +#ifndef _BP_NAPI_POLL + /* calls vnic_poll_cq_rx() */ +#ifndef _BP_NAPI_NETIFRX + napi_schedule(&rx_res->napi); +#else + netif_rx_schedule(login->dev, &rx_res->napi); +#endif +#else + netif_rx_schedule(rx_res->poll_dev); +#endif /* _BP_NAPI_POLL*/ + +} + +static void vnic_stop_qp(struct vnic_login *login, int qp_index) +{ + struct ib_qp_attr qp_attr = { .qp_state = IB_QPS_ERR }; + struct vnic_qp_res *qp_res = &login->qp_res[qp_index]; + struct vnic_rx_res *rx_res = &login->rx_res[qp_res->rx_index]; + struct vnic_tx_res *tx_res = &login->tx_res[qp_res->tx_index]; + struct vnic_rx_ring *ring = login->port->rx_ring[rx_res->index]; + unsigned long flags; + int polled, attr_mask, rc, i; + + /* move QP to ERR, wait for last WQE async event to drain the SRQ */ + rc = ib_modify_qp(qp_res->qp, &qp_attr, IB_QP_STATE); + if (rc) { + /* calls vnic_qp_event_handler() */ + vnic_warn(login->name, "failed to modify QP 0x%x to ERR state" + " (err = %d)\n", qp_res->qp->qp_num, rc); + /* continue anyway, but don't wait for completion */ + } else { + wait_for_completion(&qp_res->last_wqe_complete); + } + + /* === at this point, no NAPI/RX comps === */ + + /* drain TX CQ before moving to RESET, must hold tx_res->lock to + * protect from vnic_comp_handler_tx() after this call, all CQEs + * are polled (either by this direct call, or by CQ handlers) + */ + spin_lock_irqsave(&tx_res->lock, flags); + vnic_drain_tx_cq(login, tx_res->index); + spin_unlock_irqrestore(&tx_res->lock, flags); + + /* drain RX CQ before moving to RESET drop and re-post all comps */ + spin_lock_bh(&ring->lock); + do { + polled = ib_poll_cq(rx_res->cq, VNIC_MAX_RX_CQE, rx_res->recv_wc); + for (i = 0; i < polled; ++i) + if (vnic_post_recv(ring, rx_res->recv_wc[i].wr_id)) + vnic_dbg_data(login->name, "vnic_post_recv failed, " + "wr_id %llu\n", rx_res->recv_wc[i].wr_id); + } while (polled == VNIC_MAX_RX_CQE); + spin_unlock_bh(&ring->lock); + + /* move QP to RESET */ + qp_attr.qp_state = IB_QPS_RESET; + rc = ib_modify_qp(qp_res->qp, &qp_attr, IB_QP_STATE); + if (rc) + vnic_warn(login->name, "failed to modify QP 0x%x to RESET" + " state (err = %d)\n", qp_res->qp->qp_num, rc); + + /* move QP to INIT to avoid multicast qp cache misses */ + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qkey = login->qkey; + qp_attr.port_num = login->port->num; + qp_attr.pkey_index = login->pkey_index; + attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE; + + rc = ib_modify_qp(qp_res->qp, &qp_attr, attr_mask); + if (rc) + vnic_warn(login->name, "failed to modify QP 0x%x to INIT state" + " (err = %d)\n", qp_res->qp->qp_num, rc); +} + +int vnic_ib_stop(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + struct vnic_tx_res *tx_res; + unsigned long begin = jiffies; + int wr_id, i; + + /* flush tx and rx comps */ + for (i = 0; i < login->qps_num; ++i) + vnic_stop_qp(login, i); + + /* check any pending tx comps */ + for (i = 0; i < login->tx_rings_num; i++) { + tx_res = &login->tx_res[i]; + /* if tx_outstanding is non-zero, give it a chance to complete */ + if (!tx_res->tx_outstanding) + continue; + msleep(10); + + /* else, drain tx cq. This is indicates that something is + * wrong, thus we won't protect vnic_comp_handler_tx() here + */ + while (tx_res->tx_outstanding && + time_before(jiffies, begin + 5 * HZ)) { + vnic_drain_tx_cq(login, i); + msleep(1); + } + + /* if they're still not complete, force skb deallocation */ + if (!tx_res->tx_outstanding) + continue; + vnic_warn(login->name, "timing out: %d sends not completed\n", + tx_res->tx_outstanding); + while (tx_res->tx_outstanding) { + wr_id = tx_res->tx_tail & (vnic_tx_rings_len - 1); + vnic_dealloc_tx_skb(login, i, wr_id); + ++tx_res->tx_tail; + --tx_res->tx_outstanding; + } + } + + return 0; +} + +int vnic_ib_open(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + int i; + + /* move QP to RTS and attach to bcast group */ + for (i = 0; i < login->qps_num; ++i) { + if (vnic_init_qp(login, i)) { + vnic_err(login->name, "vnic_init_qp failed\n"); + goto stop_qps; + } + } + + return 0; + +stop_qps: + for (--i ; i >= 0; --i) + vnic_stop_qp(login, i); + + return -EINVAL; +} + +void vnic_destroy_qp(struct vnic_login *login, int qp_index) +{ + struct ib_qp *qp = login->qp_res[qp_index].qp; + + if (!qp) + return; + if (ib_destroy_qp(qp)) + vnic_warn(login->name, "ib_destroy_qp failed\n"); + return; +} + +void vnic_qp_to_reset(struct vnic_login *login, struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + int rc; + + qp_attr.qp_state = IB_QPS_RESET; + rc = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); + if (rc) + vnic_err(login->name, "ib_modify_qp 0x%06x to RESET err %d\n", + qp->qp_num, rc); +} + +int vnic_qp_to_init(struct vnic_login *login, struct ib_qp *qp, u32 qkey) +{ + struct ib_qp_attr qp_attr; + int attr_mask, rc; + + /* move QP to INIT */ + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qkey = qkey; + qp_attr.port_num = login->port->num; + /* pkey will be overwritten later by login->pkey_index */ + qp_attr.pkey_index = login->port->pkey_index; + attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE; + + rc = ib_modify_qp(qp, &qp_attr, attr_mask); + if (rc) { + vnic_err(login->name, "ib_modify_qp 0x%06x to INIT err %d\n", + qp->qp_num, rc); + goto out_qp_reset; + } + + return 0; + +out_qp_reset: + vnic_qp_to_reset(login, qp); + return rc; +} + +int vnic_init_qp(struct vnic_login *login, int qp_index) +{ + struct ib_qp_attr qp_attr; + int attr_mask, rc, rc1; + struct ib_qp *qp = login->qp_res[qp_index].qp; + + init_completion(&login->qp_res[qp_index].last_wqe_complete); + /* move QP to INIT */ + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qkey = login->qkey; + qp_attr.port_num = login->port->num; + qp_attr.pkey_index = login->pkey_index; + attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE; + + rc = ib_modify_qp(qp, &qp_attr, attr_mask); + if (rc) { + vnic_err(login->name, "ib_modify_qp to INIT err %d\n", rc); + goto out_qp_reset; + } + + /* move QP to RTR */ + qp_attr.qp_state = IB_QPS_RTR; + attr_mask &= ~IB_QP_PORT; + rc = ib_modify_qp(qp, &qp_attr, attr_mask); + if (rc) { + vnic_err(login->name, "ib_modify_qp to RTR err %d\n", rc); + goto out_qp_reset; + } + + /* move QP to RTS */ + qp_attr.qp_state = IB_QPS_RTS; + qp_attr.sq_psn = 0; + attr_mask |= IB_QP_SQ_PSN; + attr_mask &= ~IB_QP_PKEY_INDEX; + rc = ib_modify_qp(qp, &qp_attr, attr_mask); + if (rc) { + vnic_err(login->name, "ib_modify_qp to RTS err, rc %d\n", rc); + goto out_qp_reset; + } + + /* What a Good QP! */ + vnic_dbg_data(login->name, "qpn 0x%06x moved to RTS\n", + qp->qp_num); + + return 0; + +out_qp_reset: + qp_attr.qp_state = IB_QPS_RESET; + rc1 = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); + if (rc1) + vnic_err(login->name, "ib_modify_qp to RESET err %d\n", rc1); + + return rc; +} + +static void vnic_qp_event_handler(struct ib_event *event, void *ctx) +{ + struct vnic_qp_res *qp_res = ctx; + struct vnic_login *login = qp_res->login; + + ASSERT(login); + vnic_dbg_data(login->name, "[%s] qpn %d got event %d\n", + event->device->name, event->element.qp->qp_num, + event->event); + if (event->event == IB_EVENT_QP_LAST_WQE_REACHED) + complete(&qp_res->last_wqe_complete); +} + +void vnic_destroy_rx_res(struct vnic_login *login, int rx_res_index) +{ + struct ib_cq *cq = login->rx_res[rx_res_index].cq; + int rc = 0; + + if (cq) + rc = ib_destroy_cq(cq); + if (rc) + vnic_warn(login->name, "ib_destroy_cq() index %d failed\n", + rx_res_index); +} + +void vnic_destroy_tx_res(struct vnic_login *login, int tx_res_index) +{ + struct ib_cq *cq = login->tx_res[tx_res_index].cq; + struct vnic_tx_buf *tx_ring = login->tx_res[tx_res_index].tx_ring; + int rc = 0; + + if (tx_ring) + vfree(tx_ring); + if (cq) + rc = ib_destroy_cq(cq); + if (rc) + vnic_warn(login->name, "ib_destroy_cq() index %d failed\n", + tx_res_index); +} + +#if 0 +static inline int get_comp_vector(int index, struct vnic_port *port) +{ + int vector; + int num_cpus = roundup_pow_of_two(num_online_cpus()); + int port_for_eq; + + port_for_eq = (((index / port->dev->mdev->eq_per_port) % + port->dev->mdev->dev->caps.num_ports) + 1); + vector = (index % port->dev->mdev->eq_per_port) + + (port_for_eq * num_cpus); + + return vector; +} +#endif + +int vnic_create_rx_res(struct vnic_login *login, int rx_res_index) +{ + struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index]; + int comp_vector = rx_res_index % login->port->dev->ca->num_comp_vectors; + struct ib_cq *cq = + ib_create_cq(login->port->dev->ca, + vnic_comp_handler_rx, + NULL, &login->rx_res[rx_res_index], + vnic_rx_rings_len, comp_vector); + if (IS_ERR(cq)) { + vnic_err(login->name, "ib_create_cq failed, index %d, " + "comp_vector %d, rc %d\n", + rx_res_index, comp_vector, (int)PTR_ERR(cq)); + return -EINVAL; + } + + rx_res->cq = cq; + rx_res->index = rx_res_index; + rx_res->login = login; + + return 0; +} + +int vnic_create_tx_res(struct vnic_login *login, int tx_res_index) +{ + struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index]; + struct ib_cq *cq; + struct vnic_tx_buf *tx_ring; + int i, comp_vector; + + tx_ring = vmalloc(vnic_tx_rings_len * sizeof *tx_res->tx_ring); + if (!tx_ring) { + vnic_err(login->name, "vmalloc failed to allocate %u * %lu\n", + vnic_tx_rings_len, + (long unsigned int) (sizeof *tx_res->tx_ring)); + return -ENOMEM; + } + memset(tx_ring, 0, vnic_tx_rings_len * sizeof *tx_res->tx_ring); + + /* create TX CQ and set WQE drafts */ + tx_res->tx_wr.sg_list = tx_res->tx_sge; + tx_res->tx_wr.send_flags = IB_SEND_SIGNALED; + tx_res->tx_wr.wr.ud.remote_qkey = login->qkey; + + for (i = 0; i < VNIC_MAX_TX_FRAGS; ++i) + tx_res->tx_sge[i].lkey = login->port->mr->lkey; + + /* set mcast av draft*/ + memset(&tx_res->mcast_av, 0, sizeof(struct ib_ah_attr)); + tx_res->mcast_av.port_num = login->port->num; + tx_res->mcast_av.ah_flags = IB_AH_GRH; + + /* create tx cq */ + comp_vector = tx_res_index % login->port->dev->ca->num_comp_vectors; + cq = ib_create_cq(login->port->dev->ca, + vnic_comp_handler_tx, + NULL, &login->tx_res[tx_res_index], + vnic_tx_rings_len, comp_vector); + if (IS_ERR(cq)) { + vnic_err(login->name, "ib_create_cq failed, index %d, " + "comp_vector %d, rc %d\n", + tx_res_index, comp_vector, (int)PTR_ERR(cq)); + vfree(tx_ring); + return -EINVAL; + } + + tx_res->tx_ring = tx_ring; + tx_res->cq = cq; + tx_res->index = tx_res_index; + tx_res->login = login; + + return 0; +} + +int vnic_create_qp_range(struct vnic_login *login) +{ + int qp_index, create_flags = 0, rc; + struct ib_qp_init_attr *attr; + struct ib_qp *qps[VNIC_MAX_NUM_CPUS]; + struct vnic_qp_res *qp_res; + + attr = kzalloc(VNIC_MAX_NUM_CPUS * sizeof *attr, GFP_KERNEL); + if (!attr) + return -ENOMEM; + + create_flags |= login->port->dev->attr.device_cap_flags & + IB_DEVICE_BLOCK_MULTICAST_LOOPBACK ? + IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK : 0; + + /* TODO: rename IB_QP_CREATE_IPOIB_UD_LSO */ + create_flags |= login->port->dev->attr.device_cap_flags & + IB_DEVICE_UD_TSO ? + IB_QP_CREATE_IPOIB_UD_LSO : 0; + + for (qp_index = 0; qp_index < login->qps_num; ++qp_index) { + qp_res = &login->qp_res[qp_index]; + qp_res->tx_index = qp_index % login->tx_rings_num; + qp_res->rx_index = qp_index % login->rx_rings_num; + memset(&attr[qp_index], 0, sizeof(struct ib_qp_init_attr)); + attr[qp_index].cap.max_send_wr = vnic_tx_rings_len; + attr[qp_index].cap.max_send_sge = VNIC_MAX_TX_FRAGS; + attr[qp_index].cap.max_recv_wr = 0; /* we use SRQ */ + attr[qp_index].cap.max_recv_sge = 0; + attr[qp_index].sq_sig_type = IB_SIGNAL_ALL_WR; + attr[qp_index].qp_type = IB_QPT_UD; + attr[qp_index].send_cq = login->tx_res[qp_res->tx_index].cq; + attr[qp_index].recv_cq = login->rx_res[qp_res->rx_index].cq; + attr[qp_index].srq = login->port->rx_ring[qp_res->rx_index]->srq; + attr[qp_index].event_handler = vnic_qp_event_handler; + attr[qp_index].qp_context = &login->qp_res[qp_index]; + attr[qp_index].create_flags = create_flags; + attr[qp_index].cap.max_inline_data = vnic_inline_tshold; + } + + + rc = vnic_ib_create_qp_range(login->port->pd, attr, NULL, + login->qps_num, login->qps_num, qps); + if (rc) { + vnic_err(login->name, "vnic_ib_create_qp_range failed, rc %d\n", rc); + goto err; + } + + for (qp_index = 0; qp_index < login->qps_num; ++qp_index) { + qp_res = &login->qp_res[qp_index]; + qp_res->qp = qps[qp_index]; + qp_res->login = login; + } + + for (qp_index = 0; qp_index < login->qps_num; ++qp_index) { + rc = vnic_qp_to_init(login, qps[qp_index], login->qkey); + if (rc) { + vnic_err(login->name, "vnic_qp_to_init failed, rc %d\n", rc); + goto destroy_qps; + } + } + + kfree(attr); + return 0; + +destroy_qps: + for (qp_index--; qp_index>=0; qp_index--) + vnic_qp_to_reset(login, qps[qp_index]); + + for (qp_index = 0; qp_index < login->qps_num; ++qp_index) + vnic_destroy_qp(login, qp_index); + +err: + kfree(attr); + return rc; +} + +static inline int use_inline(struct sk_buff *skb) +{ + return skb->len <= vnic_inline_tshold && !skb_shinfo(skb)->nr_frags; +} + +int vnic_post_send(struct vnic_login *login, int tx_res_index, + u64 wr_id, struct ib_ah *ah, u32 dqpn) +{ + struct ib_send_wr *bad_wr; + struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index]; + struct vnic_qp_res *qp_res = &login->qp_res[tx_res_index % login->qps_num]; + struct vnic_tx_buf *tx_req = &tx_res->tx_ring[wr_id]; + skb_frag_t *frags = skb_shinfo(tx_req->skb)->frags; + int nr_frags = skb_shinfo(tx_req->skb)->nr_frags, i, off = 0; + + ASSERT(qp_res); + ASSERT(tx_res); + ASSERT(qp_res->tx_index == tx_res->index); + ASSERT(qp_res->qp->send_cq == tx_res->cq); + + if (!vnic_encap_headroom && !skb_is_gso(tx_req->skb)) { + tx_res->tx_sge[off].addr = tx_req->mapping[off]; + tx_res->tx_sge[off].length = VNIC_ENCAP_LEN; + off++; + } + + if (likely(skb_headlen(tx_req->skb))) { + if (vnic_encap_headroom && use_inline(tx_req->skb)) { + tx_res->tx_wr.send_flags |= IB_SEND_INLINE; + wr_id |= VNIC_SEND_INLINE_FLAG; + tx_res->tx_sge[off].addr = (unsigned long)tx_req->skb->data; + } else { + tx_res->tx_wr.send_flags &= ~IB_SEND_INLINE; + tx_res->tx_sge[off].addr = tx_req->mapping[off]; + } + tx_res->tx_sge[off].length = skb_headlen(tx_req->skb); + off++; + } + + for (i = 0; i < nr_frags; ++i) { + tx_res->tx_sge[i + off].addr = tx_req->mapping[i + off]; + tx_res->tx_sge[i + off].length = frags[i].size; + } + + /* handle runt packets using additional SG */ + if (unlikely(tx_req->skb->len < login->zlen)) { + /* Note: always extend runt packets (for both + * internal & external) for virtualization, some emulators + * drop runt packets, so we need to avoid runt packets even + * if the traffic is not passing the bridge + */ + vnic_dbg_data(login->name, "runt packet, skb %p len %d => %d\n", + tx_req->skb, tx_req->skb->len, login->zlen); + /* If there are frags, then packets is longer than 60B */ + if (use_inline(tx_req->skb)) + tx_res->tx_sge[i + off].addr = (u64)(unsigned long)login->pad_va; + else + tx_res->tx_sge[i + off].addr = login->pad_dma; + + tx_res->tx_sge[i + off].length = login->zlen - tx_req->skb->len; + ++nr_frags; + VNIC_STATS_INC(login->port_stats.runt_packets); + } + + tx_res->tx_wr.num_sge = nr_frags + off; + tx_res->tx_wr.wr_id = wr_id; + tx_res->tx_wr.wr.ud.remote_qpn = dqpn; + tx_res->tx_wr.wr.ud.ah = ah; + + /* check if we need to calc csum */ + if (tx_req->skb->ip_summed == CHECKSUM_PARTIAL) { + u16 csum_pseudo; + + /* calc pseudo header csum without the length + * and put in the transport's header checksum field. + * The HW will calculate the rest of it (SWP) + */ + if (tx_req->ip_off) + csum_pseudo = ~csum_tcpudp_magic(ip_hdr(tx_req->skb)->saddr, + ip_hdr(tx_req->skb)->daddr, + 0, /* length */ + ip_hdr(tx_req->skb)->protocol, + 0); + else + csum_pseudo = ~csum_ipv6_magic(&ipv6_hdr(tx_req->skb)->saddr, + &ipv6_hdr(tx_req->skb)->daddr, + 0, /* length */ + ipv6_hdr(tx_req->skb)->nexthdr, + 0); + + /* place the calculated csum in the checksum field in + * tcp/udp header + */ + if (tx_req->tcp_off) + tcp_hdr(tx_req->skb)->check = csum_pseudo; + else + udp_hdr(tx_req->skb)->check = csum_pseudo; + + /* set CSUM flag in ib_send_wr */ + tx_res->tx_wr.send_flags |= IB_SEND_IP_CSUM; + } else { + /* csum already calculated in SW */ + tx_res->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; + } + + /* prepare TSO header */ + if (skb_is_gso(tx_req->skb)) { + tx_res->tx_wr.wr.ud.mss = skb_shinfo(tx_req->skb)->gso_size + tx_req->hlen; + tx_res->tx_wr.wr.ud.header = tx_req->phead; + tx_res->tx_wr.wr.ud.hlen = tx_req->hlen; + tx_res->tx_wr.opcode = IB_WR_LSO; + } else { + tx_res->tx_wr.opcode = IB_WR_SEND; + } + + vnic_dbg_data(login->name, + "skb %p wr_id %llu sqpn 0x%06x dqpn 0x%06x num_sge " + "%d phead %p was sent\n", tx_req->skb, wr_id, qp_res->qp->qp_num, + dqpn, tx_res->tx_wr.num_sge, tx_req->phead); + + /* if EoIB encap is OOB, copy LRO header to linear part */ + if (!vnic_encap_headroom && skb_is_gso(tx_req->skb)) { + memcpy(tx_res->lso_hdr, VNIC_SKB_GET_ENCAP(tx_req->skb), + VNIC_ENCAP_LEN); + memcpy((u8 *)(tx_res->lso_hdr) + VNIC_ENCAP_LEN, + tx_res->tx_wr.wr.ud.header, + tx_res->tx_wr.wr.ud.hlen); + tx_res->tx_wr.wr.ud.header = tx_res->lso_hdr; + tx_res->tx_wr.wr.ud.mss += VNIC_ENCAP_LEN; + tx_res->tx_wr.wr.ud.hlen += VNIC_ENCAP_LEN; + } + + return vnic_ib_post_send(qp_res->qp, &tx_res->tx_wr, &bad_wr, + tx_req->ip_off, + tx_req->ip6_off, + tx_req->tcp_off, + tx_req->udp_off); +} + +static int vnic_dma_map_tx(struct ib_device *ca, struct vnic_tx_buf *tx_req) +{ + struct sk_buff *skb = tx_req->skb; + struct skb_shared_info *shinfo = skb_shinfo(skb); + u64 *mapping = tx_req->mapping; + int i = 0, off = 0, headlen = skb_headlen(skb); + + if (vnic_encap_headroom && use_inline(skb)) + return 0; + + if (!vnic_encap_headroom && !skb_is_gso(tx_req->skb)) { + mapping[off] = ib_dma_map_single(ca, VNIC_SKB_GET_ENCAP(skb), + VNIC_ENCAP_LEN, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping[off]))) + return -EIO; + off++; + } + + if (likely(headlen)) { + mapping[off] = ib_dma_map_single(ca, skb->data, + headlen, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping[off]))) + goto partial_error; + off++; + } + + for (i = 0; i < shinfo->nr_frags; ++i) { + skb_frag_t *frag = &shinfo->frags[i]; + mapping[i + off] = ib_dma_map_page(ca, frag->page.p, + frag->page_offset, + frag->size, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping[i + off]))) + goto partial_error; + } + + return 0; + +partial_error: + for (--i; i >= 0; i--) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + ib_dma_unmap_page(ca, mapping[i + off], frag->size, + DMA_TO_DEVICE); + } + + if (headlen) + ib_dma_unmap_single(ca, mapping[--off], skb_headlen(skb), + DMA_TO_DEVICE); + + if (!vnic_encap_headroom && !skb_is_gso(tx_req->skb)) + ib_dma_unmap_single(ca, mapping[--off], VNIC_ENCAP_LEN, + DMA_TO_DEVICE); + + return -EIO; +} + +void vnic_send(struct vnic_login *login, struct sk_buff *skb, + struct ib_ah *ah, u32 dqpn, int tx_res_index) +{ + struct eoibhdr *_eoib_hdr = VNIC_SKB_GET_ENCAP(skb); + struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index]; + struct vnic_tx_buf *tx_req; + unsigned long flags = 0; + u64 wr_id; + int tx_pkt_num = 1; + u8 ip_off; + + if (!vnic_tx_polling) + spin_lock_irqsave(&tx_res->lock, flags); + + ASSERT(tx_res_index < login->tx_rings_num); + wr_id = tx_res->tx_head & (vnic_tx_rings_len - 1); + tx_req = &tx_res->tx_ring[wr_id]; + tx_req->skb = skb; + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tx_req->ip_off = tx_req->ip6_off = tx_req->tcp_off = tx_req->udp_off = 0; + if (VNIC_IP_CSUM_OK(_eoib_hdr)) { + ip_off = vnic_encap_headroom ? + ((skb_network_header(skb) - skb->data) >> 1) : + /* skb_network_header doesn't count the encap since it's OOB */ + ((skb_network_header(skb) - skb->data + VNIC_ENCAP_LEN) >> 1); + switch (ntohs(skb->protocol)) { + case ETH_P_IP: + tx_req->ip_off = ip_off; + break; + case ETH_P_IPV6: + tx_req->ip6_off = ip_off; + } + } + if (VNIC_TCP_CSUM_OK(_eoib_hdr)) + tx_req->tcp_off = + (skb_transport_header(skb) - skb_network_header(skb)) >> 2; + else if (VNIC_UDP_CSUM_OK(_eoib_hdr)) + tx_req->udp_off = + (skb_transport_header(skb) - skb_network_header(skb)) >> 2; + ASSERT(!tx_req->udp_off || !tx_req->tcp_off); + vnic_dbg_data(login->name, "ip_off = %d, tcp_off = %d, udp_off = %d\n", + tx_req->ip_off, tx_req->tcp_off, tx_req->udp_off); + VNIC_STATS_INC(login->port_stats.tx_chksum_offload); + } + + /* TSO skb */ + if (skb_is_gso(skb)) { + tx_req->hlen = skb_transport_offset(skb) + tcp_hdrlen(skb); + tx_req->phead = skb->data; + ASSERT(skb_pull(skb, tx_req->hlen)); + VNIC_STATS_INC(login->port_stats.tso_packets); + tx_pkt_num = skb_shinfo(tx_req->skb)->gso_segs; + } + + /* map tx skb */ + if (unlikely(vnic_dma_map_tx(login->port->dev->ca, tx_req))) + goto err; + + /* send.. unmap.. free skb.. drain tx cq.. [pray] */ + if (unlikely(++tx_res->tx_outstanding == vnic_tx_rings_len)) { + if (++tx_res->tx_stopped_cnt % 100 == 0) + vnic_dbg(login->name, "tx queue %d stopped cnt %d, outs %d\n", + tx_res->index, + tx_res->tx_stopped_cnt, + tx_res->tx_outstanding); + ASSERT(!VNIC_TXQ_STOPPED(tx_res)); + VNIC_TXQ_STOP(tx_res); + /* vnic_drain_arm_tx_cq() will arm the cq OR resume the ring */ + VNIC_STATS_DO_INC(login->port_stats.queue_stopped); + } + + ASSERT(tx_res->tx_outstanding <= vnic_tx_rings_len); + + if (unlikely(vnic_post_send(login, tx_res_index, wr_id, ah, dqpn))) { + vnic_warn(login->name, "vnic_post_send failed\n"); + VNIC_STATS_DO_INC(tx_res->stats.tx_errors); + VNIC_STATS_DO_INC(tx_res->stats.tx_dropped); + --tx_res->tx_outstanding; + vnic_dealloc_tx_skb(login, tx_res->index, wr_id); + /* no need to netif_wake_queue() here, because + * vnic_comp_handler_tx() will eventually be called + * for armed cq, and it will wake-up the queue when it's ready + */ + } else { + VNIC_STATS_DO_ADD(tx_res->stats.tx_packets, tx_pkt_num); + VNIC_STATS_DO_ADD(tx_res->stats.tx_bytes, skb->len); + login->dev->trans_start = jiffies; + ++tx_res->tx_head; + + + if (vnic_tx_polling) { + if (likely(!skb_shared(skb))) + skb_orphan(skb); + else + VNIC_STATS_DO_INC(login->port_stats.shared_packets); + } + } + + /* poll every vnic_max_tx_outs packets */ + if (vnic_tx_polling) { + if (tx_res->tx_outstanding > vnic_max_tx_outs || + VNIC_TXQ_STOPPED(tx_res)) + vnic_drain_arm_tx_cq(login, tx_res_index); + } else + spin_unlock_irqrestore(&tx_res->lock, flags); + + return; + +err: + VNIC_STATS_DO_INC(tx_res->stats.tx_dropped); + VNIC_STATS_DO_INC(tx_res->stats.tx_errors); + dev_kfree_skb_any(skb); + + if (!vnic_tx_polling) + spin_unlock_irqrestore(&tx_res->lock, flags); + + return; +} + +void vnic_ib_free_ring(struct vnic_rx_ring *ring) +{ + ASSERT(ring->srq); + ib_destroy_srq(ring->srq); +} + +int vnic_ib_init_ring(struct vnic_rx_ring *ring) +{ + struct ib_srq_init_attr srq_attr; + struct vnic_port *port = ring->port; + int rc = 0, headroom = 10; + + /* alloc SRQ */ + memset(&srq_attr, 0, sizeof(struct ib_srq_init_attr)); + srq_attr.attr.max_sge = VNIC_MAX_RX_FRAGS; + srq_attr.attr.max_wr = vnic_rx_rings_len + headroom; + srq_attr.attr.srq_limit = vnic_rx_rings_len + headroom; + ring->srq = ib_create_srq(port->pd, &srq_attr); + if (IS_ERR(ring->srq)) { + vnic_err(ring->port->name, "ib_create_srq failed, index %d, rc %d\n", + ring->index, (int)PTR_ERR(ring->srq)); + rc = (int)PTR_ERR(ring->srq); + } + + return rc; +} + +int vnic_port_ib_init(struct vnic_port *port) +{ + int i; + + /* alloc PD */ + port->pd = ib_alloc_pd(port->dev->ca); + if (IS_ERR(port->pd)) { + vnic_err(port->name, "failed to allocate PD\n"); + goto err; + } + vnic_dbg_data(port->name, "port->pd %p\n", port); + + /* alloc MR */ + port->mr = ib_get_dma_mr(port->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(port->mr)) { + vnic_err(port->name, "failed to allocate MR\n"); + goto free_pd; + } + vnic_dbg_data(port->name, "port->mr %p\n", port->mr); + + /* alloc RX RING */ + for (i = 0; i < port->rx_rings_num; ++i) { + port->rx_ring[i] = vnic_create_rx_ring(port, i); + if (IS_ERR(port->rx_ring[i])) { + vnic_err(port->name, "failed to allocate rx_ring %d\n", i); + port->rx_ring[i] = NULL; + goto free_rx_ring; + } + } + vnic_dbg_data(port->name, "allocated %d RX ring\n", port->rx_rings_num); + + return 0; + +free_rx_ring: + for (i = 0; i < port->rx_rings_num; ++i) + vnic_destroy_rx_ring(port->rx_ring[i]); +/* free_mr: */ + ib_dereg_mr(port->mr); +free_pd: + ib_dealloc_pd(port->pd); +err: + return -EINVAL; + +} + +void vnic_port_ib_cleanup(struct vnic_port *port) +{ + int i; + + for (i = 0; i < port->rx_rings_num; ++i) + vnic_destroy_rx_ring(port->rx_ring[i]); + + ib_dereg_mr(port->mr); + ib_dealloc_pd(port->pd); + + return; +} + +void vnic_ib_dispatch_event(struct ib_event *event) +{ + return; +} + +int vnic_ib_set_moder(struct vnic_login *login, u16 rx_usecs, u16 rx_frames, + u16 tx_usecs, u16 tx_frames) +{ + int rc, i; + + vnic_dbg_moder(login->name, "set coalescing params for mtu:%d to " + "rx_frames:%d rx_usecs:%d, " + "tx_frames:%d tx_usecs:%d, " + "adaptive_rx_coal:%d, " + "adaptive_tx_coal:%d, " + "sample_interval:%d, " + "port.state: %d\n", + login->dev->mtu, + rx_frames, rx_usecs, + tx_frames, tx_usecs, + login->adaptive_rx_coal, 0, + login->sample_interval, login->port->attr.state); + + for (i = 0; i < login->tx_rings_num; ++i) { + rc = ib_modify_cq(login->tx_res[i].cq, tx_frames, tx_usecs); + if (rc && rc != -ENOSYS) { + vnic_warn(login->name, "failed modifying tx_res," + " rc %d, tx ring index %d\n", rc, i); + return rc; + } + } + + for (i = 0; i < login->rx_rings_num; ++i) { + rc = ib_modify_cq(login->rx_res[i].cq, rx_frames, rx_usecs); + if (rc && rc != -ENOSYS) { + vnic_warn(login->name, "failed modifying rx_res," + " rc %d, rx ring index %d\n", rc, i); + return rc; + } + } + + return 0; +} + +int vnic_ib_down(struct net_device *dev) +{ + return 0; +} + +int vnic_ib_up(struct net_device *dev) +{ + return 0; +} diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_mac.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_mac.c new file mode 100644 index 0000000000000..996d70dbbc802 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_mac.c @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_data.h" +#include "vnic_fip_discover.h" + +static void vnic_mace_dealloc(struct vnic_mac *mace) +{ + ASSERT(mace); + kfree(mace); +} + +static struct vnic_mac *vnic_mace_alloc(const u8 *mac, u16 vnic_id) +{ + struct vnic_mac *mace; + + mace = kzalloc(sizeof *mace, GFP_ATOMIC); + if (!mace) + return ERR_PTR(-ENOMEM); + + /* set mac entry fields */ + memcpy(mace->mac, mac, ETH_ALEN); + mace->created = jiffies; + mace->last_tx = jiffies; + mace->vnic_id = vnic_id; + + return mace; +} + +static void vnic_mace_del(struct vnic_login *login, struct vnic_mac *mace) +{ + ASSERT(mace); + rb_erase(&mace->rb_node, &login->mac_tree); +} + +static int vnic_mace_add(struct vnic_login *login, struct vnic_mac *mace) +{ + struct rb_node **n = &login->mac_tree.rb_node, *pn = NULL; + struct vnic_mac *mace_t; + int rc; + + while (*n) { + pn = *n; + mace_t = rb_entry(pn, struct vnic_mac, rb_node); + rc = memcmp(mace->mac, mace_t->mac, ETH_ALEN); + if (rc < 0) + n = &pn->rb_left; + else if (rc > 0) + n = &pn->rb_right; + else { + rc = -EEXIST; + goto out; + } + } + + rb_link_node(&mace->rb_node, pn, n); + rb_insert_color(&mace->rb_node, &login->mac_tree); + rc = 0; + +out: + return rc; +} + +/* vnic_mace_search -- + * Return entry pointer if found, or ERR_PTR(-ENODATA) if not found. + */ +static struct vnic_mac *vnic_mace_search(struct vnic_login *login, u8 *mac) +{ + struct rb_node *n = login->mac_tree.rb_node; + struct vnic_mac *mace_t; + int rc; + + ASSERT(login); + ASSERT(mac); + + while (n) { + mace_t = rb_entry(n, struct vnic_mac, rb_node); + ASSERT(mace_t); + rc = memcmp(mac, mace_t->mac, ETH_ALEN); + if (rc < 0) + n = n->rb_left; + else if (rc > 0) + n = n->rb_right; + else + goto out; + } + + mace_t = ERR_PTR(-ENODATA); + +out: + return mace_t; +} + +/* vnic_mace_update -- + * Remove: -ENODATA if not found, if removed, update ref_cnt, return 0 + * Add: -ENOMEM if no mem, -EEXIST if already exists, + * if added, update ref_cnt, return 0 + * NOTE: ref counters must be updated here, as this function is + * shared among multiple entry points + */ +int vnic_mace_update(struct vnic_login *login, u8 *mac, u16 vnic_id, int remove) +{ + struct vnic_mac *mace; + int rc; + + mace = vnic_mace_search(login, mac); + if (remove) { + if (IS_ERR(mace)) + return -ENODATA; + vnic_mace_del(login, mace); + vnic_mace_dealloc(mace); + /* update ref cnt */ + ASSERT(atomic_read(&login->vnic_child_cnt)); + atomic_dec(&login->vnic_child_cnt); + } else { + if (PTR_ERR(mace) != -ENODATA) + return -EEXIST; + + /* test ref cnt */ + if (atomic_read(&login->vnic_child_cnt) + 1 > vnic_child_max) { + vnic_warn(login->name, "too many child vNics, max %d\n", + vnic_child_max); + return -EUSERS; /* too many users */ + } + + mace = vnic_mace_alloc(mac, vnic_id); + if (!mace) + return -ENOMEM; + + rc = vnic_mace_add(login, mace); + if (rc) { + vnic_mace_dealloc(mace); + return rc; + } + /* update ref cnt */ + atomic_inc(&login->vnic_child_cnt); + vnic_dbg_mac(login->name, + "updated mac "MAC_6_PRINT_FMT" remove %d\n", + MAC_6_PRINT_ARG(mac), remove); + } + + return 0; +} + +/* this function can be called from fast data-path + * need to make sure that login instance is protected here + * likely/unlikely below were added to match the hard_start_xmit fast data flow + * + caller must hold login->mac_rwlock (read_lock is enough because we only + * queue the job here) + * + it queues a job to create a child + */ +int vnic_child_update(struct vnic_login *login, u8 *mac, int remove) +{ + struct vnic_mac *mace; + char *cmd_str; + struct fip_hadmin_cmd *cmd_hadmin; + int count, rc = -EINVAL; + u16 vnic_id = 0; + + vnic_dbg_func(login->name); + + mace = vnic_mace_search(login, mac); + + /* if asked to add, and data already exists, abort */ + if (likely(!remove && !IS_ERR(mace))) { + mace->last_tx = jiffies; + return -EEXIST; + } + + if (!remove) { + /* test if there is too many child vNics same check exist in + * vnic_mace_update(), but we have it here as well to let + * vnic_set_mac return friendly rc + */ + if (atomic_read(&login->vnic_child_cnt) + 1 > vnic_child_max) { + vnic_warn(login->name, "too many child vNics, " + "max %d\n", vnic_child_max); + return -EUSERS; /* too many users */ + } + + /* update last_tx */ + ASSERT(mace); + /* generate new vnic_id only when new child is being added */ + vnic_id = atomic_inc_return(&login->port->vnic_child_ids); + /* set bit 14 so we avoid conflict with normal host/net admin */ + vnic_id %= (1 << (VNIC_ID_LEN - 2)); + vnic_id |= (1 << (VNIC_ID_LEN - 2)); + + /* TODO: update hadmin user-script and manual to make hadmin + * vnic_id interval >= 16K (1<<14 == 16384) so bit 14 is clear + * for parent host admin. + * to avoid atomic counter wrap around, move to bitmap array + */ + } else { + /* if asked to remove, and data not found, abort */ + if (IS_ERR(mace)) + return -ENODATA; + + ASSERT(mace); + vnic_id = mace->vnic_id; + } + + /* allocate cmd structs, too big to be local vars + * use GFP_ATOMIC because this func can be called from data path + */ + cmd_str = kmalloc(sizeof *cmd_str * PAGE_SIZE, GFP_ATOMIC); + if (!cmd_str) + return -ENOMEM; + + cmd_hadmin = kmalloc(sizeof *cmd_hadmin, GFP_ATOMIC); + if (!cmd_hadmin) { + kfree(cmd_str); + return -ENOMEM; + } + + /* inherit command from parent, change: + * name, parent, mac, vnic_id and source + * Note: cannot use parent login->fip_vnic->cmd here + * in order to support net-admin-vnics + */ + vnic_login_cmd_init(cmd_hadmin); + + /* child vNic name scheme: + * eth.c + * Note: avoid sysfs files conflict (that's why parent unique cnt must + * be included in the name here) + */ + snprintf(cmd_hadmin->c_name, MAX_INPUT_LEN, "%s%u.c%u", + "eth", login->cnt, vnic_id); + snprintf(cmd_hadmin->c_mac, MAX_INPUT_LEN, MAC_6_PRINT_FMT, + MAC_6_PRINT_ARG(mac)); + snprintf(cmd_hadmin->c_vnic_id, MAX_INPUT_LEN, "%u", + vnic_id); + snprintf(cmd_hadmin->c_eport, MAX_INPUT_LEN, "%s", + login->fip_vnic->gw_info.gw_port_name); + snprintf(cmd_hadmin->c_parent, MAX_INPUT_LEN, "%s", + login->dev->name); + snprintf(cmd_hadmin->c_bxname, MAX_INPUT_LEN, "%s", + login->fip_vnic->gw_info.system_name); + snprintf(cmd_hadmin->c_bxguid, MAX_INPUT_LEN, VNIC_GUID_FMT, + VNIC_GUID_RAW_ARG(login->fip_vnic->gw_info.system_guid)); + + /* all hadmin vNics must use same BX format (guid vs. name) */ + if (login->fip_vnic->hadmined) { + snprintf(cmd_hadmin->c_bxname, MAX_INPUT_LEN, "%s", + login->fip_vnic->cmd.c_bxname); + snprintf(cmd_hadmin->c_bxguid, MAX_INPUT_LEN, "%s", + login->fip_vnic->cmd.c_bxguid); + } + + /* VLAN is optional, set it only when used by parent */ + if (login->vlan_used) + snprintf(cmd_hadmin->c_vid, MAX_INPUT_LEN, "%d", + login->fip_vnic->vlan); + + /* ready to set the command */ + count = vnic_login_cmd_set(cmd_str, cmd_hadmin); + if (!count) + goto out; + + /* queue job (similar to sysfs write function, + * will eventually call fip_discover_hadmin_update_parent() -> + * vnic_mace_update() + */ + count = fip_hadmin_sysfs_update(login->port, cmd_str, count, remove); + if (count <= 0 && count != -EEXIST) + goto out; + + /* at this point, job queued, return success */ + rc = 0; + +out: + kfree(cmd_str); + kfree(cmd_hadmin); + return rc; +} + +void vnic_child_flush(struct vnic_login *login, int all) +{ + struct rb_node *n; + struct vnic_mac *mace, *mace_t; + LIST_HEAD(local_list); + + vnic_dbg_func(login->name); + + n = rb_first(&login->mac_tree); + while (n) { + mace = rb_entry(n, struct vnic_mac, rb_node); + list_add_tail(&mace->list, &local_list); + n = rb_next(n); + } + + list_for_each_entry_safe(mace, mace_t, &local_list, list) { + list_del(&mace->list); + /* if not-flush-all, and mac is dev_addr mac, skip this entry */ + if (!all && !memcmp(login->dev->dev_addr, mace->mac, ETH_ALEN)) + continue; + vnic_child_update(login, mace->mac, 1); + vnic_mace_del(login, mace); + vnic_mace_dealloc(mace); + } + + +} + +/* find parent vNic + * add the child vnic to its mac_tree + * sync child qp_base_num with parent + * for child removal, it's ok not to find the parent, or the child mac entry + */ +int vnic_parent_update(struct vnic_port *port, char *name, u16 vnic_id, + u8 *mac, u32 *qp_base_num_ptr, char *parent_name, + int remove) +{ + struct vnic_login *login; + int rc = -ENODATA; + + vnic_dbg_func(name); + + mutex_lock(&port->mlock); + list_for_each_entry(login, &port->login_list, list) { + vnic_dbg_mac(name, "checking parent %s for child %s (expect %s)\n", + login->dev->name, name, parent_name); + /* check if parent vnic has valid QPN and not being destroyed */ + if (!strcmp(login->dev->name, parent_name) && + test_bit(VNIC_STATE_LOGIN_PRECREATE_2, &login->fip_vnic->login_state) && + !login->fip_vnic->flush) { + /* sync qp_base_num with parent */ + if (qp_base_num_ptr) + *qp_base_num_ptr = login->qp_base_num; + + /* update mac_tree and mace vnic_id */ + write_lock_bh(&login->mac_rwlock); + rc = vnic_mace_update(login, mac, vnic_id, remove); + write_unlock_bh(&login->mac_rwlock); + + break; + } + } + + mutex_unlock(&port->mlock); + + /* for vNic removal, ignore rc */ + return remove ? 0 : rc; +} diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_main.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_main.c new file mode 100644 index 0000000000000..7e17e8de5a2ca --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_main.c @@ -0,0 +1,1179 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_data.h" + +void vnic_login_refresh_mcasts(struct vnic_port *port) +{ + struct vnic_login *login; + + vnic_dbg_mark(); + mutex_lock(&port->mlock); + list_for_each_entry(login, &port->login_list, list) + vnic_tree_mcast_detach(&login->mcast_tree); + list_for_each_entry(login, &port->login_list, list) + { + if (vnic_sa_query) { + /* take the tx lock to make sure no delete function is called at the time */ + netif_tx_lock_bh(login->dev); + vnic_neigh_invalidate(login); + netif_tx_unlock_bh(login->dev); + } + + vnic_tree_mcast_attach(&login->mcast_tree); + } + mutex_unlock(&port->mlock); +} + +int vnic_login_pre_create_1(struct vnic_port *port, + struct fip_vnic_data *vnic) +{ + struct vnic_login *login; + struct net_device *dev; + + /* set login to zero first (for parent_used case) */ + vnic->login = NULL; + + /* if parent_used, skip */ + if (vnic->parent_used) { + vnic_dbg_mac(vnic->name, "function skipped\n"); + return 0; + } else { + vnic_dbg_func(vnic->name); + } + + /* create netdev per login, vlan configuration is done from outside */ + dev = vnic_alloc_netdev(port); + if (IS_ERR(dev)) { + vnic_err(port->name, "vnic_alloc_netdev failed\n"); + goto err; + } + + login = vnic_netdev_priv(dev); + login->fip_vnic = vnic; + vnic->login = login; + login->vlan_used = vnic->vlan_used; + login->dev->hard_header_len += (vnic->vlan_used && vnic->hadmined)? VLAN_HLEN: 0; + vnic_dbg_fip(vnic->name,"creating vnic, hadmin=%d vlan_used=%d hard_header_len += %d\n", + vnic->hadmined, vnic->vlan_used, (vnic->vlan_used && vnic->hadmined)? VLAN_HLEN: 0); + set_bit(VNIC_STATE_LOGIN_PRECREATE_1, &vnic->login_state); + + return 0; + +err: + return -ENODEV; +} + +int vnic_login_pre_create_2(struct fip_vnic_data *vnic, int qps_num, int is_lag) +{ + struct vnic_login *login = vnic->login; + int i, j; + + if (vnic->parent_used) { + vnic_dbg_mac(vnic->name, "function skipped\n"); + return 0; + } else { + ASSERT(login); + vnic_dbg_func(login->name); + } + + login->qps_num = qps_num; + login->qkey = VNIC_DATA_QKEY; + login->is_lag = is_lag; + VNIC_TXQ_SET_ACTIVE(login, min(login->tx_rings_num, login->qps_num)); + + /* prepare padding for runt packets */ + login->pad_va = kzalloc(VNIC_EOIB_ZLEN_MAX, GFP_KERNEL); + if (!login->pad_va) + return -ENOMEM; + + login->pad_dma = ib_dma_map_single(login->port->dev->ca, login->pad_va, + VNIC_EOIB_ZLEN_MAX, DMA_TO_DEVICE); + if (ib_dma_mapping_error(login->port->dev->ca, login->pad_dma)) + goto err; + + /* create TX resources */ + for (i = 0; i < login->tx_rings_num; ++i) { + if (vnic_create_tx_res(login, i)) { + vnic_err(login->name, "vnic_create_tx_res failed," + " index %d\n", i); + goto free_tx_res; + } + } + + /* create RX resources */ + for (j = 0; j < login->rx_rings_num; ++j) { + if (vnic_create_rx_res(login, j)) { + vnic_err(login->name, "vnic_create_rx_res failed," + " index %d\n", j); + goto free_rx_res; + } + } + + /* create QPs */ + if (vnic_create_qp_range(login)) { + vnic_err(login->name, "vnic_create_qp_range failed\n"); + goto free_rx_res; + } + + /* first QP is the base QP */ + login->qp_base_num = login->qp_res[0].qp->qp_num; + vnic->qp_base_num = login->qp_base_num; + + /* update state */ + set_bit(VNIC_STATE_LOGIN_PRECREATE_2, &vnic->login_state); + + login->queue_stopped = 0; + + /* calls vnic_do_get_stats() */ + queue_delayed_work(login_wq, &login->stats_task, VNIC_STATS_DELAY); + + return 0; + +free_rx_res: + for (--j; j >= 0; --j) + vnic_destroy_rx_res(login, j); + + i = login->tx_rings_num; +free_tx_res: + for (--i; i >= 0; --i) + vnic_destroy_tx_res(login, i); +/*free_pad:*/ + ib_dma_unmap_single(login->port->dev->ca, login->pad_dma, + VNIC_EOIB_ZLEN_MAX, DMA_TO_DEVICE); +err: + kfree(login->pad_va); + return -ENODEV; +} + +int vnic_login_register_netdev(struct fip_vnic_data *vnic, + const char *mac, + const char *name) +{ + struct vnic_login *login = vnic->login; + + if (vnic->parent_used) { + vnic_dbg_mac(vnic->name, "function skipped\n"); + vnic_info("%s created (parent %s mac "MAC_6_PRINT_FMT")\n", + name, vnic->parent_name, + MAC_6_PRINT_ARG(vnic->mac_cache)); + return 0; + } else { + ASSERT(login); + vnic_dbg_func(login->name); + } + + /* set netdev name and mac */ + if (name) + strncpy(login->dev->name, name, IFNAMSIZ); + if (mac) { + memcpy(login->dev->dev_addr, mac, ETH_ALEN); + /* save original mac */ + memcpy(login->dev_addr, mac, ETH_ALEN); + } + + /* set device features according to all_vlan mode */ + login->dev->features |= NETIF_F_HIGHDMA; + + //ronni - fixme. add comment here + if (!vnic->all_vlan_gw) { + login->dev->features |= NETIF_F_VLAN_CHALLENGED; + login->dev->features &= ~NETIF_F_HW_VLAN_FILTER; + } else + login->dev->features |= NETIF_F_HW_VLAN_FILTER; + + /* register netdev */ + if (register_netdev(login->dev)) { + vnic_err(login->name, "register_netdev failed name=%s mac=" + MAC_6_PRINT_FMT" login->dev=%p\n", + name ? name : "net_admin", + MAC_6_PRINT_ARG(login->dev->dev_addr), login->dev); + goto err; + } + + /* encode the port number in dev_id: + * This allows us to associate the net device + * with the underlying device's port. + */ + login->dev->dev_id = login->port->num - 1; + + if (vnic_create_dentry(login)) { + vnic_err(login->name, "vnic_create_dentry failed\n"); + goto err; + } + + /* print info only after register_netdev so dev->name is valid */ + sprintf(login->name, "%s", login->dev->name); + vnic_info("%s created (%s port %d)\n", + login->dev->name, + login->port->dev->ca->name, login->port->num); + + /* disable tx queues and carrier. They will be started + * after create 2 is called the mcast is attached ... + */ + netif_tx_disable(login->dev); + netif_carrier_off(login->dev); + + mutex_lock(&login->port->mlock); + vnic_dbg_mac(login->name, "added to login_list\n"); + list_add_tail(&login->list, &login->port->login_list); + mutex_unlock(&login->port->mlock); + + set_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state); + + return 0; + +err: + return -EINVAL; +} + +int vnic_login_complete_ack(struct fip_vnic_data *vnic, + struct fip_login_data *login_data, + struct fip_shared_vnic_data *shared_vnic) +{ + struct vnic_mcast *mcaste, *mcaste_bcast, *mcast_shared = NULL; + struct vnic_login *login = vnic->login; + int rc; + int first_time_vlan = 0; + + if (vnic->parent_used) { + vnic_dbg_mac(vnic->name, "function skipped\n"); + return 0; + } else { + ASSERT(login); + vnic_dbg_func(login->name); + } + + /* + * TODO, check if you need them all, check overlap with gw_neigh + * check how pkey is passed from FIP + */ + login->pkey = login_data->pkey; + login->pkey_index = login_data->pkey_index; + login->n_mac_mcgid = login_data->n_mac_mcgid; + login->gw_port_id = login_data->port_id; + + /*GW should send the data SL from the login packet*/ + login->sl = login_data->sl; + + login->vnic_id = login_data->vnic_id; + + memcpy(login->mgid_prefix, login_data->mgid_prefix, VNIC_MGID_PREFIX_LEN); + memcpy(login->vnic_name, login_data->vnic_name, sizeof(login_data->vnic_name)); + memcpy(login->vendor_id, login_data->vendor_id, sizeof(login_data->vendor_id)); + + VNIC_STR_STRIP(login->vnic_name); + VNIC_STR_STRIP(login->vendor_id); /* set ZLEN (varies per VLAN support) */ + + /* set VLAN */ + login->zlen = ETH_ZLEN + (vnic_encap_headroom? VNIC_ENCAP_LEN: 0); + first_time_vlan = !login->vlan_used; /* always false for hadmin vnics with vlans */ + login->vlan_used = login_data->vp; + login->all_vlan_gw = login_data->all_vlan_gw; + if ((VNIC_VLAN_ENABLED(login))) { + login->vid = cpu_to_be16(login_data->vlan); + if (first_time_vlan) { + vnic_dbg_fip(login->dev->name,"Updating hard_header_len %d+%d=%d\n", + login->dev->hard_header_len, VLAN_HLEN, + login->dev->hard_header_len + VLAN_HLEN); + login->dev->hard_header_len += VLAN_HLEN; + } + login->zlen = ETH_ZLEN + VLAN_HLEN + (vnic_encap_headroom? VNIC_ENCAP_LEN: 0); + } + + /* create gw_neigh (no RSS when sending to the GW) + * user zero mac to describe GW L2 address + */ + login->gw_neigh = + vnic_neighe_alloc(login, NULL, login_data->lid, + login_data->qpn, 0); + if (IS_ERR(login->gw_neigh)) { + vnic_err(login->name, "failed to alloc gw neigh\n"); + goto err; + } + + /* alloc mcast entries here to simplify the error flow */ + mcaste = vnic_mcast_alloc(login->port, NULL, NULL); + if (IS_ERR(mcaste)) + goto err_free_gw_ah; + mcaste_bcast = vnic_mcast_alloc(login->port, NULL, NULL); + if (IS_ERR(mcaste_bcast)) { + vnic_mcast_dealloc(mcaste); + goto err_free_gw_ah; + } + /* used by shared vnic mcast group */ + if (shared_vnic && shared_vnic->enabled) { + mcast_shared = vnic_mcast_alloc(login->port, NULL, NULL); + if (IS_ERR(mcast_shared)) { + vnic_mcast_dealloc(mcaste); + vnic_mcast_dealloc(mcaste_bcast); + goto err_free_gw_ah; + } + } + + /* attach to default mgid */ + __vnic_mcaste_fill(login, mcaste, login->gw_port_id, ETH_ZERO_MAC, 0, vnic_mcast_create); + mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC; + mcaste->retry = VNIC_MCAST_ULIMIT_RETRY; + mcaste->attach_cb = __bcast_attach_cb; + mcaste->detach_cb = __bcast_detach_cb; + mcaste->attach_cb_ctx = login; + mcaste->detach_cb_ctx = login; + rc = vnic_mcast_add(&login->mcast_tree, mcaste); + ASSERT(!rc); + rc = vnic_mcast_attach(&login->mcast_tree, mcaste); + ASSERT(!rc); + + /* attach to bcast mgid (use default mlid) */ + if (login->n_mac_mcgid || vnic_mgid_data_type) { + __vnic_mcaste_fill(login, mcaste_bcast, login->gw_port_id, ETH_BCAST_MAC, 0, 0); + mcaste_bcast->backoff_factor = VNIC_MCAST_BACKOF_FAC; + mcaste_bcast->retry = VNIC_MCAST_ULIMIT_RETRY; + /* The port gid is overun by the default gid as part of the mgid over + * same mlid hack */ + memcpy(&mcaste_bcast->port_gid, &mcaste->port_gid, GID_LEN); + rc = vnic_mcast_add(&login->mcast_tree, mcaste_bcast); + ASSERT(!rc); + rc = vnic_mcast_attach(&login->mcast_tree, mcaste_bcast); + ASSERT(!rc); + } else { + vnic_mcast_dealloc(mcaste_bcast); + } + + login->shared_vnic = 0; + /* attach to bcast mgid (use default mlid) */ + if (shared_vnic && shared_vnic->enabled) { + u8 rss_hash = shared_vnic->ip[0] ^ shared_vnic->ip[1] ^ + shared_vnic->ip[2] ^ shared_vnic->ip[3]; + + login->shared_vnic = 1; + __vnic_mcaste_fill(login, mcast_shared, login->gw_port_id, shared_vnic->emac, 0, 0); + mcast_shared->backoff_factor = VNIC_MCAST_BACKOF_FAC; + mcast_shared->retry = VNIC_MCAST_ULIMIT_RETRY; + memcpy(&mcast_shared->port_gid, &mcaste->port_gid, GID_LEN); + mcast_shared->gid.raw[12]= rss_hash; + + vnic_dbg_mcast(login->name, "vnic %s attaching shared vnic 1 " + "MGID "VNIC_GID_FMT"\n", login->name, + VNIC_GID_RAW_ARG(mcast_shared->gid.raw)); + mcaste = mcast_shared; + memcpy(mcaste->mac, ETH_BCAST_MAC, ETH_ALEN); + rc = vnic_mcast_add(&login->mcast_tree, mcaste); + ASSERT(!rc); + rc = vnic_mcast_attach(&login->mcast_tree, mcaste); + ASSERT(!rc); + } + + /* set state */ + set_bit(VNIC_STATE_LOGIN_CREATE_2, &vnic->login_state); + + /* call vnic_open() if open was called when we were not ready to handle it */ + if (test_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state)) +#ifndef _BP_NO_NDO_OPS + login->dev->netdev_ops->ndo_open(login->dev); +#else + login->dev->open(login->dev); +#endif + + return 0; + +err_free_gw_ah: + vnic_neighe_dealloc(login->gw_neigh); +err: + return -EINVAL; +} + +/* + * When destroying login, call to stop login wq tasks. do not call from + * login_wq context. +*/ +void vnic_login_destroy_stop_wq(struct fip_vnic_data *vnic, enum fip_flush flush) +{ + struct vnic_login *login = vnic->login; + + if (vnic->parent_used) { + vnic_dbg_mac(vnic->name, "function skipped\n"); + return; + } else { + ASSERT(login); + vnic_dbg_func(login->name); + } + + if (test_bit(VNIC_STATE_LOGIN_PRECREATE_1, &vnic->login_state)) { + /* cancel vnic_auto_moder() */ + vnic_dbg_mark(); + mutex_lock(&login->moder_lock); + login->queue_stopped = 1; + mutex_unlock(&login->moder_lock); +#ifndef _BP_WORK_SYNC + cancel_delayed_work_sync(&login->stats_task); + if (cancel_delayed_work_sync(&login->mcast_task)) + dev_put(login->dev); + cancel_delayed_work_sync(&login->restart_task); +#else + cancel_delayed_work(&login->stats_task); + if (cancel_delayed_work(&login->mcast_task)) + dev_put(login->dev); + cancel_delayed_work(&login->restart_task); + flush_workqueue(login_wq); +#endif + } +} + +/* + * When destroy login data struct. Assumes all login wq tasks are stopped. + * Can be called from any context, might block for a few secs. +*/ +void vnic_login_destroy_wq_stopped(struct fip_vnic_data *vnic, enum fip_flush flush) +{ + struct vnic_login *login = vnic->login; + unsigned long flags; + int i; + + if (vnic->parent_used) { + vnic_dbg_mac(vnic->name, "function skipped\n"); + vnic_info("%s destroyed (parent %s mac "MAC_6_PRINT_FMT")\n", + vnic->interface_name, vnic->parent_name, + MAC_6_PRINT_ARG(vnic->mac_cache)); + /* Note: vNics can be logged out by BXM (bypass sysfs calls) + * so we need to cleanup the parent here as well + * if we reach this function from sysfs calls, + * then vnic_parent_update will have no effect here (ok) + */ + vnic_parent_update(vnic->port, vnic->name, vnic->vnic_id, + vnic->mac_cache, NULL, vnic->parent_name, 1); + return; + } else { + ASSERT(login); + vnic_dbg_func(login->name); + } + + /* the cleanup procedure depends on our state, our vnic type + * (host/network admin), and the cleanup level required. In network admined + * vnics there is a single create state and only one cleanup level (full). + * for host admined there are two create states (init, regular) and two + * cleanup level. The flow depends on the reason for the cleanup. */ + vnic_dbg_data(login->name, "vnic_login_destroy flush=%d\n", flush); + + /* we need to change state to prevent from completion to re-open the TX + * queue once we close it. Before calling stop() function, need to make + * sure that all on-going hard_start_xmit() calls are done. + */ + + if (test_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state)) { + set_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state); + netif_tx_disable(login->dev); + vnic_dbg_mark(); + } + + if (test_and_clear_bit(VNIC_STATE_LOGIN_CREATE_2, &vnic->login_state)) { + if (test_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state)) { + /* calls vnic_stop() */ +#ifndef _BP_NO_NDO_OPS + login->dev->netdev_ops->ndo_stop(login->dev); +#else + login->dev->stop(login->dev); +#endif + set_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state); + vnic_dbg_mark(); + } + vnic_mcast_del_all(&login->mcast_tree); + vnic_member_remove_all(login); + vnic_neighe_dealloc(login->gw_neigh); + vnic_dbg_mark(); + } + if (test_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state)) + clear_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state); + + if (flush == FIP_FULL_FLUSH && + test_and_clear_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state)) { + mutex_lock(&login->port->mlock); + vnic_dbg_mac(login->name, "delete from login_list\n"); + list_del(&login->list); + mutex_unlock(&login->port->mlock); + + /* print info if register_netdev was called before so + * dev->name is valid + */ + vnic_info("%s destroyed (%s port %d)\n", login->dev->name, + login->port->dev->ca->name, login->port->num); + + /* use irq save so caller function supports any context */ + write_lock_irqsave(&login->mac_rwlock, flags); + vnic_child_flush(login, 1); + write_unlock_irqrestore(&login->mac_rwlock, flags); + + vnic_delete_dentry(login); + unregister_netdev(login->dev); + vnic_dbg_mark(); + } + + vnic_dbg_mark(); + /* login_ctx was in pre created state [always true] */ + spin_lock_bh(&login->stats_lock); + if (test_and_clear_bit(VNIC_STATE_LOGIN_PRECREATE_2, &vnic->login_state)) { + spin_unlock_bh(&login->stats_lock); + vnic_dbg_mark(); + /* take port->mlock in case of refresh event is being called vnic_refresh_mcasts */ + mutex_lock(&login->port->mlock); + /* tx queues are already stopped here */ + vnic_neigh_del_all(login); + vnic_mcast_del_all(&login->mcast_tree); + for (i = 0; i < login->qps_num; ++i) + vnic_destroy_qp(login, i); + mutex_unlock(&login->port->mlock); + + for (i = 0; i < login->rx_rings_num; ++i) + vnic_destroy_rx_res(login, i); + for (i = 0; i < login->tx_rings_num; ++i) + vnic_destroy_tx_res(login, i); + ib_dma_unmap_single(login->port->dev->ca, login->pad_dma, + VNIC_EOIB_ZLEN_MAX, DMA_TO_DEVICE); + kfree(login->pad_va); + } else + spin_unlock_bh(&login->stats_lock); + + if (flush == FIP_FULL_FLUSH && + test_and_clear_bit(VNIC_STATE_LOGIN_PRECREATE_1, &vnic->login_state)) { + vnic_free_netdev(login); + } +} + +int vnic_vhube_add(struct fip_vnic_data *vnic, struct vnic_table_entry *vhube) +{ + struct vnic_neigh *neighe; + struct vnic_login *login = vnic->login; + int rc; + + if (vnic->parent_used) { + vnic_dbg_mac(vnic->name, "function skipped\n"); + return 0; + } else { + ASSERT(login); + vnic_dbg_func(login->name); + } + + vnic_dbg_data(login->name, "adding vhube lid 0x%02x qpn 0x%x, mac " + MAC_6_PRINT_FMT"\n", vhube->lid, vhube->qpn, + MAC_6_PRINT_ARG(vhube->mac)); + + neighe = vnic_neighe_alloc(login, vhube->mac, vhube->lid, + vhube->qpn, vhube->rss); + if (IS_ERR(neighe)) + return (int)PTR_ERR(neighe); + + vnic_dbg_mark(); + /* when adding new neighe, make sure that TX queues are not running. */ + netif_tx_lock_bh(login->dev); + rc = vnic_neighe_add(login, neighe); + netif_tx_unlock_bh(login->dev); + if (rc) { + vnic_neighe_dealloc(neighe); + return rc; + } + + return 0; +} + +void vnic_vhube_flush(struct fip_vnic_data *vnic) +{ + struct vnic_login *login = vnic->login; + + if (vnic->parent_used) { + vnic_dbg_mac(vnic->name, "function skipped\n"); + return; + } else { + ASSERT(login); + vnic_dbg_func(login->name); + } + + /* when adding new neighe, make sure that TX queues are not running. */ + vnic_dbg_mark(); + netif_tx_lock_bh(login->dev); + vnic_neigh_del_all(login); + netif_tx_unlock_bh(login->dev); + + return; +} + +void vnic_vhube_del(struct fip_vnic_data *vnic, u8* mac) +{ + struct vnic_neigh *neighe; + struct vnic_login *login = vnic->login; + + if (vnic->parent_used) { + vnic_dbg_mac(vnic->name, "function skipped\n"); + return; + } else { + ASSERT(login); + vnic_dbg_func(login->name); + } + + vnic_dbg_mark(); + /* when adding new neighe, make sure that TX queues are not running. */ + netif_tx_lock_bh(login->dev); + neighe = vnic_neighe_search(login, mac); + if (IS_ERR(neighe)) { + vnic_warn(login->name, "couldn't find "MAC_6_PRINT_FMT"\n", + MAC_6_PRINT_ARG(mac)); + } else { + vnic_neighe_del(login, neighe); + vnic_neighe_dealloc(neighe); + } + netif_tx_unlock_bh(login->dev); + return; +} + +struct fip_login_data login_data; +struct fip_vnic_data vnic; +struct vnic_login *__vnic_login_create(struct vnic_port *port, int index) +{ + struct vnic_login *login; + int rc, no_bxm_n_rss = 0x4; + int qps_num = (port->rx_rings_num > 1) ? (1 << no_bxm_n_rss) : 1; + + /* pre create vnic */ + rc = vnic_login_pre_create_1(port, &vnic); + if (rc) { + vnic_err(port->name, "vnic_login_pre_create_1 failed" + " for %s port %d index %d\n", + port->dev->ca->name, port->num, index); + goto err; + } + + login = vnic.login; + + rc = vnic_login_pre_create_2(&vnic, qps_num, 0); + if (rc) { + vnic_err(port->name, "vnic_login_pre_create_2 failed" + " for %s port %d index %d\n", + port->dev->ca->name, port->num, index); + goto create_fail; + } + + /* create vnic */ + memset(&login_data, 0, sizeof(struct fip_login_data)); + sprintf(login_data.vendor_id, "%s", NOT_AVAILABLE_STRING); + sprintf(login_data.vnic_name, "%s", NOT_AVAILABLE_STRING); + memcpy(login_data.mgid_prefix, NO_BXM_MGID_PREFIX, VNIC_MGID_PREFIX_LEN); + login_data.qpn = 0xa00000; + login_data.lid = 1; + login_data.pkey = 0xffff; + login_data.mtu = 1500; + + /* random_ether_addr(mac); */ + memcpy(login_data.mac, port->gid.raw + 10, ETH_ALEN); + login_data.mac[0] += index * 0x10; + /* mcast bit must be zero */ + login_data.mac[0] &= 0xfe; + vnic_dbg_mark(); + if (vnic_login_register_netdev(&vnic, login_data.mac, NULL)) { + vnic_err(login->name, "vnic_login_register_netdev failed\n"); + goto create_fail; + } + if (vnic_login_complete_ack(&vnic, &login_data, NULL)) { + vnic_err(login->name, "vnic_login_complete_ack failed\n"); + goto create_fail; + } + + return login; + +create_fail: + vnic_login_destroy(login->fip_vnic, FIP_FULL_FLUSH); +err: + return ERR_PTR(-ENODEV); +} + +int vnic_port_data_init(struct vnic_port *port) +{ + int i, no_bxm_vnic_per_port = 1; + + vnic_dbg_mark(); + mutex_lock(&port->start_stop_lock); + for (i = 0; i < no_bxm_vnic_per_port; ++i) { + __vnic_login_create(port, i); + } + mutex_unlock(&port->start_stop_lock); + + return 0; + /*TODO - JPM: handle vnic_login_create failure */ +} + +void vnic_port_data_cleanup(struct vnic_port *port) +{ + struct vnic_login *login, *login_t; + + vnic_dbg_mark(); + /* vnic_login_destroy() acquires the port->mlock, cannot hold it here */ + list_for_each_entry_safe(login, login_t, + &port->login_list, list) { + vnic_dbg_data(login->name, "login %s\n", login->name); + vnic_login_destroy(login->fip_vnic, FIP_FULL_FLUSH); + } +} + +/* ALI TODO: check if need to replace login ptr with vnic */ +void debug_dump_members(struct vnic_login *login, struct vnic_gw_info *member) +{ + int i; + + vnic_warn(login->name, "Error members_debug_dump " + "member id=%d gw id = %d active_count=%d\n", + member->member_id, member->gw_id, + login->lag_member_active_count); + + /* go over map and count how many entries are mapped to each member*/ + for (i=0; iname, "%d member %d used %x gw_id %d\n", + i, login->lag_gw_neigh[i].member_id, + login->lag_gw_neigh[i].info, + login->lag_gw_neigh[i].gw_id); + } +} + +static void vnic_build_map_histogram(struct vnic_login *login, int member_id, int *hist) +{ + int i; + + memset(hist, 0, sizeof(int) * MAX_LAG_MEMBERS); + + /* go over map and count how many entries are mapped to each member*/ + for (i=0; ilag_gw_map[i] >= 0 && login->lag_gw_map[i] < MAX_LAG_MEMBERS); + hist[login->lag_gw_map[i]]++; + } +} + +static void _vnic_remove_member_from_map(struct vnic_login *login, int member_id) +{ + int user_count[MAX_LAG_MEMBERS] = {0}; + int i, j; + int continue_flag; + int thresh; + + login->lag_member_active_count--; + if (login->lag_member_active_count > 0) { + /* go over map and count how many entries are mapped to each member*/ + vnic_build_map_histogram(login, member_id, user_count); + + thresh = 2; //it might be possible to find a better lower boundary + + for (i=0; ilag_gw_map[i] != member_id) + continue; + + continue_flag = 1; + while (continue_flag) { + for (j = 0; j < MAX_LAG_MEMBERS; j++) { + if (j == member_id) + continue; + + /* Only use members that are connected, and are short of members */ + if (login->lag_gw_neigh[j].info & GW_MEMBER_INFO_MAPPED && + user_count[j] < thresh) { + login->lag_gw_map[i] = j; + user_count[j]++; + continue_flag = 0; + break; + } + } + if (j == MAX_LAG_MEMBERS) + thresh++; + } + } + } +} + +static void _vnic_add_member_to_map(struct vnic_login *login, int member_id) +{ + int i; + int expected; + int user_count[MAX_LAG_MEMBERS] = {0}; + int continue_flag; + int thresh; + + /* this is the first active port use it for all maps */ + if (!login->lag_member_active_count) { + for (i=0; ilag_gw_map[i] = member_id; + login->lag_member_active_count++; + } else { + /* go over map and count how many entries are mapped to each member + * we will use count to reasign ports from the most heavily used members */ + vnic_build_map_histogram(login, member_id, user_count); + + /* when adding new member, make sure that TX queues are not running. */ + login->lag_member_active_count++; + expected = LAG_MAP_TABLE_SIZE / login->lag_member_active_count; + thresh = LAG_MAP_TABLE_SIZE % login->lag_member_active_count; + continue_flag = 1; + while (continue_flag) { + for (i = 0; i < LAG_MAP_TABLE_SIZE; i++) { + if (user_count[login->lag_gw_map[i]] > expected + thresh) { + user_count[login->lag_gw_map[i]]--; + login->lag_gw_map[i] = member_id; + user_count[login->lag_gw_map[i]]++; + if (user_count[member_id] >= expected) { + continue_flag = 0; + break; + } + } + } + thresh--; + } + } +} + +void __bcast_member_attach_cb(struct vnic_mcast *mcaste, void *gw_ptr) +{ + struct vnic_gw_info *member = gw_ptr; + + /* When SA is local, mcast join works even when port is down */ + if (member->neigh.login->port->attr.state != IB_PORT_ACTIVE) + return; + + vnic_dbg_lag(member->neigh.login->name, "__bcast_member_attach_cb for member id %d and " + "gw_id=%d\n", member->member_id, member->gw_id); + + netif_tx_lock_bh(member->neigh.login->dev); + member->info |= GW_MEMBER_INFO_MCAST; + + if (member->info & GW_MEMBER_INFO_EPORT_UP && + !(member->info & GW_MEMBER_INFO_MAPPED)) { + _vnic_add_member_to_map(member->neigh.login, member->member_id); + member->info |= GW_MEMBER_INFO_MAPPED; + } + netif_tx_unlock_bh(member->neigh.login->dev); +} + +void __bcast_member_detach_cb(struct vnic_mcast *mcaste, void *gw_ptr) +{ + struct vnic_gw_info *member = gw_ptr; + + vnic_dbg_lag(member->neigh.login->name, "__bcast_member_detach_cb for member id %d and " + "gw_id=%d\n", member->member_id, member->gw_id); + + netif_tx_lock_bh(member->neigh.login->dev); + if (member->info & GW_MEMBER_INFO_MAPPED) + _vnic_remove_member_from_map(member->neigh.login, member->member_id); + + member->info &= ~(GW_MEMBER_INFO_MAPPED | GW_MEMBER_INFO_MCAST); + netif_tx_unlock_bh(member->neigh.login->dev); +} + +/* + * create MGIDs and join the default MCAST addresses. The mcaste are added to the + * list contained within member struct. If more MGIDs are used by the vnic when + * a member is added we will join those too using the members GW_ID. +*/ +static int _vnic_add_member_mgid(struct vnic_login *login, struct vnic_gw_info *member) +{ + struct vnic_mcast *mcaste, *mcaste_bcast; + int rc; +#ifndef _BP_NO_MC_LIST + struct dev_mc_list *mclist; +#else + struct netdev_hw_addr *ha; +#endif + + mcaste = vnic_mcast_alloc(login->port, NULL, NULL); + if (IS_ERR(mcaste)) + return (-ENOMEM); + + /* attach to default mgid */ + __vnic_mcaste_fill(login, mcaste, member->gw_id, ETH_ZERO_MAC, 0, vnic_mcast_create); + mcaste->attach_cb = __bcast_member_attach_cb; + mcaste->detach_cb = __bcast_member_detach_cb; + mcaste->attach_cb_ctx = member; + mcaste->detach_cb_ctx = member; + mcaste->priv_data = member; + rc = vnic_mcast_add(&login->mcast_tree, mcaste); + if (rc) { + debug_dump_members(login, member); + ASSERT(!rc); + } + + rc = vnic_mcast_attach(&login->mcast_tree, mcaste); + if (rc) { + debug_dump_members(login, member); + ASSERT(!rc); + } + + if (login->n_mac_mcgid) { + mcaste_bcast = vnic_mcast_alloc(login->port, NULL, NULL); + if (IS_ERR(mcaste_bcast)) + goto free_mcasts; + + __vnic_mcaste_fill(login, mcaste_bcast, member->gw_id, ETH_BCAST_MAC, 0, 0); + /* The port gid is overun by the default gid as part of the mgid over + * same mlid hack */ + memcpy(&mcaste_bcast->port_gid, &mcaste->port_gid, GID_LEN); + mcaste_bcast->priv_data = member; + rc = vnic_mcast_add(&login->mcast_tree, mcaste_bcast); + ASSERT(!rc); + rc = vnic_mcast_attach(&login->mcast_tree, mcaste_bcast); + ASSERT(!rc); + } + + + /* hold the tx lock so set_multicast_list() won't change mc_list */ + netif_tx_lock_bh(login->dev); +#ifndef _BP_NO_MC_LIST + for (mclist = login->dev->mc_list; mclist; mclist = mclist->next) { + u8* mmac = mclist->dmi_addr; +#else + netdev_for_each_mc_addr(ha, login->dev) { + u8* mmac = ha->addr; +#endif + /* do not add the default MGIDS because they are always used */ + if (IS_ZERO_MAC(mmac)) + continue; + if (IS_BCAST_MAC(mmac)) + continue; + + vnic_dbg_lag(login->name, "_vnic_add_member_mgid for " + MAC_6_PRINT_FMT" and member gw_id=%d\n", + MAC_6_PRINT_ARG(mcaste->mac), member->gw_id); + + if (_vnic_mcast_attach_mgid(login, mmac, mcaste, member, + member->gw_id)) + goto attach_failed; + } + netif_tx_unlock_bh(login->dev); + + return 0; + +attach_failed: + netif_tx_unlock_bh(login->dev); +free_mcasts: + vnic_mcast_del_user(&login->mcast_tree, member); + return -ENOMEM; +} + +int vnic_member_add(struct vnic_login *login, int member_id, struct lag_member *member) +{ + struct vnic_gw_info *member_e; + int ret; + + if (member_id >= MAX_LAG_MEMBERS || member_id < 0) + return -1; + + vnic_dbg_lag(login->name,"vnic_member_add id:%d gw_id:%d lid:%x qpn:%x sl:%d\n", + member_id, member_e->gw_id, member->lid, member->qpn, member->sl); + /* member id is already in use */ + if (login->lag_gw_neigh[member_id].info & GW_MEMBER_INFO_CREATED) + return -1; + + member_e = &login->lag_gw_neigh[member_id]; + + /* create new entry */ + member_e->member_id = member_id; + member_e->neigh.lid = member->lid; + member_e->neigh.qpn = member->qpn; + member_e->gw_id = member->gw_port_id; + member_e->neigh.login = login; + INIT_DELAYED_WORK(&member_e->neigh.destroy_task, vnic_neighe_dealloc_task); + skb_queue_head_init(&member_e->neigh.pkt_queue); + init_completion(&member_e->neigh.query_comp); + complete(&member_e->neigh.query_comp); /* mark as complete since no query is running */ + member_e->neigh.valid = 0; + member_e->neigh.pquery = ERR_PTR(-ENODATA); + member_e->neigh.query_id = -1; + member_e->neigh.ah = ERR_PTR(-ENODATA); /* ah query will be done via datapath */ + if (!vnic_sa_query) { + member_e->neigh.ah = vnic_ah_alloc(login, member->lid); + if (IS_ERR(member_e->neigh.ah)) + return -ENOMEM; + } + /* need to add multicast code */ + ret = _vnic_add_member_mgid(login, member_e); + if (ret) + goto free_ah; + + netif_tx_lock_bh(login->dev); + member_e->info = GW_MEMBER_INFO_CREATED; + if (member->eport_state) + member_e->info |= GW_MEMBER_INFO_EPORT_UP; + login->lag_member_count++; + netif_tx_unlock_bh(login->dev); + + return 0; +free_ah: + if (!IS_ERR(member_e->neigh.ah)) + ib_destroy_ah(member_e->neigh.ah); + return ret; +} + +void vnic_member_remove_all(struct vnic_login *login) +{ + int i; + + if (!login->is_lag) + return; + + for (i=0; iname, "vnic_member_remove for id %d\n", member_id); + + if (member_id >= MAX_LAG_MEMBERS || member_id < 0) + return -1; + + member_e = &login->lag_gw_neigh[member_id]; + + vnic_dbg_lag(login->name,"vnic_member_remove id:%d gw_id:%d lid:%x qpn:%x sl:%d\n", + member_id, member_e->gw_id, member_e->neigh.lid, member_e->neigh.qpn, member_e->neigh.sl); + + /* member id is not in use */ + if (!(member_e->info & GW_MEMBER_INFO_CREATED)) + return -1; + + if (member_e->neigh.query_id >=0 && member_e->neigh.pquery && !IS_ERR(member_e->neigh.pquery)) + ib_sa_cancel_query(member_e->neigh.query_id, member_e->neigh.pquery); + + netif_tx_lock_bh(login->dev); + if (member_e->info & GW_MEMBER_INFO_MAPPED) + _vnic_remove_member_from_map(login, member_e->member_id); + member_e->info &= ~(GW_MEMBER_INFO_MAPPED); + member_e->neigh.valid = 0; + netif_tx_unlock_bh(login->dev); + + /* wait for completion after the entry was removed from login data path */ + wait_for_completion(&member_e->neigh.query_comp); + + /* modification of map will be done through mcast CB if needed */ + vnic_mcast_del_user(&login->mcast_tree, member_e); + + if(member_e->neigh.ah && !IS_ERR(member_e->neigh.ah)) + ib_destroy_ah(member_e->neigh.ah); + member_e->neigh.ah = ERR_PTR(-ENODATA); + member_e->info = 0; + login->lag_member_count--; + + return 0; +} + +void vnic_member_prop(struct vnic_login *login, struct lag_properties *prop) +{ + if (login->lag_prop.hash_mask != prop->hash_mask) { + netif_tx_lock_bh(login->dev); + memcpy(&login->lag_prop, prop, + sizeof(login->lag_prop)); + netif_tx_unlock_bh(login->dev); + } +} + +/* + * modify a specific LAG eport member parameters. The parameters might not be + * "interesting" and might not effect data traffic. They might require creating + * a new ah, or might even result in a modification of the transmit hash mapping + * function. +*/ +int vnic_member_modify(struct vnic_login *login, int member_id, struct lag_member *member) +{ + struct vnic_gw_info *member_e; + + if (member_id >= MAX_LAG_MEMBERS || member_id < 0) + return -1; + + member_e = &login->lag_gw_neigh[member_id]; + + vnic_dbg_lag(login->name,"vnic_member_modify id:%d gw_id:%d lid:%x qpn:%x sl:%d\n", + member_id, member_e->gw_id, member_e->neigh.lid, member_e->neigh.qpn, member_e->neigh.sl); + + /* member id is not in use */ + if (! member_e->info & GW_MEMBER_INFO_CREATED) + return -1; + + /* change in LID requires new ah */ + /* TODO Test this */ + if (member_e->neigh.lid != member->lid) { + /* take tx lock to make sure ah is not being used */ + if (vnic_sa_query) { + /* Cancel SA query in case */ + if (member_e->neigh.query_id >=0 && member_e->neigh.pquery && !IS_ERR(member_e->neigh.pquery)) + ib_sa_cancel_query(member_e->neigh.query_id, member_e->neigh.pquery); + netif_tx_lock_bh(login->dev); + member_e->neigh.lid = member->lid; + member_e->neigh.valid = 0; + if ((member_e->neigh.ah && !IS_ERR(member_e->neigh.ah))) + { + /* lid is not the same : destroy AH */ + ib_destroy_ah(member_e->neigh.ah); + member_e->neigh.ah = ERR_PTR(-ENODATA); + } + netif_tx_unlock_bh(login->dev); + } else { + struct ib_ah *ah, *ah1; + ah = member_e->neigh.ah; + ah1 = vnic_ah_alloc(login, member->lid); + if (IS_ERR(ah1)) + return -ENOMEM; + netif_tx_lock_bh(login->dev); + member_e->neigh.lid = member->lid; + member_e->neigh.ah = ah1; + netif_tx_unlock_bh(login->dev); + ib_destroy_ah(ah); + } + } + + if (member_e->neigh.qpn != member->qpn) + member_e->neigh.qpn = member->qpn; + + netif_tx_lock_bh(login->dev); + /* link changed from up to down */ + if (member_e->info & GW_MEMBER_INFO_MAPPED && !member->eport_state) { + _vnic_remove_member_from_map(login, member_id); + member_e->info &= ~(GW_MEMBER_INFO_MAPPED | GW_MEMBER_INFO_EPORT_UP); + } + + /* link changed from down to up and mcast are connected */ + if (!(member_e->info & GW_MEMBER_INFO_MAPPED) && + member->eport_state) { + if (member_e->info & GW_MEMBER_INFO_MCAST) { + _vnic_add_member_to_map(login, member_id); + member_e->info |= (GW_MEMBER_INFO_MAPPED | GW_MEMBER_INFO_EPORT_UP); + } else + member_e->info |= GW_MEMBER_INFO_EPORT_UP; + } + netif_tx_unlock_bh(login->dev); + + return 0; +} + diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_neigh.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_neigh.c new file mode 100644 index 0000000000000..a331aebbc6dc4 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_neigh.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_data.h" + +void vnic_neighe_dealloc_task(struct work_struct *work) +{ + struct vnic_neigh *neighe = + container_of(work, struct vnic_neigh, destroy_task.work); + if (IS_NEIGH_QUERY_RUNNING(neighe)) + ib_sa_cancel_query(neighe->query_id, neighe->pquery); + wait_for_completion(&neighe->query_comp); + if (neighe->ah && !IS_ERR(neighe->ah)) + ib_destroy_ah(neighe->ah); + kfree(neighe); +} + +void vnic_neighe_dealloc(struct vnic_neigh *neighe) +{ + ASSERT(neighe); + /* calls vnic_neighe_dealloc_task */ + queue_delayed_work(neighe->login->neigh_wq, &neighe->destroy_task, 0); +} + +struct ib_ah *vnic_ah_alloc(struct vnic_login *login, u16 dlid) +{ + struct ib_ah_attr av; + struct ib_ah *ah; + + memset(&av, 0, sizeof(av)); + av.dlid = dlid; + av.port_num = login->port->num; + av.sl = login->sl; /* PATH Query is need here to allocate the data sl*/ + ah = ib_create_ah(login->port->pd, &av); + if (IS_ERR(ah)) { + return ERR_PTR(-ENOMEM); + } + return(ah); +} + +struct vnic_neigh *vnic_neighe_alloc(struct vnic_login *login, + const u8 *mac, + u16 dlid, u32 dqpn, u8 rss) +{ + struct vnic_neigh *neighe; + neighe = kzalloc(sizeof *neighe, GFP_ATOMIC); + if (!neighe) + return ERR_PTR(-ENOMEM); + INIT_DELAYED_WORK(&neighe->destroy_task, vnic_neighe_dealloc_task); + skb_queue_head_init(&neighe->pkt_queue); + if (mac) + memcpy(neighe->mac, mac, ETH_ALEN); + neighe->rss = rss; + neighe->ah = ERR_PTR(-ENODATA); + if (!vnic_sa_query) { + neighe->ah = vnic_ah_alloc(login, dlid); + if (IS_ERR(neighe->ah)) { + kfree(neighe); + return ERR_PTR(-ENOMEM); + } + } + init_completion(&neighe->query_comp); + complete(&neighe->query_comp); /* mark as complete since no query is running */ + neighe->pquery = ERR_PTR(-ENODATA); + neighe->query_id = -1; + neighe->qpn = dqpn; + neighe->lid = dlid; + neighe->login = login; + + return neighe; +} + +void vnic_neighe_del(struct vnic_login *login, struct vnic_neigh *neighe) +{ + ASSERT(neighe); + rb_erase(&neighe->rb_node, &login->neigh_tree); +} + +int vnic_neighe_add(struct vnic_login *login, struct vnic_neigh *neighe) +{ + struct rb_node **n = &login->neigh_tree.rb_node, *pn = NULL; + struct vnic_neigh *neighe_t; + int rc; + + while (*n) { + pn = *n; + neighe_t = rb_entry(pn, struct vnic_neigh, rb_node); + rc = memcmp(neighe->mac, neighe_t->mac, ETH_ALEN); + if (rc < 0) + n = &pn->rb_left; + else if (rc > 0) + n = &pn->rb_right; + else { + rc = -EEXIST; + goto out; + } + } + + rb_link_node(&neighe->rb_node, pn, n); + rb_insert_color(&neighe->rb_node, &login->neigh_tree); + rc = 0; + +out: + return rc; +} + +struct vnic_neigh *vnic_neighe_search(struct vnic_login *login, u8 *mac) +{ + struct rb_node *n = login->neigh_tree.rb_node; + struct vnic_neigh *neighe_t; + int rc; + + while (n) { + neighe_t = rb_entry(n, struct vnic_neigh, rb_node); + rc = memcmp(mac, neighe_t->mac, ETH_ALEN); + if (rc < 0) + n = n->rb_left; + else if (rc > 0) + n = n->rb_right; + else { + vnic_dbg_data(login->name, + "found: mac "MAC_6_PRINT_FMT" vid %d " + "qpn 0x%06x lid 0x%02x\n", + MAC_6_PRINT_ARG(neighe_t->mac), + be16_to_cpu(login->vid), neighe_t->qpn, + neighe_t->lid); + goto out; + } + } + neighe_t = ERR_PTR(-ENODATA); + +out: + return neighe_t; +} + +void vnic_neigh_del_all(struct vnic_login *login) +{ + struct rb_node *n; + struct vnic_neigh *neighe; + + ASSERT(login); + n = rb_first(&login->neigh_tree); + while (n) { + neighe = rb_entry(n, struct vnic_neigh, rb_node); + vnic_neighe_del(login, neighe); + n = rb_first(&login->neigh_tree); + vnic_neighe_dealloc(neighe); + } +} + +void vnic_neigh_invalidate(struct vnic_login *login) +{ + struct vnic_neigh *neighe; + struct rb_node *n; + int i; + + if (login->gw_neigh && !IS_ERR(login->gw_neigh)) + login->gw_neigh->valid = 0; + + n = rb_first(&login->neigh_tree); + while (n) { + neighe = rb_entry(n, struct vnic_neigh, rb_node); + neighe->valid = 0; + n = rb_next(n); + } + + if (login->is_lag) + for (i=0; ilag_gw_neigh[i].neigh.valid = 0; +} diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_netdev.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_netdev.c new file mode 100644 index 0000000000000..abfd2e237671c --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_netdev.c @@ -0,0 +1,1085 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_data.h" + +extern struct net_device_stats *mlx4_vnic_stats_func_container(struct net_device *n); + +static int mlx4_vnic_vlan_rx_add_vid(struct net_device *dev, __be16 proto, + unsigned short vid) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + vnic_dbg_data(login->name, "add VLAN:%d was called\n", vid); + return 0; +} + +static int mlx4_vnic_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, + unsigned short vid) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + vnic_dbg_data(login->name, "Kill VID:%d was called\n", vid); + return 0; +} + +void vnic_carrier_update(struct vnic_login *login) +{ + int attached, eport_up, eport_enforce, carrier_ok; + + ASSERT(login); + attached = test_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->fip_vnic->login_state); + eport_up = fip_vnic_get_eport_state(login->fip_vnic); + eport_enforce = vnic_eport_state_enforce; + carrier_ok = netif_carrier_ok(login->dev); + + /* bring carrier up */ + if (!carrier_ok && attached && (!eport_enforce || eport_up)) { + set_bit(VNIC_STATE_NETDEV_CARRIER_ON, &login->netdev_state); + netif_carrier_on(login->dev); + vnic_info("%s link is up\n", login->dev->name); + return; + } + + /* bring carrier down */ + if (carrier_ok && (!attached || (!eport_up && eport_enforce))) { + clear_bit(VNIC_STATE_NETDEV_CARRIER_ON, &login->netdev_state); + netif_carrier_off(login->dev); + vnic_info("%s link is down\n", login->dev->name); + return; + } + +} + +void __bcast_attach_cb(struct vnic_mcast *mcaste, void *login_ptr) +{ + struct vnic_login *login = login_ptr; + + /* When SA is local, mcast join works even when port is down */ + if (login->port->attr.state != IB_PORT_ACTIVE) + return; + set_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->fip_vnic->login_state); + vnic_carrier_update(login); +} + +void __bcast_detach_cb(struct vnic_mcast *mcaste, void *login_ptr) +{ + struct vnic_login *login = login_ptr; + + clear_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->fip_vnic->login_state); + vnic_carrier_update(login); +} + +/* this function cannot sleep, avoid any mutex() in consequent calls */ +static int vnic_set_mac(struct net_device *dev, void *_mac) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + struct sockaddr *saddr = _mac; + u8 *mac = (u8 *)(saddr->sa_data); + int rc = 0; + + vnic_dbg_func(login->name); + + vnic_dbg_mac(login->name, "mac "MAC_6_PRINT_FMT" => "MAC_6_PRINT_FMT"\n", + MAC_6_PRINT_ARG((u8 *)(dev->dev_addr)), + MAC_6_PRINT_ARG(mac)); + + /* must support child vNics for mac modification */ + if (!vnic_child_max) + return -ENOSYS; + + /* skip if invalid address */ + if (unlikely(!is_valid_ether_addr(mac))) + return -EINVAL; + + /* skip if same mac was already set */ + if (!(memcmp((u8 *)(dev->dev_addr), mac, ETH_ALEN))) + return 0; + + /* already in bh, calls vnic_child_update that queues a job, + * so read_lock is enough + */ + read_lock(&login->mac_rwlock); + + /* if mac same as original, delete child, set mac and return */ + if (!(memcmp(mac, login->dev_addr, ETH_ALEN))) + goto out; + + /* else, this is a new child vNic, + * add new child vNic + * NOTE: pay attention that the GC should not destroy a child vNic that + * is being used as mac-change even if it was created by different + * source. + */ + rc = vnic_child_update(login, mac, 0); + if (rc && rc != -EEXIST) + goto err; + +out: + memcpy(dev->dev_addr, mac, ETH_ALEN); + vnic_child_update(login, (u8 *)(dev->dev_addr), 1); + vnic_dbg_mac(login->name, "mac changed successfully to " + MAC_6_PRINT_FMT"\n", MAC_6_PRINT_ARG(mac)); + +err: + read_unlock(&login->mac_rwlock); + return rc; +} + +static void vnic_set_multicast_list(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + vnic_dbg_func(login->name); + + /* test promisc flag changes */ + if (is_ucast_promisc(login) && !login->promisc) { + /* promisc is being set */ + if (!vnic_child_max) { + /* must support child vNics for promisc mode */ + vnic_info("%s promisc mode cannot be set " + "(vnic_child_max %u)\n", + dev->name, vnic_child_max); + } else if (vnic_src_mac_enforce) { + /* cannot support promisc if source mac is enforced + * because sender should be able to use any smac + */ + vnic_info("%s promisc mode cannot be set " + "(vnic_src_mac_enforce %u)\n", + dev->name, vnic_src_mac_enforce); + } else { + login->promisc = 1; + vnic_dbg_mac(dev->name, + "entered promiscuous mode: confirmed\n"); + } + } else if (!is_ucast_promisc(login) && login->promisc) { + /* promisc is being cleared */ + login->promisc = 0; + write_lock(&login->mac_rwlock); + vnic_child_flush(login, 0); + write_unlock(&login->mac_rwlock); + vnic_dbg_mac(dev->name, + "left promiscuous mode: confirmed\n"); + } + + /* test mcast changes */ + if (!no_bxm && !login->queue_stopped) { + dev_hold(dev); + if (!queue_delayed_work(login_wq, &login->mcast_task, HZ / 100)) + dev_put(dev); + } +} + +static void vnic_auto_moder(struct vnic_login *login) +{ + unsigned long period = + (unsigned long)(jiffies - login->last_moder_jiffies); + unsigned long packets; + unsigned long rate; + unsigned long avg_pkt_size; + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_pkt_diff; + unsigned long rx_pkt_diff; + int moder_time; + + period = (unsigned long)(jiffies - login->last_moder_jiffies); +#if 0 + vnic_dbg_moder_v(login->name, "adaptive_rx_coal %d, period %d, " + "sample_interval %d, state %d\n", + login->adaptive_rx_coal, period, + login->sample_interval, login->port->attr.state); +#endif + + if (!login->adaptive_rx_coal || period < login->sample_interval * HZ) + return; + + /* TODO: when NAPI is disabled, the RX completion will be called from + * IRQ context (and not BH context) and thus spin_lock_bh should be + * replaced with spin_lock_irq + */ + spin_lock_bh(&login->stats_lock); + rx_packets = login->stats.rx_packets; + rx_bytes = login->stats.rx_bytes; + tx_packets = login->stats.tx_packets; + spin_unlock_bh(&login->stats_lock); + + if (!login->last_moder_jiffies || !period) + goto out_set; + + tx_pkt_diff = ((unsigned long)(tx_packets - + login->last_moder_tx_packets)); + rx_pkt_diff = ((unsigned long)(rx_packets - login->last_moder_packets)); + packets = max(tx_pkt_diff, rx_pkt_diff); + rate = packets * HZ / period; + avg_pkt_size = packets ? ((unsigned long)(rx_bytes - + login->last_moder_bytes)) / + packets : 0; + + if (rate > VNIC_RX_RATE_THRESH && avg_pkt_size > VNIC_AVG_PKT_SMALL) { + /* If tx and rx packet rates are not balanced, assume that + * traffic is mainly BW bound and apply maximum moderation. + * Otherwise, moderate according to packet rate */ + if (2 * tx_pkt_diff > 3 * rx_pkt_diff || + 2 * rx_pkt_diff > 3 * tx_pkt_diff) { + moder_time = login->rx_usecs_high; + } else { + if (rate < login->pkt_rate_low) + moder_time = login->rx_usecs_low; + else if (rate > login->pkt_rate_high) + moder_time = login->rx_usecs_high; + else + moder_time = (rate - login->pkt_rate_low) * + (login->rx_usecs_high - login->rx_usecs_low) / + (login->pkt_rate_high - login->pkt_rate_low) + + login->rx_usecs_low; + } + } else { + moder_time = login->rx_usecs_low; + } + + if (moder_time != login->last_moder_time) { + vnic_dbg_moder(login->name, "tx rate:%lu rx_rate:%lu\n", + tx_pkt_diff * HZ / period, + rx_pkt_diff * HZ / period); + vnic_dbg_moder(login->name, + "Rx moder_time changed from:%lu to %d period:%lu" + " [jiff] packets:%lu avg_pkt_size:%lu rate:%lu" + " [p/s])\n", login->last_moder_time, moder_time, + period, packets, avg_pkt_size, rate); + login->last_moder_time = moder_time; + vnic_ib_set_moder(login, + login->last_moder_time, login->rx_frames, + login->tx_usecs, login->tx_frames); + } + +out_set: + login->last_moder_packets = rx_packets; + login->last_moder_tx_packets = tx_packets; + login->last_moder_bytes = rx_bytes; + login->last_moder_jiffies = jiffies; +} + +void vnic_dump_stats(struct vnic_login *login) +{ + unsigned long *stats, *login_stats = (unsigned long *)(&login->stats); + int i, j, len = sizeof(struct net_device_stats) / sizeof(unsigned long); + struct net_device_stats stats_tmp; + + spin_lock_bh(&login->stats_lock); + /* tx stats are distributed between tx_res entries */ + stats_tmp = login->stats; + memset(&login->stats, 0, sizeof(struct net_device_stats)); + for (i = 0; i < login->tx_rings_num; ++i) { + stats = (unsigned long *)(&login->tx_res[i].stats); + for (j = 0; j < len; ++j) + login_stats[j] += stats[j]; + } + + /* rx stats are in login->stats */ + login->stats.rx_bytes = stats_tmp.rx_bytes; + login->stats.rx_packets = stats_tmp.rx_packets; + login->stats.rx_errors = stats_tmp.rx_errors; + login->stats.rx_dropped = stats_tmp.rx_dropped; + spin_unlock_bh(&login->stats_lock); +} + +static void vnic_do_get_stats(struct work_struct *work) +{ + struct vnic_login *login = + container_of(work, struct vnic_login, stats_task.work); + + mutex_lock(&login->moder_lock); + vnic_dump_stats(login); + + if (login->queue_stopped) + goto out; + + if (!(test_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state))) + goto resched; + + if (login->port->attr.state == IB_PORT_ACTIVE) + vnic_auto_moder(login); + +resched: + /* calls vnic_do_get_stats() */ + if (!login->queue_stopped) + queue_delayed_work(login_wq, &login->stats_task, VNIC_STATS_DELAY); +out: + mutex_unlock(&login->moder_lock); +} + +static void vnic_mcast_reattach(struct work_struct *work) +{ + struct vnic_mcast *mcaste, *mcaste_t; + struct rb_node *n; + unsigned long flags; + union vhub_mgid mgid; + LIST_HEAD(local_list); + int i; + struct vnic_gw_info *lag_member; + struct vnic_login *login; + struct net_device *dev; +#ifndef _BP_NO_MC_LIST + struct dev_mc_list *mclist; +#else + struct netdev_hw_addr *ha; +#endif + + login = container_of(work, struct vnic_login, mcast_task.work); + dev = login->dev; + + vnic_dbg_mcast(login->name, "set_multicast_list was notified\n"); + if (login->queue_stopped) { + dev_put(dev); + return; + } + + /* detach all mcast (except default and bcast mcasts) */ + spin_lock_irqsave(&login->mcast_tree.mcast_rb_lock, flags); + if (!list_empty(&login->mcast_tree.reattach_list)) { + /* an event is being processed */ + spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags); + goto retry; + } + + for (n = rb_first(&login->mcast_tree.mcast_tree); n; n = rb_next(n)) { + mcaste = rb_entry(n, struct vnic_mcast, rb_node); + if (IS_ZERO_MAC(mcaste->mac)) + continue; + if (IS_BCAST_MAC(mcaste->mac)) + continue; + list_add_tail(&mcaste->list, &local_list); + } + + list_for_each_entry(mcaste, &local_list, list) { + vnic_mcast_del(&login->mcast_tree, mcaste); + mcaste->attach_task_cnt = 0; + } + + spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags); + + vnic_dbg_mcast(login->name, "local_list is %s empty n_mac_mcgid %u\n", + (list_empty(&local_list) ? "" : "not"), + login->n_mac_mcgid); + + list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) { + list_del(&mcaste->list); + vnic_mcast_detach(&login->mcast_tree, mcaste); + vnic_mcast_dealloc(mcaste); + } + + /* attach all mcasts in mc_list */ + vhub_mgid_create(login->mgid_prefix, ETH_ZERO_MAC, login->n_mac_mcgid, + CREATE_VHUB_ID(login->vid, login->gw_port_id), + VHUB_MGID_DATA, 0, &mgid); + + spin_lock_irqsave(&login->mcast_tree.mcast_rb_lock, flags); + mcaste_t = vnic_mcast_search(&login->mcast_tree, &mgid.ib_gid); + if (IS_ERR(mcaste_t) || !test_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->fip_vnic->login_state)) { + vnic_dbg_data(login->name, "default mgid not ready\n"); + spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags); + dev_put(dev); + return; + } + spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags); + + /* hold the tx lock so set_multicast_list() won't change mc_list */ + netif_tx_lock_bh(dev); +#ifndef _BP_NO_MC_LIST + for (mclist = login->dev->mc_list; mclist; mclist = mclist->next) { + u8* mmac = mclist->dmi_addr; +#else + netdev_for_each_mc_addr(ha, login->dev) { + u8* mmac = ha->addr; +#endif + /* do not add the default MGIDS because they are always used */ + if (IS_ZERO_MAC(mmac)) + continue; + if (IS_BCAST_MAC(mmac)) + continue; + + /* attach to the legacy GW / LAG gw id MGID */ + if (_vnic_mcast_attach_mgid(login, mmac, mcaste_t, login, + login->gw_port_id)) + goto attach_failed; + + if (!login->is_lag) + continue; + + for (i=0; ilag_gw_neigh[i]; + /* member id is already in use */ + if (lag_member->info & GW_MEMBER_INFO_CREATED) + /* attach to the legacy GW / LAG gw id MGID */ + if (_vnic_mcast_attach_mgid(login, mmac, + mcaste_t, + lag_member, + lag_member->gw_id)) + goto attach_failed; + } + } + netif_tx_unlock_bh(dev); + dev_put(dev); + return; + +attach_failed: + netif_tx_unlock_bh(dev); + vnic_mcast_del_all(&login->mcast_tree); + +retry: + if (!login->queue_stopped) { + if (!queue_delayed_work(login_wq, &login->mcast_task, HZ / 100)) + dev_put(dev); + } else + dev_put(dev); +} + +static int vnic_change_mtu(struct net_device *dev, int new_mtu) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + if (new_mtu > login->max_mtu) { + vnic_warn(login->name, "failed: new_mtu %d > %d\n", new_mtu, + login->max_mtu); + return -EINVAL; + } + + vnic_dbg_data(login->name, "mtu %d -> %d\n", dev->mtu, new_mtu); + dev->mtu = new_mtu; + + return 0; +} + +static void vnic_set_default_moder(struct vnic_login *login) +{ + + login->rx_frames = VNIC_RX_COAL_TARGET / login->dev->mtu + 1; + login->rx_usecs = VNIC_RX_COAL_TIME; + login->tx_frames = VNIC_TX_COAL_PKTS; + login->tx_usecs = VNIC_TX_COAL_TIME; + login->pkt_rate_low = VNIC_RX_RATE_LOW; + login->rx_usecs_low = VNIC_RX_COAL_TIME_LOW; + login->pkt_rate_high = VNIC_RX_RATE_HIGH; + login->rx_usecs_high = VNIC_RX_COAL_TIME_HIGH; + login->sample_interval = VNIC_SAMPLE_INTERVAL; + login->adaptive_rx_coal = 1; + login->last_moder_time = VNIC_AUTO_CONF; + login->last_moder_jiffies = 0; + login->last_moder_packets = 0; + login->last_moder_tx_packets = 0; + login->last_moder_bytes = 0; + + vnic_dbg_data(login->name, "default coalescing params for mtu:%d to " + "rx_frames:%d rx_usecs:%d " + "tx_frames:%d tx_usecs:%d\n", + login->dev->mtu, + login->rx_frames, login->rx_usecs, + login->tx_frames, login->tx_usecs); +} + +#ifndef _BP_NAPI_POLL +int vnic_napi_alloc(struct vnic_login *login, int rx_res_index) +{ + + struct napi_struct *napi = &login->rx_res[rx_res_index].napi; + + netif_napi_add(login->dev, napi, vnic_poll_cq_rx, vnic_napi_weight); + + return 0; +} + +void vnic_napi_enable(struct vnic_login *login, int rx_res_index) +{ + + struct napi_struct *napi = &login->rx_res[rx_res_index].napi; + napi_enable(napi); +} + +static void vnic_napi_disable(struct vnic_login *login, int rx_res_index) +{ + struct napi_struct *napi = &login->rx_res[rx_res_index].napi; + + if (!napi->poll) + return; + + napi_disable(napi); +} + +static void vnic_napi_dealloc(struct vnic_login *login, int rx_res_index) +{ +#ifndef _BP_NAPI_NO_DEL + struct napi_struct *napi = &login->rx_res[rx_res_index].napi; + + netif_napi_del(napi); +#else + return; +#endif +} + +#else +int vnic_napi_alloc(struct vnic_login *login, int rx_res_index) +{ + struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index]; + char name[IFNAMSIZ]; + + snprintf(name, IFNAMSIZ, "%s-N%d", login->name, rx_res_index); + rx_res->poll_dev = + alloc_netdev(0, name, ether_setup); + if (!rx_res->poll_dev) + return -ENOMEM; + + rx_res->poll_dev = rx_res->poll_dev; + rx_res->poll_dev->priv = rx_res; + rx_res->poll_dev->weight = vnic_napi_weight; + rx_res->poll_dev->poll = vnic_poll_cq_rx; + + return 0; +} + +void vnic_napi_enable(struct vnic_login *login, int rx_res_index) +{ + struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index]; + + ASSERT(rx_res->poll_dev); + set_bit(__LINK_STATE_START, &rx_res->poll_dev->state); +} + +static void vnic_napi_disable(struct vnic_login *login, int rx_res_index) +{ + struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index]; + struct net_device *poll_dev = rx_res->poll_dev; + + if (!poll_dev) + return; + + while (test_bit(__LINK_STATE_RX_SCHED, &poll_dev->state)) + msleep(VNIC_NAPI_SCHED_TIMEOUT); +} + +static void vnic_napi_dealloc(struct vnic_login *login, int rx_res_index) +{ + struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index]; + struct net_device *poll_dev = rx_res->poll_dev; + + if (!poll_dev) + return; + + free_netdev(poll_dev); + rx_res->poll_dev = NULL; +} +#endif + +static int _vnic_open(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + int i; + + /* Todo add locks here */ + if (!(test_bit(VNIC_STATE_LOGIN_CREATE_2, &login->fip_vnic->login_state))) { + set_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state); + return 0; + } + + if (test_and_set_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state)) + return 0; + + clear_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state); + + /* ARM RX handlers */ + for (i = 0; i < login->rx_rings_num; ++i) { + login->rx_res[i].stopped = 0; + if (ib_req_notify_cq(login->rx_res[i].cq, IB_CQ_NEXT_COMP)) { + vnic_err(login->name, "ib_req_notify_cq failed\n"); + goto err; + } + } + + /* ARM TX handlers */ + for (i = 0; i < login->tx_rings_num; ++i) { + login->tx_res[i].stopped = 0; + spin_lock_init(&login->tx_res[i].lock); + if (!vnic_tx_polling && + ib_req_notify_cq(login->tx_res[i].cq, IB_CQ_NEXT_COMP)) { + vnic_err(login->name, "ib_req_notify_cq failed\n"); + goto err; + } + } + + /* enable napi*/ + for (i = 0; i < login->napi_num; ++i) + vnic_napi_enable(login, i); + + /* move QP to RTS, post recv skb */ + if (vnic_ib_open(dev)) + goto err_napi; + + /* dummy call */ + if (vnic_ib_up(dev)) + goto err_ib_stop; + + /* configure */ + vnic_set_default_moder(login); + if (vnic_ib_set_moder(login, login->last_moder_time, login->rx_frames, + login->tx_usecs, login->tx_frames)) + vnic_warn(login->name, "vnic_ib_set_moder failed!\n"); + + /* start interface TX queue */ + VNIC_TXQ_START_ALL(login); + + /* report and return */ + vnic_info("%s is opened\n", dev->name); + + return 0; + +err_ib_stop: + vnic_ib_stop(dev); +err_napi: + /* disable napi*/ + for (i = 0; i < login->napi_num; ++i) + vnic_napi_disable(login, i); +err: + clear_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state); + return -EINVAL; +} + +static int vnic_open(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + int ret; + + vnic_dbg_func(login->name); + + mutex_lock(&login->state_lock); + ret = _vnic_open(dev); + mutex_unlock(&login->state_lock); + return ret; +} + +static int _vnic_stop(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + int i, _watchdog_timeo = dev->watchdog_timeo; + + /* check if already stopped */ + if (!(test_and_clear_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state))) + return 0; + + /* Set trans_start to jiffies and watchdog_timeo to max + * to avoid spurious transmit timeouts in the interval between + * tx queue stopped and carrier down. + */ + dev->trans_start = jiffies; + dev->watchdog_timeo = 0x7fffffff; + + VNIC_TXQ_STOP_ALL(login); + + /* disable rx handlers */ + for (i = 0; i < login->rx_rings_num; ++i) + login->rx_res[i].stopped = 1; + + /* disable tx handlers */ + for (i = 0; i < login->tx_rings_num; ++i) + login->tx_res[i].stopped = 1; + + /* disable napi managers */ + for (i = 0; i < login->napi_num; ++i) + vnic_napi_disable(login, i); + + vnic_ib_down(dev); + vnic_ib_stop(dev); + + /* restore watchdog_timeo */ + dev->watchdog_timeo = _watchdog_timeo; + + vnic_info("%s is stopped\n", dev->name); + + return 0; +} + +static int vnic_stop(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + int ret; + + vnic_dbg_func(login->name); + + mutex_lock(&login->state_lock); + ret = _vnic_stop(dev); + mutex_unlock(&login->state_lock); + + return ret; +} + +int vnic_restart(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + int rc = 0; + + if (login->queue_stopped || !test_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state)) + return rc; + + set_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state); + netif_tx_disable(login->dev); + + mutex_lock(&login->state_lock); + _vnic_stop(login->dev); + + clear_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state); + set_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state); + + rc = _vnic_open(login->dev); + mutex_unlock(&login->state_lock); + + return rc; +} + +static void vnic_restart_task(struct work_struct *work) +{ + struct vnic_login *login = + container_of(work, struct vnic_login, restart_task.work); + + vnic_restart(login->dev); +} + +struct net_device_stats *vnic_get_stats(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + if (dev->reg_state != NETREG_REGISTERED) + return &dev->stats; + + spin_lock_bh(&login->stats_lock); + if (test_bit(VNIC_STATE_LOGIN_PRECREATE_2, &login->fip_vnic->login_state)) + memcpy(&dev->stats, &login->stats, sizeof(login->stats)); + spin_unlock_bh(&login->stats_lock); + + return &dev->stats; +} + +static void vnic_tx_timeout(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + vnic_warn(login->name, "TX timeout called on port: %d, " + "latency: %d msec, stopped: %d, carrier_ok: %d," + "queue_stopped: %d, watchdog_timeo: %d msec\n", + login->port->num, + jiffies_to_msecs(jiffies - dev->trans_start), + netif_queue_stopped(dev), netif_carrier_ok(dev), + login->queue_stopped, + jiffies_to_msecs(dev->watchdog_timeo)); + + if (netif_carrier_ok(dev)) { + VNIC_STATS_DO_INC(login->port_stats.tx_timeout); + if (!login->queue_stopped) { + vnic_warn(login->name, "TX timeout, queueing rings restart\n"); + queue_delayed_work(login_wq, &login->restart_task, HZ / 100); + } + } +} + +#ifndef _BP_NETDEV_NO_TMQ +u16 vnic_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, select_queue_fallback_t fallback) +{ + /* Notes: + * - In kernel 2.6.32 the skb->mac_header 0x1a is not set when + * select_queue() is called + * - In OVM Server 3.0, DomU tx skb network and transport + * headers are not set + */ + skb_reset_mac_header(skb); + skb_set_network_header(skb, ETH_HLEN); + skb_set_transport_header(skb, + ETH_HLEN + + (skb->protocol == htons(ETH_P_IPV6) ? + sizeof(struct ipv6hdr) : ip_hdrlen(skb))); + + return vnic_hash(dev, skb) % dev->real_num_tx_queues; +} + +#endif + +#ifndef _BP_NO_NDO_OPS +static struct net_device_ops vnic_netdev_ops = { + .ndo_open = vnic_open, + .ndo_stop = vnic_stop, + .ndo_start_xmit = vnic_tx, + .ndo_get_stats = vnic_get_stats, + .ndo_set_rx_mode = vnic_set_multicast_list, + .ndo_change_mtu = vnic_change_mtu, + .ndo_tx_timeout = vnic_tx_timeout, + .ndo_set_mac_address = vnic_set_mac, + .ndo_vlan_rx_add_vid = mlx4_vnic_vlan_rx_add_vid, + .ndo_vlan_rx_kill_vid = mlx4_vnic_vlan_rx_kill_vid, +#ifndef _BP_NETDEV_NO_TMQ + .ndo_select_queue = vnic_select_queue, +#endif +}; +#endif + +static void vnic_setup(struct net_device *dev) +{ + ether_setup(dev); + + dev->hard_header_len += VNIC_SKB_GET_ENCAP_OFFSET; + dev->watchdog_timeo = VNIC_WATCHDOG_TIMEOUT; + +#ifndef _BP_NO_NDO_OPS + if (!vnic_change_mac) + vnic_netdev_ops.ndo_set_mac_address = NULL; + + dev->netdev_ops = &vnic_netdev_ops; +#else + dev->open = vnic_open; + dev->stop = vnic_stop; + dev->hard_start_xmit = vnic_tx; + dev->get_stats = mlx4_vnic_stats_func_container; + dev->set_multicast_list = vnic_set_multicast_list; + dev->change_mtu = vnic_change_mtu; + dev->tx_timeout = vnic_tx_timeout; + dev->set_mac_address = vnic_set_mac; + dev->vlan_rx_add_vid = mlx4_vnic_vlan_rx_add_vid; + dev->vlan_rx_kill_vid = mlx4_vnic_vlan_rx_kill_vid; + + if (!vnic_change_mac) + dev->set_mac_address = NULL; + +#ifndef _BP_NETDEV_NO_TMQ + dev->select_queue = vnic_select_queue; +#endif +#endif // _BP_NO_NDO_OPS +} + +static int vnic_get_frag_header(struct skb_frag_struct *frags, void **mac_hdr, + void **ip_hdr, void **tcpudp_hdr, + u64 *hdr_flags, void *priv) +{ + struct iphdr *iph; + *mac_hdr = page_address(frags->page.p) + frags->page_offset; + *ip_hdr = iph = (struct iphdr *)(*mac_hdr + ETH_HLEN); + *tcpudp_hdr = (struct tcphdr *)(iph + (iph->ihl << 2)); + *hdr_flags = LRO_IPV4 | LRO_TCP; + + return 0; +} + +static int vnic_get_skb_header(struct sk_buff *skb, void **iphdr, + void **tcphdr, u64 *hdr_flags, void *priv) +{ + struct iphdr *iph; + struct tcphdr *tcph; + + if (unlikely(skb->protocol != htons(ETH_P_IP))) + return -1; + + if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY)) + return -1; + + iph = (struct iphdr *)(skb->data + ETH_HLEN); + if (iph->protocol != IPPROTO_TCP) + return -1; + + tcph = (struct tcphdr *)(iph + (iph->ihl << 2)); + + if (ntohs(iph->tot_len) < (iph->ihl * 4 + tcph->doff * 4)) + return -1; + + *hdr_flags = LRO_IPV4 | LRO_TCP; + *iphdr = iph; + *tcphdr = tcph; + + return 0; +} + +static int vnic_lro_enable(struct vnic_login *login, int rx_res_index) +{ + struct net_lro_mgr *lro = &login->rx_res[rx_res_index].lro; + + lro->dev = login->dev; + lro->features = login->napi_num ? LRO_F_NAPI : 0; + lro->frag_align_pad = NET_IP_ALIGN; + lro->ip_summed = CHECKSUM_UNNECESSARY; + lro->ip_summed_aggr = CHECKSUM_UNNECESSARY; + lro->max_desc = login->lro_num; + lro->max_aggr = VNIC_MAX_LRO_AGGR; + lro->lro_arr = login->rx_res[rx_res_index].lro_desc; + + if (lro->max_aggr > MAX_SKB_FRAGS) + lro->max_aggr = MAX_SKB_FRAGS; + + if (!vnic_rx_linear) + lro->get_frag_header = vnic_get_frag_header; + else + lro->get_skb_header = vnic_get_skb_header; + + return 0; +} + +static void vnic_lro_disable(struct vnic_login *login, int rx_res_index) +{ + /* nop */ + return; +} + +struct net_device *vnic_alloc_netdev(struct vnic_port *port) +{ + struct vnic_login_info *info; + struct vnic_login *login; + struct net_device *dev; + static int vnic_cnt = 0; + int i; + + dev = VNIC_TXQ_ALLOC_NETDEV(sizeof *info, "eth%d", vnic_setup, port->tx_rings_num); + if (!dev) { + vnic_err(port->name, "VNIC_TXQ_ALLOC_NETDEV failed " + "(size %Zu, tx_rings_num %d)\n", + sizeof *info, port->tx_rings_num); + goto err; + } + + /* this is a *very* large beast... */ + login = vmalloc(sizeof *login); + if (!login) { + vnic_err(port->name, "failed to allocate login struct (%Zu)\n", + sizeof *login); + goto free_netdev; + } + + /* init fields */ + memset(login, 0, sizeof *login); + info = netdev_priv(dev); + info->login = login; + login->dev = dev; + login->port = port; + login->max_mtu = VNIC_BUF_SIZE(login->port) - IB_GRH_BYTES - + VNIC_ENCAP_LEN - ETH_HLEN - VLAN_HLEN; + login->cnt = ++vnic_cnt; + /* name will be overwritten later */ + sprintf(login->name, "%s-%d", "vnic", login->cnt); + sprintf(login->desc, "%s-P%d", + login->port->dev->ca->node_desc, port->num); + + login->neigh_wq = create_singlethread_workqueue(login->name); + if (!login->neigh_wq) { + vnic_err(NULL, "create_singlethread_workqueue failed for %s\n", + login->name); + goto free_login; + } + + login->rx_csum = 1; + login->rx_rings_num = port->rx_rings_num; + login->tx_rings_num = port->tx_rings_num; +#ifdef _BP_NETDEV_NO_TMQ + /* if the kernel doesn't support Multiple TX queues, + * then use only one TX queue */ + login->tx_rings_num = 1; +#endif + vnic_dbg_mark(); + spin_lock_init(&login->lock); + spin_lock_init(&login->stats_lock); + rwlock_init(&login->mac_rwlock); + atomic_set(&login->vnic_child_cnt, 0); + vnic_mcast_root_init(&login->mcast_tree); + mutex_init(&login->moder_lock); + mutex_init(&login->state_lock); + SET_NETDEV_DEV(login->dev, login->port->dev->ca->dma_device); + INIT_DELAYED_WORK(&login->stats_task, vnic_do_get_stats); + INIT_DELAYED_WORK(&login->mcast_task, vnic_mcast_reattach); + INIT_DELAYED_WORK(&login->restart_task, vnic_restart_task); + + vnic_set_ethtool_ops(dev); + /* init ethtool */ + dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; + dev->hw_features |= NETIF_F_RXCSUM | NETIF_F_RXHASH; + dev->hw_features |= NETIF_F_TSO | NETIF_F_TSO6; + dev->features |= dev->hw_features; + + /* init NAPI (must be before LRO init) */ + login->napi_num = login->rx_rings_num; + for (i = 0; i < login->napi_num; ++i) { + if (vnic_napi_alloc(login, i)) { + vnic_err(login->name, "NAPI alloc %d failed\n", i); + goto free_napi; + } + } + +#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO) + login->dev->features |= NETIF_F_GRO; +#elif defined(NETIF_F_LRO) + login->lro_num = vnic_lro_num; + login->lro_mng_num = vnic_lro_num ? login->rx_rings_num : 0; + login->dev->features |= vnic_lro_num ? NETIF_F_LRO : 0; +#endif + for (i = 0; i < login->lro_mng_num; ++i) { + if (vnic_lro_enable(login, i)) { + vnic_err(login->name, "vnic_lro_enable %d failed\n", i); + goto free_lro; + } + } + + return dev; + +free_lro: + for (--i; i >= 0; --i) + vnic_lro_disable(login, i); + + i = login->napi_num; +free_napi: + for (--i; i >= 0; --i) + vnic_napi_dealloc(login, i); +free_login: + vfree(login); +free_netdev: + free_netdev(dev); +err: + return ERR_PTR(-ENODEV); +} + +void vnic_free_netdev(struct vnic_login *login) +{ + int i; + + vnic_dbg_func(login->name); + + for (i = 0; i < login->lro_mng_num; ++i) + vnic_lro_disable(login, i); + for (i = 0; i < login->napi_num; ++i) + vnic_napi_dealloc(login, i); + flush_workqueue(login->neigh_wq); + destroy_workqueue(login->neigh_wq); + free_netdev(login->dev); + vfree(login); +} diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_rx.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_rx.c new file mode 100644 index 0000000000000..0051dee4882ea --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_rx.c @@ -0,0 +1,677 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_data.h" + +static inline void free_single_frag(struct vnic_rx_ring *ring, int e,int i) +{ + ib_dma_unmap_single(ring->port->dev->ca, + ring->rx_info[e].dma_addr[i], + ring->frag_info[i].frag_size, + PCI_DMA_FROMDEVICE); + ring->rx_info[e].dma_addr[i] = 0; + put_page(ring->rx_info[e].frags[i].page.p); +} + +#ifndef _BP_NETDEV_NO_TMQ +/* this functions used only in no_bxm mode, + * it's not implemented in netdevice.h so we have it here + * based on netif_tx_lock() + */ +static inline int vnic_netif_tx_trylock(struct net_device *dev) +{ + int i, cpu; + + spin_lock(&dev->tx_global_lock); + cpu = smp_processor_id(); + for (i = 0; i < dev->num_tx_queues; ++i) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + if (__netif_tx_trylock(txq)) { + set_bit(__QUEUE_STATE_FROZEN, &txq->state); + __netif_tx_unlock(txq); + } else { + goto unlock; + } + } + + return 1; + +unlock: + /* based on netif_tx_unlock() */ + for (--i; i >= 0; --i) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + clear_bit(__QUEUE_STATE_FROZEN, &txq->state); + if (!test_bit(QUEUE_STATE_ANY_XOFF, &txq->state)) + __netif_schedule(txq->qdisc); + } + spin_unlock(&dev->tx_global_lock); + + return 0; +} +#else +#define vnic_netif_tx_trylock(dev) netif_tx_trylock(dev) +#endif + +int vnic_rx(struct vnic_login *login, struct sk_buff *skb, struct ib_wc *wc) +{ + ASSERT(skb); + vnic_dbg_skb("RX", skb, (unsigned long)-1, (unsigned long)0); + + if (no_bxm) { + /* In no_bxm mode, we update neigh table based on ARP reqlies + * QPN & LID are retrieved from the IB completion + * ATTENTION: on RSS mode, make sure that ARPs are + * sent on base QPN + */ + struct vnic_neigh *neighe; + struct ethhdr *eth_hdr = (struct ethhdr *)skb->data; + struct arphdr *arp_hdr = (struct arphdr *)(skb->data + ETH_HLEN); + u16 eth_proto = ntohs(eth_hdr->h_proto); + u16 arp_proto = ntohs(arp_hdr->ar_op); + + if (eth_proto != ETH_P_ARP) + goto out; + if (arp_proto == ARPOP_REQUEST) + vnic_dbg_data(login->name, "ARP REQUEST\n"); + else + vnic_dbg_data(login->name, "ARP REPLY\n"); + + /* don't stop TX queue, only try, this way we avoid blocking + * IRQs in TX flow (performance wise). + * other vnic_neighe_* functions are not called in parallel + * to this flow (in no_bxm mode) + */ + if (!vnic_netif_tx_trylock(login->dev)) + goto out; + + neighe = vnic_neighe_search(login, eth_hdr->h_source); + if (!IS_ERR(neighe)) { + /* if IB address didn't change, do nothing */ + if (neighe->qpn == wc->src_qp && + neighe->lid == wc->slid) + goto unlock; + /* else, del old neigh entry, and add a new one */ + vnic_neighe_del(login, neighe); + vnic_neighe_dealloc(neighe); + } + + /* RSS: assume that your neighbours are like you */ + neighe = vnic_neighe_alloc(login, eth_hdr->h_source, + wc->slid, wc->src_qp, + login->rx_rings_num > 1 ? 1 : 0); + if (IS_ERR(neighe)) + goto unlock; + if (vnic_neighe_add(login, neighe)) + vnic_neighe_dealloc(neighe); +unlock: + netif_tx_unlock(login->dev); + } +out: + + /* shared_vnic may receive PACKET_OTHERHOST + * we 'fix' the pkt_type here so the kernel + * won't drop it + */ + if (skb->pkt_type == PACKET_OTHERHOST && login->shared_vnic) + skb->pkt_type = PACKET_HOST; + + netif_receive_skb(skb); + + return 0; + +} + +struct sk_buff *vnic_alloc_rx_skb(struct vnic_rx_ring *ring, int buf_ind, + gfp_t gfp_flag) +{ + struct ib_device *ca = ring->port->dev->ca; + struct sk_buff *skb; + u64 mapping; + int buf_size = VNIC_BUF_SIZE(ring->port); + + skb = alloc_skb(buf_size, gfp_flag); + if (!skb) { + vnic_dbg_data(ring->port->name, + "alloc_skb for size %d failed\n", buf_size); + goto err_alloc; + } + + mapping = ib_dma_map_single(ca, skb->data, buf_size, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping))) { + vnic_dbg_data(ring->port->name, + "ib_dma_map_single len %d failed\n", buf_size); + goto err_map; + } + + ring->rx_info[buf_ind].skb = skb; + ring->rx_info[buf_ind].dma_addr[0] = mapping; + + return skb; + +err_map: + dev_kfree_skb_any(skb); +err_alloc: + return NULL; +} + +static int frag_sizes[] = { + FRAG_SZ0, + FRAG_SZ1, + FRAG_SZ2, + FRAG_SZ3 +}; + +/* Calculate the last offset position that accomodates a full fragment + * (assuming fagment size = stride-align) + */ +static int vnic_last_alloc_offset(struct vnic_rx_ring *ring, u16 stride, u16 align) +{ + u16 res = VNIC_ALLOC_SIZE % stride; + u16 offset = VNIC_ALLOC_SIZE - stride - res + align; + + vnic_dbg_data(ring->port->name, "calculated last offset for stride:%d align:%d " + "res:%d offset:%d\n", stride, align, res, offset); + return offset; +} + +static int vnic_init_allocator(struct vnic_rx_ring *ring) +{ + struct vnic_rx_alloc *page_alloc; + int i; + + if (vnic_rx_linear) + return 0; + + for (i = 0; i < ring->num_frags; i++) { + page_alloc = &ring->page_alloc[i]; + page_alloc->page = alloc_pages(GFP_ATOMIC | __GFP_COMP, VNIC_ALLOC_ORDER); + if (!page_alloc->page) + goto out; + + page_alloc->offset = ring->frag_info[i].frag_align; + vnic_dbg_data(ring->port->name, "Initialized allocator:%d with page:%p\n", + i, page_alloc->page); + } + return 0; + +out: + while (i--) { + page_alloc = &ring->page_alloc[i]; + if (page_alloc->page) { + put_page(page_alloc->page); + page_alloc->page = NULL; + } + } + return -ENOMEM; +} + +static void vnic_destroy_allocator(struct vnic_rx_ring *ring) +{ + struct vnic_rx_alloc *page_alloc; + int i; + + if (vnic_rx_linear) + return; + + for (i = 0; i < ring->num_frags; i++) { + page_alloc = &ring->page_alloc[i]; + vnic_dbg_data(ring->port->name, "Freeing allocator:%d count:%d\n", + i, page_count(page_alloc->page)); + if (page_alloc->page) { + put_page(page_alloc->page); + page_alloc->page = NULL; + } + } +} + +/* + * allocate a single fragment on a single ring entry and map it + * to HW address. + */ +static int vnic_alloc_frag(struct vnic_rx_ring *ring, + struct vnic_frag_data *frags_data, int i) +{ + struct vnic_frag_info *frag_info = &ring->frag_info[i]; + struct vnic_rx_alloc *page_alloc = &ring->page_alloc[i]; + struct skb_frag_struct *skb_frags = &frags_data->frags[i]; + struct skb_frag_struct skbf = *skb_frags; + struct page *page; + struct ib_device *ib_device = ring->port->dev->ca; + u64 dma; + int decision; + + if (vnic_rx_linear) + return 0; + + if (page_alloc->offset >= frag_info->last_offset) { + decision = 0; + /* Allocate new page */ + page = alloc_pages(GFP_ATOMIC | __GFP_COMP, VNIC_ALLOC_ORDER); + if (!page) { + /*frags_data->dma_addr[i] = NULL; + ring->rx_info[wr_id].info = VNIC_FRAG_ALLOC_FAIL; + ring->need_refill = 1; */ + return -ENOMEM; + } + skbf.page.p = page_alloc->page; + skbf.page_offset = page_alloc->offset; + } else { + decision = 1; + page = page_alloc->page; + get_page(page); + skbf.page.p = page; + skbf.page_offset = page_alloc->offset; + } + + skbf.size = frag_info->frag_size; + dma = ib_dma_map_single(ib_device, page_address(skbf.page.p) + + skbf.page_offset, frag_info->frag_size, + PCI_DMA_FROMDEVICE); + if (unlikely(ib_dma_mapping_error(ib_device, dma))) { + vnic_dbg_data(ring->port->name, + "ib_dma_map_single len %d failed\n", + frag_info->frag_size); + put_page(page); + return -ENOMEM; + } + + if (!decision) { + page_alloc->page = page; + page_alloc->offset = frag_info->frag_align; + } else + page_alloc->offset += frag_info->frag_stride; + + *skb_frags = skbf; + frags_data->dma_addr[i] = dma; + + return 0; +} + +void vnic_calc_rx_buf(struct vnic_rx_ring *ring) +{ + int eff_mtu = VNIC_BUF_SIZE(ring->port), buf_size = 0, i = 0; + + if (vnic_rx_linear) { + ring->num_frags = 1; + return; + } + + while (buf_size < eff_mtu) { + ring->frag_info[i].frag_size = + (eff_mtu > buf_size + frag_sizes[i]) ? + frag_sizes[i] : eff_mtu - buf_size; + ring->frag_info[i].frag_prefix_size = buf_size; + if (!i) { + ring->frag_info[i].frag_align = NET_IP_ALIGN; + ring->frag_info[i].frag_stride = + ALIGN(frag_sizes[i] + NET_IP_ALIGN, SMP_CACHE_BYTES); + } else { + ring->frag_info[i].frag_align = 0; + ring->frag_info[i].frag_stride = + ALIGN(frag_sizes[i], SMP_CACHE_BYTES); + } + ring->frag_info[i].last_offset = + vnic_last_alloc_offset(ring, + ring->frag_info[i].frag_stride, + ring->frag_info[i].frag_align); + buf_size += ring->frag_info[i].frag_size; + i++; + } + + ring->num_frags = i; + ring->rx_skb_size = eff_mtu; + ring->log_rx_info = ROUNDUP_LOG2(i * sizeof(struct skb_frag_struct)); + + vnic_dbg(ring->port->name, "Rx buffer scatter-list (ring %d effective-mtu:%d " + "num_frags:%d):\n", ring->index ,eff_mtu, ring->num_frags); + for (i = 0; i < ring->num_frags; i++) { + vnic_dbg(ring->port->name, "frag:%d - size:%d prefix:%d align:%d " + "stride:%d last_offset:%d\n", i, + ring->frag_info[i].frag_size, + ring->frag_info[i].frag_prefix_size, + ring->frag_info[i].frag_align, + ring->frag_info[i].frag_stride, + ring->frag_info[i].last_offset); + } +} + +static void vnic_empty_rx_entry(struct vnic_rx_ring *ring, int i) +{ + int frag_num, buf_size = VNIC_BUF_SIZE(ring->port); + struct ib_device *ca = ring->port->dev->ca; + struct sk_buff *skb; + u64 mapping; + + if (vnic_rx_linear) { + for (frag_num = 0; frag_num < ring->num_frags; frag_num++) { + mapping = ring->rx_info[i].dma_addr[0]; + skb = ring->rx_info[i].skb; + if (mapping) + ib_dma_unmap_single(ca, mapping, buf_size, DMA_FROM_DEVICE); + if (skb) + dev_kfree_skb_any(skb); + } + + return; + } + + /* non linear buffers */ + for (frag_num = 0; frag_num < ring->num_frags; frag_num++) + free_single_frag(ring, i, frag_num); +} + +static int vnic_fill_rx_buffer(struct vnic_rx_ring *ring) +{ + struct vnic_frag_data *frags_data = &ring->rx_info[0]; + struct sk_buff *skb; + struct ib_device *ca = ring->port->dev->ca; + int buf_ind, frag_num, buf_size = VNIC_BUF_SIZE(ring->port); + u64 mapping; + + if (vnic_rx_linear) { + for (buf_ind = 0; buf_ind < ring->size; buf_ind++) { + skb = vnic_alloc_rx_skb(ring, buf_ind, GFP_KERNEL); + if (!skb) + goto err_linear; + } + + return 0; + } + + /* non linear buffers */ + for (buf_ind = 0; buf_ind < ring->size; buf_ind++, frags_data++) { + for (frag_num = 0; frag_num < ring->num_frags; frag_num++) { + if (vnic_alloc_frag(ring, frags_data, frag_num)) + goto err_frags; + } + } + + return 0; + +err_linear: + for (buf_ind = 0; buf_ind < ring->size; buf_ind++) { + mapping = ring->rx_info[buf_ind].dma_addr[0]; + skb = ring->rx_info[buf_ind].skb; + if (mapping) + ib_dma_unmap_single(ca, mapping, buf_size, DMA_FROM_DEVICE); + if (skb) + dev_kfree_skb_any(skb); + } + + return -ENOMEM; + +err_frags: + for (--frag_num; frag_num >= 0; frag_num--) + free_single_frag(ring, buf_ind, frag_num); + + for (--buf_ind; buf_ind >= 0; buf_ind--) + vnic_empty_rx_entry(ring, buf_ind); + + return -ENOMEM; +} + +/* + * free entire ring full of fragments. +*/ +static void vnic_empty_rx_buffer(struct vnic_rx_ring *ring) +{ + int buf_ind; + + for (buf_ind = 0; buf_ind < ring->size; buf_ind++) + vnic_empty_rx_entry(ring, buf_ind); + + ring->size = 0; +} + +void vnic_destroy_rx_ring(struct vnic_rx_ring *ring) +{ + if (!ring) + return; + vnic_empty_rx_buffer(ring); + vnic_destroy_allocator(ring); + vfree(ring->rx_info); + vnic_ib_free_ring(ring); + kfree(ring); +} + +int vnic_unmap_and_replace_rx(struct vnic_rx_ring *ring, struct ib_device *dev, + struct skb_frag_struct *skb_frags_rx, + u64 wr_id, int length) +{ + struct vnic_frag_info *frag_info; + struct vnic_frag_data *rx_info = &ring->rx_info[wr_id]; + + int nr; + dma_addr_t dma; + + /* Collect used fragments while replacing them in the HW descriptors */ + for (nr = 0; nr < ring->num_frags; nr++) { + frag_info = &ring->frag_info[nr]; + if (length <= frag_info->frag_prefix_size) + break; + + /* Save page reference in skb */ + skb_frags_rx[nr].page = rx_info->frags[nr].page; + skb_frags_rx[nr].size = rx_info->frags[nr].size; + skb_frags_rx[nr].page_offset = rx_info->frags[nr].page_offset; + dma = rx_info->dma_addr[nr]; + + /* Allocate a replacement page */ + if (vnic_alloc_frag(ring, rx_info, nr)) + goto fail; + + /* Unmap buffer */ + ib_dma_unmap_single(dev, dma, skb_frags_rx[nr].size, + PCI_DMA_FROMDEVICE); + } + + /* Adjust size of last fragment to match actual length */ + if (nr > 0) + skb_frags_rx[nr - 1].size = length - + ring->frag_info[nr - 1].frag_prefix_size; + return nr; + +fail: + /* Drop all accumulated fragments (which have already been replaced in + * the descriptor) of this packet; remaining fragments are reused... */ + while (nr > 0) { + nr--; + put_page(skb_frags_rx[nr].page.p); + } + + return 0; +} + +int vnic_rx_skb(struct vnic_login *login, struct vnic_rx_ring *ring, + struct ib_wc *wc, int ip_summed, char *eth_hdr_va) +{ + u64 wr_id = (unsigned int)wc->wr_id; + struct sk_buff *skb; + int used_frags; + char *va = eth_hdr_va; + int length = wc->byte_len - VNIC_EOIB_HDR_SIZE - VNIC_VLAN_OFFSET(login), + linear_length = (length <= SMALL_PACKET_SIZE) ? + length : SMALL_PACKET_SIZE, hdr_len = min(length, HEADER_COPY_SIZE), + offest = NET_IP_ALIGN + 16; + struct ib_device *ib_dev = login->port->dev->ca; + + /* alloc a small linear SKB */ + skb = alloc_skb(linear_length + offest, GFP_ATOMIC); + if (unlikely(!skb)) + return -ENOMEM; + + skb_record_rx_queue(skb, ring->index); + skb_reserve(skb, offest); + + if (vnic_linear_small_pkt && length <= SMALL_PACKET_SIZE) { + u64 dma; + + /* We are copying all relevant data to the skb - temporarily + * synch buffers for the copy + */ + dma = ring->rx_info[wr_id].dma_addr[0] + VNIC_EOIB_HDR_SIZE + + VNIC_VLAN_OFFSET(login); + ib_dma_sync_single_for_cpu(ib_dev, dma, length, + DMA_FROM_DEVICE); + skb_copy_to_linear_data(skb, va, length); + ib_dma_sync_single_for_device(ib_dev, dma, length, + DMA_FROM_DEVICE); + skb->tail += length; + } else { + /* unmap the needed fragmentand reallocate them. Fragments that + * were not used will not be reused as is. */ + used_frags = vnic_unmap_and_replace_rx(ring, ib_dev, + skb_shinfo(skb)->frags, + wr_id, wc->byte_len); + if (!used_frags) + goto free_and_repost; + + skb_shinfo(skb)->nr_frags = used_frags; + + /* Copy headers into the skb linear buffer */ + memcpy(skb->data, va, hdr_len); + skb->tail += hdr_len; + /* Skip headers in first fragment */ + skb_shinfo(skb)->frags[0].page_offset += + (VNIC_EOIB_HDR_SIZE + VNIC_VLAN_OFFSET(login) + + hdr_len); + + /* Adjust size of first fragment */ + skb_shinfo(skb)->frags[0].size -= + (VNIC_EOIB_HDR_SIZE + VNIC_VLAN_OFFSET(login) + + hdr_len); + skb->data_len = length - hdr_len; + } + + /* update skb fields */ + skb->len = length; + skb->truesize = length + sizeof(struct sk_buff); + skb->ip_summed = ip_summed; + skb->dev = login->dev; + skb->protocol = eth_type_trans(skb, skb->dev); + + return vnic_rx(login, skb, wc); + +free_and_repost: + dev_kfree_skb(skb); + return -ENODEV; + +} + +static void vnic_set_rx_sge(struct vnic_rx_ring *ring) +{ + int i; + + ring->wr.num_sge = ring->num_frags; + ring->wr.next = NULL; + ring->wr.sg_list = ring->sge; + for (i = 0; i < ring->num_frags; ++i) { + ring->sge[i].lkey = ring->port->mr->lkey; + ring->sge[i].length = ring->frag_info[i].frag_size; + } +} + +struct vnic_rx_ring *vnic_create_rx_ring(struct vnic_port *port, int index) +{ + int rc, rx_info, size = vnic_rx_rings_len; + struct vnic_rx_ring *ring; + + ring = kzalloc(sizeof *ring, GFP_KERNEL); + if (!ring) + return ERR_PTR(-ENOMEM); + + /* init attributes */ + ring->port = port; + ring->size = size; + ring->index = index; + spin_lock_init(&ring->lock); + + /* init rx ring IB resources */ + if (vnic_ib_init_ring(ring)) { + vnic_err(port->name, "vnic_ib_init_ring failed\n"); + goto free_ring; + } + + rx_info = size * roundup_pow_of_two(sizeof(struct vnic_frag_data)); + ring->rx_info = vmalloc(rx_info); + if (!ring->rx_info) { + vnic_err(port->name, "Failed allocating rx_info ring" + " (%d bytes)\n", rx_info); + goto free_ib; + } + memset(ring->rx_info, 0, rx_info); + + /* determine the sizes of the fragments as result of mtu */ + vnic_calc_rx_buf(ring); + + rc = vnic_init_allocator(ring); + if (rc) { + vnic_err(port->name, "Failed initializing ring" + " allocator %d\n", rc); + goto free_rxinfo; + } + + rc = vnic_fill_rx_buffer(ring); + if (rc) { + vnic_err(port->name, "vnic_fill_rx_buffer failed %d\n", rc); + goto free_allocator; + } + + /* set rx WQEs drafts */ + vnic_set_rx_sge(ring); + + /* Initailize all descriptors and post to srq */ + rc = vnic_post_recvs(ring); + if (rc) { + vnic_err(port->name, "vnic_post_recvs failed %d\n", rc); + goto free_rx_buffer; + } + + return ring; + +free_rx_buffer: + /* TODO: we are freeing posted packets need to move SRQ + * to error and free them first + */ + vnic_empty_rx_buffer(ring); +free_allocator: + vnic_destroy_allocator(ring); +free_rxinfo: + vfree(ring->rx_info); +free_ib: + vnic_ib_free_ring(ring); +free_ring: + kfree(ring); + + return ERR_PTR(-EINVAL); +} diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_tx.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_tx.c new file mode 100644 index 0000000000000..0233d4fe7e1e4 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_tx.c @@ -0,0 +1,622 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_data.h" + +static int vnic_ucast_send(struct vnic_login *login, struct sk_buff *skb, + struct neighbour *neighbour, int tx_res_index); +/* Push VLAN & EoIB headers and calculate RSS hash value + * We do the RSS hash here because we already check IP|TCP|UDP + * in this function for EoIB fields, so we make use of that + * and do RSS too. + */ +static struct eoibhdr eoib_h_draft = { + .encap_data = ((VNIC_EOIB_HDR_VER << 4) | (VNIC_EOIB_HDR_SIG << 6)), + .seg_off = 0, + .seg_id = 0 +}; + +void vnic_learn_mac(struct net_device *dev, u8 *mac, int remove) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + vnic_dbg_func(login->name); + + /* skip invalid address */ + if (unlikely(!is_valid_ether_addr(mac))) + return; + + /* skip parent vNic address (original dev_addr) */ + if (!(memcmp(login->dev_addr, mac, ETH_ALEN))) + return; + + vnic_dbg_mac(login->name, "learn mac "MAC_6_PRINT_FMT"\n", + MAC_6_PRINT_ARG(mac)); + + /* update child vNic list, ignore returned code */ + read_lock_bh(&login->mac_rwlock); + vnic_child_update(login, mac, remove); + read_unlock_bh(&login->mac_rwlock); +} + +u32 vnic_hash(struct net_device *dev, struct sk_buff *skb) +{ + struct tcphdr *tr_h = tcp_hdr(skb); + struct iphdr *ip_h = ip_hdr(skb); + struct ipv6hdr *ip_h6 = (struct ipv6hdr *)ip_h; + u32 hash = 0, addrlen, i; + + /* All mcast traffic is sent and received on 1st queue + * because only the 1st QP is attached to the MGIDs + * TODO: consider distributing tx/rx mcast traffic as well + */ + if (is_multicast_ether_addr(skb_mac_header(skb))) + goto out; + + switch (ntohs(skb->protocol)) { + case ETH_P_IP: + /* In IPv4, access TCP/UDP header only when IP packet is not + * fragmented: flags == DF == 0x02. + */ + if (ntohs(ip_h->frag_off) >> 13 == 0x2 && + (ip_h->protocol == IPPROTO_TCP || + ip_h->protocol == IPPROTO_UDP)) { + hash ^= (u32)ntohl(ip_h->saddr); + hash ^= (u32)ntohl(ip_h->daddr); + hash ^= (u32)ntohs(tr_h->source); + hash ^= (u32)ntohs(tr_h->dest); + } + break; + case ETH_P_IPV6: + /* In IPv6, access TCP/UDP header only when IP packet is not + * fragmented: main header nexthdr field points to TCP/UDP + */ + if (ip_h6->nexthdr == IPPROTO_TCP || + ip_h6->nexthdr == IPPROTO_UDP) { + addrlen = ARRAY_LEN(ip_h6->saddr.in6_u.u6_addr32); + for (i = 0; i < addrlen; ++i) { + hash ^= (u32)ntohl(ip_h6->saddr.in6_u.u6_addr32[i]); + hash ^= (u32)ntohl(ip_h6->daddr.in6_u.u6_addr32[i]); + } + tr_h = (struct tcphdr *)((void *)ip_h6 + sizeof *ip_h6); + hash ^= (u32)ntohs(tr_h->source); + hash ^= (u32)ntohs(tr_h->dest); + } + } +out: + VNIC_SKB_SET_HASH(skb, hash); + return hash; +} + +u8 vnic_lag_hash(struct sk_buff *skb, u16 hash_mask, u16 vid) +{ + struct tcphdr *tr_h = tcp_hdr(skb); + struct iphdr *ip_h = ip_hdr(skb); + struct ipv6hdr *ip_h6 = (struct ipv6hdr *)ip_h; + u32 hash = 0, addrlen, i; + struct ethhdr *eth = (struct ethhdr *)skb_mac_header(skb); + u32 hash_dmac, hash_smac, hash_prot, hash_vid; + u32 hash_sip = 0, hash_dip = 0, hash_sp = 0, hash_dp = 0; + u8 res_hash; + u8 *tmp; + + hash_dmac = *(u32 *)(ð->h_dest[ETH_ALEN - sizeof hash_smac]); + hash_smac = *(u32 *)(ð->h_source[ETH_ALEN - sizeof hash_smac]); + hash_prot = (u32)ntohs(skb->protocol); + hash_vid = (u32)vid; + + if (hash_mask & GW_LAG_LAYER_2_3) { + switch (hash_prot) { + case ETH_P_IP: + /* In IPv4, access TCP/UDP header only when IP packet is not + * fragmented: flags == DF == 0x02. + */ + if (ntohs(ip_h->frag_off) >> 13 == 0x2 && + (ip_h->protocol == IPPROTO_TCP || + ip_h->protocol == IPPROTO_UDP)) { + hash_sip = (u32)(ip_h->saddr); + hash_dip = (u32)(ip_h->daddr); + hash_sp = (u32)(tr_h->source); + hash_dp = (u32)(tr_h->dest); + } + break; + case ETH_P_IPV6: + /* In IPv6, access TCP/UDP header only when IP packet is not + * fragmented: main header nexthdr field points to TCP/UDP + */ + if (ip_h6->nexthdr == IPPROTO_TCP || + ip_h6->nexthdr == IPPROTO_UDP) { + addrlen = ARRAY_LEN(ip_h6->saddr.in6_u.u6_addr32); + for (i = 0; i < addrlen; ++i) { + hash_sip ^= (u32)(ip_h6->saddr.in6_u.u6_addr32[i]); + hash_dip ^= (u32)(ip_h6->daddr.in6_u.u6_addr32[i]); + } + tr_h = (struct tcphdr *)((void *)ip_h6 + sizeof *ip_h6); + hash_sp = (u32)(tr_h->source); + hash_dp = (u32)(tr_h->dest); + } + } + } + + hash ^= (hash_mask & GW_LAG_HASH_DMAC) ? hash_dmac : 0; + hash ^= (hash_mask & GW_LAG_HASH_SMAC) ? hash_smac : 0; + hash ^= (hash_mask & GW_LAG_HASH_TPID) ? hash_prot : 0; + hash ^= (hash_mask & GW_LAG_HASH_VID) ? hash_vid : 0; + hash ^= (hash_mask & GW_LAG_HASH_SIP) ? hash_sip : 0; + hash ^= (hash_mask & GW_LAG_HASH_DIP) ? hash_dip : 0; + hash ^= (hash_mask & GW_LAG_HASH_SPORT) ? hash_sp : 0; + hash ^= (hash_mask & GW_LAG_HASH_DPORT) ? hash_dp : 0; + + tmp = (u8 *)&hash; + res_hash = tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3]; + + return res_hash; +} + +static inline int vnic_header_encap(struct sk_buff *skb) +{ + struct vnic_login *login = vnic_netdev_priv(skb->dev); + struct eoibhdr *eoib_h; + struct iphdr *ip_h = ip_hdr(skb); + struct ipv6hdr *ip_h6 = (struct ipv6hdr *)ip_h; + + /* push VLAN header + * TODO: when VID iz zero, push header only when prio exists, i.e.: + * if (VNIC_VLAN_ENABLED(login) && (login->vid || login->user_prio)) + */ + if (VNIC_VLAN_ENABLED(login) && login->vid) { + struct vlan_ethhdr *veth = + (struct vlan_ethhdr *)skb_push(skb, VLAN_HLEN); + ASSERT(veth); + vnic_dbg_data_v(login->name, "push vlan tag with ID %u\n", + be16_to_cpu(login->vid)); + memmove(skb->data, skb->data + VLAN_HLEN, 2 * ETH_ALEN); + veth->h_vlan_proto = htons(ETH_P_8021Q); + veth->h_vlan_TCI = login->vid; + } + + /* push EoIB header */ + if (vnic_encap_headroom) + skb_push(skb, VNIC_ENCAP_LEN); + + /* reset MAC header here, it can be changed for the following reasons: + * - vnic_encap_headroom is set, thus EoIB header is pushed + * - VLAN is enabled, thus VLAN header is pushed + * - some kernels (e.g., 2.6.18-194.el5) call dev_hard_start_xmit() + * without setting the mac header pointer + */ + skb_set_mac_header(skb, VNIC_SKB_GET_ENCAP_OFFSET); + + /* enforce source mac*/ + if (vnic_src_mac_enforce) + memcpy(skb_mac_header(skb) + ETH_ALEN, + login->dev->dev_addr, ETH_ALEN); + + /* set EoIB header VER/SIG, others set to zero */ + eoib_h = VNIC_SKB_GET_ENCAP(skb); + *eoib_h = eoib_h_draft; + + /* set EoIB header IP_CHK */ + switch (ntohs(skb->protocol)) { + case ETH_P_IP: + VNIC_EOIB_HDR_SET_IP_CHK_OK(eoib_h); + if (ip_h->protocol == IPPROTO_TCP) + VNIC_EOIB_HDR_SET_TCP_CHK_OK(eoib_h); + else if (ip_h->protocol == IPPROTO_UDP) + VNIC_EOIB_HDR_SET_UDP_CHK_OK(eoib_h); + break; + case ETH_P_IPV6: + VNIC_EOIB_HDR_SET_IP_CHK_OK(eoib_h); + if (ip_h6->nexthdr == IPPROTO_TCP) + VNIC_EOIB_HDR_SET_TCP_CHK_OK(eoib_h); + else if (ip_h6->nexthdr == IPPROTO_UDP) + VNIC_EOIB_HDR_SET_UDP_CHK_OK(eoib_h); + } + +#ifdef _BP_NETDEV_NO_TMQ + /* if TSS is enabled, use the hash value calculated by + * vnic_select_queue() otherwise call vnic_hash() + */ + vnic_hash(skb->dev, skb); +#endif + + return 0; +} + +static void vnic_neigh_path_query_complete(int status, + struct ib_sa_path_rec *pathrec, + void *context) +{ + struct vnic_neigh *neigh = context; + struct ib_ah *old_ah, *new_ah; + struct net_device *dev = neigh->login->dev; + struct sk_buff_head skqueue; + struct vnic_login *login = neigh->login; + + if (status) { + vnic_dbg_data(neigh->login->name, "neigh %d "MAC_6_PRINT_FMT" path query complete FAILED\n", + neigh->lid, MAC_6_PRINT_ARG(neigh->mac)); + goto drop_pkts; + } else { + struct ib_ah_attr av; + struct sk_buff *skb; + vnic_dbg_data(login->name, "neigh %d "MAC_6_PRINT_FMT" path query complete sucess SL=%d\n", + neigh->lid, MAC_6_PRINT_ARG(neigh->mac), pathrec->sl); + if(ib_init_ah_from_path(login->port->dev->ca, login->port->num, pathrec, &av)){ + vnic_warn(login->name, "ib_init_ah_from_path %d "MAC_6_PRINT_FMT" failed!\n", + neigh->lid, MAC_6_PRINT_ARG(neigh->mac)); + goto drop_pkts; + } + + old_ah = neigh->ah; + new_ah = ib_create_ah(login->port->pd, &av); + if (IS_ERR(new_ah) || !new_ah) { + vnic_warn(login->name, "ib_create_ah %d "MAC_6_PRINT_FMT" failed!\n", + neigh->lid, MAC_6_PRINT_ARG(neigh->mac)); + + goto drop_pkts; + } + + neigh->sl = pathrec->sl; + skb_queue_head_init(&skqueue); + netif_tx_lock_bh(login->dev); + neigh->ah = new_ah; + neigh->valid = 1; + neigh->query_id = -1; + while ((skb = __skb_dequeue(&neigh->pkt_queue))) + __skb_queue_tail(&skqueue, skb); + netif_tx_unlock_bh(login->dev); + + /* retransmit all pending packets */ + while ((skb = __skb_dequeue(&skqueue))) { + /* reset skb headers */ + /* TODO ALL VLAN ?? */ + if (VNIC_VLAN_ENABLED(login) && login->vid) + skb_pull(skb, VLAN_HLEN); + if (vnic_encap_headroom) + skb_pull(skb, VNIC_ENCAP_LEN); + + skb->dev = dev; + dev_queue_xmit(skb); + } + + if (old_ah && !IS_ERR(old_ah)) + ib_destroy_ah(old_ah); + } + complete(&neigh->query_comp); + return; + +drop_pkts: + netif_tx_lock_bh(dev); + neigh->query_id = -1; /* this will cause a retry */ + while (!skb_queue_empty(&neigh->pkt_queue)) + { + struct sk_buff *skb = skb_dequeue(&neigh->pkt_queue); + int tx_res_index; + struct vnic_tx_res *tx_res; + skb->dev = dev; + tx_res_index = VNIC_TXQ_GET_HASH(skb, login->real_tx_rings_num); + ASSERT(tx_res_index <= login->tx_rings_num); + tx_res = &login->tx_res[tx_res_index]; + VNIC_STATS_DO_INC(tx_res->stats.tx_dropped); + dev_kfree_skb_any(skb); + } + netif_tx_unlock_bh(dev); + complete(&neigh->query_comp); +} + +int vnic_neighe_path_query(struct vnic_neigh *neighe) +{ + ib_sa_comp_mask comp_mask; + struct ib_sa_path_rec p_rec; + u16 slid = neighe->login->port->attr.lid; + vnic_dbg_data(neighe->login->vnic_name,"neighe SL Query slid %d dlid %d dmac:"MAC_6_PRINT_FMT"\n", + slid, neighe->lid, MAC_6_PRINT_ARG(neighe->mac)); + + comp_mask = IB_SA_PATH_REC_SERVICE_ID | + IB_SA_PATH_REC_DLID | + IB_SA_PATH_REC_SLID | + IB_SA_PATH_REC_PKEY; + + if (IS_NEIGH_QUERY_RUNNING(neighe)) + ib_sa_cancel_query(neighe->query_id, neighe->pquery); + + init_completion(&neighe->query_comp); + neighe->query_id = -1; + neighe->pquery = NULL; + + p_rec.dlid = cpu_to_be16(neighe->lid); + p_rec.slid = cpu_to_be16(slid); + p_rec.service_id = cpu_to_be64(EOIB_SERVICE_ID); + p_rec.pkey = cpu_to_be16(neighe->login->pkey); + + neighe->query_id = ib_sa_path_rec_get(&vnic_sa_client, + neighe->login->port->dev->ca, + neighe->login->port->num, + &p_rec, + comp_mask, + 1000/*TOUT*/, + GFP_ATOMIC, + vnic_neigh_path_query_complete, + neighe, + &neighe->pquery); + if (neighe->query_id < 0) { + vnic_dbg_data(neighe->login->vnic_name, "FAILED neigh SL Query slid %d dlid %d dmac:"MAC_6_PRINT_FMT"\n", + slid, neighe->lid, MAC_6_PRINT_ARG(neighe->mac)); + complete(&neighe->query_comp); + } + return neighe->query_id; +} + +static int vnic_ucast_send(struct vnic_login *login, struct sk_buff *skb, + struct neighbour *neighbour, int tx_res_index) +{ + struct vnic_neigh *neighe; + int hash; + + neighe = vnic_neighe_search(login, skb_mac_header(skb)); + if (IS_ERR(neighe)) { + vnic_dbg_data(login->name, "no dst_neigh and no vnic_neigh - " + "gw unicast packet\n"); + + /* for egress unicast traffic of a shared vnic, + * replace src mac by shared mac + */ + if (login->shared_vnic) + memcpy(skb_mac_header(skb) + ETH_ALEN, + login->shared_mac, ETH_ALEN); + + if (!login->is_lag) + neighe = login->gw_neigh; + else { + if (unlikely(!login->lag_member_active_count)) + return -ENOENT; + + /* use hash value precomputed and mapping to find LAG GW to send to */ + hash = vnic_lag_hash(skb, login->lag_prop.hash_mask, login->vid); + hash = hash % LAG_MAP_TABLE_SIZE; + neighe = &login->lag_gw_neigh[login->lag_gw_map[hash]].neigh; + } + + /* update GW statistics */ + VNIC_STATS_ADD(login->port_stats.gw_tx_bytes, skb->len); + VNIC_STATS_INC(login->port_stats.gw_tx_packets); + } else { + vnic_dbg_data(login->name, + "no dst_neigh but vnic_neigh exists - " + "local unicast packet\n"); + } + + /* TODO: in VNIC_NEIGH_GET_DQPN use neigh qps_num instead of login */ + vnic_dbg_data(login->name, "vnic_send to (base qpn 0x%06x) dqpn 0x%06x" + " dlid 0x%08x %s\n", neighe->qpn, + VNIC_NEIGH_GET_DQPN(skb, neighe), neighe->lid, + neighe == login->gw_neigh ? "[GW]" : ""); + + if (unlikely(vnic_sa_query && !neighe->valid)) { + /* query neigh ah*/ + vnic_dbg_data(login->name, "AH is not %s, running path query: LID=%d mac="MAC_6_PRINT_FMT"\n", + !IS_ERR(neighe->ah) && neighe->ah ? "valid":"found", + neighe->lid, MAC_6_PRINT_ARG(neighe->mac)); + + if (!IS_NEIGH_QUERY_RUNNING(neighe)) + vnic_neighe_path_query(neighe); + + if (IS_ERR(neighe->ah) || !neighe->ah) + { /* AH is not ready yet, Queue pkt */ + if (skb_queue_len(&neighe->pkt_queue) > VNIC_SKB_QUEUE_LEN || !IS_NEIGH_QUERY_RUNNING(neighe)) + return 1; /* Drop in case queue is full or no query is currently runnig*/ + __skb_queue_tail(&neighe->pkt_queue, skb); + return 0; + } + /* if ah is initialized send anyway */ + } + vnic_send(login, skb, neighe->ah, VNIC_NEIGH_GET_DQPN(skb, neighe), tx_res_index); + return 0; +} + +void vnic_mcast_send(struct vnic_login *login, struct sk_buff *skb, int tx_res_index) +{ + struct vnic_mcast *mcaste; + union vhub_mgid mgid; + struct ethhdr *eth; + struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index]; + struct ib_ah_attr *av = &tx_res->mcast_av; + struct ib_ah *ah; + u16 gw_id; + int hash; + + eth = (struct ethhdr *)skb_mac_header(skb); + + /* for LAG GW, perform hashing on mcast address */ + if (login->is_lag && login->lag_member_active_count) { + hash = vnic_lag_hash(skb, login->lag_prop.hash_mask, login->vid); + hash = hash % LAG_MAP_TABLE_SIZE; + gw_id = login->lag_gw_neigh[login->lag_gw_map[hash]].gw_id; + } + else + gw_id = login->gw_port_id; + + /* retrieve the mlid */ + vhub_mgid_create(login->mgid_prefix, ETH_ZERO_MAC, login->n_mac_mcgid, + CREATE_VHUB_ID(login->vid, gw_id), + VHUB_MGID_DATA, 0, &mgid); + + spin_lock(&login->mcast_tree.mcast_rb_lock); + mcaste = vnic_mcast_search(&login->mcast_tree, &mgid.ib_gid); + if (unlikely(IS_ERR(mcaste) || !mcaste->ah)) { + vnic_dbg_data(login->name, "couldn't find mcaste for " + MAC_6_PRINT_FMT"\n", + MAC_6_PRINT_ARG(eth->h_dest)); + spin_unlock(&login->mcast_tree.mcast_rb_lock); + goto drop; + } + + spin_lock(&mcaste->lock); + vhub_mgid_create(login->mgid_prefix, eth->h_dest, login->n_mac_mcgid, + CREATE_VHUB_ID(login->vid, gw_id), + vnic_mgid_data_type, 0, &mgid); + vnic_dbg_mcast_v(login->name, "sending to ETH "MAC_6_PRINT_FMT"-> " + "GID "VNIC_GID_FMT" (mask %d bit)\n", + MAC_6_PRINT_ARG(eth->h_dest), + VNIC_GID_ARG(mgid.ib_gid), + login->n_mac_mcgid); + + av->dlid = be16_to_cpu(mcaste->port_mcaste->rec.mlid); + av->static_rate = mcaste->port_mcaste->rec.rate; + av->sl = mcaste->port_mcaste->rec.sl; + memcpy(&av->grh.dgid, mgid.ib_gid.raw, GID_LEN); + + ah = ib_create_ah(login->port->pd, av); + spin_unlock(&mcaste->lock); + spin_unlock(&login->mcast_tree.mcast_rb_lock); + + if (!ah || IS_ERR(ah)) + goto drop; + + vnic_send(login, skb, ah, IB_MULTICAST_QPN, tx_res_index); + ib_destroy_ah(ah); + /* used as a counter for multicast TX packets (not RX) */ + VNIC_STATS_DO_INC(tx_res->stats.multicast); + + return; + +drop: + VNIC_STATS_DO_INC(tx_res->stats.tx_dropped); + dev_kfree_skb_any(skb); +} + +int vnic_tx(struct sk_buff *skb, struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + int tx_res_index = 0, headroom = dev->hard_header_len - ETH_HLEN; + struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index]; + + ASSERT(dev); + ASSERT(skb); +#ifdef VNIC_PROFILLNG + login->prof_arr[login->prof_arr_it].cnt++; + /* copy only fields for reporting, data buffer is invalid */ + login->prof_arr[login->prof_arr_it].skb = *skb; + login->prof_arr[login->prof_arr_it].skb.data = NULL; + login->prof_arr[login->prof_arr_it].tstamp = current_kernel_time(); + login->prof_arr[login->prof_arr_it].jiffies = jiffies; + login->prof_arr[login->prof_arr_it].nr_frags = skb_shinfo(skb)->nr_frags; + login->prof_arr_it = (login->prof_arr_it + 1) % VNIC_PROFILLNG_SKB_MAX; + +#endif + + /* drop zero length skbs */ + if (unlikely(!skb->len)) + goto drop; + + /* sometimes, vnic_tx is called before carrier is up FM #100882 */ + if (unlikely(!test_bit(VNIC_STATE_NETDEV_CARRIER_ON, &login->netdev_state))) + goto drop; + + /* check headroom and reallocate skb if needed: + * If VLAN used: need VLAN_HLEN (4) Bytes + * If vnic_encap_headroom set: need VNIC_ENCAP_LEN (4) Bytes + * when vnic_encap_headroom is clear, we do not encap EoIB header + * into the headroom, but rather use additional SG entry to hold it + */ + + if (unlikely(skb_headroom(skb) < headroom)) { + struct sk_buff *skb_new; + + skb_new = skb_realloc_headroom(skb, headroom); + if (!skb_new) + goto drop; + + dev_kfree_skb(skb); + skb = skb_new; + VNIC_STATS_INC(login->port_stats.realloc_packets); + } + /* don't use dev->header_ops, use vnic_header_encap() inline + * function instead, because when raw socket is used or BR_CTL mode + * then header_ops are not called as expected, and we'll end up sending + * the packet without EoIB header + */ + if (unlikely(vnic_header_encap(skb))) + goto drop; + + /* in promiscuous mode, learn the source mac */ + if (is_ucast_promisc(login) && vnic_learn_mac_enabled) + vnic_learn_mac(dev, skb_mac_header(skb) + ETH_ALEN, 0); + + /* get TX resource for this SKB, keep it after vnic_header_encap() + * so if we don't have kernel multiple queue support we use the + * RSS hash result for TSS + */ + tx_res_index = VNIC_TXQ_GET_HASH(skb, login->real_tx_rings_num); + ASSERT(tx_res_index <= login->tx_rings_num); + tx_res = &login->tx_res[tx_res_index]; + + + /* send ucast/mcast packet */ + vnic_dbg_skb("TX", skb, (unsigned long)(vnic_encap_headroom ? 0 : -1), + (unsigned long)(vnic_encap_headroom ? VNIC_ENCAP_LEN : 0)); +#if 0 /* neighbour caching disabled */ + if (likely(skb->dst && skb->dst->neighbour)) { + if (is_multicast_ether_addr(skb_mac_header(skb))) { + vnic_dbg_data(login->name, + "dst_neigh exists but no vnic_neigh - " + "multicast packet\n"); + vnic_mcast_send(login, skb, tx_res_index); + } else { + vnic_dbg_data(login->name, + "dst_neigh exists but no vnic_neigh - " + "unicast packet\n"); + vnic_ucast_send(login, skb, skb->dst->neighbour, tx_res_index); + } + } else +#endif + { + if (is_multicast_ether_addr(skb_mac_header(skb))) { + vnic_dbg_data(login->name, + "no dst_neigh - multicast packet\n"); + vnic_mcast_send(login, skb, tx_res_index); + } else { + vnic_dbg_data(login->name, + "no dst_neigh - unicast packet\n"); + if (unlikely(vnic_ucast_send(login, skb, NULL, tx_res_index))) + goto drop; + } + } + + return NETDEV_TX_OK; + +drop: + dev_kfree_skb(skb); + return NETDEV_TX_OK; +} diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip.h new file mode 100644 index 0000000000000..0f77c1abde17f --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip.h @@ -0,0 +1,1025 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _VNIC_FIP_H +#define _VNIC_FIP_H + +#include "vnic.h" + + +#define FIP_TYPE(FIPT) FIP_TYPE_##FIPT +#define FIP_TYPE_IDX(FIPT) FIP_TYPE_IDX_##FIPT + +#define FIP_CASE(FIPT) case FIP_TYPE(FIPT): return FIP_TYPE_IDX(FIPT) + +#define FIP_CASE_STR(FIPT) case FIP_TYPE(FIPT): return # FIPT +#define FIP_SUBCODE_CASE_STR(SUBCODE) case (SUBCODE): return # SUBCODE + +#define FIP_MASK(FIPT) (((u64)1) << FIP_TYPE_IDX(FIPT)) + +#define ADV_EXT_TYPE(FIPT) ADV_EXT_TYPE_##FIPT +#define ADV_EXT_IDX(FIPT) ADV_EXT_IDX_##FIPT + +#define GUID_FORMAT "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x" +#define MGID_PREFIX_FMT "%02x:%02x:%02x:%02x:%02x" +#define GUID_ARG(g) (g)[0], (g)[1], (g)[2], (g)[3], (g)[4], (g)[5], (g)[6], (g)[7] +#define MGID_PRE_ARG(g) (g)[0], (g)[1], (g)[2], (g)[3], (g)[4] + +enum { + FIP_TYPE(VENDOR_ID) = 13, + FIP_TYPE(ADDRESS) = 240, + FIP_TYPE(GW_INFORMATION)= 241, + FIP_TYPE(LOGIN) = 242, + FIP_TYPE(VHUB_UPDATE) = 243, + FIP_TYPE(VHUB_TABLE) = 244, + FIP_TYPE(VNIC_IDENTITY) = 245, + FIP_TYPE(PARTITION) = 246, + FIP_TYPE(GW_IDENTIFIER) = 248, + FIP_TYPE(KA_PARAMS) = 249, + FIP_TYPE(EXT_DESC) = 254, +}; + +enum { + FIP_TYPE_IDX(VENDOR_ID), + FIP_TYPE_IDX(ADDRESS), + FIP_TYPE_IDX(GW_INFORMATION), + FIP_TYPE_IDX(LOGIN), + FIP_TYPE_IDX(VHUB_UPDATE), + FIP_TYPE_IDX(VHUB_TABLE), + FIP_TYPE_IDX(VNIC_IDENTITY), + FIP_TYPE_IDX(PARTITION), + FIP_TYPE_IDX(GW_IDENTIFIER), + FIP_TYPE_IDX(KA_PARAMS), + FIP_TYPE_IDX(EXT_DESC), +}; + +enum { + ADV_EXT_TYPE(CAP) = 40, + ADV_EXT_TYPE(BOOT) = 18, + ADV_EXT_TYPE(LAG) = 41, + ADV_EXT_TYPE(MEMBER) = 42, + ADV_EXT_TYPE(PC_ID) = 43, /* Power Cycle ID */ + ADV_EXT_TYPE(CTRL_IPORT) = 240, +}; + +enum { + ADV_EXT_IDX(CAP), + ADV_EXT_IDX(BOOT), + ADV_EXT_IDX(LAG), + ADV_EXT_IDX(PC_ID), + ADV_EXT_IDX(CTRL_IPORT), +}; + + +enum { + EPORT_STATE_DOWN = 0, + EPORT_STATE_UP = 1, +}; + +enum fip_packet_type { + FIP_DISCOVER_UCAST = 0, + FIP_DISCOVER_MCAST = 1 +}; + +enum { + FIP_TABLE_HDR_MIDDLE = 0, + FIP_TABLE_HDR_FIRST = 1, + FIP_TABLE_HDR_LAST = 2, + FIP_TABLE_HDR_ONLY = 3 +}; + +enum { + FIP_EXT_LAG_W_POLICY_HOST = 1, + FIP_EXT_LAG_W_POLICY_UCAST = 1 << 2 +}; + +/* string "mellanox" */ +#define FIP_VENDOR_MELLANOX { 0x6d, 0x65, 0x6c, 0x6c, 0x61, 0x6e, 0x6f, 0x78 } + + +#define FIP_TEST_PKT_LENGTH(port, length, type) \ + if ((length) != sizeof(type) + IB_GRH_BYTES) { \ + vnic_dbg_fip(port->name, "Dump packet:" \ + "at %d unexpected size. length %d expected %d\n", \ + __LINE__, (int)length, \ + (int)(sizeof(type) + IB_GRH_BYTES)); \ + return -EINVAL; \ + } + +/* + * copy string b to string a and NULL termination. + * length a must be >= length b+1. + */ +#define TERMINATED_MEMCPY(a,b) \ + do { \ + ASSERT(sizeof(a)>=sizeof(b)+1); \ + memcpy((a), (b), sizeof(b)); \ + (a)[sizeof(b)] = '\0'; \ + } while (0); + + +enum { + FIP_MAX_ADDR_TLVS = 6, + FIP_MAX_TLVS = 32, + FIP_MAX_EXT_DESC = 32, +}; + +struct fip_fip_type { + u8 type; + u8 length; + u16 reserved; +}; + +struct fip_header_simple { + __be16 opcode; + u8 reserved; + u8 subcode; + __be16 list_length; + __be16 flags; +}; + +struct fip_vendor_id_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; +}; + +struct fip_address_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; + __be32 gwtype_qpn; + __be16 sl_gwportid; + __be16 lid; + u8 guid[8]; +}; + +struct fip_gw_information_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; + u8 h_nmac_mgid; + u8 n_rss_mgid_tss_qpn; + __be16 n_rss_qpn_vnics; +}; + +struct fip_login_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; + __be16 mtu; + __be16 vnic_id; + __be16 flags_vlan; + u8 mac[6]; + u8 eth_gid_prefix[5]; + u8 antispoofing; + __be16 vfields; + __be32 syndrom_ctrl_qpn; + u8 vnic_name[16]; +}; + +struct context_table_entry { + u8 v_rss_type; + u8 reserved; + u8 mac[ETH_ALEN]; + __be32 qpn; + u8 reserved1; + u8 sl; + __be16 lid; +}; + +struct fip_vhub_update_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; + __be32 state_vhub_id; + __be32 tusn; +}; + +struct fip_vhub_table_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; + __be32 vp_vhub_id; + __be32 tusn; + __be16 hdr; + __be16 table_size; +}; + +struct fip_vnic_identity_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; + __be32 flags_vhub_id; + __be32 tusn; + __be16 vnic_id; + u8 mac[6]; + u8 port_guid[8]; + u8 vnic_name[16]; +}; + +struct fip_partition_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; + __be16 reserved; + __be16 pkey; +}; + +struct fip_gw_identifier_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; + u8 sys_guid[8]; + u8 sys_name[32]; + u8 gw_port_name[8]; +}; + +struct fip_ka_params_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; + __be32 adv_period; + __be32 ka_period; + __be32 vnic_ka_period; +}; + +struct fip_ext_desc_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; +}; + +struct fip_extended_type { + u8 ext_type; + u8 len; + u8 reserved; + u8 mandatory; +}; + +struct fip_ext_type_cap { + struct fip_extended_type et; + u32 reserved[4]; +}; + +struct fip_ext_type_boot { + struct fip_extended_type et; + u8 boot_prio; + u8 reserved; + __be16 discovery_timeout; +}; + +struct fip_ext_type_lag_props { + struct fip_extended_type et; + u8 gw_type; + u8 reserved; + __be16 lag_hash; + u8 weight_policy_flags; + u8 ca_threshold; + __be16 link_down_pol_thresh; + u32 reserved2[2]; +}; + +struct fip_ext_type_power_cycle_id { + struct fip_extended_type et; + __be64 power_cycle_id; + u32 reserved; +} __attribute__((packed)); + +struct fip_ext_type_hostname { + struct fip_extended_type et; + u8 hostname[32]; +}; + +struct fip_ext_type_ctrl_iport { + struct fip_extended_type et; + u8 vendor_id[8]; + __be32 gwtype_qpn; + __be16 sl_gwportid; + __be16 lid; + u8 guid[8]; +}; + +struct fip_ext_type_lag_member { + __be32 qpn; + __be16 sl_gw_portid; + __be16 lid; + u8 guid[8]; + u8 eport_state; + u8 reserved1; + u8 weight; + u8 link_utilization; + u32 reserved2; +}; + +struct fip_ext_type_lag_members { + struct fip_extended_type et; + struct fip_ext_type_lag_member lagm[0]; +}; + +struct fip_ext_group { + struct fip_ext_desc_tlv *fed[FIP_MAX_EXT_DESC]; + int num; +}; + +struct fip_address_group { + struct fip_address_tlv *fa[FIP_MAX_ADDR_TLVS]; + int num; +}; + +struct fip_context_group { + struct context_table_entry *cte; + int num; +}; + +struct fip_content { + struct fip_eoib_ver *eoib_ver; + struct fip_header_simple *fh; + struct fip_vendor_id_tlv *fvend; + struct fip_address_group fa; + struct fip_gw_information_tlv *fgwi; + struct fip_login_tlv *fl; + struct fip_vhub_update_tlv *fvu; + struct fip_vhub_table_tlv *fvt; + struct fip_vnic_identity_tlv *fvi; + struct fip_partition_tlv *fp; + struct fip_gw_identifier_tlv *fgid; + struct fip_ka_params_tlv *fka; + struct fip_ext_group fed; + struct fip_context_group cte; + u64 mask; + u16 offsets[FIP_MAX_TLVS]; + int num; +}; + +/**************************************************************************/ +/* packet format structs */ +/**************************************************************************/ +#define VENDOR_ID_LENGTH 8 + +struct fip_eoib_ver { + u8 version; + u8 reserved[3]; +}; + +struct fip_fip_header { + __be16 opcode; + u8 reserved; + u8 subcode; + __be16 list_length; + __be16 flags; + struct fip_fip_type type; + u8 vendor_id[VNIC_VENDOR_LEN]; +}; + +struct fip_discover_base { + struct fip_fip_type type; + u8 vendor_id[VNIC_VENDOR_LEN]; + u32 qpn; + u16 sl_port_id; + u16 lid; + u8 guid[GUID_LEN]; +}; + +struct eoib_adv_gw_info { /* Gabi */ + struct fip_fip_type type; + u8 vendor_id[VNIC_VENDOR_LEN]; + u8 system_guid[GUID_LEN]; + u8 system_name[VNIC_SYSTEM_NAME_LEN]; + u8 gw_port_name[VNIC_GW_PORT_NAME_LEN]; +}; + +/* keep alive information */ +struct eoib_adv_ka_info { /* Gabi */ + struct fip_fip_type type; + u8 vendor_id[VNIC_VENDOR_LEN]; + u32 gw_adv_period; + u32 gw_period; + u32 vnic_ka_period; +}; + +struct eoib_advertise { + struct fip_eoib_ver version; + struct fip_fip_header fip; + struct fip_discover_base base; + struct fip_fip_type type_1; + u8 vendor_id[VNIC_VENDOR_LEN]; + u8 flags; + u8 reserved; + u16 num_net_vnics; + struct eoib_adv_gw_info gw_info; /* Gabi */ + struct eoib_adv_ka_info ka_info; /* Gabi */ +}; + +struct syndrom_dword { + u8 syndrom; + u8 reserved[3]; +}; + +union syn_qp_ctrl { + struct syndrom_dword syn; + u32 ctl_qpn; +}; + +struct eoib_login { + struct fip_eoib_ver eoib_ver; + struct fip_header_simple fh; + struct fip_vendor_id_tlv fvend; + struct fip_address_tlv fa; + struct fip_login_tlv fl; +}; + +struct fip_solicit_legacy { + struct fip_eoib_ver version; + struct fip_header_simple fh; + struct fip_vendor_id_tlv fvend; + struct fip_address_tlv addr; +}; + +struct fip_solicit_new { + struct fip_eoib_ver version; + struct fip_header_simple fh; + struct fip_vendor_id_tlv fvend; + struct fip_address_tlv addr; + struct fip_ext_desc_tlv ext; + struct fip_ext_type_cap ext_cap; + struct fip_ext_type_hostname ext_hostname; +}; + +union fip_vhub_id { + struct { + u8 flags; + u8 reserved[3]; + } flags; + u32 vhub_id; +}; + +struct eoib_context_table { + struct fip_eoib_ver version; + struct fip_fip_header fip; + struct fip_fip_type type_1; + u8 vendor_id[VNIC_VENDOR_LEN]; + union fip_vhub_id vhub_id; + u32 tusn; + u8 flags; + u8 reserved; + u16 table_size; + /* here come the context entries */ +}; + +/* this is the number of DWORDS to subtract from type_1->length + * to get the size of the entries / 4. (size in dwords from start + * of vendor_id field until the first context entry + 1 for checksum + */ +#define FIP_TABLE_SUB_LENGTH 6 + +/* + * eoib_host_update will be used for vHub context requests, + * keep alives and logouts + */ +struct eoib_host_update { + struct fip_eoib_ver version; + struct fip_fip_header fip; + struct fip_fip_type type_1; + u8 vendor_id[VNIC_VENDOR_LEN]; + union fip_vhub_id vhub_id; + u32 tusn; + u16 vnic_id; + u8 mac[ETH_ALEN]; + u8 port_guid[GUID_LEN]; + u8 vnic_name[VNIC_NAME_LEN]; +}; + +enum fip_packet_fields { + EOIB_FIP_OPCODE = 0xFFF9, + FIP_FIP_HDR_LENGTH = 3, + FIP_FIP_HDR_TYPE = 13, + + /* keep all subcodes here */ + FIP_HOST_SOL_SUB_OPCODE = 0x1, + FIP_GW_ADV_SUB_OPCODE = 0x2, + FIP_HOST_LOGIN_SUB_OPCODE = 0x3, + FIP_GW_LOGIN_SUB_OPCODE = 0x4, + FIP_HOST_LOGOUT_SUB_OPCODE = 0x5, + FIP_GW_UPDATE_SUB_OPCODE = 0x6, + FIP_GW_TABLE_SUB_OPCODE = 0x7, + FIP_HOST_ALIVE_SUB_OPCODE = 0x8, + FIP_MAX_SUBCODES, + /* end subcodes section */ + + FIP_FIP_FCF_FLAG = 0x1, + FIP_FIP_SOLICITED_FLAG = 0x2, + FIP_FIP_ADVRTS_FLAG = 0x4, + FIP_FIP_FP_FLAG = 0x80, + FIP_FIP_SP_FLAG = 0x40, + + FIP_BASIC_LENGTH = 7, + FIP_BASIC_TYPE = 240, + + FIP_ADVERTISE_LENGTH_1 = 4, + FIP_ADVERTISE_TYPE_1 = 241, + FIP_ADVERTISE_HOST_VLANS = 0x80, + FIP_ADVERTISE_NUM_VNICS_MASK = 0x0FFF, + FIP_ADVERTISE_N_RSS_SHIFT = 12, + FIP_ADVERTISE_HOST_EN_MASK = 0x80, + FIP_ADVERTISE_ALL_VLAN_GW_MASK = 0x60, + FIP_ADVERTISE_GW_PORT_ID_MASK = 0x0FFF, + FIP_ADVERTISE_SL_SHIFT = 12, + + FIP_ADVERTISE_GW_LENGTH = 15, + FIP_ADVERTISE_GW_TYPE = 248, + + FIP_ADVERTISE_KA_LENGTH = 6, + FIP_ADVERTISE_KA_TYPE = 249, + + FIP_LOGIN_LENGTH_1 = 13, + FIP_LOGIN_TYPE_1 = 242, + FIP_LOGIN_LENGTH_2 = 4, + FIP_LOGIN_TYPE_2 = 246, + + FIP_LOGIN_V_FLAG = 0x8000, + FIP_LOGIN_M_FLAG = 0x4000, + FIP_LOGIN_VP_FLAG = 0x2000, + FIP_LOGIN_H_FLAG = 0x1000, + FIP_LOGIN_VLAN_MASK = 0x0FFF, + FIP_LOGIN_DMAC_MGID_MASK = 0x3F, + FIP_LOGIN_RSS_MGID_MASK = 0x0F, + FIP_LOGIN_RSS_MASK = 0x10, + FIP_LOGIN_RSS_SHIFT = 4, + FIP_LOGIN_CTRL_QPN_MASK = 0xFFFFFF, + FIP_LOGIN_VNIC_ID_BITS = 16, + FIP_LOGIN_ALL_VLAN_GW_FLAG = 0x0040, + + FIP_LOGOUT_LENGTH_1 = 13, + FIP_LOGOUT_TYPE_1 = 245, + + FIP_HOST_UPDATE_LENGTH = 13, + FIP_HOST_UPDATE_TYPE = 245, + FIP_HOST_VP_FLAG = 0x01, + FIP_HOST_U_FLAG = 0x80, + FIP_HOST_R_FLAG = 0x40, + + FIP_CONTEXT_UP_LENGTH = 9, + FIP_CONTEXT_UP_TYPE = 243, + FIP_CONTEXT_UP_EPORT_MASK = 0x30, + FIP_CONTEXT_UP_EPORT_SHIFT = 4, + FIP_CONTEXT_V_FLAG = 0x80, + FIP_CONTEXT_RSS_FLAG = 0x40, + FIP_CONTEXT_TYPE_MASK = 0x0F, + + FIP_CONTEXT_TBL_TYPE = 244, + FIP_CONTEXT_TBL_SEQ_MASK = 0xC0, + FIP_CONTEXT_TBL_SEQ_FIRST = 0x40, + FIP_CONTEXT_TBL_SEQ_LAST = 0x80, + + FKA_ADV_PERIOD = 8000, /* in mSecs */ + FKA_ADV_MISSES = 3 +}; + +enum fip_login_syndroms { + FIP_SYNDROM_SUCCESS = 0, + FIP_SYNDROM_HADMIN_REJECT = 1, + FIP_SYNDROM_GW_RESRC = 2, + FIP_SYNDROM_NO_NADMIN = 3, + FIP_SYNDROM_UNRECOGNISED_HOST = 4, + FIP_SYNDROM_UNSUPPORTED_PARAM = 5, + FIP_SYNDROM_GW_IS_LAG_MEMBER = 6, + FIP_SYNDROM_DUPLICATE_ADDRESS = 7, +}; + +/* + * Send a multicast or unicast solicit packet. The multicast packet is sent + * to the discover mcast group. Unicast packets are sent to the dqpn + dlid + * supplied. The dlid, dqpn, sl are ignored for multicast packets. + * functionreturns 0 on success and error code on failure +*/ +int fip_solicit_send(struct fip_discover *discover, + enum fip_packet_type multicast, u32 dqpn, + u16 dlid, u8 sl, int new_prot); + +/* + * Send a unicast login packet. This function supports both host and + * network admined logins. function returns 0 on success and + * error code on failure +*/ +int fip_login_send(struct fip_vnic_data *vnic); + +int fip_logout_send(struct fip_vnic_data *vnic); + +/* + * This function creates and sends a few types of packets (all ucast): + * vHub context request - new=1, logout=0 + * vHub context update / vnic keep alive - new=0, logout=0 + * vnic logout - new=0, logout=1 +*/ +int fip_update_send(struct fip_vnic_data *vnic, int request_new, int logout); + +/* + * Check if a received packet is a FIP packet, And if so return its subtype. + * The FIP type is also returned in fip_type and can be either EOIB_FIP_OPCODE + * or FCOIB_FIP_OPCODE. If the packet is not a FIP packet -EINVAL is returned. +*/ +int fip_pkt_parse(char *buffer, int length, int *fip_type); + +/* + * Already know that this is a FIP packet, return its subtype. +*/ +int fip_pkt_get_subtype_bh(char *buffer); + +/* + * parse a packet that is suspected of being an advertise packet. The packet + * returns 0 for a valid advertise packet and an error code other wise. The + * packets "interesting" details are returned in data. +*/ +int fip_advertise_parse_bh(struct fip_discover *discover, struct fip_content *fc, + struct fip_gw_data *data); + +/* + * parse a packet that is suspected of being an login ack packet. The packet + * returns 0 for a valid login ack packet and an error code other wise. The + * packets "interesting" details are returned in data. +*/ +int fip_login_parse(struct fip_discover *discover, struct fip_content *fc, + struct fip_login_data *data); + +static inline int _map_generic_pkt(struct vnic_port *port, + struct fip_ring_entry *tx_ring_entry, + void *mem, int pkt_size) +{ + /* alloc packet to be sent */ + tx_ring_entry->mem = mem; + + /* map packet to bus */ + tx_ring_entry->bus_addr = + ib_dma_map_single(port->dev->ca, + tx_ring_entry->mem, pkt_size, DMA_TO_DEVICE); + + if (unlikely(ib_dma_mapping_error(port->dev->ca, + tx_ring_entry->bus_addr))) { + vnic_warn(port->name, + "send_generic_pkt failed to map to pci\n"); + return -ENOMEM; + } + tx_ring_entry->length = pkt_size; + + return 0; +} + +static inline int alloc_map_fip_buffer(struct ib_device *ca, + struct fip_ring_entry *me, + int size, gfp_t mask) +{ + me->mem = kmalloc(size, mask); + if (!me->mem) { + vnic_warn(ca->name, "failed to alloc memory (%d)\n", size); + return -ENOMEM; + } + + me->bus_addr = ib_dma_map_single(ca, me->mem, size, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, me->bus_addr))) { + kfree(me->mem); + vnic_warn(ca->name, "ib_dma_mapping_error failed\n"); + return -ENOMEM; + } + me->length = size; + me->entry_posted = 0; + + return 0; +} + +#define DELAYED_WORK_CLEANUP_JIFFS 2 +#define FIP_MAX_PKT_PRINT_LENGTH 120 +#define FIP_OP_RECV (1ul << 31) + +static const char fip_discover_mgid[GID_LEN] = { + 0xFF, 0x12, 0xE0, 0x1B, + 0x00, 0x06, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00}; +static const char fip_solicit_mgid[GID_LEN] = { + 0xFF, 0x12, 0xE0, 0x1B, + 0x00, 0x07, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00}; + + +/* TODO - remove this: for initial debug only */ +void fip_dbg_dump_raw_pkt(int level, void *buff, + int length, int is_tx, char *name); +enum { + FIP_ETH_HEADER_LEN = 14, + FIP_ENCAP_LEN = 4, + FIP_PROTOCOL_RX_SIZE = 16, /* must be power of 2 */ + FIP_PROTOCOL_TX_SIZE = 64, /* must be power of 2 */ + FIP_LOGIN_RX_SIZE = 64, /* must be power of 2 */ + FIP_LOGIN_TX_SIZE = 64, /* must be power of 2 */ + + /* timeout in seconds between LOGIN and ACK */ + FIP_LOGIN_TIMEOUT = 8, + FIP_RESOLICIT_TIME = 8, + + IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + FIP_ENCAP_LEN, +}; + +struct fip_rcv_pkt { + struct list_head list; + struct fip_content *fc; + int length; + void *mem; +}; + +/* + * Alloc the discover CQ, QP. Configure the QP to RTS. + * alloc the RX + TX rings and queue work for discover + * finite state machine code. If complete it set, it clears + * possible previous GW / VNIC data structs on init. + */ +int fip_discover_init(struct vnic_port *port, struct fip_discover *discover, + u16 pkey, int complete); + +/* + * free the discover TX and RX rings, QP and CQ. If complete + * is set, it clears possible previous GW / VNIC data structs + * by using a "complete" flush otherwise vnic data is preserved. +*/ +int fip_discover_cleanup(struct vnic_port *port, struct fip_discover *discover, int complete); + +/* + * send a single multicast packet. + * return 0 on success, other on failure. +*/ +int fip_mcast_send(struct vnic_port *port, struct ib_qp *qp, + unsigned int wr_id, u64 mapping, + int size, u16 pkey_index, struct vnic_mcast *mcast); +/* + * send a single unicast packet. + * return 0 on success, other on failure. +*/ +int fip_ucast_send(struct vnic_port *port, struct ib_ah *ah, + struct ib_qp *qp, + unsigned int wr_id, u64 mapping, + int size, u16 pkey_index, u32 dest_qpn, u16 dlid, + u32 qkey, u8 sl); +/* + * qonfigure a newly allocated QP and move it + * from reset->init->RTR->RTS + */ +int fip_init_qp(struct vnic_port *port, struct ib_qp *qp, + u16 pkey_index, char *name); + +/* + * allocs a single rx buffer (of size size), map it to pci bus + * and post it to the qp for receive. id parameter is used + * to keep track of work request when completion is received. + * kernel and bus address are returned in mem_entry. + * returns 0 on success else failure. + * id used to identify entry in receive queue. + */ +int fip_post_receive(struct vnic_port *port, struct ib_qp *qp, int size, + int _id, struct fip_ring_entry *mem_entry, char *name); + +/* trigered by a core event */ +void fip_qp_to_reset(struct ib_qp *qp, char *name); +void fip_flush_rings(struct vnic_port *port, + struct ib_cq *cq, + struct ib_qp *qp, + struct fip_ring *rx_ring, + struct fip_ring *tx_ring, + char *name); +void fip_free_rings(struct vnic_port *port, + struct fip_ring *rx_ring, + struct fip_ring *tx_ring, + char *name); + +/* + * This function allocates the tx buffers and initializes the head and + * tail indexes. + */ +int fip_init_tx(int size, struct fip_ring *tx_ring, char *name); + +/* + * Configure the discover QP. This includes configuring rx+tx + * moving the discover QP to RTS and creating the tx and rx rings + */ +int fip_init_rx(struct vnic_port *port, int ring_size, struct ib_qp *qp, + struct fip_ring *rx_ring, char *name); + +/* + * This is a general purpose CQ completion function that handles + * completions on RX and TX rings. It can serve all users that are + * using RX and TX rings. + * RX completions are destinguished from TX comp by the MSB that is set + * for RX and clear for TX. For RX, the memory is unmapped from the PCI, + * The head is incremented. For TX the memory is unmapped and then freed. + * The function returns the number of packets received. +*/ +int fip_comp(struct vnic_port *port, + struct ib_cq *cq, + struct fip_ring *rx_ring, + struct fip_ring *tx_ring, + char *name); + +/* + * This function is the driving engine of the vnic logic. It manages the + * vnics state machines. + * Some of the states in the state machine could have been removed because + * they contain "actions" and not states. Still it is easier to maintaine + * the code this way and it gives an easy mechanism for exception handling + * and retries. + * Only call this function from fip_wq context. +*/ +void fip_vnic_fsm(struct work_struct *work); + +/* + * Mark the vnic for deletion and trigger a delayed call to the cleanup + * function. In the past the vnic was moved to another list but this + * might cause vnic duplication if new vnics are added to the GW. Even + * if the vnic is being flushed we need to know it is there. + * + * Note: This deletion method insures that all pending vnic work requests + * are cleared without dependency of the calling context. +*/ +void fip_vnic_close(struct fip_vnic_data *vnic, enum fip_flush flush); + +/* + * Free vnic resources. This includes closing the data vnic (data QPs etc) + * and the discovery resources. If the vnic can be totaly destroyed (no + * pending work) the vnic will be removed from the GW list and it's memory + * freed. If not the vnic will not be freed and the function will return an + * error. The caller needs to recall this unction to complete the operation. +*/ +int fip_vnic_destroy(struct fip_vnic_data *vnic); + +struct fip_vnic_data *fip_vnic_alloc(struct vnic_port *port, + struct fip_gw_data *gw, + int hadmin, + u16 vnic_id); + +/* + * Look for a vnic in the GW vnic list. The search key used is either the vnic_id + * that is unique, or the mac+vlan pair. A match on either key will result in the + * return of the vnic. both keys are nesesary because host assigned delete + * flow might not have access to the vnic_id. The search disregards vnics that + * are undergoing full flush (they will be removed soon). +*/ +struct fip_vnic_data *fip_vnic_find_in_list(struct fip_gw_data *gw, + u16 vnic_id, u8 *mac, + u16 vlan, u8 vlan_used); + +/* + * process an incoming login ack packet. The packet was already parsed and + * its data was placed in *data. The function creates RX and TX rings for the + * vnic and starts the multicast join procedure. + * This function should not be called for packets other then login ack packets. +*/ +void fip_vnic_login_ack_recv(struct fip_vnic_data *vnic, + struct fip_login_data *data); + +/* + * This function should be called when the building of a vhub context + * table is done and the vnic state should transition to CONNECTED. +*/ +int fip_vnic_tbl_done(struct fip_vnic_data *vnic); +int fip_vnic_mcast_recnct(struct fip_vnic_data *vnic); + +/* + * Init the vnic's vHub table data structures, before using them + */ +void vhub_ctx_init(struct fip_vnic_data *vnic); +void vhub_table_free(struct vhub_elist *elist); + +/* + * Clear and free the vnic's vHub context table data structures. + */ +void vhub_ctx_free(struct fip_vnic_data *vnic); + +/* + * This function handles a vhub context table packet. The table will + * be processed only if we do not have a up to date local coppy of + * our own. The table update supports multi-packet tables so care + * must be taken in building the complete table. +*/ +int vhub_handle_tbl(struct fip_vnic_data *vnic, struct fip_content *fc, + u32 vhub_id, u32 tusn); + +/* + * This function handles a vhub context update packets. There are three flows + * in handeling update packets. The first is before the main table is up + * to date, the second is after the table is up to date but before it was + * passed to the ownership of the data vnic (login struct) and the local + * lists are freed, and the last is when the table maintanence is done + * by the data vnic. This function handles all cases. +*/ +int vhub_handle_update(struct fip_vnic_data *vnic, + u32 vhub_id, u32 tusn, + struct vnic_table_entry *data); + +/* + * This function writes the main vhub table to the data (login) vnic. + * You should call it when the data vnic is ready for it and after the + * table is up to date (and the update list was applied to the main list) + */ +int fip_vnic_write_tbl(struct fip_vnic_data *vnic); + +/* sysfs entries for hadmin vNics*/ +int vnic_create_hadmin_dentry(struct fip_vnic_data *vnic); +void vnic_delete_hadmin_dentry(struct fip_vnic_data *vnic); +void extract_memb_extended(struct fip_ext_type_lag_members *ext_lag_membs, + int ext_length, + struct lag_members *lagm, + char *name); +int handle_member_update(struct fip_vnic_data *vnic, struct lag_members *lm); +int extract_vhub_extended(struct fip_ext_desc_tlv *fed, + struct fip_vnic_data *vnic); +static inline int send_generic_ucast_pkt(struct vnic_port *port, + struct ib_ah *ah, + struct fip_ring *tx_ring, + void *mem, int pkt_size, + struct ib_qp *qp, + int pkey_index, + u32 dst_qpn, u16 dst_lid, + u32 qkey, u8 sl) +{ + int index, rc; + unsigned long flags; + unsigned long tail; + + /* + * we are only allowed to update the head at task level so no need to + * perform any locks here + */ + spin_lock_irqsave(&tx_ring->ring_lock, flags); + index = tx_ring->head & (tx_ring->size - 1); + + vnic_dbg_fip(port->name, "send ucast packet\n"); + + spin_lock(&tx_ring->head_tail_lock); + tail = tx_ring->tail; + spin_unlock(&tx_ring->head_tail_lock); + + /* ring full try again */ + if (tx_ring->head - tail >= tx_ring->size) { + vnic_warn(port->name, "send_generic_pkt ring full: QPN 0x%x: tail=%ld head=%ld diff=%ld\n", + qp->qp_num, tx_ring->tail, tx_ring->head, tx_ring->head - tx_ring->tail); + rc = -EAGAIN; + goto err; + } + + + rc = _map_generic_pkt(port, &tx_ring->ring[index], mem, pkt_size); + if (rc) + goto err; + + rc = fip_ucast_send(port, ah, qp, index, + tx_ring->ring[index].bus_addr, + pkt_size, pkey_index, dst_qpn, dst_lid, + qkey, sl); + + if (rc) { + vnic_warn(port->name, "fip_ucast_send() failed (%d)\n", rc); + rc = -ENODEV; + goto error_unmap_dma; + } + + tx_ring->head++; + + spin_unlock_irqrestore(&tx_ring->ring_lock, flags); + return 0; + +error_unmap_dma: + ib_dma_unmap_single(port->dev->ca, + tx_ring->ring[index].bus_addr, + pkt_size, DMA_TO_DEVICE); +err: + spin_unlock_irqrestore(&tx_ring->ring_lock, flags); + return rc; +} + +static inline const char *eport_state_str(int state) +{ + switch (state) { + case EPORT_STATE_DOWN: return "Down"; + case EPORT_STATE_UP: return "Up"; + default:return "Invalid"; + } +} + +#endif /* _VNIC_FIP_H */ diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_discover.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_discover.c new file mode 100644 index 0000000000000..71829aaa626ae --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_discover.c @@ -0,0 +1,2183 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_fip.h" +#include "vnic_fip_discover.h" +#include "vnic_fip_pkt.h" + +#define FIP_MAX_PKT_PRINT_LENGTH 120 + +static void fip_purge_gws(struct work_struct *work); +static void fip_discover_gw_fsm(struct work_struct *work); +static void fip_discover_hadmin_update(struct work_struct *work); +static void fip_discover_fsm(struct work_struct *work); +void fip_close_gw(struct fip_gw_data *gw, enum fip_flush flush); + +/* TODO - remove this: for initial debug only */ +void fip_dbg_dump_raw_pkt(int level, void *buff, + int length, int is_tx, char *name) +{ + int i; + int tmp_len; + u32 *data_ptr; + unsigned char *tmp_data_ptr; + + if (!(vnic_msglvl & VNIC_DEBUG_PKT_DUMP)) + return; + + printk(KERN_DEBUG "%s %s: packet length is %d\n", + is_tx ? "TX" : "RX", name, length); + + length = (length > FIP_MAX_PKT_PRINT_LENGTH) ? + FIP_MAX_PKT_PRINT_LENGTH : length; + + tmp_len = (length >> 2) + 1; + data_ptr = (u32 *)buff; + for (i = 0; i < tmp_len; i++) { + if (!is_tx && i == IB_GRH_BYTES >> 2) + printk(KERN_DEBUG "========================\n"); + tmp_data_ptr = (unsigned char *)&data_ptr[i]; + printk(KERN_DEBUG "%02x %02x %02x %02x \n", + tmp_data_ptr[0], tmp_data_ptr[1], + tmp_data_ptr[2], tmp_data_ptr[3]); + } +} + +/* + * Configure the discover QP. This includes configuring rx+tx + * moving the discover QP to RTS and creating the tx and rx rings + */ +int fip_discover_start_rings(struct fip_discover *discover, + struct fip_ring *rx_ring, + struct fip_ring *tx_ring, + struct ib_cq *cq, + struct ib_qp *qp) +{ + int rc; + + rc = fip_init_tx(tx_ring->size, tx_ring, discover->name); + if (rc) { + vnic_warn(discover->name, "fip_init_tx failed rc %d\n", rc); + /* set RX ring size to 0 as indication of the failure + so RX rings won't be freed, no need to set tx_ring->size + since fip_init_tx error flow will handle it */ + rx_ring->size = 0; + return rc; + } + + rc = fip_init_rx(discover->port, rx_ring->size, qp, rx_ring, discover->name); + if (rc) { + vnic_warn(discover->name, "fip_init_rx returned %d\n", rc); + goto release_queues; + } + + return 0; + +release_queues: + fip_flush_rings(discover->port, cq, qp, rx_ring, tx_ring, discover->name); + fip_free_rings(discover->port, rx_ring, tx_ring, discover->name); + + return rc; +} + +int fip_discover_init_rings(struct vnic_port *port, + struct fip_discover *discover, + struct fip_ring *rx_ring, + struct fip_ring *tx_ring, + struct ib_cq **cq, + struct ib_qp **qp, + ib_comp_handler comp_handler) +{ + struct ib_qp_init_attr qp_init_attr; + struct ib_device *ca = port->dev->ca; + + + *cq = ib_create_cq(ca, comp_handler, NULL, discover, + rx_ring->size + tx_ring->size, 0); + if (IS_ERR(*cq)) { + vnic_warn(discover->name, "failed to create CQ\n"); + goto out; + } + + memset(&qp_init_attr, 0, sizeof(qp_init_attr)); + qp_init_attr.cap.max_send_wr = tx_ring->size; + qp_init_attr.cap.max_recv_wr = rx_ring->size; + qp_init_attr.cap.max_send_sge = 1; + qp_init_attr.cap.max_recv_sge = 1; + qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.qp_type = IB_QPT_UD; + qp_init_attr.send_cq = *cq; + qp_init_attr.recv_cq = *cq; + + *qp = ib_create_qp(port->pd, &qp_init_attr); + if (IS_ERR(*qp)) { + vnic_warn(discover->name, "failed to create QP\n"); + goto error_free_cq; + } + + /* move QP to RTS */ + if (fip_init_qp(discover->port, *qp, discover->pkey_index, discover->name)) { + vnic_warn(discover->name, "fip_init_qp failed for qp\n"); + goto error_free_qp; + } + + /* init RX + TX rings */ + if (fip_discover_start_rings(discover, rx_ring, tx_ring, *cq, *qp)) { + vnic_warn(discover->name, "failed to start rings\n"); + goto error_free_qp; + } + + /* enable receiving CQ comps, triggers fip_discover_comp() */ + if (ib_req_notify_cq(*cq, IB_CQ_NEXT_COMP)) { + vnic_warn(discover->name, "ib_req_notify_cq failed for cq\n"); + goto error_release_rings; + } + + return 0; + +error_release_rings: + fip_flush_rings(discover->port, *cq, *qp, rx_ring, tx_ring, discover->name); + fip_free_rings(discover->port, rx_ring, tx_ring, discover->name); +error_free_qp: + ib_destroy_qp(*qp); +error_free_cq: + ib_destroy_cq(*cq); +out: + *qp = NULL; + *cq = NULL; + return -ENODEV; +} + +/* + * This function handles completions of both TX and RX + * packets. RX packets are unmapped lightly parsed moved to a list + * and passed to thread processing. TX packets are unmapped and freed. + * Note: this function is called from interrupt context + */ +static void fip_discover_comp(struct ib_cq *cq, void *discover_ptr) +{ + struct fip_discover *discover = discover_ptr; + + /* handle completions. On RX packets this will call discover_process_rx + * from thread context to continue processing */ + if (fip_comp(discover->port, discover->cq, + &discover->rx_ring, &discover->tx_ring, + discover->name)) + fip_discover_process_rx(discover); +} + +/* + * Alloc the discover CQ, QP. Configure the QP to RTS. + * alloc the RX + TX rings and queue work for discover + * finite state machine code. + */ +int fip_discover_init(struct vnic_port *port, struct fip_discover *discover, + u16 pkey, int complete) +{ + int rc; + + discover->port = port; + discover->flush = FIP_NO_FLUSH; + discover->state = FIP_DISCOVER_INIT; + discover->rx_ring.size = FIP_PROTOCOL_RX_SIZE; + discover->tx_ring.size = FIP_PROTOCOL_TX_SIZE; + discover->new_prot_gws = 0; + discover->old_prot_gws = 0; + + /* This is in preparation for pkey discovery */ + + init_completion(&discover->flush_complete); + + INIT_DELAYED_WORK(&discover->fsm_task, fip_discover_fsm); + INIT_DELAYED_WORK(&discover->cleanup_task, fip_purge_gws); + INIT_DELAYED_WORK(&discover->hadmin_update_task, fip_discover_hadmin_update); + INIT_WORK(&discover->pkt_rcv_task_bh, fip_discover_process_rx_bh); + spin_lock_init(&discover->rcv_list.lock); + INIT_LIST_HEAD(&discover->rcv_list.list); + spin_lock_init(&discover->lock); + + + if (complete) { + discover->pkey = pkey; + INIT_LIST_HEAD(&discover->gw_list); + init_rwsem(&discover->l_rwsem); + sprintf(discover->name, "%s_P%x", port->name, discover->pkey); + } + INIT_LIST_HEAD(&discover->hadmin_cache); + vnic_mcast_root_init(&discover->mcast_tree); + + if (!ib_find_pkey(port->dev->ca, port->num, discover->pkey, &discover->pkey_index)) { + rc = fip_discover_init_rings(port, discover, &discover->rx_ring, + &discover->tx_ring, &discover->cq, + &discover->qp, fip_discover_comp); + if (rc) { + vnic_warn(discover->name, "descovered init failed rc=%d\n", rc); + return rc; + } + + /* start discover FSM code */ + /* calls fip_discover_fsm() */ + queue_delayed_work(fip_wq, &discover->fsm_task, 0); + } else { + vnic_warn(discover->name, "Configured PKEY 0x%X is not supported on port\n", discover->pkey); + discover->pkey_index = ILLEGAL_PKEY_INDEX; + } + + + return 0; +} + +void fip_recv_list_flush(struct fip_discover *discover) +{ + struct list_head discov_recv_local; + struct fip_rcv_pkt *rcv, *rcv1; + unsigned long flags; + + INIT_LIST_HEAD(&discov_recv_local); + + spin_lock_irqsave(&discover->rcv_list.lock, flags); + list_replace_init(&discover->rcv_list.list, &discov_recv_local); + spin_unlock_irqrestore(&discover->rcv_list.lock, flags); + + list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) { + list_del(&rcv->list); + kfree(rcv); + } + return; +} + +/* + * free the discover TX and RX rings, QP and CQ. + * May not be called from fip wq context. + */ +int fip_discover_cleanup(struct vnic_port *port, struct fip_discover *discover, int complt) +{ + if (discover->state == FIP_DISCOVER_OFF) + return -EINVAL; + + /* move FSM to flush state and wait for the FSM + * to finish whatever it is doing before we continue + */ + vnic_dbg_mark(); + init_completion(&discover->flush_complete); + discover->flush = complt ? FIP_FULL_FLUSH : FIP_PARTIAL_FLUSH; + cancel_delayed_work(&discover->fsm_task); +#ifndef _BP_WORK_SYNC + cancel_delayed_work_sync(&discover->hadmin_update_task); +#else + cancel_delayed_work(&discover->hadmin_update_task); + flush_workqueue(fip_wq); +#endif + /* flush any hadmin entries leftovers */ + { + struct fip_hadmin_cache *hadmin, *hadmin_t; + + spin_lock_irq(&discover->lock); + list_for_each_entry_safe(hadmin, hadmin_t, + &discover->hadmin_cache, next) { + list_del(&hadmin->next); + kfree(hadmin); + } + spin_unlock_irq(&discover->lock); + } + + /* calls fip_discover_fsm() */ + queue_delayed_work(fip_wq, &discover->fsm_task, 0); + vnic_dbg_mark(); + /* calls fip_discover_fsm() */ + wait_for_completion(&discover->flush_complete); + vnic_dbg_mark(); + + /* make sure that discover FSM is idle */ +#ifndef _BP_WORK_SYNC + cancel_delayed_work_sync(&discover->fsm_task); +#else + cancel_delayed_work(&discover->fsm_task); + flush_workqueue(fip_wq); +#endif + + if (discover->pkey_index != ILLEGAL_PKEY_INDEX) { + fip_flush_rings(port, discover->cq, discover->qp, + &discover->rx_ring, &discover->tx_ring, + discover->name); + fip_free_rings(port, &discover->rx_ring, &discover->tx_ring, + discover->name); + + fip_recv_list_flush(discover); + if (discover->qp) + ib_destroy_qp(discover->qp); + discover->qp = NULL; + + if (discover->cq) + ib_destroy_cq(discover->cq); + discover->cq = NULL; + } + + return 0; +} + +/* + * This function runs in interrupt context + * It does sanity checking of the packet, moves it to a list and passes + * handling to a thread. + */ +void fip_discover_process_rx(struct fip_discover *discover) +{ + struct vnic_port *port = discover->port; + int mtu_size = FIP_UD_BUF_SIZE(port->max_mtu_enum); + int rc; + int queue_packet, one_or_more_queued = 0; + struct fip_rcv_pkt *rcv, *rcv1; + struct list_head discov_recv_local; + int index; + struct fip_content *fc; + int err; + struct fip_ring_entry *ring; + + INIT_LIST_HEAD(&discov_recv_local); + + if (discover->flush != FIP_NO_FLUSH) + return; + + while (discover->rx_ring.head != discover->rx_ring.tail) { + fc = NULL; + queue_packet = 0; + index = discover->rx_ring.tail & (discover->rx_ring.size - 1); + ring = &discover->rx_ring.ring[index]; + + if (ring->entry_posted == 1 && + discover->state == FIP_DISCOVER_SOLICIT) { + fc = kzalloc(sizeof *fc, GFP_ATOMIC); + if (likely(fc)) { + /* login is the first state we RX packets in */ + rc = fip_packet_parse(port, ring->mem + IB_GRH_BYTES, + ring->length - IB_GRH_BYTES, fc); + if (!rc) + fip_discover_rx_packet(&queue_packet, fc); + } else + vnic_warn(discover->name, "allocation failed\n"); + } + if (queue_packet) { + int length; + + length = ring->length - IB_GRH_BYTES; + rcv = kmalloc(sizeof *rcv, GFP_ATOMIC); + if (!rcv) { + vnic_dbg_fip(discover->name, "failed kmalloc\n"); + kfree(fc); + } else { + struct fip_ring_entry me; + + err = alloc_map_fip_buffer(port->dev->ca, &me, + mtu_size, GFP_ATOMIC); + if (err) { + kfree(fc); + kfree(rcv); + } else { + rcv->length = length; + rcv->fc = fc; + rcv->mem = ring->mem; + list_add_tail(&rcv->list, &discov_recv_local); + one_or_more_queued++; + ib_dma_unmap_single(port->dev->ca, + ring->bus_addr, + mtu_size, DMA_FROM_DEVICE); + *ring = me; + } + } + } else + kfree(fc); + + rc = fip_post_receive(port, discover->qp, + FIP_UD_BUF_SIZE(discover->port->max_mtu_enum), + index, ring, discover->name); + if (rc) + vnic_warn(discover->name, "fip_post_receive rc %d\n", rc); + + discover->rx_ring.tail++; + } + + if (one_or_more_queued) { + spin_lock(&discover->lock); + if (likely(discover->flush == FIP_NO_FLUSH)) { + spin_lock(&discover->rcv_list.lock); + list_splice_init(&discov_recv_local, discover->rcv_list.list.prev); + spin_unlock(&discover->rcv_list.lock); + /* calls fip_discover_process_rx_bh */ + queue_work(fip_wq, &discover->pkt_rcv_task_bh); + spin_unlock(&discover->lock); + } else { + spin_unlock(&discover->lock); + list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) { + list_del(&rcv->list); + kfree(rcv->fc); + kfree(rcv->mem); + kfree(rcv); + } + } + } + + return; +} + +/* + * This function is the RX packet handler bottom half. It runs on the fip wq. +*/ +void fip_discover_process_rx_bh(struct work_struct *work) +{ + struct fip_discover *discover = + container_of(work, struct fip_discover, pkt_rcv_task_bh); + int rc; + struct list_head discov_recv_local; + struct fip_rcv_pkt *rcv, *rcv1; + unsigned long flags; + + INIT_LIST_HEAD(&discov_recv_local); + + /* the irqsave is needed because debug kernel above 2.6.27 complains about + * hard irq safe to hard irq unsafe on discover.lock */ + spin_lock_irqsave(&discover->rcv_list.lock, flags); + list_replace_init(&discover->rcv_list.list, &discov_recv_local); + spin_unlock_irqrestore(&discover->rcv_list.lock, flags); + + if (discover->flush != FIP_NO_FLUSH) { + list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) { + list_del(&rcv->list); + kfree(rcv->fc); + kfree(rcv->mem); + kfree(rcv); + } + return; + } + + list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) { + rc = fip_discover_rx_packet_bh(discover, rcv->fc); + if (rc) + vnic_warn(discover->name, "discover_rx_packet rc %d\n", rc); + + list_del(&rcv->list); + kfree(rcv->fc); + kfree(rcv->mem); + kfree(rcv); + } + return; +} + +static inline int fip_close_all_vnics(struct fip_gw_data *gw, enum fip_flush flush) +{ + struct fip_vnic_data *vnic; + int open_vnics = 0; + + vnic_dbg_func(gw->discover->name); + + list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) { + open_vnics++; + fip_vnic_close(vnic, flush); + } + return open_vnics; +} + +static int fip_gw_create_vnics(struct fip_gw_data *gw) +{ + struct fip_vnic_data *vnic; + unsigned long first_free_vnic; + struct fip_vnic_send_info gw_address; + int i; + + gw->info.gw_num_vnics = (gw->info.gw_num_vnics > FIP_MAX_VNICS_PER_GW) ? + FIP_MAX_VNICS_PER_GW : gw->info.gw_num_vnics; + + + gw->info.gw_num_vnics = vnic_net_admin ? gw->info.gw_num_vnics : 0; + fip_vnic_create_gw_param(&gw_address, gw->info.gw_qpn, VNIC_FIP_QKEY, + gw->info.gw_lid, vnic_gw_ctrl_sl(gw)); + /* for host admined */ + list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) { + if (vnic->hadmined) { + if (gw->info.hadmined_en) + fip_hadmin_vnic_refresh(vnic, &gw_address); + else { + vnic_dbg_fip(gw->discover->name, + "fip_gw_create_vnics hadmin disabled, " + "close open hadmin vnics\n"); + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + } + } + } + + /* for network admined */ + for (i = gw->vnic_count; i < gw->info.gw_num_vnics; i++) { + vnic_dbg_fip(gw->discover->name, "fip_gw_create_vnics available" + " vnics %d needed %d\n", + gw->vnic_count, gw->info.gw_num_vnics); + + /* start network assigned at half array. leave first half to host admin */ + first_free_vnic = find_first_zero_bit(gw->n_bitmask, + FIP_MAX_VNICS_PER_GW); + if (first_free_vnic >= FIP_MAX_VNICS_PER_GW) + return -ENOMEM; + + vnic = fip_vnic_alloc(gw->discover->port, gw, 0 /* hadmin */, first_free_vnic); + if (!vnic) + return -ENOMEM; + + fip_vnic_set_gw_param(vnic, &gw_address); + set_bit(first_free_vnic, gw->n_bitmask); + list_add_tail(&vnic->gw_vnics, &gw->vnic_list); + gw->vnic_count++; + + /* calls fip_vnic_fsm() */ + cancel_delayed_work(&vnic->vnic_task); + fip_vnic_fsm(&vnic->vnic_task.work); + } + + return 0; +} + +/* + * This function goes over vnics and closes network administrated vNics + * that are not open and do not receive neighbor table info (there + * is no way for the BXM to tell the vNics to close before the + * vnic is listening to the neighbour tables). +*/ +static int fip_gw_close_nonopen_vnics(struct fip_gw_data *gw) +{ + struct fip_vnic_data *vnic; + int closed_vnics = 0; + + vnic_dbg_fip(gw->discover->name, "Try to close non open vnics\n"); + + list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) { + vnic_dbg_fip(gw->discover->name, "check vnic %s, hadmin %d state %d\n", + vnic->name, vnic->hadmined, vnic->state); + if (!vnic->hadmined && vnic->state < FIP_VNIC_VHUB_DONE) { + vnic_dbg_fip(gw->discover->name, "closing vnic %s\n", vnic->name); + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + closed_vnics++; + } + } + + return closed_vnics; +} + +/* permanently delete all vnics pending delete. The function goes over + * the list of vnics awaiting deletion and tries to delete them. If the + * vnic destructor returns an error value (currently busy) the function + * will requeue it self for another try. The function will also test if + * new vnics need to be added as a result of vnic removal. + */ +static void fip_purge_vnics(struct work_struct *work) +{ + struct fip_gw_data *curr_gw = + container_of(work,struct fip_gw_data, vnic_cleanup_task.work); + struct fip_vnic_data *vnic, *tmp_vnic; + int vnic_id, rc, del_cnt = 0, retry = 0; + unsigned long *bitmask; + + vnic_dbg_fip(curr_gw->discover->name, "fip_purge_vnics\n"); + + list_for_each_entry_safe(vnic, tmp_vnic, &curr_gw->vnic_list, gw_vnics) { + enum fip_flush f; + vnic_id = vnic->vnic_id; + bitmask = vnic->hadmined ? NULL : curr_gw->n_bitmask; + + /* If successful vnic is removed from list and destroyed */ + f = vnic->flush; + if (f != FIP_NO_FLUSH) { + rc = fip_vnic_destroy(vnic); + if (!rc) { + del_cnt++; + if (f == FIP_FULL_FLUSH && bitmask) + clear_bit(vnic_id, bitmask); + } else { + retry |= rc; + } + } + + /* limit the number of vnics to purge in each loop to let other + * tasks on same wq to run (i.e., avoid starvation). + */ + if (del_cnt > 2) { + retry = 1; + break; + } + } + + /* This means we still have vnics that refuse to close retry later */ + if (retry){ + vnic_dbg_mark(); + /* calls fip_purge_vnics() */ + queue_delayed_work(fip_wq, &curr_gw->vnic_cleanup_task, HZ / 10); + } else { + vnic_dbg_fip(curr_gw->discover->name, "fip_purge_vnics, all GW" + " vnics closed\n"); + + if (curr_gw->hadmin_gw && curr_gw->state == FIP_GW_HOST_ADMIN && list_empty(&curr_gw->vnic_list)) { + vnic_warn(curr_gw->discover->name, + "Removing Host admin GW %s with no vnics\n", + (char*)curr_gw->info.vol_info.gw_port_name); + fip_close_gw(curr_gw, FIP_FULL_FLUSH); + } + /* test and open new vnics if vnics are missing */ + /* ALITODO: after GW timeout, a vnic is re-created! why is that? + if (fip_gw_create_vnics(curr_gw)) { + vnic_dbg_mark(); + queue_delayed_work(fip_wq, + &curr_gw->vnic_cleanup_task, HZ); + } + */ + } +} + +/* + * This function adds or removes a single host admined vnic to a GW. + * First the function searches for the vnic. The search function + * disregards vnics that are undergoing a complete flush. +*/ +int fip_gw_update_hadmin_gw(struct fip_gw_data *gw, + struct fip_hadmin_cache *hadmin_entry) +{ + struct fip_vnic_data *vnic; + int vnic_id = hadmin_entry->vnic_id, rc = 0; + + /* set bit 16 for hadmin vNics (by spec) */ + vnic_id |= (1 << (VNIC_ID_LEN - 1)); + + vnic = fip_vnic_find_in_list(gw, vnic_id, hadmin_entry->mac, + hadmin_entry->vlan, + hadmin_entry->vlan_used); + + /* remove: if vNic found - remove it and exit */ + if (hadmin_entry->remove) { + if (vnic) + fip_vnic_close(vnic, FIP_FULL_FLUSH); + else + vnic_dbg_fip(gw->discover->name, "vNic to remove is" + " not found (name:%s mac:"MAC_6_PRINT_FMT + " vlan:%d id:%d)\n", + hadmin_entry->interface_name, + MAC_6_PRINT_ARG(hadmin_entry->mac), + hadmin_entry->vlan, vnic_id); + goto out; + } + + /* add: if vNic found - report error, otherwise add new vNic */ + if (vnic) { + /* skip error reporting between child vNics conflict, + * as vnic_learn_mac() may learn same child while it's still + * pending. TODO: improve this to avoid such cases. + */ + if (hadmin_entry->parent_used && vnic->parent_used) + goto out; + vnic_warn(gw->discover->name, "vNic creation failed, duplicate" + " vNic detected (name:%s mac:"MAC_6_PRINT_FMT + " vlan:%d id:%d & existing name:%s mac:" + MAC_6_PRINT_FMT" vlan:%d id:%d)\n", + hadmin_entry->interface_name, + MAC_6_PRINT_ARG(hadmin_entry->mac), + hadmin_entry->vlan, vnic_id, vnic->interface_name, + MAC_6_PRINT_ARG(vnic->login_data.mac), + vnic->login_data.vlan, vnic->login_data.vnic_id); + goto out; + } + +#if 0 + /* if the GW is in all_vlan mode, + * the host can only create vlans in this mode. + * However if it is not in all_vlan mode, the host must not create + * vlans in this mode */ + if ((gw->info.all_vlan_gw && !hadmin_entry->all_vlan_gw + && hadmin_entry->vlan_used) || + (!gw->info.all_vlan_gw && hadmin_entry->all_vlan_gw)) { + vnic_warn(gw->discover->name, "vnic creation failed, all_vlan" + " gateway policy must be enforced between the gateway" + " and the host\n"); + rc = -EINVAL; + goto out; + } +#endif + + vnic = fip_vnic_alloc(gw->discover->port, gw, 1 /* hadmin */, vnic_id); + if (!vnic) { + rc = -ENOMEM; + goto out; + } + + /* hand over info from hadmin to vnic struct */ + memcpy(vnic->login_data.mac, hadmin_entry->mac, sizeof(vnic->login_data.mac)); + memcpy(vnic->interface_name, hadmin_entry->interface_name, + sizeof(vnic->interface_name)); + vnic->login_data.vlan = hadmin_entry->vlan; + vnic->login_data.vp = hadmin_entry->vlan_used; + vnic->login_data.all_vlan_gw = hadmin_entry->all_vlan_gw; + memcpy(vnic->shared_vnic.ip, hadmin_entry->shared_vnic_ip, + sizeof(vnic->shared_vnic.ip)); + memcpy(vnic->shared_vnic.emac, hadmin_entry->shared_vnic_mac, + sizeof(vnic->shared_vnic.emac)); + vnic->shared_vnic.enabled = is_valid_ipv4(hadmin_entry->shared_vnic_ip); + vnic->vnic_id = vnic_id; /* will be overwritten later */ + vnic->vlan_used = hadmin_entry->vlan_used; + vnic->parent_used = hadmin_entry->parent_used; + memcpy(vnic->parent_name, hadmin_entry->parent_name, + sizeof(vnic->parent_name)); + vnic->qp_base_num = hadmin_entry->qp_base_num; + vnic->vlan = hadmin_entry->vlan; + vnic->cmd = hadmin_entry->cmd; + vnic->all_vlan_gw = hadmin_entry->all_vlan_gw; + + /* create dentry */ + rc = vnic_create_hadmin_dentry(vnic); + if (rc) + goto init_failed; + + rc = fip_vnic_hadmin_init(gw->discover->port, vnic); + if (rc) + goto init_failed; + + list_add_tail(&vnic->gw_vnics, &gw->vnic_list); + + /* calls fip_vnic_fsm() */ + fip_vnic_fsm(&vnic->vnic_task.work); + + return 0; + +init_failed: + vnic_delete_hadmin_dentry(vnic); + kfree(vnic); +out: + return rc; +} + +/* + * Queue the GW for deletion. And trigger a delayed call to the cleanup + * function. + * Note: This deletion method insures that all pending GW work requests + * are cleared without dependency of the calling context. +*/ +void fip_close_gw(struct fip_gw_data *gw, enum fip_flush flush) +{ + enum fip_flush tmp_flush = gw->hadmin_gw ? flush : FIP_FULL_FLUSH; + + if (tmp_flush == FIP_PARTIAL_FLUSH && gw->state < FIP_GW_HOST_ADMIN) + return; + + /* close already in process, disregard*/ + if (gw->flush >= tmp_flush) + return; + + gw->flush = tmp_flush; + gw->info.gw_num_vnics = 0; + cancel_delayed_work(&gw->gw_task); + + /* This is not mandatory but will save us time because there is a + * better chance that all vnics would be destroyed before trying to + * destroy the GW */ + fip_close_all_vnics(gw, tmp_flush); + + /* calls fip_purge_gws() */ + queue_delayed_work(fip_wq, &gw->discover->cleanup_task, DELAYED_WORK_CLEANUP_JIFFS); +} + +/* + * Free GW resources. This includes destroying the vnics. If the GW can be + * totally destroyed (no pending work for the GW and all the vnics have been + * destroyed) the GW will be removed from the GWs list and it's memory + * freed. If the GW can not be closed at this time it will not be freed + * and the function will return an error. + * In this case the caller needs to recall the unction to complete the + * operation. + * Do not call this function directly use: fip_close_gw + */ +static int fip_free_gw(struct fip_discover *discover, struct fip_gw_data *gw) +{ + struct fip_vnic_data *vnic; + int vnic_close_fail = 0; + + gw->info.gw_num_vnics = 0; + + if (delayed_work_pending(&gw->gw_task)) + return -EBUSY; + + list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) + vnic_close_fail |= (vnic->flush != FIP_NO_FLUSH); + + /* true if vnics need to be closed */ + /* if some of the vnics are still open return and retry later */ + if (vnic_close_fail) + return -EBUSY; + + if (delayed_work_pending(&gw->vnic_cleanup_task)) + return -EBUSY; + + /* + * it is possible that during gw removal we added the GW again. Test GW + * list to ensure it is not in the list already before adding it again. + */ + if (gw->state > FIP_GW_HOST_ADMIN) { + if (gw->info.gw_prot_new) + discover->new_prot_gws--; + else + discover->old_prot_gws--; + } + if (gw->flush == FIP_PARTIAL_FLUSH) { + gw->state = FIP_GW_HOST_ADMIN; + gw->flush = FIP_NO_FLUSH; + } else { + list_del(&gw->list); + if (!IS_ERR(gw->pquery) && gw->query_id >= 0) + ib_sa_cancel_query(gw->query_id, gw->pquery); + wait_for_completion(&gw->query_comp); + kfree(gw); + } + return 0; +} + +/* + * permanently delete all GWs pending delete. The function goes over + * the list of GWs awaiting deletion and tries to delete them. If the + * GW destructor returns an error value (currently busy) the function + * will requeue it self for another try. + */ +static void fip_purge_gws(struct work_struct *work) +{ + struct fip_discover *discover = + container_of(work, struct fip_discover, cleanup_task.work); + struct fip_gw_data *gw, *tmp_gw; + int gw_close_fail = 0; + + down_write(&discover->l_rwsem); + list_for_each_entry_safe(gw, tmp_gw, &discover->gw_list, list) { + if (gw->flush != FIP_NO_FLUSH) { + gw_close_fail |= fip_free_gw(discover, gw); + } + } + up_write(&discover->l_rwsem); + + /* This means we still have vnics that refuse to close, retry later */ + if (gw_close_fail) { + vnic_dbg_fip(discover->name, "still have open GWs\n"); + /* calls fip_purge_gws() */ + queue_delayed_work(fip_wq, &discover->cleanup_task, + DELAYED_WORK_CLEANUP_JIFFS); + } else { + vnic_dbg_fip(discover->name, "fip_purge_gws all gws" + " closed and freed\n"); + } +} + +static int fip_free_gw_done(struct fip_discover *discover, enum fip_flush flush) +{ + struct fip_gw_data *curr_gw; + int rc; + + down_read(&discover->l_rwsem); + if (flush == FIP_FULL_FLUSH) { + rc = list_empty(&discover->gw_list); + up_read(&discover->l_rwsem); + return rc; + } + + list_for_each_entry(curr_gw, &discover->gw_list, list) { + if (curr_gw->flush != FIP_NO_FLUSH) { + up_read(&discover->l_rwsem); + return 0; + } + } + + up_read(&discover->l_rwsem); + return 1; +} + +/* + * Go over the GW list and try to close the GWs. It is possible that some + * of the GWs have pending work and therefore can not be closed. We can not + * sleep on this because we might be running on the same context as the one + * we are waiting for. The user should call this function once and then test + * if the free is done by polling (must release wq context) fip_free_gw_done + */ +static int fip_free_gw_list(struct fip_discover *discover, enum fip_flush flush) +{ + struct fip_gw_data *curr_gw; + + down_read(&discover->l_rwsem); + list_for_each_entry(curr_gw, &discover->gw_list, list) + fip_close_gw(curr_gw, flush); + up_read(&discover->l_rwsem); + + vnic_dbg_fip(discover->name, "fip_free_gw_list not done\n"); + return 0; +} + +static inline void update_gw_address(struct fip_gw_data *gw, + struct fip_gw_data_info *new_gw_data) +{ + gw->info.gw_qpn = new_gw_data->gw_qpn; + gw->info.gw_lid = new_gw_data->gw_lid; + gw->info.gw_port_id = new_gw_data->gw_port_id; + gw->info.gw_sl = new_gw_data->gw_sl; + memcpy(gw->info.gw_guid, new_gw_data->gw_guid, sizeof gw->info.gw_guid); + + vnic_dbg_fip(gw->discover->name, "GW address was modified. " + "QPN: 0x%x, LID: 0x%x, guid: " GUID_FORMAT + "port id: %d, SL: %d\n", gw->info.gw_qpn, + gw->info.gw_lid, GUID_ARG(gw->info.gw_guid), + gw->info.gw_port_id, gw->info.gw_sl); + /* restart fsm to path query */ + if (vnic_sa_query) + fip_discover_gw_fsm_move(gw, FIP_GW_CTRL_PATH_QUERY); +} + +int fip_gw_modified(struct fip_gw_data *gw, + struct fip_gw_data_info *new_gw_data) +{ + char *name = gw->discover->name; + ASSERT(new_gw_data); + + vnic_dbg_fip(name, "fip_gw_modified called, gw_num_vnics %d -> %d\n", + gw->info.gw_num_vnics, new_gw_data->gw_num_vnics); + + if (memcmp(gw->info.gw_guid, new_gw_data->gw_guid, + sizeof(gw->info.gw_guid)) || + gw->info.gw_lid != new_gw_data->gw_lid || + gw->info.gw_port_id != new_gw_data->gw_port_id || + gw->info.gw_qpn != new_gw_data->gw_qpn || + (!vnic_sa_query && gw->info.gw_sl != new_gw_data->gw_sl)) { + /* TODO: Make sure that the GW doesn't change the sl sent in solicitation */ + /* In this case the GW address might be modified even + in 'good flow' */ + if (gw->info.gw_type == GW_TYPE_LAG && + gw->info.ext_lag.ucast) + update_gw_address(gw, new_gw_data); + else { + vnic_dbg_fip(name, "fip_gw_modified changing " + "unsupported parameter closing GW\n"); + fip_close_gw(gw, FIP_PARTIAL_FLUSH); + } + } else if (gw->info.gw_num_vnics < new_gw_data->gw_num_vnics) { + vnic_dbg_fip(name, "fip_gw_modified changing num " + "vnics from %d to %d\n", gw->info.gw_num_vnics, + new_gw_data->gw_num_vnics); + gw->info.gw_num_vnics = new_gw_data->gw_num_vnics; + if (fip_gw_create_vnics(gw)) + vnic_err(name, "fip_gw_create_vnics failed\n"); + + } else if (gw->info.gw_num_vnics > new_gw_data->gw_num_vnics) { + gw->info.gw_num_vnics = new_gw_data->gw_num_vnics; + fip_gw_close_nonopen_vnics(gw); + if (gw->vnic_count < gw->info.gw_num_vnics) + fip_gw_create_vnics(gw); + vnic_dbg_fip(name, "fip_gw_modified changing num " + "vnics from %d to %d\n", gw->info.gw_num_vnics, + new_gw_data->gw_num_vnics); + } else if (gw->info.n_rss_qpn != new_gw_data->n_rss_qpn) { + gw->info.n_rss_qpn = new_gw_data->n_rss_qpn; + vnic_dbg_fip(name, "fip_gw_modified changing n_rss_qpn " + "from %d to %d\n", gw->info.n_rss_qpn, + new_gw_data->n_rss_qpn); + } else if (gw->info.hadmined_en != new_gw_data->hadmined_en) { + if (fip_gw_create_vnics(gw)) + vnic_err(name, "fip_gw_create_vnics failed\n"); + } + + return 0; +} + +static inline int is_none_zero_guid(u8 *guid) +{ + int i; + u8 ored = 0; + + if (!guid) + return 0; + + for (i = 0; i < 8; ++i) + ored |= guid[i]; + + return !!ored; +} + +/* + * Look for a GW in the GW list. + * The search need one identifier to identify the Box (either GUID or system name) + * and one identifier for the external port (port_id or eport_name). + * This function uses what ever data is available for the search since + * various callers do not have access to a single pair of ids. + * use NULL for unknown strings and GW_PORT_ID_UNKNOWN for unknown port_id. + * GW that are undergoing complete flush are disregarded by the search. + */ +struct fip_gw_data *fip_find_gw_in_list( + struct fip_discover *discover, + int port_id, + u8 *eport_name, + u8 *gw_guid, + u8 *system_guid, + u8 *system_name, + int is_login) +{ + struct fip_gw_data *curr_gw; + int use_guid = is_none_zero_guid(gw_guid); + int use_system_name = system_name && strlen(system_name) > 0; + int use_system_guid = is_none_zero_guid(system_guid); + int use_eport = eport_name && strlen(eport_name) > 0; + int use_port_id = port_id >= 0; + int port_id_pass; + int eport_match; + + if(!((use_eport || use_port_id) && + (use_guid || use_system_name || use_system_guid))) { + vnic_dbg_fip_v(discover->name, + "fip_find_gw_in_list not enough param for search\n"); + return NULL; + } + + if (use_system_name) + vnic_dbg_fip_v(discover->name, "system name %s\n", system_name); + + if (use_guid) + vnic_dbg_fip_v(discover->name, "gw guid "VNIC_GUID_FMT"\n", + VNIC_GUID_RAW_ARG(gw_guid)); + + if (use_system_guid) + vnic_dbg_fip_v(discover->name, "system guid "VNIC_GUID_FMT"\n", + VNIC_GUID_RAW_ARG(system_guid)); + + if (use_eport) + vnic_dbg_fip_v(discover->name, "eport %s\n", eport_name); + + if (use_port_id) + vnic_dbg_fip_v(discover->name, "port_id 0x%x\n", port_id); + + down_read(&discover->l_rwsem); + list_for_each_entry(curr_gw, &discover->gw_list, list) { + vnic_dbg_fip_v(discover->name, "check gw on eport %s, gw_guid "VNIC_GUID_FMT" " + "system_guid "VNIC_GUID_FMT", flush %d\n", + curr_gw->info.vol_info.gw_port_name, + VNIC_GUID_RAW_ARG(curr_gw->info.gw_guid), + VNIC_GUID_RAW_ARG(curr_gw->info.vol_info.system_guid), + curr_gw->flush); + + if (curr_gw->flush == FIP_FULL_FLUSH) + continue; + + /* for login ack, skip non connected GWs */ + if (is_login && use_port_id && curr_gw->state == FIP_GW_HOST_ADMIN) /* skip dangling hadmined GWs */ + continue; + + /* use the eport names only if you don't have port_id indexes + * This is in order to enable port_id changes. + * in case of host admin GW, ignore gw_port_id since the old GW + * will never be flushed and the new GW id can change */ + port_id_pass = use_port_id && (curr_gw->info.gw_port_id != (u16)-1) && !(curr_gw->hadmin_gw && use_eport); + eport_match = (use_eport && !port_id_pass && + !strncmp(curr_gw->info.vol_info.gw_port_name, + eport_name,VNIC_GW_PORT_NAME_LEN)) || + (port_id_pass && (port_id == curr_gw->info.gw_port_id)); + if (!eport_match) + continue; + + if (use_guid && !memcmp(curr_gw->info.gw_guid, gw_guid, GUID_LEN)) + goto found; + + if (use_system_guid && + !memcmp(curr_gw->info.vol_info.system_guid, + system_guid, GUID_LEN)) + goto found; + + if(use_system_name && + !strncmp(curr_gw->info.vol_info.system_name, system_name, + VNIC_SYSTEM_NAME_LEN)) + goto found; + } + + up_read(&discover->l_rwsem); + vnic_dbg_fip(discover->name, "gw not found!\n"); + return NULL; +found: + if (curr_gw->hadmin_gw && use_eport && use_port_id && + !strncmp(curr_gw->info.vol_info.gw_port_name,eport_name,VNIC_GW_PORT_NAME_LEN) && + curr_gw->info.gw_port_id != port_id) { + vnic_info("%s:["VNIC_GUID_FMT"] %s eport ID changed from %d to %d\n", + curr_gw->info.vol_info.system_name, + VNIC_GUID_RAW_ARG(curr_gw->info.vol_info.system_guid), + curr_gw->info.vol_info.gw_port_name, + curr_gw->info.gw_port_id, port_id); + } + + up_read(&discover->l_rwsem); + return curr_gw; +} + +/* + * Alloc and init a new GW struct + */ +static struct fip_gw_data *fip_discover_create_gw(struct fip_discover *discover) +{ + struct fip_gw_data *gw_data; + + gw_data = kzalloc(sizeof(struct fip_gw_data), GFP_KERNEL); + if (!gw_data) + goto out; + + INIT_DELAYED_WORK(&gw_data->gw_task, fip_discover_gw_fsm); + INIT_DELAYED_WORK(&gw_data->vnic_cleanup_task, fip_purge_vnics); + INIT_LIST_HEAD(&gw_data->vnic_list); + gw_data->discover = discover; + gw_data->pquery = ERR_PTR(-ENODATA); + gw_data->query_id = -1; + init_completion(&gw_data->query_comp); + complete(&gw_data->query_comp); + mutex_init(&gw_data->mlock); + +out: + return gw_data; +} + +static void fip_discover_hadmin_update(struct work_struct *work) +{ + struct fip_discover *discover = + container_of(work, struct fip_discover, + hadmin_update_task.work); + struct fip_hadmin_cache *hadmin_entry; + struct fip_hadmin_cache *hadmin_tmp; + struct fip_gw_data *curr_gw; + struct list_head hadmin_head; + char *name; + int flush, used_guid, rc; + + /* move list from hadmin_cache to a temporary list */ + spin_lock_irq(&discover->lock); + list_replace(&discover->hadmin_cache, &hadmin_head); + INIT_LIST_HEAD(&discover->hadmin_cache); + flush = discover->flush; + spin_unlock_irq(&discover->lock); + + if (flush != FIP_NO_FLUSH) + goto out; + + /* process hadmin list */ + list_for_each_entry_safe(hadmin_entry, hadmin_tmp, &hadmin_head, next) { + name = (char *)(hadmin_entry->interface_name); + vnic_dbg_mac(name, "parent_used %d, remove %d\n", + hadmin_entry->parent_used, + hadmin_entry->remove); + if (hadmin_entry->parent_used) { + rc = vnic_parent_update(discover->port, hadmin_entry->interface_name, + hadmin_entry->vnic_id, hadmin_entry->mac, + &(hadmin_entry->qp_base_num), + hadmin_entry->parent_name, + hadmin_entry->remove); + if (rc) + continue; + } + + used_guid = is_valid_guid(hadmin_entry->system_guid); + curr_gw = fip_find_gw_in_list(discover, NOT_AVAILABLE_NUM, + hadmin_entry->eport_name, + NULL, + used_guid ? hadmin_entry->system_guid : NULL, + used_guid ? NULL : hadmin_entry->system_name, 0/* is_login */); + if (!hadmin_entry->remove) { + /* in case no GW or GW is being removed create a new one */ + if (!curr_gw || curr_gw->flush == FIP_FULL_FLUSH) { + curr_gw = fip_discover_create_gw(discover); + if (!curr_gw) { + vnic_warn(discover->name, "failed to create hadmin GW\n"); + continue; + } else { + down_write(&discover->l_rwsem); + list_add_tail(&curr_gw->list, &discover->gw_list); + up_write(&discover->l_rwsem); + } + + memcpy(curr_gw->info.vol_info.system_guid, + hadmin_entry->system_guid, GUID_LEN); + memcpy(curr_gw->info.vol_info.gw_port_name, + hadmin_entry->eport_name, + VNIC_GW_PORT_NAME_LEN); + if (used_guid) + strcpy(curr_gw->info.vol_info.system_name, + NOT_AVAILABLE_STRING); + else + memcpy(curr_gw->info.vol_info.system_name, + hadmin_entry->system_name, + VNIC_SYSTEM_NAME_LEN); + + curr_gw->info.gw_port_id = hadmin_entry->gw_port_id; + curr_gw->state = FIP_GW_HOST_ADMIN; + } + + curr_gw->hadmin_gw = 1; + fip_gw_update_hadmin_gw(curr_gw, hadmin_entry); + } else if(curr_gw) + fip_gw_update_hadmin_gw(curr_gw, hadmin_entry); + + list_del(&hadmin_entry->next); + kfree(hadmin_entry); + } + +out: + /* flush hadmin_tmp list and exit */ + list_for_each_entry_safe(hadmin_entry, hadmin_tmp, &hadmin_head, next) + kfree(hadmin_entry); +} + +static const char *gw_state_to_str(enum fip_gw_state state) +{ + switch (state) { + case FIP_GW_CONNECTED: + return "FIP_GW_CONNECTED"; + case FIP_GW_CTRL_PATH_QUERY: + return "FIP_GW_CTRL_PATH_QUERY"; + case FIP_GW_DATA_PATH_QUERY: + return "FIP_GW_DATA_PATH_QUERY"; + case FIP_GW_HOST_ADMIN: + return "FIP_GW_HOST_ADMIN"; + case FIP_GW_SEND_SOLICIT: + return "FIP_GW_SEND_SOLICIT"; + default: + return "UNKNOWN"; + } +} + +int fip_gw_sysfs_show(struct vnic_port *port, char *buf) +{ + struct fip_gw_data *gw; + char *p = buf; + struct fip_discover *discover; + + mutex_lock(&port->start_stop_lock); + list_for_each_entry(discover, &port->fip.discover_list, discover_list) { + + down_read(&discover->l_rwsem); + + list_for_each_entry(gw, &discover->gw_list, list) { + p += _sprintf(p, buf, "IOA_PORT %s:%d\n", + gw->discover->port->dev->ca->name, + gw->discover->port->num); + p += _sprintf(p, buf, "BX_NAME %s\n", + gw->info.vol_info.system_name); + if (!(*(u64 *)(gw->info.vol_info.system_guid))) + p += _sprintf(p, buf, "BX_GUID %s\n", NOT_AVAILABLE_STRING); + else + p += _sprintf(p, buf, "BX_GUID "VNIC_GUID_FMT"\n", + VNIC_GUID_RAW_ARG(gw->info.vol_info.system_guid)); + p += _sprintf(p, buf, "EPORT_NAME %s\n", gw->info.vol_info.gw_port_name); + p += _sprintf(p, buf, "EPORT_ID %u\n", gw->info.gw_port_id); + p += _sprintf(p, buf, "STATE %s\n", gw_state_to_str(gw->state)); + p += _sprintf(p, buf, "GW_TYPE %s\n", gw->info.gw_type == GW_TYPE_LAG ? + "AGGREGATED" : "LEGACY"); + p += _sprintf(p, buf, "PKEY 0x%x\n", discover->pkey); + p += _sprintf(p, buf, "ALL_VLAN %s\n", + gw->state == FIP_GW_CONNECTED ? + (gw->info.all_vlan_gw ? "yes" : "no") : NOT_AVAILABLE_STRING); + p += _sprintf(p, buf, "CTRL_SL %d\n", gw->ctrl_prec.sl); + p += _sprintf(p, buf, "DATA_SL %d\n", gw->data_prec.sl); + p += _sprintf(p, buf, "\n"); + } + + up_read(&discover->l_rwsem); + } + + mutex_unlock(&port->start_stop_lock); + return (p - buf); +} + +static int fip_discover_rx_advertise_bh(struct fip_discover *discover, + struct fip_gw_data *advertise_data) +{ + struct fip_gw_data *gw_data; + int update_entry = 0; + + /* see if we received advertise packets from this GW before */ + gw_data = fip_find_gw_in_list(discover, + advertise_data->info.gw_port_id, + advertise_data->info.vol_info.gw_port_name, + advertise_data->info.gw_guid, + advertise_data->info.vol_info.system_guid, + advertise_data->info.vol_info.system_name, 0/* is_login */); + + /* + * GW not found in GW list. Create a new GW structure + * and add it to the GW list. + */ + if (!gw_data) { + gw_data = fip_discover_create_gw(discover); + if (!gw_data) { + vnic_dbg_fip(discover->name, "Could not create gw\n"); + return -ENOMEM; + } + gw_data->keep_alive_jiffies = jiffies; + + down_write(&discover->l_rwsem); + list_add_tail(&gw_data->list, &discover->gw_list); + up_write(&discover->l_rwsem); + update_entry = 1; + } else { + gw_data->keep_alive_jiffies = jiffies; + vnic_dbg_fip(discover->name, "gw_data->flush %d\n", gw_data->flush); + if (gw_data->flush != FIP_NO_FLUSH) + return 0; + + if (gw_data->state <= FIP_GW_SEND_SOLICIT) + update_entry = 1; + } + + /* If GW is in multicast state (based on received mcast packet), + * replace it with the newer up-to-date packet info. + */ + if (update_entry) { + if (gw_data->state < FIP_GW_CTRL_PATH_QUERY) { + down_write(&discover->l_rwsem); + if (advertise_data->info.gw_prot_new) + discover->new_prot_gws++; + else + discover->old_prot_gws++; + up_write(&discover->l_rwsem); + } + memcpy(&gw_data->info, &advertise_data->info, + sizeof(struct fip_gw_data_info)); + if (gw_data->state < FIP_GW_SEND_SOLICIT) + gw_data->state = vnic_sa_query? FIP_GW_CTRL_PATH_QUERY : FIP_GW_SEND_SOLICIT; + } else { + /* If the pc_id in the adv doesn't match the one + saved - there was a power cycle, so we want to close + the GW */ + if (advertise_data->info.ext_pc_id.valid && + (advertise_data->info.ext_pc_id.power_cycle_id != + gw_data->info.ext_pc_id.power_cycle_id)) { + vnic_dbg_fip_p0(discover->name, "received advertisement with " + "pc_id %llu when expecting %llu. closing the GW", + advertise_data->info.ext_pc_id.power_cycle_id, + gw_data->info.ext_pc_id.power_cycle_id); + fip_close_gw(gw_data, FIP_PARTIAL_FLUSH); + goto no_repost; + } + + /* TBD: enforce discard ?? */ + if (gw_data->info.gw_type != advertise_data->info.gw_type) + vnic_dbg_fip_p0(discover->name, "gateway type must not change\n"); + + /* update GW descriptors that do not require additional processing. + These will be updated as part of GW_MODIFY flow */ + mutex_lock(&gw_data->mlock); + if (advertise_data->info.ext_pc_id.valid) + memcpy(&gw_data->info.ext_pc_id, &advertise_data->info.ext_pc_id, + sizeof(gw_data->info.ext_pc_id)); + + memcpy(&gw_data->info.vol_info, &advertise_data->info.vol_info, + sizeof(gw_data->info.vol_info)); + if (gw_data->info.ext_lag.valid) { + gw_data->info.ext_lag.hash = advertise_data->info.ext_lag.hash; + gw_data->info.ext_lag.ca = advertise_data->info.ext_lag.ca; + gw_data->info.ext_lag.ca_thresh = advertise_data->info.ext_lag.ca_thresh; + gw_data->info.ext_lag.weights_policy = advertise_data->info.ext_lag.weights_policy; + } + mutex_unlock(&gw_data->mlock); + } + + /* if multicast advertisement received */ + if (advertise_data->info.flags & FIP_RCV_MULTICAST) { + vnic_dbg_fip(discover->name, "FIP_RCV_MULTICAST ADVERTISE, state %d\n", + gw_data->state); + /* we are beyond accepting mcast advertisement */ + if (gw_data->state > FIP_GW_SEND_SOLICIT) + goto out; + + vnic_dbg_fip(discover->name, "received mcast advertise sending" + " ucast solicit to GW qpn %d lid %d flags 0x%x\n", + gw_data->info.gw_qpn, gw_data->info.gw_lid, + gw_data->info.flags); + } else { /* unicast advertisement received */ + int ack_received = advertise_data->info.flags & FIP_GW_AVAILABLE; + + vnic_dbg_fip(discover->name, "received ucast advertise from GW " + "qpn %d lid %d flags 0x%x, ack_received %s " + "gw_num_vnics %d gw->state=%d, " + VNIC_GUID_FMT"\n", + gw_data->info.gw_qpn, gw_data->info.gw_lid, + gw_data->info.flags, ack_received ? "yes" : "no", + gw_data->info.gw_num_vnics, gw_data->state, + VNIC_GUID_RAW_ARG(gw_data->info.gw_guid)); + + if (ack_received) { + /* if this is first ACK received */ + switch (gw_data->state) { + case FIP_GW_CTRL_PATH_QUERY: + /* + * in case we are in FIP_GW_CTRL_PATH_QUERY we wait until it completes + * to move us to FIP_GW_SEND_SOLICIT + */ + break; + case FIP_GW_SEND_SOLICIT: + /* in case we received an ack in this state we move to DATA_PATH_QUERY */ + gw_data->state = vnic_sa_query ? FIP_GW_DATA_PATH_QUERY : FIP_GW_CONNECTED; + break; + case FIP_GW_CONNECTED: + /* + * received an ACK and we are connected. we need to + * check for changes in GW and apply them if needed + */ + if (!fip_gw_modified(gw_data, &advertise_data->info)) + gw_data->state = FIP_GW_CONNECTED; + goto no_repost; + default: + break; + } + } else /* !ack_received */ { + fip_close_gw(gw_data, FIP_PARTIAL_FLUSH); + goto no_repost; + } + /* + * we don't accept ACKs in transient states. + * This should not be a problem since crowded multiple ACKs + * is not an expected flow, and if the packets are similar + * (no updates) it doesn't matter anyway. + */ + } + +out: + vnic_dbg_fip(discover->name, "out gw->state=%d\n", gw_data->state); + /* + * we will call the GW FSM to hadle + */ + cancel_delayed_work(&gw_data->gw_task); + fip_discover_gw_fsm(&gw_data->gw_task.work); +no_repost: + return 0; +} + +/* + * This function handles a single received packet that are expected to be + * GW advertisements or login ACK packets. The function first parses the + * packet and decides what is the packet type and then validates the packet + * according to its type. This functions runs in ka_wq task context. + */ +void fip_discover_rx_packet(int *queue, struct fip_content *fc) +{ + *queue = 0; + switch (fc->fh->subcode) { + case FIP_GW_ADV_SUB_OPCODE: + case FIP_GW_LOGIN_SUB_OPCODE: + *queue = 1; + break; + default: + break; + } +} + +/* + * Print FIP syndrome number and string + */ +static void fip_print_syndrome(struct fip_vnic_data *vnic, int synd) { + char *syndstr; + + switch (synd) { + case FIP_SYNDROM_HADMIN_REJECT: + syndstr = "FIP_SYNDROM_HADMIN_REJECT"; + break; + case FIP_SYNDROM_GW_RESRC: + syndstr = "FIP_SYNDROM_GW_RESRC"; + break; + case FIP_SYNDROM_NO_NADMIN: + syndstr = "FIP_SYNDROM_NO_NADMIN"; + break; + case FIP_SYNDROM_UNRECOGNISED_HOST: + syndstr = "FIP_SYNDROM_UNRECOGNISED_HOST"; + break; + case FIP_SYNDROM_UNSUPPORTED_PARAM: + syndstr = "FIP_SYNDROM_UNSUPPORTED_PARAM"; + break; + case FIP_SYNDROM_GW_IS_LAG_MEMBER: + syndstr = "FIP_SYNDROM_GW_IS_LAG_MEMBER"; + break; + case FIP_SYNDROM_DUPLICATE_ADDRESS: + syndstr = "FIP_SYNDROM_DUPLICATE_ADDRESS"; + break; + default: + syndstr = "FIP_OTHER"; + } + + vnic_warn(vnic->name, "SYNDROME 0x%x: %s\n", + synd, syndstr); +} + +static void handle_login_packet(struct fip_discover *discover, + struct fip_login_data *login_data) +{ + struct fip_gw_data *gw; + struct fip_vnic_data *vnic; + int mac_vlan_refused = 0; + int synd; + + /* find the GW that this login belongs to */ + gw = fip_find_gw_in_list(discover, + login_data->port_id, + NULL, + login_data->guid, + NULL, NULL, 1/* is_login */); + + if (!gw){ + vnic_warn(discover->name,"dropping login ack with vnic_id:%d mac:"MAC_6_PRINT_FMT + " BX port_id:%d GUID: "VNIC_GUID_FMT", GW not found!\n", + login_data->vnic_id, + MAC_6_PRINT_ARG(login_data->mac), + login_data->port_id, + VNIC_GUID_RAW_ARG(login_data->guid)); + return; + } + vnic = fip_vnic_find_in_list(gw, login_data->vnic_id, + login_data->mac, + login_data->vlan, + login_data->vp); + if (!vnic){ + vnic_warn(discover->name,"dropping login ack with vnic_id:%d mac:"MAC_6_PRINT_FMT + " BX port_id:%d GUID: "VNIC_GUID_FMT", vnic not found!\n", + login_data->vnic_id, + MAC_6_PRINT_ARG(login_data->mac), + login_data->port_id, + VNIC_GUID_RAW_ARG(login_data->guid)); + return; + } + + /* + * For host administered vNICs we must have login and login ack + * macs equal and different than all zeros. login and and login + * ack must agree on vlan presence. And if vlan is present, vlans + * must be indentical. Otherwise, the request is rejected. + */ + if (vnic->hadmined) { + if (!IS_ZERO_MAC(vnic->login_data.mac) && + memcmp(vnic->login_data.mac, login_data->mac, ETH_ALEN)) { + vnic_dbg_fip(discover->name, "fip_discover_rx_packet" + " host admined mac refused\n"); + mac_vlan_refused = 1; + } else if (vnic->login_data.all_vlan_gw != login_data->all_vlan_gw) + vnic_dbg_fip(discover->name, + "fip_discover_rx_packet host" + " host and GW disagree on all_vlan mode\n"); + /* If the host is not working in all_vlan_gw policy - + check the requested vlan against the accepted */ + else if (!gw->info.all_vlan_gw && + (vnic->login_data.vp != login_data->vp || + (login_data->vp == 1 && + vnic->login_data.vlan != login_data->vlan))) { + vnic_dbg_fip(discover->name, + "fip_discover_rx_packet host" + " admined vlan refused\n"); + mac_vlan_refused = 1; + } + } + + /* process a login packet for the specific vnic */ + synd = (int)login_data->syndrome; + if (synd || mac_vlan_refused) { + char *vnic_name = vnic->hadmined ? + (char *)vnic->interface_name : (char *)vnic->name; + /* print syndrome as long as backlog limit is not exceeded */ + if (vnic->synd_backlog++ >= vnic_synd_backlog) + return; + + vnic_warn(discover->name, "%s login failed " + "(mac "MAC_6_PRINT_FMT" vlan %d) " + "backlog %d/%d\n", + vnic_name, + MAC_6_PRINT_ARG(vnic->mac_cache), + (vnic->vlan_used ? vnic->vlan : -1), + vnic->synd_backlog, vnic_synd_backlog); + + if (mac_vlan_refused) + vnic_warn(vnic->name, "MAC/VLAN refused\n"); + + fip_print_syndrome(vnic, synd); + + if (synd == FIP_SYNDROM_UNRECOGNISED_HOST) { + vnic_info("%s %s sending ucast sloicit to Gateway\n", + discover->name, vnic_name); + if(fip_solicit_send(gw->discover, + FIP_DISCOVER_UCAST, + gw->info.gw_qpn, + gw->info.gw_lid, + vnic_gw_ctrl_sl(gw), + gw->info.gw_prot_new)) + vnic_warn(discover->name, "%s Failed to send ucast solicit\n", vnic_name); + } + } else { + vnic->all_vlan_gw = !!((!vnic->hadmined && vnic->gw->info.all_vlan_gw) || + (vnic->hadmined && vnic->login_data.all_vlan_gw)); + fip_vnic_login_ack_recv(vnic, login_data); + } +} + +/* + * This function handles a single received packet that are expected to be + * GW advertisements or login ACK packets. The function first parses the + * packet and decides what is the packet type and then processes the packet + * according to its type. This functions runs in task context. + */ +int fip_discover_rx_packet_bh(struct fip_discover *discover, struct fip_content *fc) +{ + struct fip_gw_data *advertise_data = NULL; + struct fip_login_data *login_data = NULL; + int rc; + int ret = 0; + + switch (fc->fh->subcode) { + case FIP_GW_ADV_SUB_OPCODE: + advertise_data = kzalloc(sizeof *advertise_data, GFP_KERNEL); + if (!advertise_data) { + vnic_warn(discover->name, + "Failed to allocate %Zu bytes", + sizeof *advertise_data); + return -ENOMEM; + } + + rc = fip_advertise_parse_bh(discover, fc, advertise_data); + if (!rc) + ret = fip_discover_rx_advertise_bh(discover, + advertise_data); + kfree(advertise_data); + break; + + case FIP_GW_LOGIN_SUB_OPCODE: + login_data = kzalloc(sizeof *login_data, GFP_KERNEL); + if (!login_data) { + vnic_warn(discover->name, + "Failed to allocate %Zu bytes", + sizeof *login_data); + return -ENOMEM; + } + + rc = fip_login_parse(discover, fc, login_data); + if (!rc) + handle_login_packet(discover, login_data); + + kfree(login_data); + break; + default: + break; + } + + return ret; +} + +/* + * This function is a callback called upon successful join to a + * multicast group. The function checks if we have joined + attached + * to all required mcast groups and if so moves the discovery FSM to solicit. + */ +static void fip_discover_mcast_connect_cb(struct vnic_mcast *mcaste, void *ctx) +{ + struct fip_discover *discover = mcaste->priv_data; + + if (mcaste->cur_attached && mcaste->req_attach) { + vnic_dbg_parse(discover->name, "attached mask = 0x%lx, req mask = 0x%lx\n", + *mcaste->cur_attached, *mcaste->req_attach); + if ((*mcaste->cur_attached & *mcaste->req_attach) != + *mcaste->req_attach) { + return; + } + } + + discover->discover_mcast_attached_jiffies = jiffies; + set_bit(MCAST_ATTACHED, &discover->discover_mcast_state); + /* in the case of a reconnect don't change state or send a solicit + * packet + */ + if (discover->state < FIP_DISCOVER_SOLICIT) { + vnic_dbg_fip(discover->name, "fip_multicast_connected moved" + " state to solicit\n"); + spin_lock_irq(&discover->lock); + if (discover->flush == FIP_NO_FLUSH) { + /* delay sending solicit packet by 0-100 mSec */ + int rand_delay = jiffies % 100; /*get_random_int()*/ + discover->state = FIP_DISCOVER_SOLICIT; + cancel_delayed_work(&discover->fsm_task); + /* This is really (rand_delay / 1000) * HZ*/ + /* calls fip_discover_fsm() */ + queue_delayed_work(fip_wq, &discover->fsm_task, + (rand_delay * HZ) / 1000); + } + spin_unlock_irq(&discover->lock); + } + vnic_dbg_fip(discover->name, "discover_mcast_connect_cb done\n"); +} + +/* + * This function is a callback called upon a mcast deattach event. + * This event can be triggered due to discovery teardown or due to an async + * event. Currently this code does not participate in the discovery's FSM. +*/ +void fip_discover_mcast_deattach_cb(struct vnic_mcast *mcast, void *ctx) +{ +// struct vnic_mcast *mcast_other = ctx; + struct fip_discover *discover = mcast->priv_data; + + discover->discover_mcast_detached_jiffies = jiffies; + clear_bit(MCAST_ATTACHED, &discover->discover_mcast_state); + + vnic_dbg_fip(NULL, "fip_discover_mcast_deattach_cb\n"); +} + +/* + * Try to connect to the relevant mcast groups. If one of the mcast failed + * The function should be recalled to try and complete the join process + * (for the mcast groups that the join process was not performed). + * Note: A successful return of vnic_mcast_join means that the mcast join + * started, not that the join completed. completion of the connection process + * is asyncronous and uses a supplyed callback. + */ +static int fip_discover_mcast_connect(struct fip_discover *discover) +{ + struct vnic_mcast *mcaste_disc, *mcaste_sol, *mcaste; + int rc; + + mcaste_disc = vnic_mcast_alloc(discover->port, &discover->req_attach, &discover->cur_attached); + if (IS_ERR(mcaste_disc)) + return -EINVAL; + + mcaste_sol = vnic_mcast_alloc(discover->port, &discover->req_attach, &discover->cur_attached); + if (IS_ERR(mcaste_sol)) { + vnic_mcast_dealloc(mcaste_disc); + return -EINVAL; + } + + set_bit(FIP_MCAST_DISCOVER, &discover->req_attach); + set_bit(FIP_MCAST_SOLICIT, &discover->req_attach); + + mcaste = mcaste_disc; + mcaste->priv_data = discover; + mcaste->attach_bit_nr = FIP_MCAST_DISCOVER; + memcpy(mcaste->mac, ETH_BCAST_MAC, ETH_ALEN); + memcpy(&mcaste->gid, fip_discover_mgid, GID_LEN); + if (discover->pkey != 0xffff) + *(u16 *)&mcaste->gid.raw[6] = htons(discover->pkey | 0x8000); + memcpy(&mcaste->port_gid, &mcaste->gid, GID_LEN); + mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC); + mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC; + mcaste->retry = VNIC_MCAST_ULIMIT_RETRY; + mcaste->attach_cb = fip_discover_mcast_connect_cb; + mcaste->detach_cb = fip_discover_mcast_deattach_cb; + mcaste->attach_cb_ctx = mcaste_sol; + mcaste->detach_cb_ctx = mcaste_sol; + mcaste->pkey = discover->pkey; + mcaste->qkey = VNIC_FIP_QKEY; + mcaste->qp = discover->qp; + mcaste->blocking = 0; + mcaste->join_state = 1; + rc = vnic_mcast_add(&discover->mcast_tree, mcaste); + ASSERT(!rc); + rc = vnic_mcast_attach(&discover->mcast_tree, mcaste); /* MCAST_RECEIVE_ONLY */ + ASSERT(!rc); + + mcaste = mcaste_sol; + mcaste->priv_data = discover; + mcaste->attach_bit_nr = FIP_MCAST_SOLICIT; + memcpy(mcaste->mac, ETH_BCAST_MAC, ETH_ALEN); + memcpy(&mcaste->gid, fip_solicit_mgid, GID_LEN); + if (discover->pkey != 0xffff) + *(u16 *)&mcaste->gid.raw[6] = htons(discover->pkey | 0x8000); + memcpy(&mcaste->port_gid, &mcaste->gid, GID_LEN); + mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC); + mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC; + mcaste->retry = VNIC_MCAST_ULIMIT_RETRY; + mcaste->attach_cb = fip_discover_mcast_connect_cb; + mcaste->detach_cb = fip_discover_mcast_deattach_cb; + mcaste->attach_cb_ctx = mcaste_disc; + mcaste->detach_cb_ctx = mcaste_disc; + mcaste->pkey = discover->pkey; + mcaste->qkey = VNIC_FIP_QKEY; + mcaste->qp = discover->qp; + mcaste->blocking = 0; + mcaste->join_state = 1; + mcaste->sender_only = 1; + rc = vnic_mcast_add(&discover->mcast_tree, mcaste); + ASSERT(!rc); + rc = vnic_mcast_attach(&discover->mcast_tree, mcaste); /* MCAST_SEND_ONLY */ + ASSERT(!rc); + + return 0; +} + +int fip_discover_mcast_reattach(struct fip_discover *discover, + struct vnic_port *port) +{ + int flush; + + spin_lock_irq(&discover->lock); + flush = discover->flush; + spin_unlock_irq(&discover->lock); + + if (flush == FIP_NO_FLUSH && + discover->state > FIP_DISCOVER_INIT) { + vnic_tree_mcast_detach(&discover->mcast_tree); + vnic_tree_mcast_attach(&discover->mcast_tree); + } + return 0; +} + +static void fip_discover_ctrl_path_query_complete( + int status, + struct ib_sa_path_rec *pathrec, + void *context) +{ + struct fip_gw_data *gw = context; + vnic_dbg_fip_p0(gw->discover->name, "fip ctrl path query complete status=%d\n", status); + if (!status) { + vnic_dbg_fip_p0(gw->discover->name, "fip ctrl path query success srcgid:"VNIC_GUID_FMT" dgid:"VNIC_GUID_FMT"\n", + VNIC_GUID_RAW_ARG(pathrec->sgid.raw+8), + VNIC_GUID_RAW_ARG(pathrec->dgid.raw+8)); + gw->ctrl_prec = *pathrec; + fip_discover_gw_fsm_move(gw, FIP_GW_SEND_SOLICIT); + } else { + vnic_dbg_fip_p0(gw->discover->name, "fip ctrl path query FAILED ret=%d\n", status); + gw->query_id = -1; /* this will cause a retry */ + } + complete(&gw->query_comp); +} + +static void fip_discover_data_path_query_complete( + int status, + struct ib_sa_path_rec *pathrec, + void *context) +{ + struct fip_gw_data *gw = context; + vnic_dbg_fip_p0(gw->discover->name, "fip data path query complete status=%d\n", status); + if (!status) { + struct ib_sa_path_rec old_pathrec; + struct fip_vnic_data *vnic; + vnic_dbg_fip_p0(gw->discover->name, "fip data path query success srcgid:"VNIC_GUID_FMT" dgid:"VNIC_GUID_FMT"\n", + VNIC_GUID_RAW_ARG(pathrec->sgid.raw+8), + VNIC_GUID_RAW_ARG(pathrec->dgid.raw+8)); + old_pathrec = gw->data_prec; + gw->data_prec = *pathrec; + if (old_pathrec.sl != gw->data_prec.sl) { + /* in case of SL change close the vnic to relogin with the new SL */ + vnic_info("[%s] %s %s Data SL changed from %d to %d\n", + gw->info.vol_info.system_name, + gw->discover->port->name, + gw->info.vol_info.gw_port_name, + old_pathrec.sl, gw->data_prec.sl); + list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) { + if (vnic->flush != FIP_FULL_FLUSH && vnic->state >= FIP_VNIC_LOGIN) + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + } + } + fip_discover_gw_fsm_move(gw, FIP_GW_CONNECTED); + } else { + vnic_dbg_fip_p0(gw->discover->name, "fip data path query FAILED ret=%d\n", status); + gw->query_id = -1; /* this will cause a retry */ + } + complete(&gw->query_comp); +} + +static int fip_discover_path_query(struct fip_gw_data *gw, int is_data_sl) +{ + ib_sa_comp_mask comp_mask; + struct ib_sa_path_rec p_rec; + void(*callback)(int status, struct ib_sa_path_rec *resp, void *context); + + vnic_dbg_fip_p0(gw->discover->name, "fip path query %d of GW lid:%d sl=%d GID:"VNIC_GUID_FMT" SID=%llx data_path=%d!\n", + gw->query_path_cnt, + gw->info.gw_lid, + gw->info.gw_sl, + VNIC_GUID_RAW_ARG(gw->info.gw_guid), + is_data_sl ? EOIB_SERVICE_ID : EOIB_CTRL_SERVICE_ID, + is_data_sl); + + comp_mask = IB_SA_PATH_REC_SERVICE_ID | + IB_SA_PATH_REC_DGID | + IB_SA_PATH_REC_SGID | + IB_SA_PATH_REC_REVERSIBLE | + IB_SA_PATH_REC_PKEY; + + callback = is_data_sl ? fip_discover_data_path_query_complete : fip_discover_ctrl_path_query_complete; + memset(&p_rec, 0, sizeof(p_rec)); + + p_rec.service_id = is_data_sl ? cpu_to_be64(EOIB_SERVICE_ID) : cpu_to_be64(EOIB_CTRL_SERVICE_ID); + p_rec.sgid = gw->discover->port->gid; + /* copy the subnet prefix from source gid */ + memcpy(p_rec.dgid.raw, p_rec.sgid.raw, 8); + /* copy gw dgid */ + memcpy(p_rec.dgid.raw+8, gw->info.gw_guid,8); + p_rec.pkey = cpu_to_be16(gw->discover->pkey); + p_rec.reversible = cpu_to_be32(1); + + if (gw->query_id >= 0 && !IS_ERR(gw->pquery) && gw->pquery) { + ib_sa_cancel_query(gw->query_id, gw->pquery); + return -1; /* retry later */ + } + + init_completion(&gw->query_comp); + gw->query_path_cnt++; + gw->query_id = -1; + gw->pquery = ERR_PTR(-ENODATA); + + gw->query_id = + ib_sa_path_rec_get(&vnic_sa_client, + gw->discover->port->dev->ca, + gw->discover->port->num, + &p_rec, + comp_mask, + 2000 /*TOUT*/, + GFP_KERNEL, + callback, + gw, + &gw->pquery); + if (gw->query_id < 0) { + complete(&gw->query_comp); + vnic_dbg_fip_p0(gw->discover->name, "ib_sa_path_rec_get failed, error %d\n", gw->query_id); + gw->pquery = ERR_PTR(-ENODATA); + } + return gw->query_id; +} + +void fip_discover_gw_fsm_move(struct fip_gw_data *gw, enum fip_gw_state state) +{ + cancel_delayed_work(&gw->gw_task); + if (gw->pquery && !IS_ERR(gw->pquery) && gw->query_id >= 0) + ib_sa_cancel_query(gw->query_id, gw->pquery); + + gw->state = state; + gw->query_id = -1; + gw->query_path_cnt = 0; + queue_delayed_work(fip_wq, &gw->gw_task, 0); +} + + +static void fip_discover_gw_fsm(struct work_struct *work) +{ + struct fip_gw_data *curr_gw = + container_of(work, struct fip_gw_data, gw_task.work); + unsigned long next_wakeup = curr_gw->info.gw_adv_period; + unsigned long rand = jiffies % 100 + 1; + int ret; + + if (curr_gw->flush != FIP_NO_FLUSH) + return; + + if (test_bit(MCAST_ATTACHED, + &curr_gw->discover->discover_mcast_state)) { + if (time_after(jiffies, curr_gw->keep_alive_jiffies + next_wakeup)) { + if (time_after(jiffies, + curr_gw->discover->discover_mcast_attached_jiffies + + next_wakeup)) { + fip_close_gw(curr_gw, FIP_PARTIAL_FLUSH); + return; + } + } + } else { + /* close gw if 1 minute has elapsed since mcast detach */ + if (time_after(jiffies, + curr_gw->discover->discover_mcast_detached_jiffies + + 60*HZ)) { + fip_close_gw(curr_gw, FIP_PARTIAL_FLUSH); + return; + } + } + + switch (curr_gw->state) { + case FIP_GW_HOST_ADMIN: + break; + case FIP_GW_CTRL_PATH_QUERY: + if (curr_gw->query_path_cnt && curr_gw->query_id >= 0) { + /* PATH query is running */ + next_wakeup = msecs_to_jiffies(100); + break; + } + ret = fip_discover_path_query(curr_gw, 0/*ctrl SL*/); + if (ret < 0) + vnic_dbg_fip_p0(curr_gw->discover->name, "Query ctrl path Failed : retry num %d ...\n", curr_gw->query_path_cnt); + next_wakeup = msecs_to_jiffies(100); + break; + + case FIP_GW_SEND_SOLICIT: + curr_gw->query_path_cnt = 0; + curr_gw->query_id = -1; + curr_gw->pquery = ERR_PTR(-ENODATA); + vnic_dbg_fip(curr_gw->discover->name, "DISCOVER_LOGIN FIP_GW_SEND_SOLICIT\n"); + vnic_dbg_parse(curr_gw->discover->name, "new protocol %d\n", curr_gw->info.gw_prot_new); + ret = fip_solicit_send(curr_gw->discover, FIP_DISCOVER_UCAST, + curr_gw->info.gw_qpn, + curr_gw->info.gw_lid, + vnic_gw_ctrl_sl(curr_gw), + curr_gw->info.gw_prot_new); + if (ret) + next_wakeup = (100 + rand * HZ) / 200; + else + next_wakeup = (100 + rand * HZ) / 25; + break; + + case FIP_GW_DATA_PATH_QUERY: + if (curr_gw->query_path_cnt && curr_gw->query_id >= 0) { + /* PATH query is running */ + next_wakeup = msecs_to_jiffies(100); + break; + } + ret = fip_discover_path_query(curr_gw, 1/*data SL*/); + if (ret < 0) + vnic_dbg_fip_p0(curr_gw->discover->name, "Query data path Failed : retry num %d ...\n", curr_gw->query_path_cnt); + next_wakeup = msecs_to_jiffies(100); + break; + + case FIP_GW_CONNECTED: + vnic_dbg_fip(curr_gw->discover->name, "DISCOVER_LOGIN: GW_CONNECTED!!!\n"); + /* test vnic status */ + fip_gw_create_vnics(curr_gw); + break; + default: + ASSERT(0); + break; + } + + /* go to sleep until time out. We expect that we will be awaken by + * RX packets and never get to wake up due to timeout + */ + cancel_delayed_work(&curr_gw->gw_task); + queue_delayed_work(fip_wq, &curr_gw->gw_task, next_wakeup); +} + +static int is_new_solicit_prot(struct fip_discover *discover) +{ + vnic_dbg_parse(discover->name, "new gw %d, old gw %d\n", + discover->new_prot_gws, discover->old_prot_gws); + + if (!discover->old_prot_gws) { + if (!discover->new_prot_gws) { + /* mcast solicit sent before any + * advertise packets arrive. Use old format. + */ + return 0; + } else + return 1; + } + return 0; +} + +/* + * This is the discover finite state machine that runs the + * advertise and solicit packet exchange of the discovery + * proccess. + * It is assumed that this function is only called from work queue + * task context (for locking) + */ +static void fip_discover_fsm(struct work_struct *work) +{ + struct fip_discover *discover = + container_of(work, struct fip_discover, fsm_task.work); + struct vnic_port *port = discover->port; + int recall_time = -1, flush = discover->flush; + + /* we got a flush request and we have not performed it yet */ + if ((flush != FIP_NO_FLUSH) && + discover->state != FIP_DISCOVER_OFF) { + vnic_dbg_fip(discover->name, "discover_fsm switching to OFF\n"); + + recall_time = DELAYED_WORK_CLEANUP_JIFFS * 2; + + + if (discover->state != FIP_DISCOVER_CLEAR) { + fip_free_gw_list(discover, flush); + discover->state = FIP_DISCOVER_CLEAR; + } + + /* if we open GWs we will test again later */ + if (!fip_free_gw_done(discover, flush)) { + vnic_dbg_fip(discover->name, "fip_free_gw_list not done, recalling \n"); + goto recall_fsm; + } + + if (delayed_work_pending(&discover->cleanup_task)) + goto recall_fsm; + + vnic_dbg_fip(discover->name, "fip_free_gw_list done \n"); + vnic_dbg_mark(); + vnic_mcast_del_all(&discover->mcast_tree); + vnic_dbg_mark(); + discover->state = FIP_DISCOVER_OFF; + + /* signal the unload to continue */ + complete(&discover->flush_complete); + return; + } + + if (discover->state == FIP_DISCOVER_OFF) + return; + + if (!port->attr.lid) { + recall_time = 1 * HZ; + goto recall_fsm; + } + + switch (discover->state) { + int new_prot; + + case FIP_DISCOVER_INIT: + vnic_dbg_fip(discover->name, "FIP_DISCOVER_INIT\n"); + /* in init try and join the discover multicast group + * This is a preliminary request for all other progress + * will eventually call fip_discover_mcast_connect_cb() + */ + if (fip_discover_mcast_connect(discover)) { + vnic_warn(discover->name, "fip_discover_mcast_connect() " + "failed\n"); + recall_time = 1 * HZ; + } + break; + + case FIP_DISCOVER_SOLICIT: + new_prot = is_new_solicit_prot(discover); + vnic_dbg_fip(discover->name, "DISCOVER_SOLICIT\n"); + + /* send multicast solicit of type fip, if send is + * successfull move to login state and await advertise + * packets. It TX fail then retry + */ + fip_solicit_send(discover, FIP_DISCOVER_MCAST, 0, 0, 0, new_prot); + recall_time = FIP_RESOLICIT_TIME * HZ; + + break; + + case FIP_DISCOVER_OFF: + default: + ASSERT(0); + break; + + } + +recall_fsm: + if (recall_time >= 0) + queue_delayed_work(fip_wq, &discover->fsm_task, recall_time); + + return; +} + diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_discover.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_discover.h new file mode 100644 index 0000000000000..52e11d359a6cf --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_discover.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FIP_DISCOVER_H +#define _FIP_DISCOVER_H + +#include "vnic.h" +#include "vnic_fip.h" + +/* TODO - rethink this */ +#define FIP_UD_MTU(ib_mtu) (ib_mtu - FIP_ENCAP_LEN - FIP_ETH_HEADER_LEN) +#define FIP_UD_BUF_SIZE(ib_mtu) (ib_mtu + IB_GRH_BYTES) + +#define FIP_MAX_BACKOFF_SECONDS 16 +#define FIP_MAX_VNICS_PER_GW (1 << 9) + +#define FIP_TIMEOUT_FACTOR(a) ((a)*5/2) + +enum fip_gw_state { + FIP_GW_HOST_ADMIN, + FIP_GW_CTRL_PATH_QUERY, + FIP_GW_SEND_SOLICIT, /* got mcast advertise & ctrl path query. sending solicit */ + FIP_GW_DATA_PATH_QUERY, + FIP_GW_CONNECTED /* we are already connected. do nothing */ +}; + + +enum { + GW_TYPE_SINGLE_EPORT = 0, + GW_TYPE_LAG = 1, +}; + +struct gw_ext_boot { + int valid; + int boot_prio; + int timeout; +}; + +struct gw_ext_lag { + int valid; + int hash; /* enum gw_ext_lag_hash_policy */ + int weights_policy; + int member_ka; + int ca; /* conjestion aware */ + int ca_thresh; + int ucast; /* gw supports unicat keep alives */ +}; + + +struct gw_ext_pc_id { + int valid; + u64 power_cycle_id; +}; + +struct fip_gw_data_info { + struct fip_gw_volatile_info vol_info; + long gw_adv_period; /* timeout in jiffies */ + long gw_period; /* timeout in jiffies */ + long vnic_ka_period; /* in jiffies */ + int flags; + u32 gw_qpn; + u16 gw_lid; + u16 gw_port_id; + u16 gw_num_vnics; + u16 n_rss_qpn; + u8 gw_sl; /* GW ctrl SL */ + u8 hadmined_en; + u8 all_vlan_gw; + u8 gw_vendor_id[VNIC_VENDOR_LEN+1]; + u8 gw_guid[GUID_LEN]; + int gw_type; + int gw_prot_new; + int ext_mask; + struct gw_ext_boot ext_boot; + struct gw_ext_lag ext_lag; + struct gw_ext_pc_id ext_pc_id; +}; + +struct fip_gw_data { + enum fip_flush flush; + int hadmin_gw; + struct mutex mlock; + struct fip_discover *discover; + struct list_head list; + unsigned long keep_alive_jiffies; + enum fip_gw_state state; + int vnic_count; + struct list_head vnic_list; + struct delayed_work gw_task; + struct delayed_work vnic_cleanup_task; + struct fip_gw_data_info info; + unsigned long n_bitmask[(FIP_MAX_VNICS_PER_GW >> 3) / + sizeof(unsigned long)]; + + struct ib_sa_path_rec ctrl_prec; + struct ib_sa_path_rec data_prec; + struct ib_sa_query *pquery; + int query_path_cnt; + int query_id; + struct completion query_comp; +}; + +enum fip_gw_data_flags { + FIP_IS_FIP = 1 << 0, /* protocol type */ + FIP_RCV_MULTICAST = 1 << 1, /* received mcast packet */ + FIP_GW_AVAILABLE = 1 << 2, /* GW available bit set in pkt */ + FIP_HADMINED_VLAN = 1 << 3, /* H bit set in advertise pkt */ +}; + +static inline u8 vnic_gw_ctrl_sl(struct fip_gw_data *gw) +{ + return vnic_sa_query? gw->ctrl_prec.sl : gw->info.gw_sl; +} + +/* + * TODO - we can do a nicer job here. stage 2 + * allocates memory and post receives + */ +int fip_post_discovery_rcv(struct vnic_port *port, + int ring_size, struct ib_qp *qp, + struct fip_ring *rx_ring); + +int fip_discover_mcast_reattach(struct fip_discover *discover, + struct vnic_port *port); + +/* + * This function handles a single received packet that are expected to be + * GW advertisements or login ACK packets. The function first parses the + * packet and decides what is the packet type and then handles the packets + * specifically according to its type. This functions runs in task context. +*/ +void fip_discover_rx_packet(int *queue, struct fip_content *fc); +int fip_discover_rx_packet_bh(struct fip_discover *discover, struct fip_content *fc); + +/* + * This function is the RX packet handler entry point at the thread level + * (unlike the completion handler that runs from interrupt context). + * the function calls a handler function and then reallocats the ring + * entry for the next receive. +*/ +void fip_discover_process_rx(struct fip_discover *discover); +void fip_discover_process_rx_bh(struct work_struct *work); +void fip_discover_gw_fsm_move(struct fip_gw_data *gw, enum fip_gw_state state); + +/* This function creates an info string from GW attributes published + * by the GW in advertisement pkts */ +int fip_get_short_gw_info(struct fip_gw_data *gw, char *buff); + + +int fip_packet_parse(struct vnic_port *port, void *packet, int size, + struct fip_content *fc); + +#endif /* _FIP_DISCOVER_H */ diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_ib.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_ib.c new file mode 100644 index 0000000000000..ba630673777a1 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_ib.c @@ -0,0 +1,437 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_fip.h" + +#define FIP_OP_RECV (1ul << 31) +/* TODO - rethink this */ +#define FIP_UD_MTU(ib_mtu) (ib_mtu - FIP_ENCAP_LEN - FIP_ETH_HEADER_LEN) +#define FIP_UD_BUF_SIZE(ib_mtu) (ib_mtu + IB_GRH_BYTES) + +static inline void fip_wr_pepare(struct vnic_port *port, + struct ib_send_wr *tx_wr, + struct ib_sge *tx_sge, + unsigned int wr_id, u64 mapping, + int size, u16 pkey_index) +{ + /* This is a fixed part */ + memset(tx_wr, 0, sizeof(struct ib_send_wr)); + tx_wr->num_sge = 1; + tx_wr->sg_list = tx_sge; + tx_wr->opcode = IB_WR_SEND; + tx_wr->send_flags = IB_SEND_SIGNALED; + tx_wr->wr.ud.pkey_index = pkey_index; + tx_wr->wr_id = wr_id; + + memset(tx_sge, 0, sizeof(struct ib_sge)); + tx_sge->lkey = port->mr->lkey; + tx_sge->addr = mapping; + tx_sge->length = size; +} + +/* + * send a single multicast packet. + * return 0 on success, other on failure. +*/ +int fip_mcast_send(struct vnic_port *port, + struct ib_qp *qp, + unsigned int wr_id, + u64 mapping, + int size, + u16 pkey_index, + struct vnic_mcast *mcast) +{ + struct ib_send_wr *bad_wr; + struct ib_sge tx_sge; + struct ib_send_wr tx_wr; + int ret; + + fip_wr_pepare(port, &tx_wr, &tx_sge, wr_id, mapping, size, pkey_index); + + tx_wr.wr.ud.ah = mcast->ah; + tx_wr.wr.ud.remote_qpn = 0xFFFFFFFF; /*dest_qpn; */ + tx_wr.wr.ud.remote_qkey = mcast->qkey; + + ret = ib_post_send(qp, &tx_wr, &bad_wr); + + return ret; +} + +/* + * send a single unicast packet. + * return 0 on success, other on failure. + */ +int fip_ucast_send(struct vnic_port *port, + struct ib_ah *ah, + struct ib_qp *qp, + unsigned int wr_id, + u64 mapping, + int size, + u16 pkey_index, u32 dest_qpn, u16 dlid, + u32 qkey, u8 sl) +{ + struct ib_send_wr *bad_wr; + struct ib_ah *new_ah = NULL; + struct ib_sge tx_sge; + struct ib_send_wr tx_wr; + int ret; + + fip_wr_pepare(port, &tx_wr, &tx_sge, wr_id, mapping, size, pkey_index); + + if (!ah) { + struct ib_ah_attr ah_attr = { + .dlid = dlid, + .port_num = port->num, + .sl = sl & 0xf, + }; + + new_ah = ib_create_ah(port->pd, &ah_attr); + if (IS_ERR(new_ah)) + return -1; + + tx_wr.wr.ud.ah = new_ah; + } else + tx_wr.wr.ud.ah = ah; + + tx_wr.wr.ud.remote_qpn = dest_qpn; + tx_wr.wr.ud.remote_qkey = qkey; + + ret = ib_post_send(qp, &tx_wr, &bad_wr); + + if (new_ah) + ib_destroy_ah(new_ah); + + return ret; +} + +/* + * This is a general purpose CQ completion function that handles + * completions on RX and TX rings. It can serve all users that are + * using RX and TX rings. + * RX completions are destinguished from TX comp by the MSB that is set + * for RX and clear for TX. For RX, the memory is unmapped from the PCI, + * The head is incremented. For TX the memory is unmapped and then freed. + * The function returns the number of packets received. +*/ +int fip_comp(struct vnic_port *port, + struct ib_cq *cq, + struct fip_ring *rx_ring, + struct fip_ring *tx_ring, + char *name) +{ +#define FIP_DISCOVER_WC_COUNT 4 + struct ib_wc ibwc[FIP_DISCOVER_WC_COUNT]; + int wrid, n, i; + int mtu_size = FIP_UD_BUF_SIZE(port->max_mtu_enum); + int rx_count = 0; + struct ib_device *dev = port->dev->ca; + + do { + /* + * poll for up to FIP_DISCOVER_WC_COUNT in one request. + * returns the number of WC actually polled + */ + n = ib_poll_cq(cq, FIP_DISCOVER_WC_COUNT, ibwc); + for (i = 0; i < n; ++i) { + /* + * use a mask on the id to decide if this is a receive + * or transmit WC + */ + if (ibwc[i].wr_id & FIP_OP_RECV) { + wrid = ibwc[i].wr_id & ~FIP_OP_RECV; + + ib_dma_sync_single_for_cpu(dev, + rx_ring->ring[wrid].bus_addr, + mtu_size, + DMA_FROM_DEVICE); + + if (likely(ibwc[i].status == IB_WC_SUCCESS)) { + rx_ring->ring[wrid].length = + ibwc[i].byte_len; + rx_count++; + } else + rx_ring->ring[wrid].entry_posted = 0; + + rx_ring->head++; + } else { /* TX completion */ + unsigned long flags; + wrid = ibwc[i].wr_id; + + /* unmap and free transmitted packet */ + ib_dma_unmap_single(dev, + tx_ring->ring[wrid]. + bus_addr, tx_ring->ring[wrid].length, + DMA_TO_DEVICE); + + kfree(tx_ring->ring[wrid].mem); + tx_ring->ring[wrid].mem = NULL; + tx_ring->ring[wrid].length = 0; + spin_lock_irqsave(&tx_ring->head_tail_lock, flags); + tx_ring->tail++; + spin_unlock_irqrestore(&tx_ring->head_tail_lock, flags); + } + } + } while (n == FIP_DISCOVER_WC_COUNT); + + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + + return rx_count; +} + +/* qonfigure a newly allocated QP and move it + * from reset->init->RTR->RTS + */ +int fip_init_qp(struct vnic_port *port, struct ib_qp *qp, u16 pkey_index, char *name) +{ + struct ib_qp_attr qp_attr; + int attr_mask; + + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qkey = VNIC_FIP_QKEY; + qp_attr.port_num = port->num; + qp_attr.pkey_index = pkey_index; + attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE; + + if (ib_modify_qp(qp, &qp_attr, attr_mask)) + goto out_fail; + + qp_attr.qp_state = IB_QPS_RTR; + attr_mask &= ~IB_QP_PORT; + if (ib_modify_qp(qp, &qp_attr, attr_mask)) + goto out_fail; + + qp_attr.qp_state = IB_QPS_RTS; + qp_attr.sq_psn = 0; + attr_mask |= IB_QP_SQ_PSN; + attr_mask &= ~IB_QP_PKEY_INDEX; + if (ib_modify_qp(qp, &qp_attr, attr_mask)) + goto out_fail; + + return 0; + +out_fail: + qp_attr.qp_state = IB_QPS_RESET; + if (ib_modify_qp(qp, &qp_attr, IB_QP_STATE)) + vnic_warn(name, "failed to modify QP to RESET state\n"); + + return -EINVAL; +} + +void fip_qp_to_reset(struct ib_qp *qp, char *name) +{ + struct ib_qp_attr qp_attr; + + qp_attr.qp_state = IB_QPS_RESET; + if (ib_modify_qp(qp, &qp_attr, IB_QP_STATE)) + vnic_warn(name, "Failed to modify QP to RESET state\n"); + return; +} + +/* + * alloc a single buffer, map it and post it to the qp. + * id used to identify entry in receive queue. + */ +int fip_post_receive(struct vnic_port *port, struct ib_qp *qp, int size, + int _id, struct fip_ring_entry *mem_entry, char *name) +{ + struct ib_recv_wr rx_wr, *bad_wr; + struct ib_sge rx_sge; + int rc; + + rx_wr.wr_id = _id | FIP_OP_RECV; + rx_wr.next = NULL; + rx_wr.sg_list = &rx_sge; + rx_wr.num_sge = 1; + rx_sge.addr = mem_entry->bus_addr; + rx_sge.length = size; + rx_sge.lkey = port->mr->lkey; + + ib_dma_sync_single_for_device(port->dev->ca, rx_sge.addr, + FIP_UD_BUF_SIZE(port->max_mtu_enum), + DMA_FROM_DEVICE); + + rc = ib_post_recv(qp, &rx_wr, &bad_wr); + if (unlikely(rc)) { + vnic_warn(name, "post receive failed for buf rc %d (id %d)\n", _id, rc); + goto post_recv_failed; + } + mem_entry->entry_posted = 1; + return 0; + +post_recv_failed: + mem_entry->entry_posted = 0; + return -EIO; +} + +void fip_flush_rings(struct vnic_port *port, + struct ib_cq *cq, + struct ib_qp *qp, + struct fip_ring *rx_ring, + struct fip_ring *tx_ring, + char *name) +{ + vnic_dbg_fip(name, "fip_qp_to_err called\n"); + if (qp) { + fip_qp_to_reset(qp, name); + fip_comp(port, cq, rx_ring, tx_ring, name); + } +} + +void fip_free_rings(struct vnic_port *port, + struct fip_ring *rx_ring, + struct fip_ring *tx_ring, + char *name) +{ + struct ib_device *dev = port->dev->ca; + int i; + + for (i = rx_ring->size - 1; i >= 0; --i) { + if (rx_ring->ring[i].mem) { + ib_dma_unmap_single(dev, + rx_ring->ring[i].bus_addr, + FIP_UD_BUF_SIZE(port->max_mtu_enum), + DMA_FROM_DEVICE); + kfree(rx_ring->ring[i].mem); + } + } + rx_ring->size = 0; + + for (i = tx_ring->size - 1; i >= 0; --i) + if (tx_ring->ring[i].length != 0) { + ib_dma_unmap_single(dev, + tx_ring->ring[i].bus_addr, + tx_ring->ring[i].length, + DMA_TO_DEVICE); + kfree(tx_ring->ring[i].mem); + } + tx_ring->size = 0; + + vnic_dbg_fip(name, "Done cleaning RX and TX queues\n"); + + kfree(rx_ring->ring); + rx_ring->ring = NULL; + kfree(tx_ring->ring); + tx_ring->ring = NULL; +} + +/* + * TODO - we can do a nicer job here. stage 2 + * allocates memory and post receives + * TODO2: need to handle the bad flow to free all existing entries in the ring + */ +int fip_init_rx(struct vnic_port *port, + int ring_size, + struct ib_qp *qp, + struct fip_ring *rx_ring, + char *name) +{ + struct ib_device *dev = port->dev->ca; + int i, rc = 0, mtu_size = FIP_UD_BUF_SIZE(port->max_mtu_enum); + + rx_ring->size = ring_size; + rx_ring->ring = kzalloc(rx_ring->size * + sizeof(struct fip_ring_entry), + GFP_KERNEL); + if (!rx_ring->ring) { + vnic_warn(name, "failed to alloc fip RX ring, size %d\n", rx_ring->size); + rx_ring->size = 0; + return -ENOMEM; + } + + /* allocate the ring entries */ + for (i = 0; i < rx_ring->size; i++) { + rx_ring->ring[i].mem = kmalloc(mtu_size, GFP_KERNEL); + if (unlikely(!rx_ring->ring[i].mem)) { + rc = -ENOMEM; + goto error; + } + + rx_ring->ring[i].entry_posted = 0; + rx_ring->ring[i].length = mtu_size; + rx_ring->ring[i].bus_addr = ib_dma_map_single(dev, + rx_ring->ring[i].mem, + mtu_size, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(dev, rx_ring->ring[i].bus_addr))) { + rc = -ENODEV; + goto dma_error; + } + + if (fip_post_receive(port, qp, FIP_UD_BUF_SIZE(port->max_mtu_enum), + i, rx_ring->ring + i, name)) { + rc = -EIO; + goto post_recv_failed; + } + } + + rx_ring->head = 0; + rx_ring->tail = 0; + spin_lock_init(&rx_ring->head_tail_lock); + spin_lock_init(&rx_ring->ring_lock); + return 0; + +post_recv_failed: + ib_dma_unmap_single(dev, rx_ring->ring[i].bus_addr, + mtu_size, DMA_FROM_DEVICE); +dma_error: + kfree(rx_ring->ring[i].mem); + rx_ring->ring[i].mem = NULL; +error: + /* previous entries need to be freed after flushing the QP */ + return rc; +} + +/* + * This function allocates the tx buffers and initializes the head and + * tail indexes. + */ +int fip_init_tx(int size, struct fip_ring *tx_ring, char *name) +{ + tx_ring->size = size; + tx_ring->ring = kzalloc(tx_ring->size * + sizeof(struct fip_ring_entry), + GFP_KERNEL); + + if (!tx_ring->ring) { + vnic_warn(name, "failed to alloc fip TX ring, size %d\n", + tx_ring->size); + tx_ring->size = 0; + return -ENOMEM; + } + + tx_ring->head = 0; + tx_ring->tail = 0; + spin_lock_init(&tx_ring->head_tail_lock); + spin_lock_init(&tx_ring->ring_lock); + return 0; +} + diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_login.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_login.c new file mode 100644 index 0000000000000..55729f2ac0254 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_login.c @@ -0,0 +1,1752 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_fip.h" +#include "vnic_fip_discover.h" +#include "vnic_fip_pkt.h" + +#ifndef work_pending /* back-port */ +#define work_pending(_work) test_bit(0, &(_work)->pending) +#endif + +enum { + VNIC_LOGIN_REG_NETDEV_PENDING, + VNIC_LOGIN_REG_NETDEV_DONE, + VNIC_LOGIN_DESTROY_PENDING, + VNIC_LOGIN_DESTROY_DONE, + VNIC_LOGIN_DESTROY_FULL +}; + +static int fip_vnic_rings_create(struct vnic_port *port, + struct fip_vnic_data *vnic); +static void fip_vnic_rings_destroy(struct fip_vnic_data *vnic); +static void fip_vnic_recv(struct fip_vnic_data *vnic); + +#ifdef _BP_HR_TIMER +int fip_vnic_keepalive(struct hrtimer * timer); +#else +enum hrtimer_restart fip_vnic_keepalive(struct hrtimer * timer); +#endif +int fip_vnic_keepalive_send(struct fip_vnic_data *vnic, int source); + + +#define QUEUE_VNIC_DWORK(vnic, task, time) \ +do { \ + unsigned long flags; \ + spin_lock_irqsave(&vnic->lock, flags); \ + if (likely(vnic->flush == FIP_NO_FLUSH)) \ + queue_delayed_work(fip_wq, task, time); \ + spin_unlock_irqrestore(&vnic->lock, flags); \ +} while(0) + +#define REQUEUE_VNIC_DWORK(vnic, task, time) \ +do { \ + cancel_delayed_work(task); \ + QUEUE_VNIC_DWORK(vnic, task, time); \ +} while(0); + + +/* + * Look for a vnic in the GW vnic list. The search key used is either the vnic_id + * that is unique, or the mac+vlan pair. A match on either key will result in the + * return of the vnic. both keys are nesesary because host assigned delete + * flow might not have access to the vnic_id. The search disregards vnics that + * are undergoing full flush (they will be removed soon). +*/ +struct fip_vnic_data *fip_vnic_find_in_list(struct fip_gw_data *gw, u16 vnic_id, + u8 *mac, u16 vlan, u8 vlan_used) +{ + struct fip_vnic_data *vnic; + int use_mac = mac ? 1 : 0; + int vlan_match; + + ASSERT(gw); + + if (list_empty(&gw->vnic_list)) + return NULL; + + /* do not use MAC 0:..:0 for vnic matches */ + if (use_mac) + use_mac = !IS_ZERO_MAC(mac); + + list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) { + if (vnic->flush == FIP_FULL_FLUSH) + continue; + + if (vnic->vnic_id == vnic_id) + return vnic; + + if (vlan_used != vnic->login_data.vp) + continue; + + vlan_match = !vlan_used || + (vlan_used && (vlan == vnic->login_data.vlan)); + + if ((use_mac && !memcmp(vnic->login_data.mac, mac, ETH_ALEN)) && + vlan_match) + return vnic; + } + return NULL; +} + +/* + * This function handles completions of both TX and RX + * packets of vnics. RX packets are unmapped lightly parsed moved to a list + * and passed to thread processing. TX packets are unmapped and freed. + * Note: this function is called from interrupt context + */ +static void fip_vnic_comp(struct ib_cq *cq, void *vnic_ptr) +{ + struct fip_vnic_data *vnic = vnic_ptr; + + /* handle completions. On RX packets this will call vnic_recv + * from thread context to continue processing */ + if (fip_comp(vnic->port, vnic->cq, &vnic->rx_ring, + &vnic->tx_ring, vnic->name)) + fip_vnic_recv(vnic); + + fip_vnic_keepalive_send(vnic, 0); +} + +/* + * read the state of the gw eport. This can be done from any context and therefore + * requires protection. +*/ +int fip_vnic_get_eport_state(struct fip_vnic_data *vnic) +{ + int i; + + if (no_bxm) + return 1; + + if (vnic->gw->info.gw_type == GW_TYPE_LAG) { + for (i = 0; i < MAX_LAG_MEMBERS; i++) { + if (!(vnic->lm.used_bitmask & 1 << i)) + continue; + + if (vnic->lm.memb[i].eport_state) + return 1; + } + return 0; + } else { + return atomic_read(&vnic->eport_state); + } +} + +/* + * get GW info funcs. +*/ +int fip_vnic_get_bx_name(struct fip_vnic_data *vnic, char *buff) +{ + struct fip_gw_data *gw = vnic->gw; + struct fip_gw_volatile_info tmp_info; + int rc; + + if (!gw) + return -EINVAL; + + mutex_lock(&gw->mlock); + memcpy(&tmp_info, &gw->info.vol_info, sizeof(tmp_info)); + mutex_unlock(&gw->mlock); + + rc = sprintf(buff, "%s", tmp_info.system_name); + + return rc < 0 ? rc : 0; +} + +int fip_vnic_get_bx_guid(struct fip_vnic_data *vnic, char *buff) +{ + struct fip_gw_data *gw = vnic->gw; + struct fip_gw_volatile_info tmp_info; + void *rc; + + memset(buff, 0, sizeof *buff); + + if (!gw) + return -EINVAL; + + mutex_lock(&gw->mlock); + memcpy(&tmp_info, &gw->info.vol_info, sizeof(tmp_info)); + mutex_unlock(&gw->mlock); + + rc = memcpy(buff, tmp_info.system_guid, GUID_LEN); + + return rc ? 0 : -EINVAL; +} + +int fip_vnic_get_all_vlan_mode(struct fip_vnic_data *vnic, char *buff) +{ + struct fip_gw_data *gw = vnic->gw; + int rc; + + if (!gw) + return -EINVAL; + + rc = sprintf(buff, "%s", gw->info.all_vlan_gw ? "yes" : "no"); + + return rc < 0 ? rc : 0; +} + +int fip_vnic_get_eport_name(struct fip_vnic_data *vnic, char *buff) +{ + + struct fip_gw_data *gw = vnic->gw; + struct fip_gw_volatile_info tmp_info; + int rc; + + if (!gw) + return -EINVAL; + + mutex_lock(&gw->mlock); + memcpy(&tmp_info, &gw->info.vol_info, sizeof(tmp_info)); + mutex_unlock(&gw->mlock); + + rc = sprintf(buff, "%s", tmp_info.gw_port_name); + + return rc < 0 ? rc : 0; +} + +u8 fip_vnic_get_bx_sl(struct fip_vnic_data *vnic) +{ + return vnic->gw->info.gw_sl; +} + +/* + * get GW info funcs. +*/ +int fip_vnic_get_gw_type(struct fip_vnic_data *vnic) +{ + struct fip_gw_data *gw = vnic->gw; + int lag = 0; + + if (!gw) + return -EINVAL; + + lag = gw->info.gw_type == GW_TYPE_LAG; + + return lag; +} + +/* + * get GW info funcs. +*/ +int fip_vnic_get_lag_eports(struct fip_vnic_data *vnic, char *buf) +{ + struct fip_gw_data *gw = vnic->gw; + int i; + struct lag_member *member; + char *p = buf; + + if (!gw) + return -EINVAL; + + if (gw->info.gw_type != GW_TYPE_LAG) + return -EINVAL; + + p += _sprintf(p, buf, "LAG_MEMBER_INFORMATION:\n"); + for (i=0; ilm.used_bitmask & 1 << i)) + continue; + + member = &vnic->lm.memb[i]; + p += _sprintf(p, buf, " %.2d ID=%.3X LID=%4X QPN=%8X STATE=%s\n", + i, member->gw_port_id, member->lid, member->qpn, + member->eport_state ? "UP" : "DOWN"); + } + + return p - buf; +} + +/* + * process an incoming login ack packet. The packet was already parsed and + * its data was placed in *data. The function creates RX and TX rings for the + * vnic and starts the multicast join procedure. + * This function should not be called for packets other then login ack packets. + */ +void fip_vnic_login_ack_recv(struct fip_vnic_data *vnic, + struct fip_login_data *data) +{ + /* we allow login acks only in wait for ack in other states + * we ignore them */ + if (vnic->state != FIP_VNIC_WAIT_4_ACK) { + vnic_dbg_fip_v(vnic->name, + "vnic_login_ack_recv in state other" + " then FIP_VNIC_WAIT_4_ACK state %d\n", + vnic->state); + return; + } + + /* For LAG vnics, process login ack member data */ + if (vnic->gw->info.gw_type == GW_TYPE_LAG) + handle_member_update(vnic, &data->lagm); + + memcpy(&vnic->login_data, data, sizeof(vnic->login_data)); + + vnic->state = FIP_VNIC_RINGS_INIT; + + /* calls fip_vnic_fsm() */ + cancel_delayed_work(&vnic->vnic_task); + fip_vnic_fsm(&vnic->vnic_task.work); + // REQUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, 0); + return; +} + +/* + * This is a helper function we use in order to move the login create + * to another context so we don't block the fip thread for too long. + * The call stack triggered by this function calls register_netdev that + * might block for some time when netdev are removed in parallel. This + * stalls the fip_wq which causes KA not to be sent. +*/ +void fip_vnic_login_create(struct work_struct *work) +{ + struct fip_vnic_data *vnic = + container_of(work, struct fip_vnic_data, vnic_login_create_task); + char *name = NULL; + int rc; + + if (vnic->hadmined) + name = vnic->interface_name; + + rc = vnic_login_register_netdev(vnic, vnic->mac_cache, name); + + spin_lock_irq(&vnic->lock); + clear_bit(VNIC_LOGIN_REG_NETDEV_PENDING, &vnic->login_status); + if (!rc) + set_bit(VNIC_LOGIN_REG_NETDEV_DONE, &vnic->login_status); + spin_unlock_irq(&vnic->lock); +} + +/* + * Test if the create request posted earlier terminated or not. + * If yes and successfully returns 0, if still pending returns + * -EAGAIN , and if failed returns -EINVAL. if retry is set + * it will requeue a create attempt and try again. In this case + * the function will return -EAGAIN. +*/ +static int fip_vnic_test_login(struct fip_vnic_data *vnic, int retry) +{ + int ret = 0; + + spin_lock_irq(&vnic->lock); + + if (!test_bit(VNIC_LOGIN_REG_NETDEV_DONE, &vnic->login_status)) { + /* queue retry login create request */ + if (retry) { + if (!test_and_set_bit(VNIC_LOGIN_REG_NETDEV_PENDING, + &vnic->login_status)) { + memcpy(vnic->mac_cache, vnic->login_data.mac, ETH_ALEN); + vnic->vlan_used = vnic->login_data.vp; + vnic->vlan = vnic->login_data.vlan; + vnic->all_vlan_gw = vnic->login_data.all_vlan_gw; + + /* calls fip_vnic_login_create() */ + if (vnic->flush == FIP_NO_FLUSH) + queue_work(login_wq, &vnic->vnic_login_create_task); + } + ret = -EAGAIN; + } else { + if (test_bit(VNIC_LOGIN_REG_NETDEV_PENDING, + &vnic->login_status)) + ret = -EAGAIN; + else + ret = -EINVAL; + } + } + spin_unlock_irq(&vnic->lock); + + return ret; +} + + +/* + * This function should be called when the building of a vhub context + * table is done and the vnic state should transition to CONNECTED. + */ +int fip_vnic_tbl_done(struct fip_vnic_data *vnic) +{ + vnic->vhub_table.state = VHUB_TBL_UP2DATE; + vnic->vhub_table.tusn = vnic->vhub_table.main_list.tusn; + + if (vnic->state <= FIP_VNIC_VHUB_DONE) + vnic->state = FIP_VNIC_VHUB_DONE; + else + vnic->state = FIP_VNIC_VHUB_WRITE; + + cancel_delayed_work(&vnic->vnic_task); + fip_vnic_fsm(&vnic->vnic_task.work); + return 0; +} + +/* + * This function runs in interrupt context + * It does sanity checking of the packet, moves it to a list and passes + * handleing to a thread. + */ +static void fip_vnic_recv(struct fip_vnic_data *vnic) +{ + struct fip_ring *rx_ring = &vnic->rx_ring; + int ret, length; + u32 vhub_id; + void *mem; + int queue_packet = 0; + int one_or_more_queued = 0; + int index; + int err; + + while (rx_ring->head != rx_ring->tail) { + struct fip_content *fc; + + queue_packet = 0; + index = rx_ring->tail & (vnic->rx_ring.size - 1); + + if (rx_ring->ring[index].entry_posted == 0) + goto repost; + + mem = rx_ring->ring[index].mem; + length = rx_ring->ring[index].length; + + + fc = kzalloc(sizeof *fc, GFP_ATOMIC); + if (!fc) { + vnic_warn(vnic->name, "kzalloc failed\n"); + goto repost; + } + + err = fip_packet_parse(vnic->port, mem + IB_GRH_BYTES, length - IB_GRH_BYTES, fc); + if (err) { + vnic_warn(vnic->name, "packet parse failed\n"); + kfree(fc); + goto repost; + } + + switch (fc->fh->subcode) { + case FIP_GW_UPDATE_SUB_OPCODE: + if (fc->fvu) { + vhub_id = be32_to_cpu(fc->fvu->state_vhub_id) & 0xffffff; + if (vnic->login_data.vhub_id == vhub_id) + queue_packet = 1; + } + + break; + case FIP_GW_TABLE_SUB_OPCODE: + if (vnic->state >= FIP_VNIC_VHUB_INIT && + vnic->vhub_table.state == VHUB_TBL_INIT) { + /* handle vhub context table packets */ + if (fc->fvt) { + vhub_id = be32_to_cpu(fc->fvt->vp_vhub_id) & 0xffffff; + if (vnic->login_data.vhub_id == vhub_id) + queue_packet = 1; + } + } + break; + default: + vnic_dbg_fip_v(vnic->name, + "received unexpected format packet\n"); + break; + } + + if (queue_packet && (likely(vnic->flush == FIP_NO_FLUSH))) { + struct fip_rcv_pkt *rcv; + struct fip_ring_entry me; + + /* record packet time for heart beat */ + vnic->keep_alive_jiffs = jiffies; + length -= IB_GRH_BYTES; + rcv = kzalloc(sizeof *rcv, GFP_ATOMIC); + if (!rcv) { + vnic_warn(vnic->name, "failed kmalloc\n"); + kfree(fc); + goto repost; + } + + /* replace it with new entry, and queue old one */ + err = alloc_map_fip_buffer(vnic->port->dev->ca, &me, + FIP_UD_BUF_SIZE(vnic->port->max_mtu_enum), + GFP_ATOMIC); + if (err) { + vnic_warn(vnic->name, "alloc_map_fip_buffer failed\n"); + kfree(fc); + kfree(rcv); + goto repost; + } + + /* unmap old entry */ + ib_dma_unmap_single(vnic->port->dev->ca, + rx_ring->ring[index].bus_addr, + FIP_UD_BUF_SIZE(vnic->port->max_mtu_enum), + DMA_FROM_DEVICE); + + rx_ring->ring[index] = me; + rcv->fc = fc; + rcv->length = length; + rcv->mem = mem; + spin_lock(&vnic->vnic_rcv_list.lock); + list_add_tail(&rcv->list, &vnic->vnic_rcv_list.list); + spin_unlock(&vnic->vnic_rcv_list.lock); + one_or_more_queued++; + } else + kfree(fc); +repost: + ret = fip_post_receive(vnic->port, vnic->qp, + FIP_UD_BUF_SIZE(vnic->port->max_mtu_enum), + index, rx_ring->ring + index, vnic->name); + if (ret) + vnic_warn(vnic->name, "fip_post_receive ret %d\n", ret); + + rx_ring->tail++; + } + + if (one_or_more_queued && (likely(vnic->flush == FIP_NO_FLUSH))) { + /* calls fip_vnic_recv_bh() */ + queue_work(fip_wq, &vnic->vnic_pkt_rcv_task_bh); + } + + return; +} + +void fip_vnic_recv_list_flush(struct fip_vnic_data *vnic) +{ + struct list_head vnic_recv_local; + struct fip_rcv_pkt *rcv, *rcv1; + unsigned long flags; + + INIT_LIST_HEAD(&vnic_recv_local); + + spin_lock_irqsave(&vnic->vnic_rcv_list.lock, flags); + list_replace_init(&vnic->vnic_rcv_list.list, &vnic_recv_local); + spin_unlock_irqrestore(&vnic->vnic_rcv_list.lock, flags); + + list_for_each_entry_safe(rcv, rcv1, &vnic_recv_local, list) { + list_del(&rcv->list); + kfree(rcv); + } + return; +} + +void lag_ctx_clear(struct fip_vnic_data *vnic) +{ + memset(&vnic->lm, 0, sizeof (vnic->lm)); +} + +/* + * Handle the GW eport member info for a LAG GW. The function compares the + * member information to previous membership information that is stored in the + * vnic. The data path info is updated only after the login ack info was + * updated to prevent race conditions. + * The vnic contains a local cache of the member info. The cache is updated + * in all cases other then if the write to the data path failed. If the write + * failed we will not update the cache and rely on periodic updates packets + * for the retry. + * There are 4 possible flows per member entry: + * 1. the entry is cached in the vnic but not in the packet - remove from vnic + * 2. the entry is not cached in the vnic but is in the packet - add to vnic, + * 3. entry is in vnic and in packet but different params - modifiy vnic + * 4. entry is in vnic and in packet and with similar params - do nothing +*/ +int handle_member_update(struct fip_vnic_data *vnic, struct lag_members *lm) +{ + int i, j; + char packet_used[MAX_LAG_MEMBERS]; + char vnic_used[MAX_LAG_MEMBERS]; + struct lag_member *vnic_mem, *pkt_mem; + int last_bit = 0; + #define EMPTY_ENTRY (char)0xff + /* we only update data path with new info after certain stage */ + int write_through = !!(vnic->state >= FIP_VNIC_VHUB_WRITE); + int skip; + struct lag_properties lag_prop; + struct vnic_login *login = vnic->login; + + memset(packet_used, EMPTY_ENTRY, sizeof(packet_used)); + memset(vnic_used, EMPTY_ENTRY, sizeof(vnic_used)); + + /* if LAG is not enabled, or it's a child vNic, abort */ + if (!vnic->gw->info.ext_lag.valid || vnic->parent_used) + return -EINVAL; + + mutex_lock(&vnic->gw->mlock); + lag_prop.ca = vnic->gw->info.ext_lag.ca; + lag_prop.ca_thresh = vnic->gw->info.ext_lag.ca_thresh; + lag_prop.hash_mask = vnic->gw->info.ext_lag.hash; + lag_prop.weights_policy = vnic->gw->info.ext_lag.weights_policy; + mutex_unlock(&vnic->gw->mlock); + if (write_through) + vnic_member_prop(login, &lag_prop); + + /* go over all known members, for each one search for a match in the + * packet member struct */ + for (i=0; ilm.used_bitmask & 1 << i)) + continue; + + vnic_mem = &vnic->lm.memb[i]; + for (j=0; jnum; j++) { + + pkt_mem = &lm->memb[j]; + /* find match for member in vnic data structure */ + if (packet_used[j] == EMPTY_ENTRY && + !memcmp(vnic_mem->guid, pkt_mem->guid, GUID_LEN) && + vnic_mem->gw_port_id == pkt_mem->gw_port_id) { + /* found a match, check for change in parameters */ + if (vnic->login) { + /* check for change in member parameters */ + if (vnic_mem->lid != pkt_mem->lid || + vnic_mem->qpn != pkt_mem->qpn || + vnic_mem->eport_state != pkt_mem->eport_state || + vnic_mem->sl != pkt_mem->sl || + vnic_mem->link_utilization != pkt_mem->link_utilization) { + + vnic_dbg_lag_v(vnic->name, "handle_member_update entry %d modifying lid %d qpn %d state %d\n", + i, lm->memb[j].lid, lm->memb[j].qpn, lm->memb[j].eport_state); + /* update data path if required and store update info localy */ + if (!write_through || + (write_through && !vnic_member_modify(login, i, &lm->memb[j]))) + *vnic_mem = lm->memb[j]; + } + } + packet_used[j] = i; + vnic_used[i] = j; + break; + } + } + /* if member was removed in last packet remove it */ + if (vnic_used[i] == EMPTY_ENTRY) { + if (!write_through || + (write_through && !vnic_member_remove(login, i))) { + vnic_dbg_lag_v(vnic->name, "handle_member_update entry %d removing lid %d qpn %d state %d\n", + i, lm->memb[j].lid, lm->memb[j].qpn, lm->memb[j].eport_state); + vnic->lm.used_bitmask &= ~(1 << i); + } + } + } + + /* go over packet and look for any new members */ + for (j=0; jnum; j++) { + /* if entry was matched up already */ + if (packet_used[j]!= EMPTY_ENTRY) + continue; + + skip = 0; + /* verify that the same GW_ID is not in use by another port */ + for (i=0; ilm.used_bitmask & 1 << i)) + continue; + if (vnic->lm.memb[i].gw_port_id == lm->memb[j].gw_port_id) + skip = 1; + } + if (skip) + continue; + + /* look for an empty member id and add the member to it */ + for (i=last_bit; ilm.used_bitmask & 1 << i) + continue; + + vnic_dbg_lag_v(vnic->name, "handle_member_update entry %d adding lid %d qpn %d state %d\n", + i, lm->memb[j].lid, lm->memb[j].qpn, lm->memb[j].eport_state); + if (!write_through || + (write_through && !vnic_member_add(login, i, &lm->memb[j]))) { + vnic->lm.used_bitmask |= (1 << i); + vnic->lm.memb[i] = lm->memb[j]; + } + + break; + } + last_bit = i; + } + + return 0; +} + +/* Write the initial member table to the datapath. If we fail we will + * delete the entry from the local cache and rely on periodic updates + * packets for the retry*/ +int fip_vnic_write_members(struct fip_vnic_data *vnic) +{ + int i; + struct lag_properties lag_prop; + struct vnic_login *login = vnic->login; + + /* if LAG is not enabled, or it's a child vNic, abort */ + if (!vnic->gw->info.ext_lag.valid || vnic->parent_used) + return -EINVAL; + + lag_prop.ca = vnic->gw->info.ext_lag.ca; + lag_prop.ca_thresh = vnic->gw->info.ext_lag.ca_thresh; + lag_prop.hash_mask = vnic->gw->info.ext_lag.hash; + lag_prop.weights_policy = vnic->gw->info.ext_lag.weights_policy; + vnic_member_prop(login, &lag_prop); + + /* go over all members, for each une used write it to the data path */ + for (i=0; ilm.used_bitmask & 1 << i)) + continue; + + /* if update failed, delete local entry we will use the + * the update packet flow for retries. + */ + if (vnic_member_add(login, i, &vnic->lm.memb[i])) + vnic->lm.used_bitmask &= ~(1 << i); + } + + return 0; +} + +/* runs in the context of vnic->vnic_pkt_rcv_task_bh */ +void fip_vnic_recv_bh(struct work_struct *work) +{ + struct fip_vnic_data *vnic = + container_of(work, struct fip_vnic_data, vnic_pkt_rcv_task_bh); + int length; + u32 vhub_id, tusn; + int eport_state; + struct vnic_table_entry *vhub_entries; + struct list_head vnic_recv_local; + struct fip_rcv_pkt *rcv, *rcv1; + unsigned long flags; + int i, __eport_state; + + INIT_LIST_HEAD(&vnic_recv_local); + + spin_lock_irqsave(&vnic->vnic_rcv_list.lock, flags); + list_replace_init(&vnic->vnic_rcv_list.list, &vnic_recv_local); + spin_unlock_irqrestore(&vnic->vnic_rcv_list.lock, flags); + + /* We Are not interested in packets prior to FIP_VNIC_VHUB_INIT */ + if (vnic->state < FIP_VNIC_VHUB_INIT || + vnic->flush != FIP_NO_FLUSH) { + list_for_each_entry_safe(rcv, rcv1, &vnic_recv_local, list) { + kfree(rcv->fc); + kfree(rcv->mem); + list_del(&rcv->list); + kfree(rcv); + } + } else { + int err; + + list_for_each_entry_safe(rcv, rcv1, &vnic_recv_local, list) { + length = rcv->length; + + switch (rcv->fc->fh->subcode) { + case FIP_GW_UPDATE_SUB_OPCODE: + /* validate vhub id before processing packet */ + vhub_id = be32_to_cpu(rcv->fc->fvu->state_vhub_id) & 0xffffff; + if(unlikely(vnic->login_data.vhub_id != vhub_id)) + break; + + eport_state = be32_to_cpu(rcv->fc->fvu->state_vhub_id) >> 27 & 3; + __eport_state = (eport_state == 0) ? EPORT_STATE_DOWN : EPORT_STATE_UP; + atomic_set(&vnic->eport_state, __eport_state); + + /* handle vhub context update packets */ + if (rcv->fc->fed.num) { + err = extract_vhub_extended(rcv->fc->fed.fed[0], vnic); + if (err) + vnic_warn(vnic->name, "extract_vhub_extended() failed\n"); + } + if (rcv->fc->cte.num) { + vhub_entries = kmalloc(rcv->fc->cte.num * sizeof *vhub_entries, GFP_KERNEL); + if (!vhub_entries) { + vnic_warn(vnic->port->name, "failed to allocate memory for update CTEs\n"); + goto free_entry; + } + + tusn = be32_to_cpu(rcv->fc->fvu->tusn); + for (i = 0; i < rcv->fc->cte.num; ++i) { + vhub_entries[i].lid = be16_to_cpu(rcv->fc->cte.cte[i].lid); + vhub_entries[i].qpn = be32_to_cpu(rcv->fc->cte.cte[i].qpn) & 0xffffff; + vhub_entries[i].sl = rcv->fc->cte.cte[i].sl & 0xf; + vhub_entries[i].rss = rcv->fc->cte.cte[i].v_rss_type & FIP_CONTEXT_RSS_FLAG ? 1 : 0; + vhub_entries[i].valid = rcv->fc->cte.cte[i].v_rss_type & FIP_CONTEXT_V_FLAG ? 1 : 0; + memcpy(vhub_entries[i].mac, rcv->fc->cte.cte[i].mac, sizeof(vhub_entries[i].mac)); + vhub_handle_update(vnic, vhub_id, tusn - rcv->fc->cte.num + i + 1, &vhub_entries[i]); + } + kfree(vhub_entries); + } + + /* update vnic carrier only when vnic is ready: + * not closing (non zero flush), and per-registered + */ + if (!vnic->flush && vnic->login && + test_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state)) { + vnic_carrier_update(vnic->login); + } + break; + case FIP_GW_TABLE_SUB_OPCODE: + /* handle vhub context table packets */ + tusn = be32_to_cpu(rcv->fc->fvt->tusn); + vhub_id = be32_to_cpu(rcv->fc->fvt->vp_vhub_id) & 0xffffff; + vhub_handle_tbl(vnic, rcv->fc, vhub_id, tusn); + break; + + default: + break; + } +free_entry: + list_del(&rcv->list); + kfree(rcv->fc); + kfree(rcv->mem); + kfree(rcv); + } + } + return; +} + +/* + * Mark the vnic for deletion and trigger a delayed call to the cleanup + * function. In the past the vnic was moved to another list but this + * might cause vnic duplication if new vnics are added to the GW. Even + * if the vnic is being flushed we need to know it is there. + * + * Note: This deletion method insures that all pending vnic work requests + * are cleared without dependency of the calling context. + */ +void fip_vnic_close(struct fip_vnic_data *vnic, enum fip_flush flush) +{ + int tmp_flush; + + /* net admin -> full flush */ + tmp_flush = vnic->hadmined ? flush : FIP_FULL_FLUSH; + + /* child vNic -> full flush */ + tmp_flush = (!vnic->parent_used) ? tmp_flush : FIP_FULL_FLUSH; + + /* no need for partial cleanup in host admin idle */ + if (tmp_flush == FIP_PARTIAL_FLUSH && + vnic->state < FIP_VNIC_HADMIN_IDLE) + return; + + /* close already in process, disregard */ + spin_lock_irq(&vnic->lock); + if (vnic->flush >= tmp_flush){ + spin_unlock_irq(&vnic->lock); + return; + } + if (vnic->flush == FIP_NO_FLUSH && vnic->state > FIP_VNIC_WAIT_4_ACK) + fip_update_send(vnic, 0, 1 /* logout */); + + vnic->flush = tmp_flush; + cancel_delayed_work(&vnic->vnic_gw_alive_task); + cancel_delayed_work(&vnic->vnic_task); + spin_unlock_irq(&vnic->lock); + /* after this point we should have no work that is not already pending + * for execution, and no new work will be added + */ + + if (vnic->hadmined && tmp_flush == FIP_FULL_FLUSH) + vnic_delete_hadmin_dentry(vnic); + else if (!vnic->hadmined) + /* vnic_count is relevant for net admin only */ + vnic->gw->vnic_count--; + + vnic_dbg_mark(); + + /* calls fip_purge_vnics() */ + queue_delayed_work(fip_wq, &vnic->gw->vnic_cleanup_task, + DELAYED_WORK_CLEANUP_JIFFS); +} + +/* + * This is a helper function we use in order to move the login destroy + * to another context so we don't block the fip thread for too long. +*/ +void fip_vnic_login_destroy(struct work_struct *work) +{ + struct fip_vnic_data *vnic = + container_of(work, struct fip_vnic_data, + vnic_login_destroy_task); + int flush = vnic->flush; + + vnic_login_destroy_wq_stopped(vnic, flush); + + /* we don't want to use a lock here so we will verify that the + * flush level did not change between the request and now */ + if (flush == FIP_FULL_FLUSH) + set_bit(VNIC_LOGIN_DESTROY_FULL, &vnic->login_status); + + set_bit(VNIC_LOGIN_DESTROY_DONE, &vnic->login_status); +} + +/* + * Free vnic resources. This includes closing the data vnic (data QPs etc) + * and the discovery resources. If the vnic can be totaly destroyed (no + * pending work) the vnic will be removed from the GW and it's memory + * freed. If not the vnic will not be freed and the function will return an + * error. The caller needs to recall this unction to complete the operation. + * Note: Do not call this function to remove a vnic, use fip_vnic_close. +*/ +int fip_vnic_destroy(struct fip_vnic_data *vnic) +{ + int pending; + + vnic_dbg_func(vnic->name); + vnic_dbg_fip_p0(vnic->name, "fip_vnic_destroy called flow=%d state=%d mac" MAC_6_PRINT_FMT "\n", + vnic->flush, vnic->state, MAC_6_PRINT_ARG(vnic->login_data.mac)); + + pending = work_pending(&vnic->vnic_pkt_rcv_task_bh) || + delayed_work_pending(&vnic->vnic_gw_alive_task) || + delayed_work_pending(&vnic->vnic_task); + + /* verify no pending packets before we start tearing down the rings */ + if (pending || fip_vnic_test_login(vnic, 0) == -EAGAIN) + goto retry_later; + + if (!test_and_set_bit(VNIC_LOGIN_DESTROY_PENDING, + &vnic->login_status)) { + vnic_login_destroy_stop_wq(vnic, vnic->flush); + /* calls fip_vnic_login_destroy() */ + queue_work(login_wq, &vnic->vnic_login_destroy_task); + } + + if (!test_bit(VNIC_LOGIN_DESTROY_DONE, &vnic->login_status)) + goto retry_later; + + clear_bit(VNIC_LOGIN_DESTROY_DONE, &vnic->login_status); + clear_bit(VNIC_LOGIN_DESTROY_PENDING, &vnic->login_status); + + /* We need to test if when we queued the destroy request it was + * a partial flush but this has changed to a full flush. + * if so we need to try again */ + if (vnic->flush == FIP_FULL_FLUSH && + !test_bit(VNIC_LOGIN_DESTROY_FULL, &vnic->login_status)) + goto retry_later; + + hrtimer_cancel(&vnic->keepalive_timer); + + if (vnic->state >= FIP_VNIC_VHUB_INIT) { + lag_ctx_clear(vnic); + vhub_ctx_free(vnic); + } + + /* disconnect from mcast groups */ + if (vnic->state >= FIP_VNIC_MCAST_INIT) { + vnic_mcast_del_all(&vnic->mcast_tree); + fip_vnic_rings_destroy(vnic); + } + + if (vnic->state > FIP_VNIC_LOGIN) + ib_destroy_ah(vnic->ah); + + if (vnic->flush == FIP_PARTIAL_FLUSH) { + if (vnic->hadmined) /* we close Host admin vnics so they won't do any login from fip_vnic_fsm */ + vnic->state = FIP_VNIC_CLOSED; + else + vnic->state = FIP_VNIC_HADMIN_IDLE; + + vnic->flush = FIP_NO_FLUSH; + vnic->last_send_jiffs = 0; + + vnic_dbg_fip_v(vnic->name, "fip_vnic_remove partial done vnic->retry_count=%d\n", vnic->retry_count); + if (!VNIC_MAX_RETRIES || ++vnic->retry_count < VNIC_MAX_RETRIES) + QUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, FIP_LOGIN_TIMEOUT * HZ); + + } else { + list_del(&vnic->gw_vnics); + vnic_dbg_fip_v(vnic->name, "fip_vnic_remove full done\n"); + kfree(vnic); + } + + return 0; + +retry_later: + return -EBUSY; +} + +int fip_vnic_keepalive_send(struct fip_vnic_data *vnic, int source_timer) +{ + int update; + unsigned long flags; + int ret = 0; + + if (vnic->flush != FIP_NO_FLUSH) + return ret; + + if (vnic->last_send_jiffs > 1 && jiffies - vnic->last_send_jiffs > vnic->gw->info.vnic_ka_period * 3 / 2) + vnic_dbg_fip_p0(vnic->name, "Delaying in sending KA should be %ld actual time=%ld source=%d\n", + vnic->gw->info.vnic_ka_period, jiffies - vnic->last_send_jiffs, source_timer); + + spin_lock_irqsave(&vnic->ka_lock, flags); + if (source_timer || + (vnic->last_send_jiffs && jiffies - vnic->last_send_jiffs > + vnic->gw->info.vnic_ka_period * 6 / 5)) { + + /* we need to have mcast attached before we ask for a table */ + if (vnic->state >= FIP_VNIC_VHUB_INIT && + vnic->vhub_table.state == VHUB_TBL_INIT) + update = 1; + else + update = 0; + + /* send vnic keep alive to GW */ + ret = fip_update_send(vnic, update, 0 /*not logout */); + if (!ret) + vnic->last_send_jiffs = jiffies; + } + spin_unlock_irqrestore(&vnic->ka_lock, flags); + + return ret; + +} + +//void fip_vnic_keepalive(unsigned long data) +#ifdef _BP_HR_TIMER +int fip_vnic_keepalive(struct hrtimer * timer) +#else +enum hrtimer_restart fip_vnic_keepalive(struct hrtimer *timer) +#endif +{ +// struct fip_vnic_data *vnic = (struct fip_vnic_data *)data; + struct fip_vnic_data *vnic = (struct fip_vnic_data *) + container_of(timer, struct fip_vnic_data, keepalive_timer); + unsigned long flags; + ktime_t ktime; + enum hrtimer_restart ret = HRTIMER_NORESTART; + int flush; + + spin_lock_irqsave(&vnic->lock, flags); + flush = vnic->flush; + spin_unlock_irqrestore(&vnic->lock, flags); + + if (flush != FIP_NO_FLUSH) + return ret; + + fip_vnic_keepalive_send(vnic, 1); + + /*mod_timer(&vnic->keepalive, jiffies + time);*/ + ret = HRTIMER_RESTART; + ktime = ktime_set(0, vnic->gw->info.vnic_ka_period * (1000000000 / HZ)); + hrtimer_forward(&vnic->keepalive_timer, vnic->keepalive_timer.base->get_time(), ktime); + + + return ret; + +} + +void fip_vnic_gw_alive(struct work_struct *work) +{ + struct fip_vnic_data *vnic = + container_of(work, struct fip_vnic_data, + vnic_gw_alive_task.work); + long time_to_timeout; + + if (vnic->flush != FIP_NO_FLUSH) + return; + + if (!test_bit(MCAST_ATTACHED, &vnic->vnic_mcaste_state)) { + if (time_after(jiffies, vnic->detached_ka_jiffs + 60*HZ)) { + vnic_dbg_fip_p0(vnic->name, "No GW keep alive timeout when mcast un attached " + "QPN 0x%06x, LID 0x%04x\n", vnic->qp->qp_num, + vnic->port->attr.lid); + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + return; + } else { + vnic_dbg_fip_p0(vnic->name, "Got ka poll when bcast not " + "attached QPN 0x%06x, LID 0x%04x, ka=%u\n", + vnic->qp->qp_num, vnic->port->attr.lid, + jiffies_to_msecs(jiffies - vnic->detached_ka_jiffs)); + time_to_timeout = vnic->gw->info.gw_period; + } + } else { + long jiffs_from_last; + jiffs_from_last = (jiffies - vnic->keep_alive_jiffs); + time_to_timeout = vnic->gw->info.gw_period - jiffs_from_last; + } + + /* Todo, change receive of update to rearm work timer so an expiration + * indicates a truie time out */ + if (time_to_timeout <= 0) { + vnic_dbg_fip_p0(vnic->name, "GW keep alives timed out for " + "QPN 0x%06x, LID 0x%04x timeout=%ld\n", vnic->qp->qp_num, + vnic->port->attr.lid, time_to_timeout); + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + } else + QUEUE_VNIC_DWORK(vnic, &vnic->vnic_gw_alive_task, + time_to_timeout + 1); +} + +struct fip_vnic_data *fip_vnic_alloc(struct vnic_port *port, + struct fip_gw_data *gw, + int hadmin, u16 vnic_id) +{ + struct fip_vnic_data *vnic; + + vnic = kzalloc(sizeof(struct fip_vnic_data), GFP_KERNEL); + if (!vnic) { + vnic_err(port->name, "failed to alloc vnic\n"); + return NULL; + } + + vnic->state = hadmin ? FIP_VNIC_HADMIN_IDLE : FIP_VNIC_LOGIN; + vnic->vnic_id = vnic_id; + vnic->gw = gw; + vnic->gw_info = gw->info.vol_info; + vnic->port = port; + vnic->hadmined = hadmin; + vnic->flush = FIP_NO_FLUSH; + + sprintf(vnic->name, "vnic-%d", vnic_id); /* will be overwritten */ + + spin_lock_init(&vnic->lock); + spin_lock_init(&vnic->ka_lock); + INIT_DELAYED_WORK(&vnic->vnic_task, fip_vnic_fsm); + INIT_DELAYED_WORK(&vnic->vnic_gw_alive_task, fip_vnic_gw_alive); + INIT_WORK(&vnic->vnic_login_destroy_task, fip_vnic_login_destroy); + INIT_WORK(&vnic->vnic_login_create_task, fip_vnic_login_create); + + +#ifdef _BP_HR_TIMER + hrtimer_init(&vnic->keepalive_timer, CLOCK_MONOTONIC, HRTIMER_REL); +#else + hrtimer_init(&vnic->keepalive_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL ); +#endif + vnic->keepalive_timer.function = fip_vnic_keepalive; + + vnic_mcast_root_init(&vnic->mcast_tree); + atomic_set(&vnic->eport_state,EPORT_STATE_DOWN); + + return vnic; +} + +int fip_vnic_hadmin_init(struct vnic_port *port, struct fip_vnic_data *vnic) +{ + int rc; + + vnic_dbg_func(port->name); + + rc = vnic_login_pre_create_1(port, vnic); + if (rc) { + vnic_warn(port->name, "vnic_login_pre_create_1 failed, rc %d\n", rc); + goto pre_create_failed; + } + + strncpy(vnic->login_data.vnic_name, vnic->interface_name, + sizeof(vnic->interface_name)); + + /* queue login create request */ + fip_vnic_test_login(vnic, 1); + + return 0; + +pre_create_failed: + return -ENODEV; +} + +void fip_vnic_create_gw_param(struct fip_vnic_send_info *gw_address, u32 gw_qpn, + u32 qkey, u16 gw_lid, u8 gw_sl) +{ + gw_address->gw_qpn = gw_qpn; + gw_address->qkey = qkey; + gw_address->gw_lid = gw_lid; + gw_address->gw_sl = gw_sl; +} + +void fip_vnic_set_gw_param(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address) +{ + memcpy(&vnic->gw_address, gw_address, sizeof(vnic->gw_address)); +} + +int fip_hadmin_vnic_refresh(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address) +{ + vnic_dbg_fip(vnic->name, "fip_vnic_to_login host admin flow flush=%d" + " state=%d\n", vnic->flush, vnic->state); + if (likely(vnic->flush == FIP_NO_FLUSH) && + vnic->state <= FIP_VNIC_HADMIN_IDLE && + (!VNIC_MAX_RETRIES || vnic->retry_count < VNIC_MAX_RETRIES)) { + fip_vnic_set_gw_param(vnic, gw_address); + cancel_delayed_work(&vnic->vnic_task); + vnic->state = FIP_VNIC_LOGIN; + fip_vnic_fsm(&vnic->vnic_task.work); + } + return 0; +} + +/* + * Call the data vnic precreate 1 + 2 in order to alloc and init the data vnic. + * This function updates qp numbers that the data vnic will use. These qp numbers + * are needed for the login. + * This function does not cleanup on failures. It assumes that the caller will call + * the login destoy. +*/ +static int fip_vnic_login_init(struct vnic_port *port, struct fip_vnic_data *vnic) +{ + int qps_num; + int rc; + + struct ib_ah_attr ah_attr = { + .dlid = vnic->gw_address.gw_lid, + .port_num = port->num, + .sl = vnic_gw_ctrl_sl(vnic->gw) & 0xf, + }; + + vnic_dbg_func(vnic->name); + + /* If the driver wants to enable RSS (vnic_rss == 1) then the + * number of QPs is what the GW advertises: 1 << n_rss_qpn + */ + qps_num = (port->rx_rings_num > 1) ? (1 << vnic->gw->info.n_rss_qpn) : 1; + qps_num = (qps_num == 0) ? 1 : qps_num; + + /* However, we don't support any qps_num, if the GW asks for more than + * VNIC_MAX_NUM_CPUS QPs, then we're not going to enable RSS + * -- qps_num == 1 means RSS is disabled, otherwise it's enabled + */ + qps_num = qps_num <= VNIC_MAX_NUM_CPUS ? qps_num : 1; + + /* set in vnic, so it can be reported back to the BXM */ + vnic->qps_num = qps_num; + + /* in host admin vnic->login should be non NULL */ + if (!vnic->hadmined) { + rc = vnic_login_pre_create_1(port, vnic); + if (rc) { + vnic_warn(vnic->name, + "vnic_login_pre_create_1 failed, " + "rc %d\n", rc); + goto failed; + } + } + + /* in host admin vnic->login should be non NULL */ + rc = vnic_login_pre_create_2(vnic, qps_num, + vnic->gw->info.gw_type == GW_TYPE_LAG); + if (rc) { + vnic_warn(port->name, "vnic_login_pre_create_2 failed\n"); + goto failed; + } + + /* if parent_used, you must already have the base QPN */ + ASSERT(!vnic->parent_used || vnic->qp_base_num); + + vnic->ah = ib_create_ah(port->pd, &ah_attr); + if (IS_ERR(vnic->ah)) { + vnic_warn(vnic->name, "fip_vnic_login_init failed to create ah\n"); + vnic->ah = NULL; + goto failed; + } + + vhub_ctx_init(vnic); + + return 0; + +failed: + return -ENODEV; +} + +/* + * create a CQ and QP for the new vNic. Create RX and TX rings for this + * QP. Move QP to RTS and connect it to the CQ. +*/ +static int fip_vnic_rings_create(struct vnic_port *port, + struct fip_vnic_data *vnic) +{ + struct ib_qp_init_attr qp_init_attr; + int ret; + + vnic->rx_ring.size = FIP_LOGIN_RX_SIZE; + vnic->tx_ring.size = FIP_LOGIN_TX_SIZE; + + INIT_WORK(&vnic->vnic_pkt_rcv_task_bh, fip_vnic_recv_bh); + spin_lock_init(&vnic->vnic_rcv_list.lock); + INIT_LIST_HEAD(&vnic->vnic_rcv_list.list); + + if (ib_find_pkey(port->dev->ca, port->num, vnic->login_data.pkey, + &vnic->login_data.pkey_index)) { + vnic_warn(vnic->name, + "fip_vnic_rings_create PKey 0x%04x not found." + " Check configuration in SM/BX\n", vnic->login_data.pkey); + goto out_w_err; + } + + vnic->pkey = vnic->login_data.pkey; + vnic->pkey_index = vnic->login_data.pkey_index; + + vnic_dbg_fip_v(vnic->name, "fip_vnic_rings_create pkey id %d " + "for pkey 0x%x\n", (int)vnic->pkey_index, + (int)vnic->pkey); + + vnic->cq = ib_create_cq(port->dev->ca, fip_vnic_comp, NULL, vnic, + vnic->rx_ring.size + vnic->tx_ring.size, 0); + if (IS_ERR(vnic->cq)) { + vnic_dbg_fip(vnic->name, "failed to create receive CQ\n"); + goto out_w_err; + } + + memset(&qp_init_attr, 0, sizeof(qp_init_attr)); + qp_init_attr.cap.max_send_wr = vnic->tx_ring.size; + qp_init_attr.cap.max_recv_wr = vnic->rx_ring.size; + qp_init_attr.cap.max_send_sge = 1; + qp_init_attr.cap.max_recv_sge = 1; + qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.qp_type = IB_QPT_UD; + qp_init_attr.send_cq = vnic->cq; + qp_init_attr.recv_cq = vnic->cq; + + vnic->qp = ib_create_qp(port->pd, &qp_init_attr); + if (IS_ERR(vnic->qp)) { + vnic_dbg_fip(vnic->name, "failed to create QP\n"); + goto error_free_cq; + } + + vnic_dbg_fip_v(vnic->name, "fip_vnic_rings_create QPN %d," + " LID %d\n", (int)vnic->qp->qp_num, (int)port->attr.lid); + + /* move QP from reset to RTS */ + if (fip_init_qp(vnic->port, vnic->qp, vnic->pkey_index, vnic->name)) { + vnic_dbg_fip(vnic->name, "fip_init_qp returned with error\n"); + goto error_free_qp; + } + + ret = fip_init_tx(vnic->tx_ring.size, &vnic->tx_ring, vnic->name); + if (ret) { + vnic_dbg_fip(vnic->name, "fip_init_tx failed ret %d\n", ret); + goto error_free_qp; + } + + ret = fip_init_rx(port, vnic->rx_ring.size, vnic->qp, + &vnic->rx_ring, vnic->name); + if (ret) { + vnic_dbg_fip(vnic->name, "fip_init_rx returned %d\n", ret); + goto error_release_rings; + } + + /* enable recieving CQ completions */ + if (ib_req_notify_cq(vnic->cq, IB_CQ_NEXT_COMP)) + goto error_release_rings; + vnic_dbg_fip_v(vnic->name, "fip_vnic_rings_create done OK\n"); + + return 0; + +error_release_rings: + fip_flush_rings(port, vnic->cq, vnic->qp, &vnic->rx_ring, + &vnic->tx_ring, vnic->name); + fip_free_rings(port, &vnic->rx_ring, &vnic->tx_ring, vnic->name); +error_free_qp: + ib_destroy_qp(vnic->qp); +error_free_cq: + ib_destroy_cq(vnic->cq); +out_w_err: + vnic->qp = NULL; + vnic->cq = NULL; + vnic->rx_ring.size = 0; + vnic->tx_ring.size = 0; + return -ENODEV; +} + +static void fip_vnic_rings_destroy(struct fip_vnic_data *vnic) +{ + fip_flush_rings(vnic->port, vnic->cq, vnic->qp, &vnic->rx_ring, + &vnic->tx_ring, vnic->name); + fip_free_rings(vnic->port, &vnic->rx_ring, &vnic->tx_ring, vnic->name); + fip_vnic_recv_list_flush(vnic); + ib_destroy_qp(vnic->qp); + ib_destroy_cq(vnic->cq); + vnic->qp = NULL; + vnic->cq = NULL; +} + +/* + * This function is a callback called upon successful join to a + * multicast group. The function checks if we have joined + attached + * to all required mcast groups and if so moves the discovery FSM to solicit. +*/ +void fip_vnic_mcast_cnct_cb(struct vnic_mcast *mcast, void *ctx) +{ + struct fip_vnic_data *vnic = mcast->priv_data; + + vnic_dbg_fip(vnic->name, "fip_vnic_mcast_cnct_cb\n"); + vnic_dbg_parse(vnic->name, "attached mask = 0x%lx, req mask = 0x%lx\n", + *mcast->cur_attached, *mcast->req_attach); + + if ((*mcast->cur_attached & *mcast->req_attach) != *mcast->req_attach) + return; + + vnic->keep_alive_jiffs = jiffies; + set_bit(MCAST_ATTACHED, &vnic->vnic_mcaste_state); + /* in case of a new mcast connection switch to VHUB_INIT, for a + * reconnection stay in the current state */ + if (vnic->state < FIP_VNIC_VHUB_INIT) { + vnic_dbg_fip(vnic->name, + "fip_vnic_mcast_cnct_cb done joining mcasts\n"); + vnic->state = FIP_VNIC_VHUB_INIT; + cancel_delayed_work(&vnic->vnic_task); + REQUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, 0); + } +} + +/* + * This function is a callback called upon a mcast deattach event. + * This event can be triggered due to vnic request or due to an async + * event. Currently this code does not participate in the vnic's FSM. +*/ +void fip_vnic_mcast_deattach_cb(struct vnic_mcast *mcast, void *ctx) +{ + struct fip_vnic_data *vnic = mcast->priv_data; + + vnic->detached_ka_jiffs = jiffies; + clear_bit(MCAST_ATTACHED, &vnic->vnic_mcaste_state); + + vnic_dbg_fip(vnic->name, "fip_vnic_mcast_deattach_cb\n"); +} + +/* + * Try to connect to the relevant mcast groups. If one of the mcast failed + * The function should be recalled to try and complete the join process + * (for the mcast groups that the join process was not performed). + * Note: A successful return of vnic_mcast_join means that the mcast join + * started, not that the join completed. completion of the connection process + * is asyncronous and uses a supplyed callback. + */ +int fip_vnic_mcast_cnct(struct fip_vnic_data *vnic) +{ + struct vnic_port *port = vnic->port; + union vhub_mgid mgid; + struct vnic_mcast *mcaste, *mcaste_upd, *mcaste_tbl; + struct vnic_mcast *uninitialized_var(mcaste_ka); + int rc; + + vnic_dbg_fip(port->name, "fip_vnic_mcast_cnct called\n"); + + mcaste_upd = vnic_mcast_alloc(port, &vnic->req_attach, &vnic->cur_attached); + if (IS_ERR(mcaste_upd)) + return -EINVAL; + + mcaste_tbl = vnic_mcast_alloc(port, &vnic->req_attach, &vnic->cur_attached); + if (IS_ERR(mcaste_tbl)) { + rc = -EINVAL; + goto free_upd; + } + + set_bit(FIP_MCAST_VHUB_UPDATE, &vnic->req_attach); + set_bit(FIP_MCAST_TABLE, &vnic->req_attach); + + vnic_dbg_fip(port->name, "gw type is %d\n", vnic->gw->info.gw_type); + if (vnic->gw->info.gw_type == GW_TYPE_LAG) { + mcaste_ka = vnic_mcast_alloc(port, &vnic->req_attach, &vnic->cur_attached); + if (IS_ERR(mcaste_ka)) { + rc = -EINVAL; + goto free_tbl; + } + set_bit(FIP_MCAST_VHUB_KA, &vnic->req_attach); + } + + mcaste = mcaste_upd; + mcaste->priv_data = vnic; + mcaste->attach_bit_nr = FIP_MCAST_VHUB_UPDATE; + memset(mcaste->mac, 0, ETH_ALEN); + vhub_mgid_create(vnic->login_data.mgid_prefix, + mcaste->mac, + vnic->login_data.n_mac_mcgid, + vnic->login_data.vhub_id, VHUB_MGID_UPDATE, + 0, &mgid); + mcaste->gid = mgid.ib_gid; + mcaste->port_gid = mcaste->gid; + mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC); + mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC; + mcaste->retry = VNIC_MCAST_ULIMIT_RETRY; + mcaste->attach_cb = fip_vnic_mcast_cnct_cb; + mcaste->detach_cb = fip_vnic_mcast_deattach_cb; + mcaste->attach_cb_ctx = NULL; + mcaste->detach_cb_ctx = NULL; + mcaste->blocking = 0; + mcaste->qkey = VNIC_FIP_QKEY; + mcaste->pkey = vnic->pkey; + mcaste->qp = vnic->qp; + mcaste->create = vnic_mcast_create; + mcaste->blocking = 0; + mcaste->join_state = 1; + rc = vnic_mcast_add(&vnic->mcast_tree, mcaste); + ASSERT(!rc); + rc = vnic_mcast_attach(&vnic->mcast_tree, mcaste); /* MCAST_RECEIVE_ONLY */ + ASSERT(!rc); + + mcaste = mcaste_tbl; + mcaste->priv_data = vnic; + mcaste->attach_bit_nr = FIP_MCAST_TABLE; + memset(mcaste->mac, 0, ETH_ALEN); + vhub_mgid_create(vnic->login_data.mgid_prefix, + mcaste->mac, + vnic->login_data.n_mac_mcgid, + vnic->login_data.vhub_id, VHUB_MGID_TABLE, + 0, &mgid); + mcaste->gid = mgid.ib_gid; + mcaste->port_gid = mcaste->gid; + mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC); + mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC; + mcaste->retry = VNIC_MCAST_ULIMIT_RETRY; + mcaste->attach_cb = fip_vnic_mcast_cnct_cb; + mcaste->detach_cb = fip_vnic_mcast_deattach_cb; + mcaste->attach_cb_ctx = NULL; + mcaste->detach_cb_ctx = NULL; + mcaste->blocking = 0; + mcaste->qkey = VNIC_FIP_QKEY; + mcaste->pkey = vnic->pkey; + mcaste->qp = vnic->qp; + mcaste->create = vnic_mcast_create; + mcaste->blocking = 0; + mcaste->join_state = 1; + rc = vnic_mcast_add(&vnic->mcast_tree, mcaste); + ASSERT(!rc); + rc = vnic_mcast_attach(&vnic->mcast_tree, mcaste); /* MCAST_RECEIVE_ONLY */ + ASSERT(!rc); + + if (vnic->gw->info.gw_type != GW_TYPE_LAG) + return 0; + + mcaste = mcaste_ka; + mcaste->priv_data = vnic; + mcaste->attach_bit_nr = FIP_MCAST_VHUB_KA; + memset(mcaste->mac, 0, ETH_ALEN); + vhub_mgid_create(vnic->login_data.mgid_prefix, + mcaste->mac, + vnic->login_data.n_mac_mcgid, + vnic->login_data.vhub_id, VHUB_MGID_KA, + 0, &mgid); + mcaste->gid = mgid.ib_gid; + mcaste->port_gid = mcaste->gid; + mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC); + mcaste->backoff_factor = 1; + mcaste->retry = VNIC_MCAST_MAX_RETRY; + mcaste->attach_cb = fip_vnic_mcast_cnct_cb; + mcaste->detach_cb = fip_vnic_mcast_deattach_cb; + mcaste->attach_cb_ctx = NULL; + mcaste->detach_cb_ctx = NULL; + mcaste->blocking = 0; + mcaste->qkey = VNIC_FIP_QKEY; + mcaste->pkey = vnic->pkey; + mcaste->qp = vnic->qp; + mcaste->create = vnic_mcast_create; + mcaste->blocking = 0; + mcaste->join_state = 1; + mcaste->sender_only = 1; + vnic->ka_mcast_gid = mcaste->gid; + rc = vnic_mcast_add(&vnic->mcast_tree, mcaste); + ASSERT(!rc); + rc = vnic_mcast_attach(&vnic->mcast_tree, mcaste); + ASSERT(!rc); + + return 0; + +free_tbl: + vnic_mcast_dealloc(mcaste_tbl); + +free_upd: + vnic_mcast_dealloc(mcaste_upd); + + return rc; +} + +/* + * This function is the driving engine of the vnic logic. It manages the + * vnics state machines. + * Some of the states in the state machine could have been removed because + * they contain "actions" and not states. Still it is easier to maintaine + * the code this way and it gives an easy mechanism for exception handling + * and retries. + * Only call this function from fip_wq context. +*/ +void fip_vnic_fsm(struct work_struct *work) +{ + struct fip_vnic_data *vnic = + container_of(work, struct fip_vnic_data, vnic_task.work); + struct vnic_port *port = vnic->port; + int rc, recall_time = 0; + const long int msec_in_sec = 1000; + struct fip_vnic_send_info gw_address; + ktime_t ktime; + + vnic_dbg_fip(port->name, "fip_vnic_fsm called vnic %d\n", + vnic->vnic_id); + + if (vnic->flush != FIP_NO_FLUSH) + return; + + switch (vnic->state) { + case FIP_VNIC_CLOSED: + break; + case FIP_VNIC_HADMIN_IDLE: + if (vnic->gw->state < FIP_GW_CONNECTED) + break; + fip_vnic_create_gw_param(&gw_address, vnic->gw->info.gw_qpn, VNIC_FIP_QKEY, + vnic->gw->info.gw_lid, vnic_gw_ctrl_sl(vnic->gw)); + fip_vnic_set_gw_param(vnic, &gw_address); + /* fall through */ + + case FIP_VNIC_LOGIN: + vnic_dbg_fip(port->name, "FIP_VNIC_LOGIN vnic %d\n", + vnic->vnic_id); + /* get data QP numbers needed for login request packet. If we fail + * we will close the vnic entirely */ + rc = fip_vnic_login_init(vnic->port, vnic); + if (rc) { + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + vnic_warn(vnic->name, "fip_vnic_login_init failed, " + "closing vnic rc %d\n", rc); + break; + } + vnic->state = FIP_VNIC_WAIT_4_ACK; + /* fall through */ + + case FIP_VNIC_WAIT_4_ACK: + vnic_dbg_fip(port->name, "FIP_VNIC_WAIT_4_ACK vnic %d\n", + vnic->vnic_id); + /* resend login request every timeout */ + vnic_dbg_fip(port->name, "fip_login_send vnic %d\n",vnic->vnic_id); + rc = fip_login_send(vnic); + if (!rc) + recall_time = FIP_LOGIN_TIMEOUT * msec_in_sec; + else + recall_time = 1 * msec_in_sec; + + goto queue_vnic_work; + + case FIP_VNIC_RINGS_INIT: + /* create QP and rings */ + rc = fip_vnic_rings_create(vnic->port, vnic); + if (rc) { + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + vnic_warn(vnic->name, "fip_vnic_rings_create failed, " + "closing vnic rc=%d\n", rc); + break; + } + + vnic->last_send_jiffs = 1; /* use a non zero value to start transmition */ + { + /* start vnic UCAST KA packets, This will also cause bxm to send us the + * neighbor table */ + if (vnic->gw->info.gw_type != GW_TYPE_LAG) { + ktime = ktime_set(0, 0); +#ifdef _BP_HR_TIMER + hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_REL ); +#else + hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_MODE_REL ); +#endif + } + } + + vnic->state = FIP_VNIC_MCAST_INIT; + /* fall through */ + + case FIP_VNIC_MCAST_INIT: + rc = fip_vnic_mcast_cnct(vnic); + if (rc) { + vnic_warn(vnic->name, + "fip_vnic_mcast_cnct failed, rc %d\n", rc); + /* try again later */ + recall_time = 1 * msec_in_sec; + goto queue_vnic_work; + } + vnic->state = FIP_VNIC_MCAST_INIT_DONE; + /* fall through */ + + case FIP_VNIC_MCAST_INIT_DONE: + /* wait for mcast attach CB before continueing */ + break; + + case FIP_VNIC_VHUB_INIT: + + /* previous KA if sent did not request a table because MCASTs were not + * available. Send extra KA packet that should trigger table request in + * order to hasten things up */ + fip_vnic_keepalive_send(vnic, 1); + + if (vnic->gw->info.gw_type == GW_TYPE_LAG) { + /* start vnic MCAST KA packets, This will also cause bxm to send us the + * neighbor table */ + ktime = ktime_set(0, 0); +#ifdef _BP_HR_TIMER + hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_REL ); +#else + hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_MODE_REL ); +#endif + } + + /* start tracking GW keep alives, calls fip_vnic_gw_alive() */ + QUEUE_VNIC_DWORK(vnic, &vnic->vnic_gw_alive_task, + vnic->gw->info.gw_period); + + vnic->state = FIP_VNIC_VHUB_INIT_DONE; + /* fall through */ + + case FIP_VNIC_VHUB_INIT_DONE: + /* we are waiting to receive a full vhub table. The KA will handle + * retries if we do not get the table we are expecting */ + + /* queue login create request */ + if (fip_vnic_test_login(vnic, 1)) { + recall_time = 1 * msec_in_sec; + goto queue_vnic_work; + } + + break; + + case FIP_VNIC_VHUB_DONE: + if (fip_vnic_test_login(vnic, 1)) { + recall_time = 1 * msec_in_sec; + goto queue_vnic_work; + } + + if (vnic_login_complete_ack(vnic, &vnic->login_data, &vnic->shared_vnic)) { + vnic_warn(vnic->name, + "vnic_login_complete_ack failed\n"); + recall_time = 1 * msec_in_sec; + goto queue_vnic_work; + } + + /* for LAG write member info */ + fip_vnic_write_members(vnic); + + vnic->state = FIP_VNIC_VHUB_WRITE; + /* fall through */ + + case FIP_VNIC_VHUB_WRITE: + /* write the vhub table to login */ + fip_vnic_write_tbl(vnic); + vnic->state = FIP_VNIC_CONNECTED; + /* fall through */ + + case FIP_VNIC_CONNECTED: + vnic->retry_count = 0; + break; + default: + ASSERT(0); + break; + } + + vnic_dbg_fip(port->name, "state %d gw_lid %d gw_qpn %d\n", + vnic->state, vnic->gw_address.gw_lid, vnic->gw_address.gw_qpn); + return; + +queue_vnic_work: + QUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, recall_time * HZ / msec_in_sec); +} diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_main.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_main.c new file mode 100644 index 0000000000000..07a6f2ebe54d7 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_main.c @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_fip.h" +#include "vnic_fip_discover.h" + +struct workqueue_struct *fip_wq; + +void fip_refresh_mcasts(struct fip_discover *discover) +{ + struct fip_gw_data *gw; + struct fip_vnic_data *vnic; + + fip_discover_mcast_reattach(discover, discover->port); + + down_read(&discover->l_rwsem); + list_for_each_entry(gw, &discover->gw_list, list) + list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) { + if (vnic->flush != FIP_FULL_FLUSH && vnic->state > FIP_VNIC_MCAST_INIT) + vnic_tree_mcast_detach(&vnic->mcast_tree); + } + + list_for_each_entry(gw, &discover->gw_list, list) { + list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) { + if (vnic->flush != FIP_FULL_FLUSH && vnic->state > FIP_VNIC_MCAST_INIT) + vnic_tree_mcast_attach(&vnic->mcast_tree); + } + /* restart path query */ + if (vnic_sa_query && gw->state >= FIP_GW_CTRL_PATH_QUERY && gw->flush == FIP_NO_FLUSH) + fip_discover_gw_fsm_move(gw, FIP_GW_CTRL_PATH_QUERY); + } + up_read(&discover->l_rwsem); + +} + +void port_fip_discover_restart(struct work_struct *work) +{ + struct vnic_port *port = + container_of(work, struct vnic_port, discover_restart_task.work); + struct fip_discover *discover; + struct vnic_login *login; + + vnic_dbg_mark(); + mutex_lock(&port->start_stop_lock); + vnic_dbg_mark(); + mutex_lock(&port->mlock); + if (vnic_port_query(port)) + vnic_warn(port->name, "vnic_port_query failed\n"); + + /* bring vnics links down */ + list_for_each_entry(login, &port->login_list, list) + vnic_mcast_del_all(&login->mcast_tree); + + mutex_unlock(&port->mlock); + list_for_each_entry(discover, &port->fip.discover_list, discover_list) { + if (fip_discover_cleanup(port, discover, 0)) { + vnic_dbg(port->name, "fip_discover_cleanup flushed\n"); + goto out; + } + } + + list_for_each_entry(discover, &port->fip.discover_list, discover_list) { + if (fip_discover_init(port, discover, discover->pkey, 0)) { + vnic_warn(port->name, "failed to alloc discover resources\n"); + } + } +out: + mutex_unlock(&port->start_stop_lock); + return; +} + +void vnic_port_fip_cleanup(struct vnic_port *port, int lock) +{ + struct fip_discover *discover, *tmp_discover; + + if (lock) + mutex_lock(&port->start_stop_lock); + + list_for_each_entry_safe(discover, tmp_discover, &port->fip.discover_list, discover_list) { + vnic_dbg_fip_p0(port->name, "Discovery cleanup of PKEY=0x%x\n", discover->pkey); + + list_del(&discover->discover_list); + vnic_info("Removed fip discovery %s port %d pkey 0x%x\n", + port->dev->ca->name, port->num, discover->pkey); + fip_discover_cleanup(port, discover, 1); + kfree(discover); + } + + if (lock) + mutex_unlock(&port->start_stop_lock); +} + + +int vnic_port_fip_init(struct vnic_port *port) +{ + int rc; + struct fip_discover *discover; + int i; + + if (no_bxm) + return 0; + + vnic_discovery_pkeys_count = vnic_discovery_pkeys_count > MAX_NUM_PKEYS_DISCOVERY ? + MAX_NUM_PKEYS_DISCOVERY : vnic_discovery_pkeys_count; + + if (vnic_discovery_pkeys_count == 0 || + (vnic_discovery_pkeys_count == MAX_NUM_PKEYS_DISCOVERY && + vnic_discovery_pkeys[0] == 0)) { + vnic_discovery_pkeys[0] = 0xffff; + vnic_discovery_pkeys_count = 1; + vnic_dbg_fip_p0(port->name, "Creating default PKEY for Discovery\n"); + } + + mutex_lock(&port->start_stop_lock); + + for (i = 0; i < vnic_discovery_pkeys_count; i++) { + vnic_discovery_pkeys[i] &= 0xffff; + vnic_discovery_pkeys[i] |= 0x8000; + + vnic_dbg_fip_p0(port->name, "Init Discovery=%d on PKEY=0x%x\n", i, vnic_discovery_pkeys[i]); + + discover = kzalloc(sizeof(struct fip_discover), GFP_KERNEL); + if (!discover) { + vnic_warn(port->name, "discover alloc failed\n"); + rc = -ENOMEM; + goto fail; + } + + INIT_LIST_HEAD(&discover->discover_list); + + vnic_info("Added fip discovery %s port %d PKEY 0x%x\n", + port->dev->ca->name, port->num, + vnic_discovery_pkeys[i]); + + list_add_tail(&discover->discover_list, &port->fip.discover_list); + rc = fip_discover_init(port, discover, vnic_discovery_pkeys[i], 1); + if (rc) { + vnic_warn(port->name, "fip_discover_init pkey=0x%x " + "failed\n", discover->pkey); + list_del(&discover->discover_list); + kfree(discover); + goto fail; + } + } + mutex_unlock(&port->start_stop_lock); + return 0; + +fail: + mutex_unlock(&port->start_stop_lock); + vnic_port_fip_cleanup(port, 1); + return rc; +} + diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_pkt.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_pkt.c new file mode 100644 index 0000000000000..078d4aa0ea5f2 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_pkt.c @@ -0,0 +1,863 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "vnic.h" +#include "vnic_fip.h" +#include "vnic_fip_discover.h" +#include "vnic_fip_pkt.h" + +const struct eoib_host_update base_update_pkt = { + .fip.subcode = FIP_HOST_ALIVE_SUB_OPCODE, + .fip.type.type = FIP_FIP_HDR_TYPE, + .fip.type.length = FIP_FIP_HDR_LENGTH, + .fip.vendor_id = FIP_VENDOR_MELLANOX, + + .type_1.type = FIP_HOST_UPDATE_TYPE, + .type_1.length = FIP_HOST_UPDATE_LENGTH, + .vendor_id = FIP_VENDOR_MELLANOX, +}; + +const struct eoib_host_update base_logout_pkt = { + .fip.subcode = FIP_HOST_LOGOUT_SUB_OPCODE, + .fip.type.type = FIP_FIP_HDR_TYPE, + .fip.type.length = FIP_FIP_HDR_LENGTH, + .fip.vendor_id = FIP_VENDOR_MELLANOX, + + .type_1.type = FIP_LOGOUT_TYPE_1, + .type_1.length = FIP_LOGOUT_LENGTH_1, + .vendor_id = FIP_VENDOR_MELLANOX, +}; + +static int extract_adv_extended(struct fip_ext_desc_tlv *fed, + struct fip_gw_data_info *info) +{ + struct fip_ext_type_cap *extended_cap; + struct fip_ext_type_boot *extended_boot; + struct fip_ext_type_power_cycle_id *extended_pc_id; + struct fip_ext_type_lag_props *extended_lag = NULL; + struct fip_extended_type *ext_hdr; + int length_to_go, ext_length; + + vnic_dbg_parse("", "extracting extended descriptor\n"); + + length_to_go = (((int)fed->ft.length) << 2) - sizeof(*fed); + ext_hdr = (struct fip_extended_type *)(fed + 1); + + while (length_to_go > 0) { + ext_length = ((int)ext_hdr->len) << 2; + + vnic_dbg_parse(NULL, "Advertise parse, sub-tlv " + "type %d length %d address=%p\n", + ext_hdr->ext_type, ext_length, ext_hdr); + + if (ext_length < sizeof(*ext_hdr) || + ext_length > length_to_go) { + vnic_dbg_parse(NULL, "Extended length error. " + "Length=%d\n", ext_length); + return -EINVAL; + } + + if (ext_hdr->ext_type == ADV_EXT_TYPE(CAP) && + ext_length == sizeof(*extended_cap)) { /* capabilities*/ + /* do nothing */ + } else if (ext_hdr->ext_type == ADV_EXT_TYPE(LAG) && /* LAG */ + ext_length == sizeof(*extended_lag)) { + extended_lag = (struct fip_ext_type_lag_props *)ext_hdr; + info->gw_type = extended_lag->gw_type; + info->ext_lag.hash = be16_to_cpu(extended_lag->lag_hash); + info->ext_lag.weights_policy = extended_lag->weight_policy_flags >> 4; + info->ext_lag.member_ka = (extended_lag->weight_policy_flags & 0x8) >> 3; + info->ext_lag.ca = !!(extended_lag->weight_policy_flags & + FIP_EXT_LAG_W_POLICY_HOST); + info->ext_lag.ca_thresh = extended_lag->ca_threshold; + info->ext_lag.ucast = !!(extended_lag->weight_policy_flags & + FIP_EXT_LAG_W_POLICY_UCAST); + info->ext_lag.valid = 1; + } else if (ext_hdr->ext_type == ADV_EXT_TYPE(BOOT) && + ext_length == sizeof(*extended_boot)) { /* boot */ + extended_boot = (struct fip_ext_type_boot *)ext_hdr; + info->ext_boot.boot_prio = extended_boot->boot_prio; + info->ext_boot.timeout = extended_boot->discovery_timeout; + info->ext_boot.valid = 1; + } else if (ext_hdr->ext_type == ADV_EXT_TYPE(PC_ID) && + ext_length == sizeof(*extended_pc_id)) { /* Power Cycle ID */ + extended_pc_id = (struct fip_ext_type_power_cycle_id *)ext_hdr; + info->ext_pc_id.power_cycle_id = + be64_to_cpu(extended_pc_id->power_cycle_id); + info->ext_pc_id.valid = 1; + } else if (ext_hdr->mandatory & 0x01) { + vnic_dbg_parse(NULL, "Advertise parse, unknown" + " mandatory extended type %d length %d\n", + ext_hdr->ext_type, ext_length); + return -EINVAL; + } else + vnic_dbg_parse(NULL, "Advertise parse, unknown " + "non-mandatory extended. Skipping, type" + " %d length %d\n", + ext_hdr->ext_type, ext_length); + + ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length); + length_to_go -= ext_length; + } + + return 0; +} + +int fip_advertise_parse_bh(struct fip_discover *discover, struct fip_content *fc, + struct fip_gw_data *data) +{ + long ka_time; + int err = 0; + + /* make sure we have at least a single address descriptor */ + if (fc->fa.num < 1 || !fc->fgwi || !fc->fgid || !fc->fka) + return -EINVAL; + + data->info.flags = be16_to_cpu(fc->fh->flags) & FIP_FIP_ADVRTS_FLAG ? FIP_GW_AVAILABLE : 0; + + data->info.flags |= + (be16_to_cpu(fc->fh->flags) & FIP_FIP_SOLICITED_FLAG) ? 0 : + FIP_RCV_MULTICAST; + + data->info.flags |= FIP_IS_FIP; + data->info.flags |= (fc->fh->flags & FIP_ADVERTISE_HOST_VLANS) ? + FIP_HADMINED_VLAN : 0; + + data->info.gw_qpn = be32_to_cpu(fc->fa.fa[0]->gwtype_qpn) & 0xffffff; + data->info.gw_lid = be16_to_cpu(fc->fa.fa[0]->lid); + data->info.gw_port_id = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) & + FIP_ADVERTISE_GW_PORT_ID_MASK; + data->info.gw_sl = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) >> FIP_ADVERTISE_SL_SHIFT; /*ignore this value.*/ + memcpy(data->info.gw_guid, fc->fa.fa[0]->guid, sizeof(data->info.gw_guid)); + data->info.gw_num_vnics = be16_to_cpu(fc->fgwi->n_rss_qpn_vnics) & + FIP_ADVERTISE_NUM_VNICS_MASK; + + data->info.n_rss_qpn = be16_to_cpu(fc->fgwi->n_rss_qpn_vnics) >> + FIP_ADVERTISE_N_RSS_SHIFT; + data->info.hadmined_en = (fc->fgwi->h_nmac_mgid & FIP_ADVERTISE_HOST_EN_MASK); + data->info.all_vlan_gw = !!(fc->fgwi->h_nmac_mgid & FIP_ADVERTISE_ALL_VLAN_GW_MASK); + + TERMINATED_MEMCPY(data->info.gw_vendor_id, fc->fgwi->vendor_id); + memcpy(data->info.vol_info.system_guid, fc->fgid->sys_guid, + sizeof(data->info.vol_info.system_guid)); + TERMINATED_MEMCPY(data->info.vol_info.system_name, + fc->fgid->sys_name); + TERMINATED_MEMCPY(data->info.vol_info.gw_port_name, fc->fgid->gw_port_name); + + ka_time = be32_to_cpu(fc->fka->adv_period); + ka_time = ka_time ? ka_time : FKA_ADV_PERIOD; + /* do not let KA go under 2 secs */ + ka_time = (ka_time < 2000) ? 2000 : ka_time; + data->info.gw_adv_period = FIP_TIMEOUT_FACTOR(msecs_to_jiffies(ka_time)); + + ka_time = be32_to_cpu(fc->fka->ka_period); + ka_time = ka_time ? ka_time : FKA_ADV_PERIOD; + data->info.gw_period = FIP_TIMEOUT_FACTOR(msecs_to_jiffies(ka_time)); + + ka_time = be32_to_cpu(fc->fka->vnic_ka_period); + ka_time = ka_time ? ka_time : FKA_ADV_PERIOD; + data->info.vnic_ka_period = msecs_to_jiffies(ka_time); + + data->info.gw_type = GW_TYPE_SINGLE_EPORT; + if (fc->fed.num > 0) { + if (fc->fed.num == 1) { + /* new version bxm mode */ + data->info.gw_prot_new = 1; + err = extract_adv_extended(fc->fed.fed[0], &data->info); + if (err) + vnic_dbg_parse(discover->name, "invalid extended descripotr\n"); + } else { + vnic_dbg_parse(discover->name, "too many extended descripotrs\n"); + return -EINVAL; + } + } + + return err; +} + +static int send_generic_mcast_pkt(struct vnic_port *port, + struct fip_ring *tx_ring, + void *mem, int pkt_size, + struct ib_qp *qp, + int pkey_index, + struct vnic_mcast *mcast) +{ + int index, rc; + unsigned long flags; + unsigned long tail; + + /* + * we are only allowed to update the head at task level so no need to + * perform any locks here + */ + spin_lock_irqsave(&tx_ring->ring_lock, flags); + index = tx_ring->head & (tx_ring->size - 1); + vnic_dbg_fip(port->name, "mcast packet\n"); + + spin_lock(&tx_ring->head_tail_lock); + tail = tx_ring->tail; + spin_unlock(&tx_ring->head_tail_lock); + + /* ring full try again */ + if (tx_ring->head - tail >= tx_ring->size) { + vnic_warn(port->name, "send_generic_mcast_pkt ring full: QPN 0x%x: tail=%ld head=%ld diff=%ld\n", + qp->qp_num, tx_ring->tail, tx_ring->head, tx_ring->head - tx_ring->tail); + rc = -EAGAIN; + goto err; + } + + rc = _map_generic_pkt(port, &tx_ring->ring[index], mem, pkt_size); + if (rc) + goto err; + + rc = fip_mcast_send(port, qp, index, + tx_ring->ring[index].bus_addr, + pkt_size, pkey_index, mcast); + + if (rc) { + vnic_warn(port->name, + "send_generic_mcast_pkt: fip_mcast_send ret %d\n", + rc); + rc = -ENODEV; + goto error_unmap_dma; + } + + tx_ring->head++; + + spin_unlock_irqrestore(&tx_ring->ring_lock, flags); + return 0; + +error_unmap_dma: + ib_dma_unmap_single(port->dev->ca, + tx_ring->ring[index].bus_addr, + pkt_size, DMA_TO_DEVICE); + +err: + spin_unlock_irqrestore(&tx_ring->ring_lock, flags); + return rc; +} + +static void *alloc_solicit_pkt(int new_prot, char *node_desc) +{ + void *ptr; + struct fip_solicit_new *nptr; + struct fip_solicit_legacy *optr; + int size = new_prot ? sizeof *nptr : sizeof *optr; + + ptr = kzalloc(size, GFP_KERNEL); + if (!ptr) + return ERR_PTR(-ENOMEM); + optr = ptr; + optr->version.version = 1; + optr->fh.opcode = cpu_to_be16(EOIB_FIP_OPCODE); + optr->fh.subcode = FIP_HOST_SOL_SUB_OPCODE; + optr->fh.list_length = cpu_to_be16(size - offsetof(typeof(*optr), fvend)) / 4; + optr->fvend.ft.type = FIP_TYPE(VENDOR_ID); + optr->fvend.ft.length = sizeof optr->fvend / 4; + strncpy(optr->fvend.vendor_id, "mellanox", sizeof optr->fvend.vendor_id); + optr->addr.ft.type = FIP_TYPE(ADDRESS); + optr->addr.ft.length = sizeof optr->addr / 4; + strncpy(optr->addr.vendor_id, "mellanox", sizeof optr->addr.vendor_id); + if (new_prot) { + nptr = ptr; + nptr->ext.ft.type = 254; + nptr->ext.ft.length = sizeof nptr->ext / 4; + strncpy(nptr->ext.vendor_id, "mellanox", sizeof nptr->ext.vendor_id); + nptr->ext_cap.et.ext_type = 40; + nptr->ext_cap.et.len = sizeof nptr->ext_cap / 4; + nptr->ext_cap.et.mandatory = 1; + nptr->ext_hostname.et.ext_type = 39; + nptr->ext_hostname.et.len = sizeof nptr->ext_hostname / 4; + strncpy(nptr->ext_hostname.hostname, node_desc, sizeof nptr->ext_hostname.hostname); + } + + return ptr; +} + +int fip_solicit_send(struct fip_discover *discover, + enum fip_packet_type multicast, + u32 dqpn, u16 dlid, u8 sl, int new_prot) +{ + int rc = 0; + unsigned long flags, flags1; + struct fip_solicit_legacy *optr; + int size = new_prot ? sizeof(struct fip_solicit_new) : sizeof *optr; + + ASSERT(discover); + + /* alloc packet to be sent */ + optr = alloc_solicit_pkt(new_prot, discover->port->dev->ca->node_desc); + if (IS_ERR(optr)) + return PTR_ERR(optr); + + /* we set bit 24 to signify that we're a new host */ + optr->addr.gwtype_qpn = cpu_to_be32(discover->qp->qp_num | 0x1000000); + optr->addr.lid = cpu_to_be16(discover->port->attr.lid); + /* send the SL to the GW*/ + optr->addr.sl_gwportid = cpu_to_be16(sl << FIP_ADVERTISE_SL_SHIFT); + + memcpy(optr->addr.guid, &discover->port->gid.global.interface_id, sizeof(optr->addr.guid)); + vnic_dbg_fip(discover->name, "fip_solicit_send creating multicast %d" + " solicit packet\n", multicast); + + fip_dbg_dump_raw_pkt(0, optr, size, 1, "sending solicit packet"); + + if (multicast) { + struct vnic_mcast *mcaste; + union ib_gid gid; + + memcpy(&gid, fip_solicit_mgid, GID_LEN); + spin_lock_irqsave(&discover->mcast_tree.mcast_rb_lock, flags); + mcaste = vnic_mcast_search(&discover->mcast_tree, &gid); + /* it is possible for the MCAST entry or AH to be missing in + * transient states (after events). This is a valid condition + * but we can't send packet + */ + if (!IS_ERR(mcaste) && mcaste->ah) { + spin_lock_irqsave(&mcaste->lock, flags1); + rc = send_generic_mcast_pkt(discover->port, &discover->tx_ring, + optr, size, discover->qp, + discover->pkey_index, + mcaste); + spin_unlock_irqrestore(&mcaste->lock, flags1); + } else + kfree(optr); + + spin_unlock_irqrestore(&discover->mcast_tree.mcast_rb_lock, flags); + } else { + rc = send_generic_ucast_pkt(discover->port, NULL, &discover->tx_ring, + optr, size, discover->qp, + discover->pkey_index, + dqpn, dlid, VNIC_FIP_QKEY, sl); + } + if (rc) + goto error_free_mem; + + return 0; + +error_free_mem: + vnic_warn(discover->name, "discover_send error ret %d\n", rc); + kfree(optr); + return -ENOMEM; +} + +static void *alloc_login_pkt(struct fip_vnic_data *vnic) +{ + struct eoib_login *ptr; + int size = sizeof *ptr; + + ptr = kzalloc(size, GFP_KERNEL); + if (!ptr) + return ERR_PTR(-ENOMEM); + + ptr->eoib_ver.version = 1; + ptr->fh.opcode = cpu_to_be16(EOIB_FIP_OPCODE); + ptr->fh.subcode = FIP_HOST_LOGIN_SUB_OPCODE; + ptr->fh.list_length = cpu_to_be16(size - offsetof(typeof(*ptr), fvend) / 4); + ptr->fvend.ft.type = FIP_TYPE(VENDOR_ID); + ptr->fvend.ft.length = sizeof ptr->fvend / 4; + strncpy(ptr->fvend.vendor_id, "mellanox", sizeof ptr->fvend.vendor_id); + ptr->fa.ft.type = FIP_TYPE(ADDRESS); + ptr->fa.ft.length = sizeof ptr->fa / 4; + strncpy(ptr->fa.vendor_id, "mellanox", sizeof ptr->fa.vendor_id); + ptr->fa.gwtype_qpn = cpu_to_be32(vnic->qp_base_num); + ptr->fa.sl_gwportid = cpu_to_be16(vnic->gw->info.gw_port_id); + /* sl will be taken from the data path record query */ + ptr->fa.sl_gwportid |= cpu_to_be16(vnic->gw->data_prec.sl << FIP_ADVERTISE_SL_SHIFT); + ptr->fa.lid = cpu_to_be16(vnic->port->attr.lid); + memcpy(ptr->fa.guid, &vnic->port->gid.global.interface_id, sizeof ptr->fa.guid); + ptr->fl.ft.type = FIP_TYPE(LOGIN); + ptr->fl.ft.length = sizeof ptr->fl / 4; + strncpy(ptr->fl.vendor_id, "mellanox", sizeof ptr->fl.vendor_id); + ptr->fl.vnic_id = cpu_to_be16(vnic->vnic_id); + + if (vnic->hadmined) { + int mac_valid = !IS_ZERO_MAC(vnic->login_data.mac); + u16 flags = (mac_valid ? FIP_LOGIN_M_FLAG : 0) | + FIP_LOGIN_H_FLAG | + (vnic->login_data.vp ? FIP_LOGIN_VP_FLAG | FIP_LOGIN_V_FLAG : 0); + ptr->fl.flags_vlan = cpu_to_be16(vnic->login_data.vlan | flags ); + memcpy(ptr->fl.mac, vnic->login_data.mac, sizeof ptr->fl.mac); + memcpy(ptr->fl.vnic_name, vnic->login_data.vnic_name, sizeof ptr->fl.vnic_name); + + // TODO remove this when BXM handles 0 addresses + if (!mac_valid) + ptr->fl.mac[ETH_ALEN-1] = 1; + } + + /* all_vlan mode must be enforced between the host and GW side. + For host admin vnic with VLAN we let the host choose the work mode. + If the GW isn't working in that same mode, the login will fail + and the host will enter a login-retry loop + For net admin vnic or host admin without a vlan, we work in the mode + published by the GW */ + if (vnic->gw->info.all_vlan_gw && + (!vnic->hadmined || + (vnic->hadmined && !vnic->login_data.vp))) + ptr->fl.vfields |= cpu_to_be16(FIP_LOGIN_ALL_VLAN_GW_FLAG); + + ptr->fl.syndrom_ctrl_qpn = cpu_to_be32(vnic->gw->discover->qp->qp_num); + ptr->fl.vfields |= cpu_to_be16((vnic->qps_num > 1) << 12); + + /* for child vNics, allow implicit logout */ + if (vnic->parent_used) { + ptr->fl.vfields |= cpu_to_be16(1 << 14); + ptr->fl.vfields |= cpu_to_be16(1 << 13); + } + + return ptr; +} + +/* + * Send a unicast login packet. This function supports both host and + * network admined logins. function returns 0 on success and + * error code on failure +*/ +int fip_login_send(struct fip_vnic_data *vnic) +{ + int ret; + struct eoib_login *ptr; + + ASSERT(vnic); + ASSERT(vnic->port); + + /* don't send packet because GW does not support this */ + if (vnic->hadmined && !vnic->gw->hadmin_gw) + return 0; + + /* alloc packet to be sent */ + ptr = alloc_login_pkt(vnic); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + fip_dbg_dump_raw_pkt(0, ptr, sizeof *ptr, 1, "sending login packet"); + + ret = send_generic_ucast_pkt(vnic->port, NULL/*ah : create a new ah inside*/, + &vnic->gw->discover->tx_ring, + ptr, sizeof *ptr, vnic->gw->discover->qp, + vnic->gw->discover->pkey_index, + vnic->gw_address.gw_qpn, + vnic->gw_address.gw_lid, + vnic->gw_address.qkey, + vnic_gw_ctrl_sl(vnic->gw)); + if (ret) { + vnic_warn(vnic->port->name, + "fip_login_send: fip_ucast_send ret %d\n", ret); + goto error_free_mem; + } + + return 0; + +error_free_mem: + kfree(ptr); + return -ENOMEM; +} + +/* + * This function creates and sends a few types of packets (all ucast): + * vHub context request - new=1, logout=0 + * vHub context update / vnic keep alive - new=0, logout=0 + * vnic logout - new=0, logout=1 +*/ +int fip_update_send(struct fip_vnic_data *vnic, int request_new, int logout) +{ + struct eoib_host_update *pkt; + struct ib_qp *send_qp; + struct fip_ring *tx_ring; + int pkey_index; + int ret = 0; + + ASSERT(vnic); + ASSERT(vnic->port); + + /* alloc packet to be sent */ + pkt = kmalloc(sizeof *pkt, GFP_ATOMIC); + if (!pkt) { + vnic_warn(vnic->port->name, "fip_update_send malloc failed\n"); + return -EAGAIN; + } + + /* copy keep alive packet template */ + if (logout) + memcpy(pkt, &base_logout_pkt, sizeof(struct eoib_host_update)); + else + memcpy(pkt, &base_update_pkt, sizeof(struct eoib_host_update)); + + pkt->fip.opcode = cpu_to_be16(EOIB_FIP_OPCODE); + pkt->fip.list_length = + cpu_to_be16((sizeof(struct eoib_host_update) >> 2) - 3); + pkt->vnic_id = cpu_to_be16(vnic->vnic_id); + memcpy(pkt->mac, vnic->login_data.mac, sizeof(pkt->mac)); + memcpy(pkt->vnic_name, vnic->login_data.vnic_name, + sizeof(pkt->vnic_name)); + memcpy(pkt->port_guid, &vnic->port->gid.global.interface_id, + sizeof(pkt->port_guid)); + + pkt->vhub_id.vhub_id = cpu_to_be32(vnic->login_data.vhub_id); + + if (!logout) { + pkt->tusn = cpu_to_be32(vnic->vhub_table.main_list.tusn); + send_qp = vnic->qp; + tx_ring = &vnic->tx_ring; + pkey_index = vnic->pkey_index; + + if (vnic->login_data.vp) + pkt->vhub_id.flags.flags |= FIP_HOST_VP_FLAG; + + if (request_new) + pkt->vhub_id.flags.flags |= FIP_HOST_R_FLAG; + else + pkt->vhub_id.flags.flags |= FIP_HOST_U_FLAG; + } else { + send_qp = vnic->gw->discover->qp; + tx_ring = &vnic->gw->discover->tx_ring; + pkey_index = vnic->gw->discover->pkey_index; + } + + if (vnic->gw->info.gw_type == GW_TYPE_LAG && + !vnic->gw->info.ext_lag.ucast && !logout) { + struct vnic_mcast *mcaste; + unsigned long flags; + + spin_lock_irqsave(&vnic->mcast_tree.mcast_rb_lock, flags); + mcaste = vnic_mcast_search(&vnic->mcast_tree, &vnic->ka_mcast_gid); + if (!IS_ERR(mcaste)) { + if (mcaste->ah) { + ret = send_generic_mcast_pkt(vnic->port, &vnic->tx_ring, + pkt, sizeof *pkt, vnic->qp, + vnic->pkey_index, mcaste); + vnic_dbg_parse(vnic->name, "sent multicast keep alive\n"); + } + else { + vnic_dbg_parse(vnic->name, "mcaste %p: ah is null\n", mcaste); + kfree(pkt); + } + } else { + vnic_dbg_parse(vnic->name, "ka mcast not found\n"); + ret = -ENOMEM; + } + spin_unlock_irqrestore(&vnic->mcast_tree.mcast_rb_lock, flags); + + } else + /* For LAG gateway the ah is not up to date and therefore + should not be used */ + ret = send_generic_ucast_pkt(vnic->port, NULL/*ah : create a new ah inside*/, + tx_ring, pkt, sizeof *pkt, + send_qp, + pkey_index, + vnic->gw_address.gw_qpn, + vnic->gw_address.gw_lid, + vnic->gw_address.qkey, + vnic_gw_ctrl_sl(vnic->gw)); + if (ret) { + vnic_warn(vnic->port->name, + "fip_update_send: ret %d\n", ret); + goto error_free_mem; + } + + return 0; + +error_free_mem: + kfree(pkt); + return -ENOMEM; +} + +static void dump_lag_member(struct lag_member *m) +{ + vnic_dbg_lag("", "QPN 0x%x, SL %d, gw_portid 0x%x, LID 0x%x, guid " GUID_FORMAT + ", eport_state %s, weight %d, link_utilization %d\n", + m->qpn, m->sl, m->gw_port_id, m->lid, GUID_ARG(m->guid), + eport_state_str(m->eport_state), m->weight, m->link_utilization); +} + +static inline int handle_lag_member(struct fip_vnic_data *vnic, + struct fip_ext_type_lag_members *ext_lag_membs, + int ext_length) +{ + struct lag_members lag_members; + + extract_memb_extended(ext_lag_membs, ext_length, &lag_members, vnic->name); + + /* propogate change in member state as needed */ + return handle_member_update(vnic, &lag_members); +} + +int extract_vhub_extended(struct fip_ext_desc_tlv *fed, + struct fip_vnic_data *vnic) +{ + struct fip_ext_type_ctrl_iport *ext_ctrl_iport; + struct fip_ext_type_lag_members *ext_lag_memb; + struct fip_extended_type *ext_hdr; + struct fip_vnic_send_info *gw_addr; + int length_to_go, ext_length; + + if (fed->ft.type != 254) + return -EINVAL; + + length_to_go = ((int)(fed->ft.length) << 2) - sizeof(*fed); + ext_hdr = (struct fip_extended_type *)(fed + 1); + + while (length_to_go > 0) { + ext_length = ((int)ext_hdr->len) << 2; + + vnic_dbg_parse(vnic->name, "Table Update parse, sub-tlv " + "type %d length %d address=%p\n", + ext_hdr->ext_type, ext_length, ext_hdr); + + if (ext_length < sizeof(*ext_hdr) || + ext_length > length_to_go) { + vnic_dbg_parse(vnic->name, "Extended length error." + " Length=%d\n", ext_length); + return -EINVAL; + } + + switch (ext_hdr->ext_type) { + case ADV_EXT_TYPE(MEMBER): + ext_lag_memb = (struct fip_ext_type_lag_members *)ext_hdr; + + if (handle_lag_member(vnic, ext_lag_memb, ext_length)) + vnic_dbg_parse(vnic->name, "handle_lag_member() failed"); + break; + case ADV_EXT_TYPE(CTRL_IPORT): + if (ext_length != sizeof(*ext_ctrl_iport)) { + vnic_dbg_parse(vnic->name, "Extended length %d is" + " different than expected\n", + ext_length); + return -EINVAL; + } + + gw_addr = &vnic->gw_address; + ext_ctrl_iport = (struct fip_ext_type_ctrl_iport *)ext_hdr; + gw_addr->gw_qpn = be32_to_cpu(ext_ctrl_iport->gwtype_qpn); + gw_addr->gw_lid = be16_to_cpu(ext_ctrl_iport->lid); + gw_addr->gw_sl = be16_to_cpu(ext_ctrl_iport->sl_gwportid) >> FIP_ADVERTISE_SL_SHIFT; + break; + default: + if (ext_hdr->mandatory & 0x01) { + vnic_dbg_parse(vnic->name, "Unknown mandatory extended type %d length %d\n", + ext_hdr->ext_type, ext_length); + return -EINVAL; + } else { + vnic_dbg_parse(vnic->name, "Unknown non-mandatory extended. Skipping, type %d length %d\n", + ext_hdr->ext_type, ext_length); + ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length); + length_to_go -= ext_length; + continue; + } + } + + ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length); + length_to_go -= ext_length; + } + + return 0; +} + +static int extract_login_extended(struct fip_ext_desc_tlv *fed, + struct lag_members *lagm, + char *name) +{ + struct fip_ext_type_lag_members *ext_lag_membs; + struct fip_extended_type *ext_hdr; + int length_to_go, ext_length; + + if (fed->ft.type != 254) + return -EINVAL; + + length_to_go = ((int)(fed->ft.length) << 2) - sizeof(*fed); + ext_hdr = (struct fip_extended_type *)(fed + 1); + + while (length_to_go > 0) { + ext_length = ((int)ext_hdr->len) << 2; + + vnic_dbg_parse(name, "Table Update parse, sub-tlv " + "type %d length %d address=%p\n", + ext_hdr->ext_type, ext_length, ext_hdr); + + if (ext_length < sizeof(*ext_hdr) || + ext_length > length_to_go) { + vnic_dbg_parse(name, "Extended length error." + " Length=%d\n", ext_length); + return -EINVAL; + } + + switch (ext_hdr->ext_type) { + case ADV_EXT_TYPE(MEMBER): + ext_lag_membs = (struct fip_ext_type_lag_members *)ext_hdr; + + extract_memb_extended(ext_lag_membs, ext_length, lagm, name); + + break; + default: + if (ext_hdr->mandatory & 0x01) { + vnic_dbg_parse(name, "Unknown mandatory extended type %d length %d\n", + ext_hdr->ext_type, ext_length); + return -EINVAL; + } else { + vnic_dbg_parse(name, "Unknown non-mandatory extended. Skipping, type %d length %d\n", + ext_hdr->ext_type, ext_length); + ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length); + length_to_go -= ext_length; + continue; + } + } + ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length); + length_to_go -= ext_length; + } + + return 0; +} + +void extract_memb_extended(struct fip_ext_type_lag_members *ext_lag_membs, + int ext_length, + struct lag_members *lagm, + char *name) +{ + struct lag_member *m; + struct fip_ext_type_lag_member *lm; + int nmemb = 0; + int i; + + nmemb = (ext_length - sizeof ext_lag_membs->et) / sizeof *lm; + if (nmemb > MAX_LAG_MEMBERS) { + vnic_dbg_parse(name, "recieved %d members but max supported is %d. " + "Using only %d\n", nmemb, MAX_LAG_MEMBERS, + MAX_LAG_MEMBERS); + nmemb = MAX_LAG_MEMBERS; + } + + m = lagm->memb; + lm = ext_lag_membs->lagm; + + for (i = 0; i < nmemb; ++i, ++lm, ++m) { + m->qpn = be32_to_cpu(lm->qpn) & 0xffffff; + m->sl = be16_to_cpu(lm->sl_gw_portid) >> 12; + m->gw_port_id = be16_to_cpu(lm->sl_gw_portid) & 0xfff; + m->lid = be16_to_cpu(lm->lid); + memcpy(m->guid, lm->guid, sizeof m->guid); + m->eport_state = lm->eport_state >> 6; + m->weight = lm->weight; + m->link_utilization = lm->link_utilization; + dump_lag_member(m); + } + lagm->num = nmemb; + + vnic_dbg_parse(name, "Table Update extended parse finished OK. Num members=%d\n", + lagm->num); + return; +} + +/* + * parse a packet that is suspected of being an login ack packet. The packet + * returns 0 for a valid login ack packet and an error code otherwise. The + * packets "interesting" details are returned in data. + */ +int fip_login_parse(struct fip_discover *discover, struct fip_content *fc, + struct fip_login_data *data) +{ + u32 vfields; + int err = 0; + + data->syndrome = be32_to_cpu(fc->fl->syndrom_ctrl_qpn) >> 24; + data->vnic_id = be16_to_cpu(fc->fl->vnic_id); + data->lid = be16_to_cpu(fc->fa.fa[0]->lid); + data->port_id = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) & 0xfff; + data->sl = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) >> FIP_ADVERTISE_SL_SHIFT; + data->qpn = be32_to_cpu(fc->fa.fa[0]->gwtype_qpn) & 0xffffff; + memcpy(data->guid, fc->fa.fa[0]->guid, sizeof(data->guid)); + + if (be16_to_cpu(fc->fl->flags_vlan) & FIP_LOGIN_VP_FLAG) { + data->vp = 1; + data->vlan = be16_to_cpu(fc->fl->flags_vlan) & 0xfff; + } + data->all_vlan_gw = !!(be16_to_cpu(fc->fl->vfields) & FIP_LOGIN_ALL_VLAN_GW_FLAG); + + data->vhub_id = CREATE_VHUB_ID(cpu_to_be16(data->vlan), data->port_id); + + data->ctl_qpn = be32_to_cpu(fc->fl->syndrom_ctrl_qpn) & FIP_LOGIN_CTRL_QPN_MASK; + vfields = be16_to_cpu(fc->fl->vfields); + data->n_mac_mcgid = vfields & FIP_LOGIN_DMAC_MGID_MASK; + data->n_rss_mgid = vfields >> 8 & 0xf; + /* data->rss = pkt->rss & FIP_LOGIN_RSS_MASK; it's redundant in login ack */ + data->pkey = be16_to_cpu(fc->fp->pkey); + data->mtu = be16_to_cpu(fc->fl->mtu); + + memcpy(data->mac, fc->fl->mac, sizeof(data->mac)); + memcpy(data->mgid_prefix, fc->fl->eth_gid_prefix, sizeof(data->mgid_prefix)); + memcpy(data->vnic_name, fc->fl->vnic_name, sizeof(data->vnic_name)); + memcpy(data->vendor_id, fc->fl->vendor_id, sizeof(data->vendor_id)); + + if (fc->fed.num) + err = extract_login_extended(fc->fed.fed[0], &data->lagm, discover->name); + + return err; +} + +/* + * Check if a received packet is a FIP packet, And if so return its subtype. + * The FIP type is also returned in fip_type and can be either EOIB_FIP_OPCODE + * or FCOIB_FIP_OPCODE. If the packet is not a FIP packet -EINVAL is returned. +*/ +int fip_pkt_parse(char *buffer, int length, int *fip_type) +{ + struct fip_fip_header *fip_header; + u16 fip_opcode; + + fip_header = (struct fip_fip_header *) + (buffer + IB_GRH_BYTES + sizeof(struct fip_eoib_ver)); + + fip_opcode = be16_to_cpu(fip_header->opcode); + + if (fip_opcode != EOIB_FIP_OPCODE) { + *fip_type = 0; + return -EINVAL; + } + + *fip_type = fip_opcode; + + return fip_header->subcode; +} + +/* + * Already know that this is a FIP packet, return its subtype. +*/ +int fip_pkt_get_subtype_bh(char *buffer) +{ + struct fip_fip_header *fip_header; + + fip_header = (struct fip_fip_header *) + (buffer + sizeof(struct fip_eoib_ver)); + + return fip_header->subcode; +} + diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_pkt.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_pkt.h new file mode 100644 index 0000000000000..32e34fce15252 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_pkt.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FIP_DISCOVER_PKT_H +#define _FIP_DISCOVER_PKT_H + +#include + + + +#endif /* _FIP_DISCOVER_H */ diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_vhub.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_vhub.c new file mode 100644 index 0000000000000..8bcd6d0b69801 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_vhub.c @@ -0,0 +1,635 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_fip.h" +#include "vnic_fip_discover.h" +#include "vnic_fip_pkt.h" + +/* + * construct an mgid address based on vnic login information and the type + * variable (data mcast / vhub update / vhub table). The resulting mgid + * is returned in *mgid. + */ +void vhub_mgid_create(const char *mgid_prefix, + const char *mmac, /* mcast mac for bcast 0xFF.. */ + u64 n_mac, /* bits to take from mmac */ + u32 vhub_id, + enum vhub_mgid_type type, + u8 rss_hash, + union vhub_mgid *mgid) +{ + u32 vhub_id_be; + u64 mac_mask; + u64 *mac_ptr; + u64 one = 1; /* must do that for shift bitwise operation */ + + memcpy(mgid->mgid.mgid_prefix, mgid_prefix, + sizeof(mgid->mgid.mgid_prefix)); + mgid->mgid.type = (u8)type; + memcpy(mgid->mgid.dmac, mmac, sizeof(mgid->mgid.dmac)); + mac_mask = cpu_to_le64(((one << n_mac) - one) | 0xFFFF000000000000ULL); + mac_ptr = (u64*)(mgid->mgid.dmac); + *mac_ptr &= mac_mask; + mgid->mgid.rss_hash = rss_hash; + vhub_id_be = cpu_to_be32(vhub_id); + memcpy(mgid->mgid.vhub_id, ((u8 *) &vhub_id_be) + 1, + sizeof(mgid->mgid.vhub_id)); +}; + +/* + * Init the vnic's vHub table data structures, before using them + */ +void vhub_ctx_init(struct fip_vnic_data *vnic) +{ + INIT_LIST_HEAD(&vnic->vhub_table.main_list.vnic_list); + vnic->vhub_table.main_list.tusn = 0; + vnic->vhub_table.main_list.count = 0; + vnic->vhub_table.main_list.total_count = 0; + + INIT_LIST_HEAD(&vnic->vhub_table.update_list.vnic_list); + vnic->vhub_table.update_list.tusn = 0; + vnic->vhub_table.update_list.count = 0; + vnic->vhub_table.update_list.total_count = 0; + + vnic->vhub_table.checksum = 0; + vnic->vhub_table.tusn = 0; + vnic->vhub_table.state = VHUB_TBL_INIT; +} + +/* print vhub context table */ +static void vhub_ctx_prnt(struct fip_vnic_data *vnic, + struct vhub_elist *vhub_list, int level) +{ + struct vnic_table_entry *vnic_entry; + + if (!(vnic_msglvl & VNIC_DEBUG_VHUB_V)) + return; + + vnic_dbg_vhub_v(vnic->name, "Dumping context table. Count %d tusn %d\n", + vhub_list->count, vhub_list->tusn); + + list_for_each_entry(vnic_entry, &vhub_list->vnic_list, list) { + vnic_dbg_vhub_v(vnic->name, "lid 0x%04x qpn 0x%06x, mac " + MAC_6_PRINT_FMT"\n", vnic_entry->lid, + vnic_entry->qpn, + MAC_6_PRINT_ARG(vnic_entry->mac)); + } +} + +void vhub_table_free(struct vhub_elist *elist) +{ + struct vnic_table_entry *del_vnic, *tmp_vnic; + + list_for_each_entry_safe(del_vnic, tmp_vnic, &elist->vnic_list, list) { + list_del(&del_vnic->list); + kfree(del_vnic); + } +} + +/* + * Clear and free the vnic's vHub context table data structures. + */ +void vhub_ctx_free(struct fip_vnic_data *vnic) +{ + vnic_dbg_fip_v(vnic->name, "vhub_ctx_free called\n"); + + vhub_table_free(&vnic->vhub_table.main_list); + vhub_table_free(&vnic->vhub_table.update_list); + + vhub_ctx_init(vnic); +} + +static struct vnic_table_entry *vhub_find_entry(struct vhub_elist *vnic_list, + u16 lid, u32 qpn) +{ + struct vnic_table_entry *tmp_vnic; + + list_for_each_entry(tmp_vnic, &vnic_list->vnic_list, list) { + if (tmp_vnic->lid == lid && tmp_vnic->qpn == qpn) + return tmp_vnic; + } + return NULL; +} + +/* + * Move vHub context entries from the update list to the main list. The update + * list is used during the wait for the main table to be updated. Once + * the table update is completed the entries need to be moved from the update + * table to the main table. This function does this. +*/ +static int vhub_update_main(struct fip_vnic_data *vnic, + struct vhub_elist *main_list, + struct vhub_elist *update_list) +{ + struct vnic_table_entry *new_entry, *tmp_vnic, *del_vnic; + int first_tusn = (u32) update_list->tusn - (update_list->count - 1); + int extra_tusn; + + /* update list is usually empty */ + if (likely(update_list->count == 0)) + return 0; + + if (first_tusn > main_list->tusn + 1) { + vnic_warn(vnic->name, "Info, vhub_to_main_tbl sync main to" + " update list failed. update tusn %d update " + "first %d main %d\n", + update_list->tusn, first_tusn, main_list->tusn); + return -1; + } + + extra_tusn = main_list->tusn + 1 - first_tusn; + + /* go over update list and move / remove entries in it */ + list_for_each_entry_safe(new_entry, tmp_vnic, + &update_list->vnic_list, list) { + if (extra_tusn > 0) { + list_del(&new_entry->list); + kfree(new_entry); + extra_tusn--; + } else { + /* remove from update list and apply to main list */ + list_del(&new_entry->list); + main_list->tusn++; + + /* Check valid bit, if set add to main list */ + if (new_entry->valid) { + list_add_tail(&new_entry->list, + &main_list->vnic_list); + main_list->count++; + } else { /* remove from main list */ + del_vnic = vhub_find_entry(main_list, + new_entry->lid, + new_entry->qpn); + if (del_vnic) { + list_del(&del_vnic->list); + kfree(del_vnic); + + main_list->count--; + } + vnic_dbg_fip_v(vnic->name, + "vhub_to_main_tbl removed " + "vnic lid %d qpn 0x%x found %d\n", + (int)new_entry->lid, + (int)new_entry->qpn, + (del_vnic != 0)); + kfree(new_entry); + } + } + update_list->count--; + } + return 0; +} + +int fip_vnic_search_mac(struct fip_vnic_data *vnic, struct vhub_elist *elist) +{ + struct vnic_table_entry *vlist_entry; + + list_for_each_entry(vlist_entry, &elist->vnic_list, list) + /* find matching entry based on mac */ + if(!memcmp(vnic->login_data.mac, vlist_entry->mac, ETH_ALEN)) { + /* verify lid/qpn match */ + if (vnic->port->attr.lid == vlist_entry->lid && + vnic->qp_base_num == vlist_entry->qpn) + return 1; + else { + vnic_dbg_vhub(vnic->name, + "vnic LID=0x%x or QPN=0x%x " + "in vhub tbl is different than " + "expected LID=0x%x, QPN=0x%x\n", + vlist_entry->lid, + vlist_entry->qpn, + vnic->port->attr.lid, + vnic->qp_base_num); + break; + } + } + + return 0; +} + +/* + * This function handles a vhub context table packet. The table will + * be processed only if we do not have an up to date local copy of + * our own. The table update supports multi-packet tables so care + * must be taken in building the complete table. + */ +int vhub_handle_tbl(struct fip_vnic_data *vnic, struct fip_content *fc, + u32 vhub_id, u32 tusn) +{ + struct context_table_entry *entry; + struct vnic_table_entry *new_entry; + struct vhub_elist *table; + int i, j, count_in_pkt; + int reason = 0; + int hdr_type; + + /* we already have a table. disregard this one */ + if (vnic->vhub_table.state != VHUB_TBL_INIT) { + vnic_dbg_vhub_v(vnic->name, + "vhub_handle_tbl context not in init\n"); + return 0; + } + + /* compute the number of vnic entries in the packet. + * don't forget the checksum + */ + count_in_pkt = fc->cte.num; + table = &vnic->vhub_table.main_list; + hdr_type = be16_to_cpu(fc->fvt->hdr) >> 14; + + /* first or only packet in sequence */ + if (hdr_type == FIP_TABLE_HDR_FIRST || hdr_type == FIP_TABLE_HDR_ONLY) { + table->total_count = be16_to_cpu(fc->fvt->table_size); + table->tusn = tusn; + } + if (table->tusn != tusn) { + vnic_warn(vnic->name, "Info, vhub_handle_tbl got unexpected " + "tusn. Expect=%d received=%d\n", table->tusn, tusn); + if (!table->tusn) + goto drop_silently; + reason = 1; + goto reset_table; + } + + if ((table->count + count_in_pkt > table->total_count) || + ((table->count + count_in_pkt < table->total_count) && + (hdr_type == FIP_TABLE_HDR_LAST || hdr_type == FIP_TABLE_HDR_ONLY))) { + vnic_dbg_vhub(vnic->name, + "vhub_handle_tbl got unexpected entry count. " + "count %d, in packet %d total expected %d\n", + table->count, count_in_pkt, table->total_count); + reason = 2; + goto reset_table; + } + + entry = fc->cte.cte; + for (i = 0; i < count_in_pkt; ++i, ++entry) { + new_entry = kzalloc(sizeof *new_entry, GFP_KERNEL); + if (!new_entry) + goto reset_table; + + for (j = 0; j < (sizeof *entry) >> 2; ++j) + vnic->vhub_table.checksum += ((u32 *) entry)[j]; + + new_entry->lid = be16_to_cpu(entry->lid); + new_entry->qpn = be32_to_cpu(entry->qpn) & 0xffffff; + new_entry->sl = entry->sl & 0xf; + new_entry->rss = !!(entry->v_rss_type & FIP_CONTEXT_RSS_FLAG); + new_entry->valid = !!(entry->v_rss_type & FIP_CONTEXT_V_FLAG); + memcpy(new_entry->mac, entry->mac, sizeof(new_entry->mac)); + + list_add_tail(&new_entry->list, &table->vnic_list); + table->count++; + } + + /* last packet */ + if (hdr_type == FIP_TABLE_HDR_LAST || hdr_type == FIP_TABLE_HDR_ONLY) { + ASSERT(table->count == table->total_count); + if (vnic->vhub_table.checksum != be32_to_cpu(*(u32 *) entry)) { + vnic_dbg_fip_v(vnic->name, + "vhub_handle_tbl checksum mismatch. " + "expected 0x%x, in packet 0x%x\n", + vnic->vhub_table.checksum, + be32_to_cpu(*(u32 *) entry)); + /* TODO: request checksum match in final code */ + /* goto reset_table; */ + } + + if (vhub_update_main(vnic, &vnic->vhub_table.main_list, + &vnic->vhub_table.update_list)) { + vnic_dbg_fip_v(vnic->name, + "vhub_handle_tbl moving update list to main " + "list failed\n"); + reason = 3; + goto reset_table; + } + + /* we are done receiving the context table */ + vnic_dbg_fip_v(vnic->name, + "vhub_handle_tbl updated with %d entries\n", + vnic->vhub_table.main_list.count); + vhub_ctx_prnt(vnic, &vnic->vhub_table.main_list, 0); + + /* we are not in the main vHub list close ourselves */ + if (!fip_vnic_search_mac(vnic, &vnic->vhub_table.main_list)) { + vnic_dbg_fip_p0(vnic->name, "We are not in the main table close our selves\n"); + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + reason = 4; + goto reset_table; + } + + if (fip_vnic_tbl_done(vnic)) { + vnic_warn(vnic->name, "vhub_handle_tbl done failed, reseting table\n"); + reason = 5; + goto reset_table; + } + } + +drop_silently: + return 0; + +reset_table: + vnic_dbg_fip_p0(vnic->name, "We are not in the main table close our selves reason=%d\n", reason); + vhub_ctx_free(vnic); + /* TODO renable tx of update request, fip_update_send() */ + return -EINVAL; +} + +/* + * This function writes the main vhub table to the data (login) vnic. + * You should call it when the data vnic is ready for it and after the + * table is up to date (and the update list was applied to the main list) + */ +int fip_vnic_write_tbl(struct fip_vnic_data *vnic) +{ + struct vnic_table_entry *vlist_entry; + int rc; + + if (vnic->login) + sprintf(vnic->name, "%s", vnic->login->name); + + /* update table in neigh tree */ + list_for_each_entry(vlist_entry, + &vnic->vhub_table.main_list.vnic_list, list) { + rc = vnic_vhube_add(vnic, vlist_entry); + if (rc) { + vnic_warn(vnic->name, "vnic_vhube_add failed for mac " + MAC_6_PRINT_FMT" (rc %d)\n", + MAC_6_PRINT_ARG(vlist_entry->mac), rc); + vhub_ctx_free(vnic); + vnic_vhube_flush(vnic); + return -1; + } + } + + vnic_dbg_fip(vnic->name, "fip_vnic_tbl_done: creation of vnic done\n"); + + vnic->vhub_table.tusn = vnic->vhub_table.main_list.tusn; + vnic->vhub_table.state = VHUB_TBL_UPDATED; + + /* free table memory */ + vhub_table_free(&vnic->vhub_table.main_list); + return 0; +} + +/* + * This function handles a vhub context update packets received AFTER + * we have a valid vhub table. For update additions the code adds an + * entry to the neighbour tree. For update removals we either remove + * the entry from the neighbour list or if the removed entry is "this vnic" + * we remove the vnic. +*/ +static int vhub_update_updated(struct fip_vnic_data *vnic, + u32 vhub_id, u32 pkt_tusn, + struct vnic_table_entry *data) +{ + int curr_tusn; + + curr_tusn = vnic->vhub_table.tusn; + + /* if vnic is being flushed, return */ + if (vnic->flush) + return 0; + + /* we got a GW keep alive packet */ + if (pkt_tusn == curr_tusn) + return 0; + + /* if we got an out of order update clear list and request new table */ + if (pkt_tusn != curr_tusn + 1) { + vnic_warn(vnic->name, "Info, vhub_update_up2date received out" + " of order update. Recvd=%d Expect=%d\n", + pkt_tusn, curr_tusn); + goto error_in_update; + } + + /* new entry added */ + if (data->valid) { + if (vnic_vhube_add(vnic, data)) { + vnic_dbg_fip(vnic->name, "vnic_vhube_add " + "failed to update vnic neigh tree\n"); + goto error_in_update; + } + } else { /* remove entry */ + /* the remove request is for this vnic :-o */ + if (!memcmp(vnic->login_data.mac, data->mac, ETH_ALEN)) { + vnic_dbg_fip_p0(vnic->name, "remove this vnic "MAC_6_PRINT_FMT"\n", + MAC_6_PRINT_ARG(vnic->login_data.mac)); + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + } else { + vnic_dbg_fip(vnic->name, "remove neigh vnic\n"); + vnic_vhube_del(vnic, data->mac); + } + } + + vnic->vhub_table.tusn = pkt_tusn; + + return 0; + +error_in_update: + vhub_ctx_free(vnic); + vnic_vhube_flush(vnic); + fip_update_send(vnic, 1 /* new */, 0 /* logout */); + return -1; +} + +/* + * This function handles a vhub context update packets received BEFORE + * we have a valid vhub table. The function adds the update request + * to an update list to be processed after the entire vhub table is received + * and processed. + */ +static int vhub_update_init(struct fip_vnic_data *vnic, + u32 vhub_id, u32 pkt_tusn, + struct vnic_table_entry *data) +{ + struct vnic_table_entry *new_vnic; + struct vhub_elist *vnic_list; + int curr_tusn; + + vnic_list = &vnic->vhub_table.update_list; + curr_tusn = vnic_list->tusn; + + /* if we got an out of order update clear list and request new table */ + if ((pkt_tusn < curr_tusn || pkt_tusn > curr_tusn + 1) + && curr_tusn != 0) { + vnic_warn(vnic->name, "Info, vhub_update_init received out of" + " order update. got %d my %d\n", pkt_tusn, curr_tusn); + goto error_in_update; + } + + /* we got a GW keep alive packet */ + if (pkt_tusn == curr_tusn) { + vnic_dbg_fip_v(vnic->name, "Received GW keep alive update." + " tusn %d\n", curr_tusn); + return 0; + } + + /* got remove request for this vnic don't wait */ + if (!(data->valid) && + !memcmp(vnic->login_data.mac, data->mac, ETH_ALEN)) { + vhub_ctx_free(vnic); + vnic_dbg_fip_p0(vnic->name, "got request to close vNic vhub_update_init\n"); + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + goto err; + } + + new_vnic = kzalloc(sizeof *new_vnic, GFP_KERNEL); + if (!new_vnic) + goto error_in_update; + + memcpy(new_vnic, data, sizeof *data); + list_add_tail(&new_vnic->list, &vnic_list->vnic_list); + vnic_list->count++; + vnic_list->tusn = pkt_tusn; + vhub_ctx_prnt(vnic, vnic_list, 0); + return 0; + +error_in_update: + vhub_ctx_free(vnic); + fip_update_send(vnic, 1 /* new */, 0 /* logout */); +err: + return -1; +} + +/* + * This function handles a vhub context update packets received after + * we have a valid vhub table but before it was passed to the data rbtree. + * The function applies the update request to the main vhub table. + */ +static int vhub_update_inter(struct fip_vnic_data *vnic, + u32 vhub_id, u32 pkt_tusn, + struct vnic_table_entry *data) +{ + struct vnic_table_entry *new_vnic, *del_vnic; + struct vhub_elist *vnic_list; + int curr_tusn; + + vnic_list = &vnic->vhub_table.main_list; + curr_tusn = vnic_list->tusn; + + /* if we got an out of order update clear list and request new table */ + if ((pkt_tusn < curr_tusn || pkt_tusn > curr_tusn + 1) + && curr_tusn != 0) { + vnic_warn(vnic->name, "Info, vhub_update_init received out" + " of order update. got %d my %d\n", pkt_tusn, curr_tusn); + goto error_in_update; + } + + /* we got a GW keep alive packet */ + if (pkt_tusn == curr_tusn) { + vnic_dbg_fip_v(vnic->name, "Received GW keep alive update." + " tusn %d\n", curr_tusn); + return 0; + } + + /* we got an add request */ + if (data->valid) { + new_vnic = kzalloc(sizeof *new_vnic, GFP_KERNEL); + if (!new_vnic) + goto error_in_update; + + memcpy(new_vnic, data, sizeof *data); + list_add_tail(&new_vnic->list, &vnic_list->vnic_list); + vnic_list->count++; + vnic_list->tusn = pkt_tusn; + } else { /* we got a remove request */ + /* remove is for this vnic */ + if (!memcmp(vnic->login_data.mac, data->mac, ETH_ALEN)) { + vhub_ctx_free(vnic); + vnic_dbg_fip_p0(vnic->name, "got request to close vNic vhub_update_inter\n"); + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + goto err; + } + + /* search and delete the vnic */ + del_vnic = vhub_find_entry(vnic_list, + data->lid, + data->qpn); + if (del_vnic) { + list_del(&del_vnic->list); + kfree(del_vnic); + vnic_list->count--; + } + vnic_dbg_fip_v(vnic->name, + "vhub_update_inter removed " + "vnic lid %d qpn 0x%x found %d\n", + (int)data->lid, (int)data->qpn, + (del_vnic != 0)); + } + + vhub_ctx_prnt(vnic, vnic_list, 0); + return 0; + +error_in_update: + vhub_ctx_free(vnic); + fip_update_send(vnic, 1 /* new */, 0 /* logout */); +err: + return -1; +} + +/* + * This function handles a vhub context update packets. There are three flows + * in handeling update packets. The first is before the main table is up + * to date, the second is after the table is up to date but before it was + * passed to the ownership of the data vnic (login struct) and the local + * lists are freed, and the last is when the table maintanence is done + * by the data vnic. This function handles all cases. +*/ +int vhub_handle_update(struct fip_vnic_data *vnic, + u32 vhub_id, u32 tusn, + struct vnic_table_entry *data) +{ + int ret = 0; + + /* + * if we do not have an up to date table to use the update list. + * if we have an up to date table apply the updates to the + * main table list. + */ + switch (vnic->vhub_table.state) { + case VHUB_TBL_INIT: /* No full table yet, keep updates for later */ + ret = vhub_update_init(vnic, vhub_id, tusn, data); + break; + case VHUB_TBL_UP2DATE: /* full table available, not writen to data half */ + ret = vhub_update_inter(vnic, vhub_id, tusn, data); + break; + case VHUB_TBL_UPDATED: /* full table available and writen to data half */ + ret = vhub_update_updated(vnic, vhub_id, tusn, data); + break; + default: + break; + } + + return ret; +} diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_main.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_main.c new file mode 100644 index 0000000000000..f07ee4e63fb7e --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_main.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_data.h" +#include "vnic_fip.h" + +MODULE_AUTHOR(DRV_AUTH); +MODULE_DESCRIPTION(DRV_DESC); +MODULE_LICENSE(DRV_LIC); +MODULE_VERSION(DRV_VER); + +static int __init mlx4_ib_init(void) +{ + vnic_dbg_func("module_init"); + + if (vnic_param_check()) + goto err; + if (vnic_mcast_init()) + goto err; + if (vnic_ports_init()) + goto free_mcast; + + return 0; + +free_mcast: + vnic_mcast_cleanup(); +err: + return -EINVAL; +} + +static void __exit mlx4_ib_cleanup(void) +{ + int ret; + + vnic_dbg_func("module_exit"); + vnic_ports_cleanup(); + vnic_dbg_mark(); + vnic_mcast_cleanup(); +} + +module_init(mlx4_ib_init); +module_exit(mlx4_ib_cleanup); diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_mcast.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_mcast.c new file mode 100644 index 0000000000000..c82190cd576be --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_mcast.c @@ -0,0 +1,1098 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" + +struct workqueue_struct *mcast_wq; +struct ib_sa_client vnic_sa_client; + +//static void vnic_mcast_detach_task(struct work_struct *work); +static void vnic_mcast_attach_task(struct work_struct *work); +static void vnic_port_mcast_leave_task(struct work_struct *work); +static void vnic_port_mcast_join_task(struct work_struct *work); + +static void vnic_port_mcast_release(struct vnic_port_mcast *mcaste); +static struct vnic_port_mcast *vnic_port_mcast_update(struct vnic_mcast + *_mcaste); + +/* + * A helper function to prevent code duplication. Fills vnic_mcast struct with + * common values. + * + * in: mcaste - mcaste to fill + * in: gw_id - to be used in creation MGID address + * in: mac - to be used in creation MGID address + * in: create - value of create field in mcaste + */ +void __vnic_mcaste_fill(struct vnic_login *login, struct vnic_mcast *mcaste, + u16 gw_id, const u8 *mac, u8 rss_hash, int create) +{ + union vhub_mgid mgid; + + memcpy(mcaste->mac, mac, ETH_ALEN); + vhub_mgid_create(login->mgid_prefix, mcaste->mac, + login->n_mac_mcgid, + CREATE_VHUB_ID(login->vid, gw_id), + VHUB_MGID_DATA, rss_hash, &mgid); + memcpy(&mcaste->gid, mgid.ib_gid.raw, GID_LEN); + memcpy(&mcaste->port_gid, &mcaste->gid, GID_LEN); + mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC); + mcaste->backoff_factor = 1; + mcaste->retry = VNIC_MCAST_MAX_RETRY; + mcaste->blocking = 0; + mcaste->qkey = login->qkey; + mcaste->pkey = login->pkey; + mcaste->create = create; + mcaste->qp = login->qp_res[0].qp; /* mcast/bcast is only on first QP */ + mcaste->join_state = 1; +} + +/* + * A helper function to prevent code duplication. Receives a multicast mac + * and a gw_id and attaches it (join + attach). The function also receives + * a default_mcaste (used for the MGID over default MLID hack and a user list. + * Returns 0 on success and non 0 on failure. + * + * in: mmac - to be used in creation MGID address + * in: default_mcaste - mcaste entry of the default MGID. Can be NULL + * in: user_list - A user list to hang the new mcaste on. Can be NULL + * in: gw_id - to be used in creation MGID address + */ +int _vnic_mcast_attach_mgid(struct vnic_login *login, + char *mmac, + struct vnic_mcast *default_mcaste, + void *private_data, + u16 gw_id) +{ + struct vnic_mcast *mcaste; + int rc = 0; + int rss_hash; + + mcaste = vnic_mcast_alloc(login->port, NULL, NULL); + if (IS_ERR(mcaste)) { + vnic_warn(login->name, "vnic_mcast_alloc for "MAC_6_PRINT_FMT" failed\n", + MAC_6_PRINT_ARG(mmac)); + vnic_dbg_mark(); + return -ENOMEM; + } + memcpy(mcaste->mac, mmac, ETH_ALEN); + + /* if mcast mac has mcast IP in it:*/ + rss_hash = 0; + if ((mcaste->mac[0] & 0xf0) == 0xe0 && + mcaste->mac[4] == 0x00 && + mcaste->mac[5] == 0x00) { + /* calculate mcas rss_hash on IP octets */ + rss_hash = mcaste->mac[0] ^ mcaste->mac[1] ^ + mcaste->mac[2] ^ mcaste->mac[3]; + /* and build the corresponding mcast MAC using the IEEE + * multicast OUI 01:00:5e + */ + mcaste->mac[5] = mcaste->mac[3]; + mcaste->mac[4] = mcaste->mac[2]; + mcaste->mac[3] = mcaste->mac[1] & 0x7f; + mcaste->mac[2] = 0x5e; + mcaste->mac[1] = 0x00; + mcaste->mac[0] = 0x01; + } + + __vnic_mcaste_fill(login, mcaste, gw_id, mcaste->mac, rss_hash, 0); + mcaste->priv_data = private_data; + + if (default_mcaste) + memcpy(&mcaste->port_gid, &default_mcaste->gid, GID_LEN); + + rc = vnic_mcast_add(&login->mcast_tree, mcaste); /* add holds mcast_rb_lock */ + if (!rc) { + rc = vnic_mcast_attach(&login->mcast_tree, mcaste); + ASSERT(!rc); + } else if (rc == -EEXIST){ + /* MGID may be already in the tree when n_mac_mcgid > 0 (ok)*/ + vnic_dbg_mcast(login->name, "vnic_mcast_add for " + MAC_6_PRINT_FMT" already exist, rc %d\n", + MAC_6_PRINT_ARG(mcaste->mac), rc); + vnic_mcast_dealloc(mcaste); + rc = 0; + } else { + vnic_warn(login->name, "vnic_mcast_add for " + MAC_6_PRINT_FMT" failed, rc %d\n", + MAC_6_PRINT_ARG(mcaste->mac), rc); + vnic_mcast_dealloc(mcaste); + } + return rc; +} + +struct vnic_mcast *vnic_mcast_alloc(struct vnic_port *port, + unsigned long *req_attach, + unsigned long *cur_attached) +{ + struct vnic_mcast *mcaste; + + mcaste = kzalloc(sizeof *mcaste, GFP_ATOMIC); + if (!mcaste) + return ERR_PTR(-ENOMEM); + /* set mcaste fields */ + init_completion(&mcaste->attach_complete); + INIT_DELAYED_WORK(&mcaste->attach_task, vnic_mcast_attach_task); + spin_lock_init(&mcaste->lock); + mcaste->port = port; + mcaste->req_attach = req_attach; + mcaste->cur_attached = cur_attached; + + return mcaste; +} + +void vnic_mcast_dealloc(struct vnic_mcast *mcaste) +{ + struct vnic_port *port; + + ASSERT(mcaste); + port = mcaste->port; + vnic_dbg_mcast_vv(port->name, "dealloc vnic_mcast: MAC "MAC_6_PRINT_FMT + " GID "VNIC_GID_FMT"\n", + MAC_6_PRINT_ARG(mcaste->mac), + VNIC_GID_ARG(mcaste->gid)); + kfree(mcaste); +} + +/* + * This function grabs the mcast_tree->mcast_rb_lock +*/ +int vnic_mcast_add(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste) +{ + struct rb_node **n = &mcast_tree->mcast_tree.rb_node, *pn = NULL; + struct vnic_mcast *mcaste_t; + unsigned long flags; + int rc; + + spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags); + while (*n) { + pn = *n; + mcaste_t = rb_entry(pn, struct vnic_mcast, rb_node); + rc = memcmp(mcaste->gid.raw, mcaste_t->gid.raw, GID_LEN); + if (rc < 0) + n = &pn->rb_left; + else if (rc > 0) + n = &pn->rb_right; + else { + rc = -EEXIST; + goto out; + } + } + + rb_link_node(&mcaste->rb_node, pn, n); + rb_insert_color(&mcaste->rb_node, &mcast_tree->mcast_tree); + + rc = 0; + +out: + vnic_dbg_mcast_v(mcaste->port->name, + "added (rc %d) vnic_mcast: MAC "MAC_6_PRINT_FMT + " GID "VNIC_GID_FMT"\n", rc, + MAC_6_PRINT_ARG(mcaste->mac), + VNIC_GID_ARG(mcaste->gid)); + + spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags); + return rc; +} + +/* + * The caller must hold the mcast_tree->mcast_rb_lock lock before calling + */ +void vnic_mcast_del(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste) +{ + rb_erase(&mcaste->rb_node, &mcast_tree->mcast_tree); +} + +/* + * The caller must hold the mcast_tree->mcast_rb_lock lock before calling +*/ +struct vnic_mcast *vnic_mcast_search(struct mcast_root *mcast_tree, + union ib_gid *gid) +{ + struct rb_node *n = mcast_tree->mcast_tree.rb_node; + struct vnic_mcast *mcaste_t; + int rc; + + while (n) { + mcaste_t = rb_entry(n, struct vnic_mcast, rb_node); + rc = memcmp(gid->raw, mcaste_t->gid.raw, GID_LEN); + if (rc < 0) + n = n->rb_left; + else if (rc > 0) + n = n->rb_right; + else { + vnic_dbg_mcast_v(mcaste_t->port->name, + "found: MAC "MAC_6_PRINT_FMT" GID " + VNIC_GID_FMT"\n", + MAC_6_PRINT_ARG(mcaste_t->mac), + VNIC_GID_ARG(mcaste_t->gid)); + goto out; + } + } + mcaste_t = ERR_PTR(-ENODATA); + +out: + return mcaste_t; +} + +static void vnic_mcast_detach_ll(struct vnic_mcast *mcaste, struct mcast_root *mcast_tree) +{ + struct vnic_port *port = mcaste->port; + struct ib_ah *tmp_ih; + unsigned long flags; + int rc; + + vnic_dbg_mcast_v(port->name, + "mcaste->attached %d for mac "MAC_6_PRINT_FMT"\n", + test_bit(MCAST_ATTACHED, &mcaste->state), + MAC_6_PRINT_ARG(mcaste->mac)); + + spin_lock_irqsave(&mcaste->lock, flags); + if (!test_and_clear_bit(MCAST_ATTACHED, &mcaste->state)) { + spin_unlock_irqrestore(&mcaste->lock, flags); + return; + } + + tmp_ih = mcaste->ah; + mcaste->ah = NULL; + spin_unlock_irqrestore(&mcaste->lock, flags); + + /* callback */ + if (mcaste->detach_cb) { + vnic_dbg_mcast(port->name, "calling detach_cb\n"); + mcaste->detach_cb(mcaste, mcaste->detach_cb_ctx); + } + + if (!mcaste->sender_only) + rc = ib_detach_mcast(mcaste->qp, &mcaste->gid, port->attr.lid); + else + rc = 0; + + ASSERT(tmp_ih); + if (ib_destroy_ah(tmp_ih)) + vnic_warn(port->name, + "ib_destroy_ah failed (rc %d) for mcaste mac " + MAC_6_PRINT_FMT"\n", rc, + MAC_6_PRINT_ARG(mcaste->mac)); + vnic_dbg_mcast(port->name, "GID "VNIC_GID_FMT" detached!\n", + VNIC_GID_ARG(mcaste->gid)); +} + +int vnic_mcast_detach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste) +{ + struct vnic_port *port = mcaste->port; + unsigned long flags; + + /* must be a task, to make sure no attach task is pending */ + vnic_dbg_mcast_v(port->name, "queue delayed task (%lu) " + "vnic_mcast_detach_task\n", mcaste->backoff); + + /* cancel any pending/queued tasks. We can not use sync + * under the spinlock because it might hang. we need the + * spinlock here to ensure the requeueing is atomic + */ + vnic_dbg_mcast_v(port->name, "cancel attach_task\n"); + spin_lock_irqsave(&mcaste->lock, flags); + clear_bit(MCAST_ATTACH_RUNNING, &mcaste->state); + spin_unlock_irqrestore(&mcaste->lock, flags); +#ifndef _BP_WORK_SYNC + cancel_delayed_work_sync(&mcaste->attach_task); +#else + cancel_delayed_work(&mcaste->attach_task); + flush_workqueue(mcast_wq); +#endif + vnic_mcast_detach_ll(mcaste, mcast_tree); + + if (mcaste->port_mcaste) + vnic_port_mcast_release(mcaste->port_mcaste); + + return 0; +} + +static void vnic_mcast_attach_task(struct work_struct *work) +{ + struct ib_ah_attr av; + struct vnic_mcast *mcaste = + container_of(work, struct vnic_mcast, attach_task.work); + struct vnic_port *port = mcaste->port; + unsigned long flags; + int rc; + u16 mlid; + + if ((++mcaste->attach_task_cnt > mcaste->retry && mcaste->retry) || + !test_bit(MCAST_ATTACH_RUNNING, &mcaste->state)) { + vnic_dbg_mcast_v(port->name, + "attach_task stopped, tried %ld times\n", + mcaste->retry); + goto out; + } + + /* update backoff time */ + mcaste->backoff = min(mcaste->backoff * mcaste->backoff_factor, + msecs_to_jiffies(VNIC_MCAST_BACKOFF_MAX_MSEC)); + + if (!test_bit(MCAST_JOINED, &mcaste->port_mcaste->state)) { + vnic_dbg_mcast_v(port->name, "joined %d, retry %ld from %ld\n", + test_bit(MCAST_JOINED, &mcaste->port_mcaste->state), + mcaste->attach_task_cnt, mcaste->retry); + goto retry; + } + + /* attach QP */ + ASSERT(mcaste); + ASSERT(mcaste->port_mcaste); + ASSERT(mcaste->port_mcaste->sa_mcast); + mlid = be16_to_cpu(mcaste->port_mcaste->rec.mlid); + vnic_dbg_mcast(port->name, "QPN 0x%06x attaching MGID "VNIC_GID_FMT + " LID 0x%04x\n", mcaste->qp->qp_num, + VNIC_GID_ARG(mcaste->gid), mlid); + if (!mcaste->sender_only) + rc = ib_attach_mcast(mcaste->qp, &mcaste->gid, mlid); + else + rc = 0; + + if (rc) { + int attach_count = atomic_read(&mcaste->port_mcaste->ref_cnt); + + vnic_err(port->name, "failed to attach (rc %d) to multicast " + "group, MGID "VNIC_GID_FMT"\n", + rc, VNIC_GID_ARG(mcaste->gid)); + + if (port->dev->attr.max_mcast_qp_attach <= attach_count) { + vnic_err(port->name, "Attach failed. Too many vnics are on the same" + " vhub on this port. vnics count=%d, max=%d\n", + attach_count, + port->dev->attr.max_mcast_qp_attach); + } + + goto retry; + } else { + /* create mcast ah */ + memset(&av, 0, sizeof(av)); + av.dlid = be16_to_cpu(mcaste->port_mcaste->rec.mlid); + av.port_num = mcaste->port->num; + av.ah_flags = IB_AH_GRH; + av.static_rate = mcaste->port_mcaste->rec.rate; + av.sl = mcaste->port_mcaste->rec.sl; + memcpy(&av.grh.dgid, mcaste->gid.raw, GID_LEN); + spin_lock_irqsave(&mcaste->lock, flags); + mcaste->ah = ib_create_ah(port->pd, &av); + if (IS_ERR(mcaste->ah)) { + mcaste->ah = NULL; + vnic_err(port->name, + "vnic_ib_create_ah failed (rc %d)\n", + (int)PTR_ERR(mcaste->ah)); + spin_unlock_irqrestore(&mcaste->lock, flags); + /* for such a failure, no need to retry */ + goto out; + } + vnic_dbg_mcast(mcaste->port->name, "created mcast ah for %p\n", mcaste); + + /* callback */ + set_bit(MCAST_ATTACHED, &mcaste->state); + spin_unlock_irqrestore(&mcaste->lock, flags); + + if (mcaste->cur_attached) + set_bit(mcaste->attach_bit_nr, mcaste->cur_attached); + vnic_dbg_mcast(mcaste->port->name, + "attached GID "VNIC_GID_FMT"\n", + VNIC_GID_ARG(mcaste->gid)); + if (mcaste->attach_cb) { + vnic_dbg_mcast(mcaste->port->name, + "calling attach_cb\n"); + mcaste->attach_cb(mcaste, mcaste->attach_cb_ctx); + } + } + +out: + mcaste->attach_task_cnt = 0; /* for next time */ + mcaste->backoff = mcaste->backoff_init; + clear_bit(MCAST_ATTACH_RUNNING, &mcaste->state); + complete(&mcaste->attach_complete); + return; + +retry: + spin_lock_irqsave(&mcaste->lock, flags); + if (test_bit(MCAST_ATTACH_RUNNING, &mcaste->state)) { + /* calls vnic_mcast_attach_task() */ + queue_delayed_work(mcast_wq, &mcaste->attach_task, mcaste->backoff); + } + spin_unlock_irqrestore(&mcaste->lock, flags); +} + +int vnic_mcast_attach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste) +{ + struct vnic_port_mcast *pmcaste; + struct vnic_port *port = mcaste->port; + int rc = 0; + ASSERT(mcaste); + + mcaste->backoff_init = mcaste->backoff; + + pmcaste = vnic_port_mcast_update(mcaste); + if (IS_ERR(pmcaste)) { + vnic_err(port->name, "vnic_port_mcast_update failed GID " + VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid)); + rc = PTR_ERR(pmcaste); + goto out; + } + + mcaste->port_mcaste = pmcaste; + + set_bit(MCAST_ATTACH_RUNNING, &mcaste->state); + + /* must be a task, to sample the joined flag */ + vnic_dbg_mcast_v(port->name, "queue delayed task (%lu) " + "vnic_mcast_join_task\n", mcaste->backoff); + init_completion(&mcaste->attach_complete); + /* calls vnic_mcast_attach_task() */ + queue_delayed_work(mcast_wq, &mcaste->attach_task, 0); + if (mcaste->blocking) { + wait_for_completion(&mcaste->attach_complete); + if (test_bit(MCAST_ATTACHED, &mcaste->state)) + goto out; + vnic_mcast_detach(mcast_tree, mcaste); + rc = 1; + } + +out: + return rc; +} + +#if 0 +static int vnic_mcast_attach_all(struct mcast_root *mcast_tree) +{ + int fails = 0; + struct vnic_mcast *mcaste; + struct rb_node *n; + + n = rb_first(&mcast_tree->mcast_tree); + while (n) { + mcaste = rb_entry(n, struct vnic_mcast, rb_node); + n = rb_next(n); + /* async call */ + if (vnic_mcast_attach(mcast_tree, mcaste)) + fails++; + } + + return fails; +} +#endif + +int vnic_mcast_del_all(struct mcast_root *mcast_tree) +{ + struct rb_node *n; + struct vnic_mcast *mcaste, *mcaste_t; + unsigned long flags; + int fails = 0; + LIST_HEAD(local_list); + + spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags); + n = rb_first(&mcast_tree->mcast_tree); + while (n) { + mcaste = rb_entry(n, struct vnic_mcast, rb_node); + vnic_mcast_del(mcast_tree, mcaste); + list_add_tail(&mcaste->list, &local_list); + n = rb_first(&mcast_tree->mcast_tree); + } + spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags); + + list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) { + list_del(&mcaste->list); + vnic_mcast_detach(mcast_tree, mcaste); + vnic_mcast_dealloc(mcaste); + } + + return fails; +} + +int vnic_mcast_del_user(struct mcast_root *mcast_tree, void *owner) +{ + struct rb_node *n; + struct vnic_mcast *mcaste, *mcaste_t; + unsigned long flags; + int fails = 0; + LIST_HEAD(local_list); + + spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags); + n = rb_first(&mcast_tree->mcast_tree); + while (n) { + mcaste = rb_entry(n, struct vnic_mcast, rb_node); + n = rb_next(&mcaste->rb_node); + if (mcaste->priv_data == owner) { + list_add_tail(&mcaste->list, &local_list); + vnic_mcast_del(mcast_tree, mcaste); + } + } + spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags); + + list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) { + list_del(&mcaste->list); + vnic_mcast_detach(mcast_tree, mcaste); + vnic_mcast_dealloc(mcaste); + } + + return fails; +} + +/* PORT MCAST FUNCTIONS */ +static struct vnic_port_mcast *vnic_port_mcast_alloc(struct vnic_port *port, + union ib_gid *gid) +{ + struct vnic_port_mcast *mcaste; + + mcaste = kzalloc(sizeof *mcaste, GFP_ATOMIC); + if (!mcaste) + return ERR_PTR(-ENOMEM); + + mcaste->gid = *gid; + mcaste->port = port; + init_completion(&mcaste->leave_complete); + atomic_set(&mcaste->ref_cnt, 1); + INIT_DELAYED_WORK(&mcaste->join_task, vnic_port_mcast_join_task); + INIT_WORK(&mcaste->leave_task, vnic_port_mcast_leave_task); + mcaste->sa_mcast = ERR_PTR(-EINVAL); + memset(&mcaste->rec,0,sizeof(mcaste->rec)); + vnic_dbg_mcast_v(mcaste->port->name, "allocated port_mcast GID " + VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid)); + spin_lock_init(&mcaste->lock); + set_bit(MCAST_JOIN_RUNNING, &mcaste->state); + + return mcaste; +} + +static void vnic_port_mcast_dealloc(struct vnic_port_mcast *mcaste) +{ + ASSERT(mcaste); + vnic_dbg_mcast_v(NULL, "dealloc port_mcast GID " + VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid)); + kfree(mcaste); +} + +/* + * This function accesses the port mcast tree. Please make sure + * to call it only while holding the port mcast_rb_lock +*/ +static int vnic_port_mcast_add(struct vnic_port_mcast *mcaste) +{ + struct rb_node **n = &mcaste->port->mcast_tree.mcast_tree.rb_node; + struct rb_node *pn = NULL; + struct vnic_port_mcast *mcaste_t; + int rc; + + while (*n) { + pn = *n; + mcaste_t = rb_entry(pn, struct vnic_port_mcast, rb_node); + rc = memcmp(mcaste->gid.raw, mcaste_t->gid.raw, GID_LEN); + if (rc < 0) + n = &pn->rb_left; + else if (rc > 0) + n = &pn->rb_right; + else { + rc = -EEXIST; + goto out; + } + } + + rb_link_node(&mcaste->rb_node, pn, n); + rb_insert_color(&mcaste->rb_node, &mcaste->port->mcast_tree.mcast_tree); + rc = 0; + +out: + vnic_dbg_mcast_v(mcaste->port->name, "added (rc %d) port_mcast GID " + VNIC_GID_FMT"\n", rc, VNIC_GID_ARG(mcaste->gid)); + return rc; +} + +/* + * This function accesses the port mcast tree. Please make sure + * to call it only while holding the port mcast_rb_lock +*/ +static void vnic_port_mcast_del(struct vnic_port_mcast *mcaste) +{ + ASSERT(mcaste); + vnic_dbg_mcast_v(mcaste->port->name, "del port_mcast GID " + VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid)); + rb_erase(&mcaste->rb_node, &mcaste->port->mcast_tree.mcast_tree); +} + +/* + * This function accesses the port mcast tree. Please make sure + * to call it only while holding the port mcast_rb_lock +*/ +struct vnic_port_mcast *vnic_port_mcast_search(struct vnic_port *port, + union ib_gid *gid) +{ + struct rb_node *n = port->mcast_tree.mcast_tree.rb_node; + struct vnic_port_mcast *mcaste_t; + int rc; + + while (n) { + mcaste_t = rb_entry(n, struct vnic_port_mcast, rb_node); + rc = memcmp(gid->raw, mcaste_t->gid.raw, GID_LEN); + if (rc < 0) + n = n->rb_left; + else if (rc > 0) + n = n->rb_right; + else { + vnic_dbg_mcast_v(mcaste_t->port->name, + "found: GID "VNIC_GID_FMT"\n", + VNIC_GID_ARG(mcaste_t->gid)); + goto out; + } + } + mcaste_t = ERR_PTR(-ENODATA); + +out: + return mcaste_t; +} +/* +static void vnic_port_mcast_leave_task(struct work_struct *work) +{ + struct vnic_port_mcast *mcaste = + container_of(work, struct vnic_port_mcast, leave_task.work); + + vnic_dbg_mcast_v(mcaste->port->name, "leave GID "VNIC_GID_FMT"\n", + VNIC_GID_ARG(mcaste->gid)); + + if (!IS_ERR(mcaste->sa_mcast) && test_bit(MCAST_JOINED, &mcaste->port_mcaste->state)) + vnic_dbg_mcast(mcaste->port->name, + "mcast left: GID "VNIC_GID_FMT"\n", + VNIC_GID_ARG(mcaste->gid)); + if (!IS_ERR(mcaste->sa_mcast)) + ib_sa_free_multicast(mcaste->sa_mcast); + mcaste->sa_mcast = ERR_PTR(-EINVAL); + clear_bit(MCAST_JOINED, &mcaste->port_mcaste->state); +} +*/ + +static int vnic_port_mcast_leave(struct vnic_port_mcast *mcaste, + unsigned long backoff) +{ + unsigned long flags; + + ASSERT(mcaste); + vnic_dbg_mcast(NULL, "queue delayed task (%lu) " + "vnic_mcast_leave_task\n", backoff); + + /* cancel any pending/queued tasks. We can not use sync + * under the spinlock because it might hang. we need the + * spinlock here to ensure the requeueing is atomic + */ + spin_lock_irqsave(&mcaste->lock, flags); + clear_bit(MCAST_JOIN_RUNNING, &mcaste->state); + spin_unlock_irqrestore(&mcaste->lock, flags); +#ifndef _BP_WORK_SYNC + cancel_delayed_work_sync(&mcaste->join_task); +#else + cancel_delayed_work(&mcaste->join_task); + if (delayed_work_pending(&mcaste->join_task)) { + return -EBUSY; + } +#endif + + if (test_and_clear_bit(MCAST_JOIN_STARTED, &mcaste->state) + && !IS_ERR(mcaste->sa_mcast)) { + ib_sa_free_multicast(mcaste->sa_mcast); + mcaste->sa_mcast = ERR_PTR(-EINVAL); + } + + return 0; +} + +static int vnic_port_mcast_join_comp(int status, struct ib_sa_multicast *sa_mcast) +{ + struct vnic_port_mcast *mcaste = sa_mcast->context; + unsigned long flags; + + vnic_dbg_mcast(mcaste->port->name, "join completion for GID " + VNIC_GID_FMT" (status %d)\n", + VNIC_GID_ARG(mcaste->gid), status); + + if (status == -ENETRESET) + return 0; + + if (status) + goto retry; + + /* same as mcaste->rec = mcaste->sa_mcast->rec; */ + mcaste->rec = sa_mcast->rec; + + set_bit(MCAST_JOINED, &mcaste->state); + vnic_dbg_mcast(mcaste->port->name, "joined GID "VNIC_GID_FMT"\n", + VNIC_GID_ARG(mcaste->gid)); +#if 0 + vnic_dbg_mcast_v(mcaste->port->name, "mcast record dump:\n"); + vnic_dbg_mcast_v(mcaste->port->name, "mgid "VNIC_GID_FMT"\n", + VNIC_GID_ARG(rec->mgid)); + vnic_dbg_mcast_v(mcaste->port->name, "port_gid "VNIC_GID_FMT"\n", + VNIC_GID_ARG(rec->port_gid)); + vnic_dbg_mcast_v(mcaste->port->name, "pkey 0x%x\n", rec->pkey); + vnic_dbg_mcast_v(mcaste->port->name, "qkey 0x%x\n", rec->qkey); + vnic_dbg_mcast_v(mcaste->port->name, "mtu_slct 0x%x\n", + rec->mtu_selector); + vnic_dbg_mcast_v(mcaste->port->name, "mtu 0x%x\n", rec->mtu); + vnic_dbg_mcast_v(mcaste->port->name, "rate_slct 0x%x\n", + rec->rate_selector); + vnic_dbg_mcast_v(mcaste->port->name, "rate 0x%x\n", rec->rate); + vnic_dbg_mcast_v(mcaste->port->name, "sl 0x%x\n", rec->sl); + vnic_dbg_mcast_v(mcaste->port->name, "flow_label 0x%x\n", + rec->flow_label); + vnic_dbg_mcast_v(mcaste->port->name, "hop_limit 0x%x\n", + rec->hop_limit); +#endif + + goto out; +retry: + /* calls vnic_port_mcast_join_task() */ + spin_lock_irqsave(&mcaste->lock, flags); + if (test_bit(MCAST_JOIN_RUNNING, &mcaste->state)) + queue_delayed_work(mcast_wq, &mcaste->join_task, mcaste->backoff); + spin_unlock_irqrestore(&mcaste->lock, flags); + +out: + /* rc is always zero so we handle ib_sa_free_multicast ourselves */ + return 0; +} + +static void vnic_port_mcast_join_task(struct work_struct *work) +{ + struct vnic_port_mcast *mcaste = + container_of(work, struct vnic_port_mcast, join_task.work); + struct ib_sa_mcmember_rec rec = { + .join_state = mcaste->join_state + }; + int rc; + ib_sa_comp_mask comp_mask; + unsigned long flags; + + if (++mcaste->join_task_cnt > mcaste->retry && mcaste->retry) { + vnic_dbg_mcast(mcaste->port->name, + "join_task stopped, tried %ld times\n", + mcaste->retry); + goto out; + } + + /* update backoff time */ + mcaste->backoff = min(mcaste->backoff * mcaste->backoff_factor, + msecs_to_jiffies(VNIC_MCAST_BACKOFF_MAX_MSEC)); + + rec.mgid.global = mcaste->gid.global; + rec.port_gid.global = mcaste->port->gid.global; + rec.pkey = cpu_to_be16(mcaste->pkey); + + comp_mask = + IB_SA_MCMEMBER_REC_MGID | + IB_SA_MCMEMBER_REC_PORT_GID | + /*IB_SA_MCMEMBER_REC_PKEY | */ + IB_SA_MCMEMBER_REC_JOIN_STATE; + + if (mcaste->create) { + comp_mask |= + IB_SA_MCMEMBER_REC_QKEY | + IB_SA_MCMEMBER_REC_MTU_SELECTOR | + IB_SA_MCMEMBER_REC_MTU | + IB_SA_MCMEMBER_REC_TRAFFIC_CLASS | + IB_SA_MCMEMBER_REC_RATE_SELECTOR | + IB_SA_MCMEMBER_REC_RATE | + IB_SA_MCMEMBER_REC_SL | + IB_SA_MCMEMBER_REC_FLOW_LABEL | + IB_SA_MCMEMBER_REC_HOP_LIMIT | + IB_SA_MCMEMBER_REC_PKEY; + + rec.qkey = cpu_to_be32(mcaste->qkey); + rec.mtu_selector = IB_SA_EQ; + rec.rate_selector = IB_SA_EQ; + /* when no_bxm is set, use min values to let everybody in */ + rec.mtu = no_bxm ? IB_MTU_2048 : mcaste->port->attr.max_mtu; + rec.rate = no_bxm ? IB_RATE_10_GBPS : mcaste->port->rate_enum; + rec.sl = 0; + rec.flow_label = 0; + rec.hop_limit = 0; + } + + vnic_dbg_mcast(mcaste->port->name, "joining MGID "VNIC_GID_FMT + " create %d, comp_mask %lu\n", + VNIC_GID_ARG(mcaste->gid), mcaste->create, (unsigned long)comp_mask); + + if (!IS_ERR(mcaste->sa_mcast)) + ib_sa_free_multicast(mcaste->sa_mcast); + + mcaste->sa_mcast = + ib_sa_join_multicast(&vnic_sa_client, mcaste->port->dev->ca, + mcaste->port->num, &rec, comp_mask, + GFP_KERNEL, vnic_port_mcast_join_comp, mcaste); + set_bit(MCAST_JOIN_STARTED, &mcaste->state); + + if (IS_ERR(mcaste->sa_mcast)) { + rc = PTR_ERR(mcaste->sa_mcast); + vnic_warn(mcaste->port->name, + "ib_sa_join_multicast failed, status %d\n", rc); + /* calls vnic_port_mcast_join_task() */ + spin_lock_irqsave(&mcaste->lock, flags); + if (test_bit(MCAST_JOIN_RUNNING, &mcaste->state)) + queue_delayed_work(mcast_wq, &mcaste->join_task, mcaste->backoff); + spin_unlock_irqrestore(&mcaste->lock, flags); + } + + return; + +out: + mcaste->join_task_cnt = 0; /* for next time */ + mcaste->backoff = mcaste->backoff_init; + return; +} + +static int vnic_port_mcast_join(struct vnic_port_mcast *mcaste) +{ + unsigned long flags; + + ASSERT(mcaste); + vnic_dbg_mcast_v(mcaste->port->name, "queue delayed task (%lu) " + "vnic_port_mcast_join_task\n", mcaste->backoff); + + /* calls vnic_port_mcast_join_task() */ + spin_lock_irqsave(&mcaste->lock, flags); + if (test_bit(MCAST_JOIN_RUNNING, &mcaste->state)) + queue_delayed_work(mcast_wq, &mcaste->join_task, 0); + spin_unlock_irqrestore(&mcaste->lock, flags); + + return 0; +} + +#if 0 +static int vnic_port_mcast_join_all(struct vnic_port *port) +{ + int fails = 0; + struct vnic_port_mcast *mcaste; + struct rb_node *n; + + n = rb_first(&port->mcast_tree.mcast_tree); + while (n) { + mcaste = rb_entry(n, struct vnic_port_mcast, rb_node); + n = rb_next(n); + if (vnic_port_mcast_join(mcaste)) + fails++; + } + + return fails; +} +#endif + +static void vnic_port_mcast_leave_task(struct work_struct *work) +{ + struct vnic_port_mcast *mcaste = + container_of(work, struct vnic_port_mcast, leave_task); + +#ifndef _BP_WORK_SYNC + vnic_port_mcast_leave(mcaste, 0); +#else + if (vnic_port_mcast_leave(mcaste, 0)) { + queue_work(mcast_wq, &mcaste->leave_task); + return; + } +#endif + vnic_port_mcast_dealloc(mcaste); +} + +static void vnic_port_mcast_release(struct vnic_port_mcast *mcaste) +{ + unsigned long flags; + + struct vnic_port *port = mcaste->port; + + vnic_dbg_mcast(port->name, "update mcaste->ref_cnt %d -> %d\n", + atomic_read(&mcaste->ref_cnt), + atomic_read(&mcaste->ref_cnt) - 1); + + spin_lock_irqsave(&port->mcast_tree.mcast_rb_lock, flags); + if (atomic_dec_and_test(&mcaste->ref_cnt)) { + vnic_port_mcast_del(mcaste); + spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags); + + /* we are not going to wait for the leave to terminate. + * We will just go on. + * calls vnic_port_mcast_leave_task() + */ + queue_work(mcast_wq, &mcaste->leave_task); + } else + spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags); +} + +static struct vnic_port_mcast *vnic_port_mcast_update(struct vnic_mcast *_mcaste) +{ + union ib_gid *gid = &_mcaste->port_gid; + u32 qkey = _mcaste->qkey; + u16 pkey = _mcaste->pkey; + struct vnic_port *port = _mcaste->port; + struct vnic_port_mcast *mcaste; + unsigned long flags; + + spin_lock_irqsave(&port->mcast_tree.mcast_rb_lock, flags); + mcaste = vnic_port_mcast_search(port, gid); + /* entry found */ + if (PTR_ERR(mcaste) != -ENODATA) { + ASSERT(!IS_ERR(mcaste)); + atomic_inc(&mcaste->ref_cnt); + spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags); + vnic_dbg_mcast(mcaste->port->name, + "found, add GID "VNIC_GID_FMT" \n", + VNIC_GID_ARG(*gid)); + vnic_dbg_mcast(mcaste->port->name, + "update mcaste->ref_cnt %d -> %d\n", + atomic_read(&mcaste->ref_cnt), + atomic_read(&mcaste->ref_cnt) + 1); + } else { /* not found, add it */ + mcaste = vnic_port_mcast_alloc(port, gid); + if (IS_ERR(mcaste)) { + spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags); + return mcaste; + } + vnic_dbg_mcast(mcaste->port->name, + "not found, add GID "VNIC_GID_FMT" \n", + VNIC_GID_ARG(*gid)); + vnic_dbg_mcast(mcaste->port->name, + "update mcaste->ref_cnt %d -> %d\n", + atomic_read(&mcaste->ref_cnt), + atomic_read(&mcaste->ref_cnt) + 1); + mcaste->qkey = qkey; + mcaste->pkey = pkey; + mcaste->backoff_init = _mcaste->backoff_init; + mcaste->backoff = _mcaste->backoff; + mcaste->backoff_factor = _mcaste->backoff_factor; + mcaste->retry = _mcaste->retry; + mcaste->create = _mcaste->create; + mcaste->join_state = _mcaste->join_state; + vnic_port_mcast_add(mcaste); + spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags); + + vnic_port_mcast_join(mcaste); + vnic_dbg_mcast(mcaste->port->name, "added\n"); + } + + return mcaste; +} + +#if 0 +void vnic_port_mcast_del_all(struct vnic_port *port) +{ + + struct rb_node *n; + struct vnic_port_mcast *mcaste, *mcaste_t; + LIST_HEAD(local_list); + + ASSERT(port); + + n = rb_first(&port->mcast_tree.mcast_tree); + while (n) { + mcaste = rb_entry(n, struct vnic_port_mcast, rb_node); + list_add_tail(&mcaste->list, &local_list); + n = rb_next(&mcaste->rb_node); + } + + list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) { + list_del(&mcaste->list); + vnic_warn(port->name, "shouldn't find gid "VNIC_GID_FMT"\n", + VNIC_GID_ARG(mcaste->gid)); + vnic_port_mcast_release(mcaste); + } + + return; +} +#endif + +void vnic_tree_mcast_detach(struct mcast_root *mcast_tree) +{ + struct vnic_mcast *mcaste, *mcaste_t; + struct rb_node *n; + unsigned long flags; + INIT_LIST_HEAD(&mcast_tree->reattach_list); + + spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags); + n = rb_first(&mcast_tree->mcast_tree); + while (n) { + mcaste = rb_entry(n, struct vnic_mcast, rb_node); + list_add_tail(&mcaste->list, &mcast_tree->reattach_list); + n = rb_next(&mcaste->rb_node); + vnic_mcast_del(mcast_tree, mcaste); + mcaste->attach_task_cnt = 0; + } + spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags); + + list_for_each_entry_safe(mcaste, mcaste_t, &mcast_tree->reattach_list, list) { + vnic_mcast_detach(mcast_tree, mcaste); + } + + return; +} + +void vnic_tree_mcast_attach(struct mcast_root *mcast_tree) +{ + struct vnic_mcast *mcaste, *mcaste_t; + int rc; + + /* The add function grabs the mcast_rb_lock no need to take it */ + list_for_each_entry_safe(mcaste, mcaste_t, &mcast_tree->reattach_list, list) { + rc = vnic_mcast_add(mcast_tree, mcaste); + ASSERT(!rc); + rc = vnic_mcast_attach(mcast_tree, mcaste); + ASSERT(!rc); + list_del(&mcaste->list); + } + + return; +} + +int vnic_mcast_init() +{ + ib_sa_register_client(&vnic_sa_client); + + mcast_wq = create_singlethread_workqueue("mcast_wq"); + if (!mcast_wq) + return -ENOMEM; + + return 0; +} + +void vnic_mcast_cleanup() +{ + ASSERT(mcast_wq); + vnic_dbg_mark(); + flush_workqueue(mcast_wq); + vnic_dbg_mark(); + destroy_workqueue(mcast_wq); + vnic_dbg_mark(); + ib_sa_unregister_client(&vnic_sa_client); + + return; +} diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_mcast.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_mcast.h new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_param.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_param.c new file mode 100644 index 0000000000000..56751aa752740 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_param.c @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_fip.h" + +u32 vnic_lro_num = VNIC_MAX_LRO_DESCS; +u32 vnic_net_admin = 1; +u32 vnic_child_max = VNIC_CHILD_MAX; +u32 vnic_tx_rings_num = 0; +u32 vnic_rx_rings_num = 0; +u32 vnic_tx_rings_len = VNIC_TX_QUEUE_LEN; +u32 vnic_rx_rings_len = VNIC_RX_QUEUE_LEN; +u32 vnic_mgid_data_type = 0; +u32 vnic_encap_headroom = 1; +u32 vnic_tx_polling = 1; +u32 vnic_rx_linear = 0; +u32 vnic_change_mac = 0; +u32 vnic_learn_mac_enabled = 1; +u32 vnic_synd_backlog = 4; +u32 vnic_eport_state_enforce = 0; +u32 vnic_src_mac_enforce = 0; +u32 vnic_inline_tshold = 0; +u32 vnic_discovery_pkeys[MAX_NUM_PKEYS_DISCOVERY]; +u32 vnic_discovery_pkeys_count = MAX_NUM_PKEYS_DISCOVERY; +u32 vnic_sa_query = 0; + +/* these params are enbaled in debug mode */ +u32 no_bxm = 0; +u32 vnic_msglvl = 0x80000000; +u32 vnic_max_tx_outs = VNIC_MAX_TX_OUTS; +u32 vnic_linear_small_pkt = 1; +u32 vnic_mcast_create = 0; +u32 vnic_napi_weight = VNIC_MAX_RX_CQE; + +module_param_named(tx_rings_num, vnic_tx_rings_num, int, 0444); +MODULE_PARM_DESC(tx_rings_num, "Number of TX rings, use 0 for #cpus [default 0, max 32]"); + +module_param_named(tx_rings_len, vnic_tx_rings_len, int, 0444); +MODULE_PARM_DESC(tx_rings_len, "Length of TX rings, must be power of two [default 1024, max 8K]"); + +module_param_named(rx_rings_num, vnic_rx_rings_num, int, 0444); +MODULE_PARM_DESC(rx_rings_num, "Number of RX rings, use 0 for #cpus [default 0, max 32]"); + +module_param_named(rx_rings_len, vnic_rx_rings_len, int, 0444); +MODULE_PARM_DESC(rx_rings_len, "Length of RX rings, must be power of two [default 2048, max 8K]"); + +module_param_named(eport_state_enforce, vnic_eport_state_enforce, int, 0644); +MODULE_PARM_DESC(eport_state_enforce, "Bring interface up only when corresponding EPort is up [default 0]"); + +module_param_named(src_mac_enforce, vnic_src_mac_enforce, int, 0644); +MODULE_PARM_DESC(src_mac_enforce, "Enforce source MAC address [default 0]"); + +module_param_named(vnic_net_admin, vnic_net_admin, int, 0644); +MODULE_PARM_DESC(vnic_net_admin, "Enable Network Administration mode [default 1]"); + +module_param_named(vnic_child_max, vnic_child_max, int, 0644); +MODULE_PARM_DESC(vnic_child_max, "Max child vNics (per interface), use 0 to disable [default 128]"); + +module_param_named(mgid_data_type, vnic_mgid_data_type, int, 0444); +MODULE_PARM_DESC(mgid_data_type, "Set MGID data type for multicast traffic [default 0, max 1]"); + +module_param_named(encap_headroom, vnic_encap_headroom, int, 0444); +MODULE_PARM_DESC(encap_headroom, "Use SKB headroom for protocol encapsulation [default 1]"); + +module_param_named(inline_tshold, vnic_inline_tshold, int, 0444); +MODULE_PARM_DESC(inline_tshold, "Packets smaller than this threshold (in bytes) use inline & blue flame [default 0, max 512]"); + +module_param_named(tx_polling, vnic_tx_polling, int, 0444); +MODULE_PARM_DESC(tx_polling, "Enable TX polling mode [default 1]"); + +module_param_named(rx_linear, vnic_rx_linear, int, 0444); +MODULE_PARM_DESC(rx_linear, "Enable linear RX buffers [default 0]"); + +module_param_named(change_mac, vnic_change_mac, int, 0444); +MODULE_PARM_DESC(change_mac, "Enable MAC change using child vNics [default 0]"); + +module_param_named(learn_tx_mac, vnic_learn_mac_enabled, int, 0644); +MODULE_PARM_DESC(learn_tx_mac, "Enable TX MAC learning in promisc mode [default 1]"); + +module_param_named(synd_backlog, vnic_synd_backlog, int, 0644); +MODULE_PARM_DESC(synd_backlog, "Syndrome error reporting backlog limit [default 4]"); + +module_param_array_named(discovery_pkeys, vnic_discovery_pkeys, int, &vnic_discovery_pkeys_count, 0444); +MODULE_PARM_DESC(discovery_pkeys, "Vector of PKeys to be used for discovery [default 0xffff, max vector length 24]"); + +module_param_named(sa_query, vnic_sa_query, int, 0644); +MODULE_PARM_DESC(sa_query, "Query SA for each IB address and ignore gateway assigned SLs [default 0]"); + + +#if !(defined(NETIF_F_GRO) && !defined(_BP_NO_GRO)) +module_param_named(lro_num, vnic_lro_num, int, 0444); +MODULE_PARM_DESC(lro_num, "Number of LRO sessions per ring, use 0 to disable [default 32, max 32]"); +#endif + +#ifdef CONFIG_MLX4_VNIC_DEBUG +module_param_named(no_bxm, no_bxm, int, 0444); +MODULE_PARM_DESC(no_bxm, "Enable NO BXM mode [default 0]"); + +module_param_named(msglvl, vnic_msglvl, uint, 0644); +MODULE_PARM_DESC(msglvl, "Debug message level [default 0]"); + +module_param_named(max_tx_outs, vnic_max_tx_outs, int, 0644); +MODULE_PARM_DESC(max_tx_outs, "Max outstanding TX packets [default 16]"); + +module_param_named(linear_small_pkt, vnic_linear_small_pkt, int, 0644); +MODULE_PARM_DESC(linear_small_pkt, "Use linear buffer for small packets [default 1]"); + +module_param_named(mcast_create, vnic_mcast_create, int, 0444); +MODULE_PARM_DESC(mcast_create, "Create multicast group during join request [default 0]"); + +module_param_named(napi_weight, vnic_napi_weight, int, 0444); +MODULE_PARM_DESC(napi_weight, "NAPI weight [default 32]"); +#endif /* CONFIG_MLX4_VNIC_DEBUG */ + +int vnic_param_check(void) { +#ifdef CONFIG_MLX4_VNIC_DEBUG + vnic_info("VNIC_DEBUG flag is set\n"); +#endif + + vnic_mcast_create = vnic_mcast_create ? 1 : 0; + vnic_mcast_create = no_bxm ? 1 : vnic_mcast_create; + no_bxm = no_bxm ? 1 : 0; + vnic_sa_query = vnic_sa_query ? 1 : 0; + + vnic_mgid_data_type = max_t(u32, vnic_mgid_data_type, 0); + vnic_mgid_data_type = min_t(u32, vnic_mgid_data_type, 1); + + vnic_rx_rings_num = max_t(u32, vnic_rx_rings_num, 0); + vnic_rx_rings_num = min_t(u32, vnic_rx_rings_num, VNIC_MAX_NUM_CPUS); + + vnic_tx_rings_num = max_t(u32, vnic_tx_rings_num, 0); + vnic_tx_rings_num = min_t(u32, vnic_tx_rings_num, VNIC_MAX_NUM_CPUS); + + vnic_tx_rings_len = rounddown_pow_of_two(vnic_tx_rings_len); + vnic_tx_rings_len = max_t(u32, vnic_tx_rings_len, VNIC_TX_QUEUE_LEN_MIN); + vnic_tx_rings_len = min_t(u32, vnic_tx_rings_len, VNIC_TX_QUEUE_LEN_MAX); + + vnic_rx_rings_len = rounddown_pow_of_two(vnic_rx_rings_len); + vnic_rx_rings_len = max_t(u32, vnic_rx_rings_len, VNIC_RX_QUEUE_LEN_MIN); + vnic_rx_rings_len = min_t(u32, vnic_rx_rings_len, VNIC_RX_QUEUE_LEN_MAX); + + vnic_max_tx_outs = min_t(u32, vnic_tx_rings_len, vnic_max_tx_outs); + + vnic_napi_weight = min_t(u32, vnic_napi_weight, VNIC_MAX_NUM_CPUS); + + vnic_lro_num = max_t(u32, vnic_lro_num, 0); + vnic_lro_num = min_t(u32, vnic_lro_num, VNIC_MAX_LRO_DESCS); + + vnic_inline_tshold = max_t(u32, vnic_inline_tshold, 0); + vnic_inline_tshold = min_t(u32, vnic_inline_tshold, VNIC_MAX_INLINE_TSHOLD); + + return 0; +} diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_port.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_port.c new file mode 100644 index 0000000000000..a973deb7aecdf --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_port.c @@ -0,0 +1,492 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_data.h" + +/* globals */ +struct workqueue_struct *port_wq; +struct workqueue_struct *login_wq; + +/* functions */ +static void vnic_port_event(struct ib_event_handler *handler, + struct ib_event *record) +{ + struct vnic_port *port = + container_of(handler, struct vnic_port, event_handler); + + if (record->element.port_num != port->num) + return; + + vnic_info("Received event 0x%x (device %s port %d)\n", + record->event, record->device->name, + record->element.port_num); + + switch (record->event) { + case IB_EVENT_SM_CHANGE: + case IB_EVENT_CLIENT_REREGISTER: + /* calls vnic_port_event_task_light() */ + queue_delayed_work(fip_wq, &port->event_task_light, msecs_to_jiffies(VNIC_SM_HEADSTART)); + break; + case IB_EVENT_PORT_ERR: + case IB_EVENT_PORT_ACTIVE: + /* calls vnic_port_event_task() */ + queue_delayed_work(fip_wq, &port->event_task, msecs_to_jiffies(VNIC_SM_HEADSTART)); + break; + case IB_EVENT_PKEY_CHANGE: + case IB_EVENT_LID_CHANGE: + /* calls port_fip_discover_restart() */ + if (no_bxm) + queue_delayed_work(fip_wq, &port->event_task, 0); + else + queue_delayed_work(port_wq, &port->discover_restart_task, msecs_to_jiffies(VNIC_SM_HEADSTART)); + break; + case IB_EVENT_SRQ_ERR: + case IB_EVENT_SRQ_LIMIT_REACHED: + case IB_EVENT_QP_LAST_WQE_REACHED: + case IB_EVENT_DEVICE_FATAL: + default: + vnic_warn(port->name, "event 0x%x unhandled\n", record->event); + break; + } + +} + +static inline u8 vnic_mcast_rate_enum(struct vnic_port *port, int rate) +{ + u8 ret; + + switch (rate) { + case 10: + ret = IB_RATE_10_GBPS; + break; + case 20: + ret = IB_RATE_20_GBPS; + break; + case 40: + ret = IB_RATE_40_GBPS; + break; + case 80: + ret = IB_RATE_80_GBPS; + break; + default: + ret = IB_RATE_10_GBPS; + } + return ret; +} + +int vnic_port_query(struct vnic_port *port) +{ + if (ib_query_gid(port->dev->ca, port->num, 0, &port->gid)) { + vnic_err(port->name, "ib_query_gid failed\n"); + return -EINVAL; + } + + if (ib_query_port(port->dev->ca, port->num, &port->attr)) { + vnic_err(port->name, "ib_query_port failed\n"); + return -EINVAL; + } + + port->max_mtu_enum = ib_mtu_enum_to_int(port->attr.max_mtu); + port->rate = ((int)port->attr.active_speed * + ib_width_enum_to_int(port->attr.active_width) * 25) / 10; + port->rate_enum = vnic_mcast_rate_enum(port, port->rate); + + if (ib_query_pkey(port->dev->ca, port->num, port->pkey_index, + &port->pkey)) { + vnic_err(port->name, "ib_query_pkey failed for index %d\n", + port->pkey_index); + return -EINVAL; + } + port->pkey |= 0x8000; + + return 0; +} + +void vnic_port_event_task(struct work_struct *work) +{ + struct vnic_port *port = + container_of(work, struct vnic_port, event_task.work); + struct fip_discover *discover; + + /* refresh port attr, TODO: check what else need to be refreshed */ + vnic_dbg_mark(); + mutex_lock(&port->mlock); + if (vnic_port_query(port)) + vnic_warn(port->name, "vnic_port_query failed\n"); + mutex_unlock(&port->mlock); + + /* refresh login mcasts */ + vnic_login_refresh_mcasts(port); + + list_for_each_entry(discover, &port->fip.discover_list, discover_list) { + /* refresh FIP mcasts */ + if ((!no_bxm) && (discover->state != FIP_DISCOVER_OFF)) + fip_refresh_mcasts(discover); + } +} + +void vnic_port_event_task_light(struct work_struct *work) +{ + struct vnic_port *port = + container_of(work, struct vnic_port, event_task_light.work); + unsigned long flags,mc_flags; + struct fip_discover *discover; + struct rb_node *node; + struct vnic_port_mcast *mcaste; + struct mcast_root *mcast_tree = &port->mcast_tree; + struct vnic_login *login; + vnic_dbg_mark(); + mutex_lock(&port->mlock); + + if (vnic_port_query(port)) + vnic_warn(port->name, "vnic_port_query failed\n"); + + spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags); + for (node = rb_first(&mcast_tree->mcast_tree); node; node = rb_next(node)){ + mcaste = rb_entry(node, struct vnic_port_mcast , rb_node); + clear_bit(MCAST_JOINED, &mcaste->state); + set_bit(MCAST_JOIN_RUNNING, &mcaste->state); + vnic_dbg_mcast(mcaste->port->name,"Rejoin GID="VNIC_GID_FMT"\n",VNIC_GID_ARG(mcaste->gid)); + spin_lock_irqsave(&mcaste->lock, mc_flags); + queue_delayed_work(mcast_wq, &mcaste->join_task, 0); + spin_unlock_irqrestore(&mcaste->lock, mc_flags); + } + + spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags); + + vnic_dbg_mark(); + if (vnic_sa_query) + list_for_each_entry(login, &port->login_list, list) + { + /* take the tx lock to make sure no delete function is called at the time */ + netif_tx_lock_bh(login->dev); + vnic_neigh_invalidate(login); + netif_tx_unlock_bh(login->dev); + } + + mutex_unlock(&port->mlock); + + list_for_each_entry(discover, &port->fip.discover_list, discover_list) { + if ((!no_bxm) && (discover->state != FIP_DISCOVER_OFF)) + fip_refresh_mcasts(discover); + } +} + +struct vnic_port *vnic_port_alloc(struct vnic_ib_dev *vnic_dev, u8 num) +{ + struct vnic_port *port; + int def_rings_num; + int max_num_cpus; + + port = kzalloc(sizeof *port, GFP_KERNEL); + if (!port) + return ERR_PTR(-ENOMEM); + + /* pre-init fields */ + port->num = num; + port->dev = vnic_dev; + + max_num_cpus = min((int)num_online_cpus(), VNIC_MAX_NUM_CPUS); + def_rings_num = min(vnic_dev->ca->num_comp_vectors, max_num_cpus); + port->rx_rings_num = vnic_rx_rings_num ? vnic_rx_rings_num : def_rings_num; + port->tx_rings_num = vnic_tx_rings_num ? vnic_tx_rings_num : def_rings_num; + + sprintf(port->name, "%s:%d", port->dev->ca->name, port->num); + INIT_LIST_HEAD(&port->login_list); + INIT_LIST_HEAD(&port->fip.discover_list); + INIT_DELAYED_WORK(&port->event_task, vnic_port_event_task); + INIT_DELAYED_WORK(&port->event_task_light, vnic_port_event_task_light); + INIT_DELAYED_WORK(&port->discover_restart_task, port_fip_discover_restart); + INIT_IB_EVENT_HANDLER(&port->event_handler, vnic_dev->ca, + vnic_port_event); + mutex_init(&port->mlock); + mutex_init(&port->start_stop_lock); + vnic_mcast_root_init(&port->mcast_tree); + atomic_set(&port->vnic_child_ids, 0); + + port->pkey_index = 0; /* used by fip qps, TBD */ + + if (ib_register_event_handler(&port->event_handler)) { + vnic_err(port->name, "ib_register_event_handler failed\n"); + goto err; + } + + vnic_dbg_mark(); + mutex_lock(&port->mlock); + if (vnic_port_query(port)) { + vnic_err(port->name, "vnic_port_query failed\n"); + mutex_unlock(&port->mlock); + if (ib_unregister_event_handler(&port->event_handler)) + vnic_err(port->name, "ib_unregister_event_handler failed!\n"); + goto err; + } + mutex_unlock(&port->mlock); + + return port; +err: + kfree(port); + return ERR_PTR(-EINVAL); +} + +int vnic_port_init(struct vnic_port *port) +{ + return vnic_port_ib_init(port); +} + +void vnic_port_cleanup(struct vnic_port *port) +{ + /* should be empty list */ + vnic_port_ib_cleanup(port); + return; +} + +static void vnic_ib_dev_add_one(struct ib_device *device); +static void vnic_ib_dev_remove_one(struct ib_device *device); +static struct ib_client vnic_init_client = { + .name = DRV_NAME, + .add = vnic_ib_dev_add_one, + .remove = vnic_ib_dev_remove_one, +}; + +static void vnic_ib_dev_add_one(struct ib_device *device) +{ + struct vnic_port *ib_port; + struct vnic_ib_dev *ib_dev; + int s, e, p, rc; + + vnic_dbg(NULL, "ib_dev %s\n", device->name); + + if (memcmp(device->name, "mlx4", 4)) + return; + + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; + + s = 1; + e = device->phys_port_cnt; + + /* alloc ib device */ + ib_dev = kzalloc(sizeof *ib_dev, GFP_KERNEL); + if (!ib_dev) + return; + + /* init ib dev */ + mutex_init(&ib_dev->mlock); + ib_dev->ca = device; + mutex_lock(&ib_dev->mlock); + /* TODO: remove mdev once all mlx4 caps are standard */ + ib_dev->mdev = to_mdev(device); + ASSERT(ib_dev->ca); + sprintf(ib_dev->name, "%s", device->name); + if (ib_query_device(device, &ib_dev->attr)) { + vnic_err(ib_dev->name, "ib_query_device failed on %s\n", + device->name); + goto abort; + } + + VNIC_FW_STR(ib_dev->attr.fw_ver, ib_dev->fw_ver_str); + INIT_LIST_HEAD(&ib_dev->port_list); + vnic_dbg_mark(); + for (p = s; p <= e; ++p) { + /* skip non IB link layers */ + if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) + continue; + + /* alloc IB port */ + ib_port = vnic_port_alloc(ib_dev, p); + if (IS_ERR(ib_port)) { + vnic_err(ib_dev->name, + "vnic_port_alloc failed %d from %d\n", p, e); + continue; + } + /* init IB port */ + rc = vnic_port_init(ib_port); + if (rc) { + vnic_err(ib_port->name, + "vnic_port_init failed, rc %d\n", rc); + if (ib_unregister_event_handler(&ib_port->event_handler)) + vnic_err(ib_port->name, + "ib_unregister_event_handler failed!\n"); + kfree(ib_port); + continue; + } + if (no_bxm) { + rc = vnic_port_data_init(ib_port); + if (rc) + vnic_err(ib_port->name, + "vnic_port_data_init failed, rc %d\n", rc); + } else { + rc = vnic_port_fip_init(ib_port); + if (rc) + vnic_err(ib_port->name, + "vnic_port_fip_init failed, rc %d\n", rc); + else { + rc = port_fs_init(ib_port); + if (rc) + vnic_warn(ib_port->name, "port_fs_init sysfs:" + "entry creation failed, %d\n", rc); + } + } + if (rc) { + if (ib_unregister_event_handler(&ib_port->event_handler)) + vnic_err(ib_port->name, + "ib_unregister_event_handler failed!\n"); + vnic_port_cleanup(ib_port); + kfree(ib_port); + continue; + + } + vnic_dbg_mark(); + mutex_lock(&ib_port->start_stop_lock); + list_add_tail(&ib_port->list, &ib_dev->port_list); + mutex_unlock(&ib_port->start_stop_lock); + } + + /* set device ctx */ + ib_set_client_data(device, &vnic_init_client, ib_dev); + mutex_unlock(&ib_dev->mlock); + return; + +abort: + mutex_unlock(&ib_dev->mlock); + kfree(ib_dev); +} + +static void vnic_ib_dev_remove_one(struct ib_device *device) +{ + struct vnic_port *port, *port_t; + struct vnic_ib_dev *ib_dev = + ib_get_client_data(device, &vnic_init_client); + + vnic_dbg(NULL, "ib_dev %s\n", device->name); + + if (!ib_dev) + return; + + vnic_dbg_mark(); + mutex_lock(&ib_dev->mlock); + list_for_each_entry_safe(port, port_t, &ib_dev->port_list, list) { + vnic_dbg(port->name, "port %d\n", port->num); + if (ib_unregister_event_handler(&port->event_handler)) + vnic_err(port->name, "ib_unregister_event_handler failed!\n"); + /* make sure we don't have any more pending events */ +#ifndef _BP_WORK_SYNC + cancel_delayed_work_sync(&port->event_task_light); + cancel_delayed_work_sync(&port->event_task); + cancel_delayed_work_sync(&port->discover_restart_task); +#else + cancel_delayed_work(&port->event_task_light); + cancel_delayed_work(&port->event_task); + cancel_delayed_work(&port->discover_restart_task); + flush_workqueue(port_wq); + flush_workqueue(fip_wq); +#endif + /* remove sysfs entries related to FIP + * we want to do this outside the lock + */ + port_fs_exit(port); + + /* cleanup any pending vnics */ + vnic_dbg_mark(); + mutex_lock(&port->start_stop_lock); + list_del(&port->list); + if (no_bxm) + vnic_port_data_cleanup(port); + else { + vnic_port_fip_cleanup(port, 0); + } + mutex_unlock(&port->start_stop_lock); + vnic_port_cleanup(port); + kfree(port); + } + mutex_unlock(&ib_dev->mlock); + + kfree(ib_dev); +} + +int vnic_ports_init(void) +{ + int rc; + + /* create global wq */ + port_wq = create_singlethread_workqueue("port_wq"); + if (!port_wq) { + vnic_err(NULL, "create_singlethread_workqueue failed for %s\n", + "port_wq"); + return -EINVAL; + } + + login_wq = create_singlethread_workqueue("login_wq"); + if (!login_wq) { + vnic_err(NULL, "create_singlethread_workqueue failed for %s\n", + "login_wq"); + goto free_wq0; + } + + fip_wq = create_singlethread_workqueue("fip"); + if (!fip_wq) { + vnic_err(NULL, "create_singlethread_workqueue failed for %s\n", + "fip"); + goto free_wq1; + } + + /* calls vnic_ib_dev_add_one() */ + rc = ib_register_client(&vnic_init_client); + if (rc) { + vnic_err(NULL, "ib_register_client failed %d\n", rc); + goto free_wq2; + } + + return 0; + +free_wq2: + destroy_workqueue(fip_wq); +free_wq1: + destroy_workqueue(login_wq); +free_wq0: + destroy_workqueue(port_wq); + + return -EINVAL; +} + +void vnic_ports_cleanup(void) +{ + vnic_dbg(NULL, "calling ib_unregister_client\n"); + /* calls vnic_ib_dev_remove_one() */ + ib_unregister_client(&vnic_init_client); + vnic_dbg(NULL, "calling destroy_workqueue\n"); + destroy_workqueue(fip_wq); + destroy_workqueue(login_wq); + destroy_workqueue(port_wq); + vnic_dbg(NULL, "vnic_data_cleanup done\n"); +} diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_qp.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_qp.c new file mode 100644 index 0000000000000..c8fb317a0cd43 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_qp.c @@ -0,0 +1,1636 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include +#include +#include + +#include +#include + +#include "vnic.h" + +/* compare with drivers/infiniband/hw/mlx4/qp.c */ +#define mlx4_ib_dbg(format, arg...) vnic_dbg(NULL, format, ## arg) + +enum { + MLX4_IB_ACK_REQ_FREQ = 8, +}; + +enum { + MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83, + MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, + MLX4_IB_LINK_TYPE_IB = 0, + MLX4_IB_LINK_TYPE_ETH = 1, +}; + +enum { + /* + * Largest possible UD header: send with GRH and immediate data. + * 4 bytes added to accommodate for eth header instead of lrh + */ + MLX4_IB_UD_HEADER_SIZE = 76, + MLX4_IB_MAX_RAW_ETY_HDR_SIZE = 12 +}; + +enum { + MLX4_IBOE_ETHERTYPE = 0x8915 +}; + +struct mlx4_ib_sqp { + struct mlx4_ib_qp qp; + int pkey_index; + u32 qkey; + u32 send_psn; + struct ib_ud_header ud_header; + u8 header_buf[MLX4_IB_UD_HEADER_SIZE]; +}; + +enum { + MLX4_IB_MIN_SQ_STRIDE = 6 +}; + +static const __be32 mlx4_ib_opcode[] = { + [IB_WR_SEND] = cpu_to_be32(MLX4_OPCODE_SEND), + [IB_WR_LSO] = cpu_to_be32(MLX4_OPCODE_LSO), + [IB_WR_SEND_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_SEND_IMM), + [IB_WR_RDMA_WRITE] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE), + [IB_WR_RDMA_WRITE_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM), + [IB_WR_RDMA_READ] = cpu_to_be32(MLX4_OPCODE_RDMA_READ), + [IB_WR_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_ATOMIC_CS), + [IB_WR_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_ATOMIC_FA), + [IB_WR_SEND_WITH_INV] = cpu_to_be32(MLX4_OPCODE_SEND_INVAL), + [IB_WR_LOCAL_INV] = cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL), + [IB_WR_FAST_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR), + [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS), + [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA), +}; + +#ifndef wc_wmb + #if defined(__i386__) + #define wc_wmb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory") + #elif defined(__x86_64__) + #define wc_wmb() asm volatile("sfence" ::: "memory") + #elif defined(__ia64__) + #define wc_wmb() asm volatile("fwb" ::: "memory") + #else + #define wc_wmb() wmb() + #endif +#endif + +#if 0 +static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) +{ + return container_of(mqp, struct mlx4_ib_sqp, qp); +} +#endif + +static void *get_wqe(struct mlx4_ib_qp *qp, int offset) +{ + return mlx4_buf_offset(&qp->buf, offset); +} + +static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift)); +} + +static void *get_send_wqe(struct mlx4_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift)); +} + +/* + * Stamp a SQ WQE so that it is invalid if prefetched by marking the + * first four bytes of every 64 byte chunk with + * 0x7FFFFFF | (invalid_ownership_value << 31). + * + * When the max work request size is less than or equal to the WQE + * basic block size, as an optimization, we can stamp all WQEs with + * 0xffffffff, and skip the very first chunk of each WQE. + */ +static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size) +{ + __be32 *wqe; + int i; + int s; + int ind; + void *buf; + __be32 stamp; + struct mlx4_wqe_ctrl_seg *ctrl; + + if (qp->sq_max_wqes_per_wr > 1) { + s = roundup(size, 1U << qp->sq.wqe_shift); + for (i = 0; i < s; i += 64) { + ind = (i >> qp->sq.wqe_shift) + n; + stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) : + cpu_to_be32(0xffffffff); + buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); + wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1)); + *wqe = stamp; + } + } else { + ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); + s = (ctrl->fence_size & 0x3f) << 4; + for (i = 64; i < s; i += 64) { + wqe = buf + i; + *wqe = cpu_to_be32(0xffffffff); + } + } +} + +static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size) +{ + struct mlx4_wqe_ctrl_seg *ctrl; + struct mlx4_wqe_inline_seg *inl; + void *wqe; + int s; + + ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); + s = sizeof(struct mlx4_wqe_ctrl_seg); + + if (qp->ibqp.qp_type == IB_QPT_UD) { + struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl; + struct mlx4_av *av = (struct mlx4_av *)dgram->av; + memset(dgram, 0, sizeof *dgram); + av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn); + s += sizeof(struct mlx4_wqe_datagram_seg); + } + + /* Pad the remainder of the WQE with an inline data segment. */ + if (size > s) { + inl = wqe + s; + inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl)); + } + ctrl->srcrb_flags = 0; + ctrl->fence_size = size / 16; + /* + * Make sure descriptor is fully written before setting ownership bit + * (because HW can start executing as soon as we do). + */ + wmb(); + + ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) | + (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); + + stamp_send_wqe(qp, n + qp->sq_spare_wqes, size); +} + +/* Post NOP WQE to prevent wrap-around in the middle of WR */ +static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind) +{ + unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1)); + if (unlikely(s < qp->sq_max_wqes_per_wr)) { + post_nop_wqe(qp, ind, s << qp->sq.wqe_shift); + ind += s; + } + return ind; +} + +static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) +{ + struct ib_event event; + struct mlx4_ib_qp *mqp = to_mibqp(qp); + struct ib_qp *ibqp = &mqp->ibqp; + + if (type == MLX4_EVENT_TYPE_PATH_MIG) + to_mibqp(qp)->port = to_mibqp(qp)->alt_port; + + if (ibqp->event_handler) { + event.device = ibqp->device; + switch (type) { + case MLX4_EVENT_TYPE_PATH_MIG: + event.event = IB_EVENT_PATH_MIG; + break; + case MLX4_EVENT_TYPE_COMM_EST: + event.event = IB_EVENT_COMM_EST; + break; + case MLX4_EVENT_TYPE_SQ_DRAINED: + event.event = IB_EVENT_SQ_DRAINED; + break; + case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE: + event.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + case MLX4_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_QP_FATAL; + break; + case MLX4_EVENT_TYPE_PATH_MIG_FAILED: + event.event = IB_EVENT_PATH_MIG_ERR; + break; + case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + event.event = IB_EVENT_QP_REQ_ERR; + break; + case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR: + event.event = IB_EVENT_QP_ACCESS_ERR; + break; + default: + printk(KERN_WARNING "mlx4_ib: Unexpected event type %d " + "on QP %06x\n", type, qp->qpn); + return; + } + + event.element.qp = ibqp; + ibqp->event_handler(&event, ibqp->qp_context); + } +} + +static int send_wqe_overhead(enum ib_qp_type type, u32 flags) +{ + /* + * UD WQEs must have a datagram segment. + * RC and UC WQEs might have a remote address segment. + * MLX WQEs need two extra inline data segments (for the UD + * header and space for the ICRC). + */ + switch (type) { + case IB_QPT_UD: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg) + + ((flags & MLX4_IB_QP_LSO) ? 128 : 0); + case IB_QPT_UC: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_raddr_seg); + case IB_QPT_XRC_TGT: + case IB_QPT_RC: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_atomic_seg) + + sizeof (struct mlx4_wqe_raddr_seg); + case IB_QPT_SMI: + case IB_QPT_GSI: + return sizeof (struct mlx4_wqe_ctrl_seg) + + ALIGN(MLX4_IB_UD_HEADER_SIZE + + DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE, + MLX4_INLINE_ALIGN) * + sizeof (struct mlx4_wqe_inline_seg), + sizeof (struct mlx4_wqe_data_seg)) + + ALIGN(4 + + sizeof (struct mlx4_wqe_inline_seg), + sizeof (struct mlx4_wqe_data_seg)); + case IB_QPT_RAW_ETHERTYPE: + return sizeof(struct mlx4_wqe_ctrl_seg) + + ALIGN(MLX4_IB_MAX_RAW_ETY_HDR_SIZE + + sizeof(struct mlx4_wqe_inline_seg), + sizeof(struct mlx4_wqe_data_seg)); + + default: + return sizeof (struct mlx4_wqe_ctrl_seg); + } +} + +static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, + int is_user, int has_rq, struct mlx4_ib_qp *qp) +{ + /* Sanity check RQ size before proceeding */ + if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE || + cap->max_recv_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)) + return -EINVAL; + + if (!has_rq) { + if (cap->max_recv_wr) + return -EINVAL; + + qp->rq.wqe_cnt = qp->rq.max_gs = 0; + } else { + /* HW requires >= 1 RQ entry with >= 1 gather entry */ + if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) + return -EINVAL; + + qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr)); + qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge)); + qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg)); + } + + /* leave userspace return values as they were, so as not to break ABI */ + if (is_user) { + cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt; + cap->max_recv_sge = qp->rq.max_gs; + } else { + cap->max_recv_wr = qp->rq.max_post = + min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt); + cap->max_recv_sge = min(qp->rq.max_gs, + min(dev->dev->caps.max_sq_sg, + dev->dev->caps.max_rq_sg)); + } + + return 0; +} + +static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, + enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp) +{ + int s; + + /* Sanity check SQ size before proceeding */ + if (cap->max_send_wr > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) || + cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) || + cap->max_inline_data + send_wqe_overhead(type, qp->flags) + + sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz) + return -EINVAL; + + /* + * For MLX transport we need 2 extra S/G entries: + * one for the header and one for the checksum at the end + */ + if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI || + type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) && + cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) + return -EINVAL; + + s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg), + cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) + + send_wqe_overhead(type, qp->flags); + + if (s > dev->dev->caps.max_sq_desc_sz) + return -EINVAL; + + /* + * Hermon supports shrinking WQEs, such that a single work + * request can include multiple units of 1 << wqe_shift. This + * way, work requests can differ in size, and do not have to + * be a power of 2 in size, saving memory and speeding up send + * WR posting. Unfortunately, if we do this then the + * wqe_index field in CQEs can't be used to look up the WR ID + * anymore, so we do this only if selective signaling is off. + * + * Further, on 32-bit platforms, we can't use vmap() to make + * the QP buffer virtually contiguous. Thus we have to use + * constant-sized WRs to make sure a WR is always fully within + * a single page-sized chunk. + * + * Finally, we use NOP work requests to pad the end of the + * work queue, to avoid wrap-around in the middle of WR. We + * set NEC bit to avoid getting completions with error for + * these NOP WRs, but since NEC is only supported starting + * with firmware 2.2.232, we use constant-sized WRs for older + * firmware. + * + * And, since MLX QPs only support SEND, we use constant-sized + * WRs in this case. + * + * We look for the smallest value of wqe_shift such that the + * resulting number of wqes does not exceed device + * capabilities. + * + * We set WQE size to at least 64 bytes, this way stamping + * invalidates each WQE. + */ + if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC && + qp->sq_signal_bits && BITS_PER_LONG == 64 && + type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI && + !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | + MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) + qp->sq.wqe_shift = ilog2(64); + else + qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s)); + + for (;;) { + qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift); + + /* + * We need to leave 2 KB + 1 WR of headroom in the SQ to + * allow HW to prefetch. + */ + qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr; + qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr * + qp->sq_max_wqes_per_wr + + qp->sq_spare_wqes); + + if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes) + break; + + if (qp->sq_max_wqes_per_wr <= 1) + return -EINVAL; + + ++qp->sq.wqe_shift; + } + + qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz, + (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) - + send_wqe_overhead(type, qp->flags)) / + sizeof (struct mlx4_wqe_data_seg); + + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + if (qp->rq.wqe_shift > qp->sq.wqe_shift) { + qp->rq.offset = 0; + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + } else { + qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift; + qp->sq.offset = 0; + } + + cap->max_send_wr = qp->sq.max_post = + (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr; + cap->max_send_sge = min(qp->sq.max_gs, + min(dev->dev->caps.max_sq_sg, + dev->dev->caps.max_rq_sg)); + qp->max_inline_data = cap->max_inline_data; + + return 0; +} + + + +static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: return MLX4_QP_STATE_RST; + case IB_QPS_INIT: return MLX4_QP_STATE_INIT; + case IB_QPS_RTR: return MLX4_QP_STATE_RTR; + case IB_QPS_RTS: return MLX4_QP_STATE_RTS; + case IB_QPS_SQD: return MLX4_QP_STATE_SQD; + case IB_QPS_SQE: return MLX4_QP_STATE_SQER; + case IB_QPS_ERR: return MLX4_QP_STATE_ERR; + default: return -1; + } +} + +static void del_gid_entries(struct mlx4_ib_qp *qp) +{ + struct mlx4_ib_gid_entry *ge, *tmp; + + list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { + list_del(&ge->list); + kfree(ge); + } +} + +static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, + struct ib_qp_init_attr *init_attr) +{ + if (qp->state != IB_QPS_RESET) + if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), + MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp)) + printk(KERN_WARNING "mlx4_ib: modify QP %06x to RESET failed.\n", + qp->mqp.qpn); + + mlx4_qp_remove(dev->dev, &qp->mqp); + mlx4_qp_free(dev->dev, &qp->mqp); + mlx4_mtt_cleanup(dev->dev, &qp->mtt); + mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); + kfree(qp->sq.wrid); + kfree(qp->rq.wrid); + mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); + if (qp->max_inline_data) + mlx4_bf_free(dev->dev, &qp->bf); + if (!init_attr->srq) + mlx4_db_free(dev->dev, &qp->db); + + del_gid_entries(qp); +} + +static int qp_has_rq(struct ib_qp_init_attr *attr) +{ + if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT) + return 0; + + return !attr->srq; +} + + +static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp) +{ + int qpn; + int err; + enum mlx4_ib_qp_type qp_type = + (enum mlx4_ib_qp_type) init_attr->qp_type; + qp->mlx4_ib_qp_type = qp_type; + qp->pri.vid = qp->alt.vid = 0xFFFF; + mutex_init(&qp->mutex); + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); + INIT_LIST_HEAD(&qp->gid_list); + INIT_LIST_HEAD(&qp->steering_rules); + + qp->state = IB_QPS_RESET; + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + + err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, + qp_has_rq(init_attr), qp); + if (err) + goto err; + + if (pd->uobject) { + } else { + qp->sq_no_prefetch = 0; + + if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) + qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; + + if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) + qp->flags |= MLX4_IB_QP_LSO; + + if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP && + dev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED && + !mlx4_is_mfunc(dev->dev)) + qp->flags |= MLX4_IB_QP_NETIF; + + err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp); + if (err) + goto err; + + if (qp_has_rq(init_attr)) { + err = mlx4_db_alloc(dev->dev, &qp->db, 0, GFP_KERNEL); + if (err) + goto err; + + *qp->db.db = 0; + } + + if (qp->max_inline_data) { + err = mlx4_bf_alloc(dev->dev, &qp->bf, 0); + if (err) { + mlx4_ib_dbg("failed to allocate blue flame register (%d)", err); + qp->bf.uar = &dev->priv_uar; + } + } else + qp->bf.uar = &dev->priv_uar; + + if (mlx4_buf_alloc(dev->dev, qp->buf_size, + PAGE_SIZE * 2, &qp->buf, GFP_KERNEL)) { + err = -ENOMEM; + goto err_db; + } + + err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift, + &qp->mtt); + if (err) { + mlx4_ib_dbg("kernel qp mlx4_mtt_init error (%d)", err); + goto err_buf; + } + + err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf, GFP_KERNEL); + if (err) { + mlx4_ib_dbg("mlx4_buf_write_mtt error (%d)", err); + goto err_mtt; + } + + /* these are big chunks that may fail, added __GFP_NOWARN */ + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), + GFP_KERNEL | __GFP_NOWARN); + qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), + GFP_KERNEL | __GFP_NOWARN); + + if (!qp->sq.wrid || !qp->rq.wrid) { + printk(KERN_WARNING "%s:%d: not enough memory\n", + __func__, __LINE__); + err = -ENOMEM; + goto err_wrid; + } + } + + qpn = sqpn; + + err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp, GFP_KERNEL); + if (err) + goto err_qpn; + + if (init_attr->qp_type == IB_QPT_XRC_TGT) + qp->mqp.qpn |= (1 << 23); + + /* + * Hardware wants QPN written in big-endian order (after + * shifting) for send doorbell. Precompute this value to save + * a little bit when posting sends. + */ + qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); + + qp->mqp.event = mlx4_ib_qp_event; + + return 0; + +err_qpn: +err_wrid: + if (pd->uobject) { + } else { + kfree(qp->sq.wrid); + kfree(qp->rq.wrid); + } + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &qp->mtt); + +err_buf: + if (pd->uobject) + ib_umem_release(qp->umem); + else + mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); + +err_db: + if (!pd->uobject && !init_attr->srq + && init_attr->qp_type != IB_QPT_XRC_TGT) + mlx4_db_free(dev->dev, &qp->db); + + if (qp->max_inline_data) + mlx4_bf_free(dev->dev, &qp->bf); + +err: + return err; +} + +#if 0 +static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, + void *wqe, unsigned *mlx_seg_len) +{ + struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev; + struct mlx4_wqe_mlx_seg *mlx = wqe; + struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; + struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); + u16 pkey; + int send_size; + int header_size; + int spc; + int i; + union ib_gid sgid; + int is_eth; + int is_grh; + int is_vlan = 0; + int err; + u16 vlan; + + send_size = 0; + for (i = 0; i < wr->num_sge; ++i) + send_size += wr->sg_list[i].length; + + is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET; + is_grh = mlx4_ib_ah_grh_present(ah); + err = ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sgid); + if (err) + return err; + + if (is_eth) { + is_vlan = rdma_get_vlan_id(&sgid) < 0x1000; + vlan = rdma_get_vlan_id(&sgid); + } + + ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header); + if (!is_eth) { + sqp->ud_header.lrh.service_level = + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid; + sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f); + } + + if (is_grh) { + sqp->ud_header.grh.traffic_class = + (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff; + sqp->ud_header.grh.flow_label = + ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff); + sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit; + ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid); + memcpy(sqp->ud_header.grh.destination_gid.raw, + ah->av.ib.dgid, 16); + } + + mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + + if (!is_eth) { + mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) | + (sqp->ud_header.lrh.destination_lid == + IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) | + (sqp->ud_header.lrh.service_level << 8)); + mlx->rlid = sqp->ud_header.lrh.destination_lid; + } + + switch (wr->opcode) { + case IB_WR_SEND: + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; + sqp->ud_header.immediate_present = 0; + break; + case IB_WR_SEND_WITH_IMM: + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + sqp->ud_header.immediate_present = 1; + sqp->ud_header.immediate_data = wr->ex.imm_data; + break; + default: + return -EINVAL; + } + + if (is_eth) { + u8 *smac; + + memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6); + smac = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]->dev_addr; /* fixme: cache this value */ + memcpy(sqp->ud_header.eth.smac_h, smac, 6); + if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) + mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); + if (!is_vlan) + sqp->ud_header.eth.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE); + else { + u16 pcp; + + sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE); + pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 27 & 3) << 13; + sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp); + } + } else { + sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0; + if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE) + sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; + } + sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); + if (!sqp->qp.ibqp.qp_num) + ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey); + else + ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey); + sqp->ud_header.bth.pkey = cpu_to_be16(pkey); + sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); + sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ? + sqp->qkey : wr->wr.ud.remote_qkey); + sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num); + + header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); + + if (0) { + printk(KERN_ERR "built UD header of size %d:\n", header_size); + for (i = 0; i < header_size / 4; ++i) { + if (i % 8 == 0) + printk(" [%02x] ", i * 4); + printk(" %08x", + be32_to_cpu(((__be32 *) sqp->header_buf)[i])); + if ((i + 1) % 8 == 0) + printk("\n"); + } + printk("\n"); + } + + /* + * Inline data segments may not cross a 64 byte boundary. If + * our UD header is bigger than the space available up to the + * next 64 byte boundary in the WQE, use two inline data + * segments to hold the UD header. + */ + spc = MLX4_INLINE_ALIGN - + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + if (header_size <= spc) { + inl->byte_count = cpu_to_be32(1 << 31 | header_size); + memcpy(inl + 1, sqp->header_buf, header_size); + i = 1; + } else { + inl->byte_count = cpu_to_be32(1 << 31 | spc); + memcpy(inl + 1, sqp->header_buf, spc); + + inl = (void *) (inl + 1) + spc; + memcpy(inl + 1, sqp->header_buf + spc, header_size - spc); + /* + * Need a barrier here to make sure all the data is + * visible before the byte_count field is set. + * Otherwise the HCA prefetcher could grab the 64-byte + * chunk with this inline segment and get a valid (!= + * 0xffffffff) byte count but stale data, and end up + * generating a packet with bad headers. + * + * The first inline segment's byte_count field doesn't + * need a barrier, because it comes after a + * control/MLX segment and therefore is at an offset + * of 16 mod 64. + */ + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc)); + i = 2; + } + + *mlx_seg_len = + ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); + return 0; +} +#endif + +static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq) +{ + unsigned cur; + struct mlx4_ib_cq *cq; + + cur = wq->head - wq->tail; + if (likely(cur + nreq < wq->max_post)) + return 0; + + cq = to_mcq(ib_cq); + spin_lock(&cq->lock); + cur = wq->head - wq->tail; + spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +#if 0 +static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey) +{ + iseg->flags = 0; + iseg->mem_key = cpu_to_be32(rkey); + iseg->guest_id = 0; + iseg->pa = 0; +} +#endif + +static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, + u64 remote_addr, u32 rkey) +{ + rseg->raddr = cpu_to_be64(remote_addr); + rseg->rkey = cpu_to_be32(rkey); + rseg->reserved = 0; +} + +#if 0 +static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ib_send_wr *wr) +{ + if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap); + aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add); + } else if (wr->opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) { + aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add); + aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add_mask); + } else { + aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add); + aseg->compare = 0; + } + +} +#endif + +static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, + struct ib_send_wr *wr, __be16 *vlan) +{ + memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av)); + dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn); + dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey); + dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan; + memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6); + *vlan = dseg->vlan; +} + +#if 0 +static void set_mlx_icrc_seg(void *dseg) +{ + u32 *t = dseg; + struct mlx4_wqe_inline_seg *iseg = dseg; + + t[1] = 0; + + /* + * Need a barrier here before writing the byte_count field to + * make sure that all the data is visible before the + * byte_count field is set. Otherwise, if the segment begins + * a new cacheline, the HCA prefetcher could grab the 64-byte + * chunk and get a valid (!= * 0xffffffff) byte count but + * stale data, and end up sending the wrong data. + */ + wmb(); + + iseg->byte_count = cpu_to_be32((1 << 31) | 4); +} +#endif + +static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); + + /* + * Need a barrier here before writing the byte_count field to + * make sure that all the data is visible before the + * byte_count field is set. Otherwise, if the segment begins + * a new cacheline, the HCA prefetcher could grab the 64-byte + * chunk and get a valid (!= * 0xffffffff) byte count but + * stale data, and end up sending the wrong data. + */ + wmb(); + + dseg->byte_count = cpu_to_be32(sg->length); +} + +static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->byte_count = cpu_to_be32(sg->length); + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); +} + +static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr, + struct mlx4_ib_qp *qp, unsigned *lso_seg_len, + __be32 *lso_hdr_sz, int *blh) +{ + unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16); + + *blh = unlikely(halign > 64) ? 1 : 0; + + if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) && + wr->num_sge > qp->sq.max_gs - (halign >> 4))) + return -EINVAL; + + memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen); + + *lso_hdr_sz = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 | + wr->wr.ud.hlen); + *lso_seg_len = halign; + return 0; +} + +static __be32 send_ieth(struct ib_send_wr *wr) +{ + switch (wr->opcode) { + case IB_WR_SEND_WITH_IMM: + case IB_WR_RDMA_WRITE_WITH_IMM: + return wr->ex.imm_data; + + case IB_WR_SEND_WITH_INV: + return cpu_to_be32(wr->ex.invalidate_rkey); + + default: + return 0; + } +} + +static int lay_inline_data(struct mlx4_ib_qp *qp, struct ib_send_wr *wr, + void *wqe, int *sz) +{ + struct mlx4_wqe_inline_seg *seg; + void *addr; + int len, seg_len; + int num_seg; + int off, to_copy; + int i; + int inl = 0; + + seg = wqe; // current segment + wqe += sizeof *seg; // wqe pointer + off = ((unsigned long)wqe) & (unsigned long)(MLX4_INLINE_ALIGN - 1); + num_seg = 0; + seg_len = 0; + + for (i = 0; i < wr->num_sge; ++i) { + addr = (void *) (unsigned long)(wr->sg_list[i].addr); + len = wr->sg_list[i].length; + inl += len; + + if (inl > qp->max_inline_data) { + inl = 0; + return -1; + } + + while (len >= MLX4_INLINE_ALIGN - off) { + to_copy = MLX4_INLINE_ALIGN - off; + memcpy(wqe, addr, to_copy); + len -= to_copy; + wqe += to_copy; + addr += to_copy; + seg_len += to_copy; + wmb(); /* see comment below */ + seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); + seg_len = 0; + seg = wqe; + wqe += sizeof *seg; + off = sizeof *seg; + ++num_seg; + } + + memcpy(wqe, addr, len); + wqe += len; + seg_len += len; + off += len; + } + + if (seg_len) { + ++num_seg; + /* + * Need a barrier here to make sure + * all the data is visible before the + * byte_count field is set. Otherwise + * the HCA prefetcher could grab the + * 64-byte chunk with this inline + * segment and get a valid (!= + * 0xffffffff) byte count but stale + * data, and end up sending the wrong + * data. + */ + wmb(); + seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); + } + + *sz = (inl + num_seg * sizeof * seg + 15) / 16; + + return 0; +} + +/* + * Avoid using memcpy() to copy to BlueFlame page, since memcpy() + * implementations may use move-string-buffer assembler instructions, + * which do not guarantee order of copying. + */ +static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt) +{ + __iowrite64_copy(dst, src, bytecnt / 8); +} + +int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx4_ib_qp *qp = to_mqp(ibqp); + struct mlx4_wqe_data_seg *scat; + unsigned long flags; + int err = 0; + int nreq; + int ind; + int i; + + spin_lock_irqsave(&qp->rq.lock, flags); + + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)", + ibqp->qp_num, wr->num_sge); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + scat = get_recv_wqe(qp, ind); + + for (i = 0; i < wr->num_sge; ++i) + __set_data_seg(scat + i, wr->sg_list + i); + + if (i < qp->rq.max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY); + scat[i].addr = 0; + } + + qp->rq.wrid[ind] = wr->wr_id; + + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + qp->rq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); + } + + spin_unlock_irqrestore(&qp->rq.lock, flags); + + return err; +} + +static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state) +{ + switch (mlx4_state) { + case MLX4_QP_STATE_RST: return IB_QPS_RESET; + case MLX4_QP_STATE_INIT: return IB_QPS_INIT; + case MLX4_QP_STATE_RTR: return IB_QPS_RTR; + case MLX4_QP_STATE_RTS: return IB_QPS_RTS; + case MLX4_QP_STATE_SQ_DRAINING: + case MLX4_QP_STATE_SQD: return IB_QPS_SQD; + case MLX4_QP_STATE_SQER: return IB_QPS_SQE; + case MLX4_QP_STATE_ERR: return IB_QPS_ERR; + default: return -1; + } +} + +static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state) +{ + switch (mlx4_mig_state) { + case MLX4_QP_PM_ARMED: return IB_MIG_ARMED; + case MLX4_QP_PM_REARM: return IB_MIG_REARM; + case MLX4_QP_PM_MIGRATED: return IB_MIG_MIGRATED; + default: return -1; + } +} + +static int to_ib_qp_access_flags(int mlx4_flags) +{ + int ib_flags = 0; + + if (mlx4_flags & MLX4_QP_BIT_RRE) + ib_flags |= IB_ACCESS_REMOTE_READ; + if (mlx4_flags & MLX4_QP_BIT_RWE) + ib_flags |= IB_ACCESS_REMOTE_WRITE; + if (mlx4_flags & MLX4_QP_BIT_RAE) + ib_flags |= IB_ACCESS_REMOTE_ATOMIC; + + return ib_flags; +} + +static void to_ib_ah_attr(struct mlx4_ib_dev *ib_dev, struct ib_ah_attr *ib_ah_attr, + struct mlx4_qp_path *path) +{ + struct mlx4_dev *dev = ib_dev->dev; + int is_eth; + + memset(ib_ah_attr, 0, sizeof *ib_ah_attr); + ib_ah_attr->port_num = path->sched_queue & 0x40 ? 2 : 1; + + if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports) + return; + + is_eth = rdma_port_get_link_layer(&ib_dev->ib_dev, ib_ah_attr->port_num) == + IB_LINK_LAYER_ETHERNET; + if (is_eth) + ib_ah_attr->sl = ((path->sched_queue >> 3) & 0x7) | + ((path->sched_queue & 4) << 1); + else + ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf; + + ib_ah_attr->dlid = be16_to_cpu(path->rlid); + + ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f; + ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0; + ib_ah_attr->ah_flags = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0; + if (ib_ah_attr->ah_flags) { + ib_ah_attr->grh.sgid_index = path->mgid_index; + ib_ah_attr->grh.hop_limit = path->hop_limit; + ib_ah_attr->grh.traffic_class = + (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff; + ib_ah_attr->grh.flow_label = + be32_to_cpu(path->tclass_flowlabel) & 0xfffff; + memcpy(ib_ah_attr->grh.dgid.raw, + path->rgid, sizeof ib_ah_attr->grh.dgid.raw); + } +} + +int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx4_ib_dev *dev = to_mdev(ibqp->device); + struct mlx4_ib_qp *qp = to_mqp(ibqp); + struct mlx4_qp_context context; + int mlx4_state; + int err = 0; + + mutex_lock(&qp->mutex); + + if (qp->state == IB_QPS_RESET) { + qp_attr->qp_state = IB_QPS_RESET; + goto done; + } + + err = mlx4_qp_query(dev->dev, &qp->mqp, &context); + if (err) { + err = -EINVAL; + goto out; + } + + mlx4_state = be32_to_cpu(context.flags) >> 28; + + qp->state = to_ib_qp_state(mlx4_state); + qp_attr->qp_state = qp->state; + qp_attr->path_mtu = context.mtu_msgmax >> 5; + qp_attr->path_mig_state = + to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3); + qp_attr->qkey = be32_to_cpu(context.qkey); + qp_attr->rq_psn = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff; + qp_attr->sq_psn = be32_to_cpu(context.next_send_psn) & 0xffffff; + qp_attr->dest_qp_num = be32_to_cpu(context.remote_qpn) & 0xffffff; + qp_attr->qp_access_flags = + to_ib_qp_access_flags(be32_to_cpu(context.params2)); + + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC || + qp->ibqp.qp_type == IB_QPT_XRC_TGT) { + to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path); + to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path); + qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f; + qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; + } + + qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f; + if (qp_attr->qp_state == IB_QPS_INIT) + qp_attr->port_num = qp->port; + else + qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1; + + /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ + qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING; + + qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7); + + qp_attr->max_dest_rd_atomic = + 1 << ((be32_to_cpu(context.params2) >> 21) & 0x7); + qp_attr->min_rnr_timer = + (be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f; + qp_attr->timeout = context.pri_path.ackto >> 3; + qp_attr->retry_cnt = (be32_to_cpu(context.params1) >> 16) & 0x7; + qp_attr->rnr_retry = (be32_to_cpu(context.params1) >> 13) & 0x7; + qp_attr->alt_timeout = context.alt_path.ackto >> 3; + +done: + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; + qp_attr->cap.max_recv_sge = qp->rq.max_gs; + + if (!ibqp->uobject) { + qp_attr->cap.max_send_wr = qp->sq.wqe_cnt; + qp_attr->cap.max_send_sge = qp->sq.max_gs; + } else { + qp_attr->cap.max_send_wr = 0; + qp_attr->cap.max_send_sge = 0; + } + + /* + * We don't support inline sends for kernel QPs (yet), and we + * don't know what userspace's value should be. + */ + qp_attr->cap.max_inline_data = 0; + + qp_init_attr->cap = qp_attr->cap; + + qp_init_attr->create_flags = 0; + if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) + qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + + if (qp->flags & MLX4_IB_QP_LSO) + qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; + +out: + mutex_unlock(&qp->mutex); + return err; +} + + +int mlx4_ib_create_xrc_rcv_qp(struct ib_qp_init_attr *init_attr, + u32 *qp_num) +{ + return -ENOSYS; +} + +int mlx4_ib_modify_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num, + struct ib_qp_attr *attr, int attr_mask) +{ + return -ENOSYS; +} + +int mlx4_ib_query_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num, + struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + return -ENOSYS; +} + +int mlx4_ib_reg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num) +{ + return -ENOSYS; +} + +int mlx4_ib_unreg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num) +{ + return -ENOSYS; +} + +/**** VNIC IB VERBS ****/ +int vnic_ib_post_send(struct ib_qp *ibqp, + struct ib_send_wr *wr, + struct ib_send_wr **bad_wr, + u8 ip_off, u8 ip6_off, + u8 tcp_off, u8 udp_off) +{ + struct mlx4_ib_qp *qp = to_mqp(ibqp); + void *wqe; + struct mlx4_wqe_ctrl_seg *ctrl; + struct mlx4_wqe_data_seg *dseg; + __be32 owner_opcode = 0; + int nreq; + int err = 0; + unsigned ind; + int uninitialized_var(stamp); + int uninitialized_var(size); + unsigned uninitialized_var(seglen); + __be32 dummy; + __be32 *lso_wqe; + __be32 uninitialized_var(lso_hdr_sz); + int i; + int blh = 0; + __be16 vlan = 0; + int inl = 0; + + ind = qp->sq_next_wqe; + + nreq = 0; + lso_wqe = &dummy; + + if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { + mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->sq.max_gs)) { + mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)", + ibqp->qp_num, wr->num_sge); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); + *((u32 *) (&ctrl->vlan_tag)) = 0; + qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id; + + ctrl->srcrb_flags = + (wr->send_flags & IB_SEND_SIGNALED ? + cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) | + (wr->send_flags & IB_SEND_SOLICITED ? + cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) | + qp->sq_signal_bits; + + ctrl->imm = send_ieth(wr); + + wqe += sizeof *ctrl; + size = sizeof *ctrl / 16; + + set_datagram_seg(wqe, wr, &vlan); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + + if (wr->opcode == IB_WR_LSO) { + err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz, &blh); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + lso_wqe = (__be32 *) wqe; + wqe += seglen; + size += seglen / 16; + } + dseg = wqe; + dseg += wr->num_sge - 1; + + if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) { + int sz; + + err = lay_inline_data(qp, wr, wqe, &sz); + if (!err) { + inl = 1; + size += sz; + } + } else { + size += wr->num_sge * (sizeof(struct mlx4_wqe_data_seg) / 16); + for (i = wr->num_sge - 1; i >= 0; --i, --dseg) + set_data_seg(dseg, wr->sg_list + i); + } + + wmb(); + *lso_wqe = lso_hdr_sz; + + ctrl->fence_size = size; + + /* set SWP bits based on ip/ip6/tcp/udp offests */ + if (wr->send_flags & IB_SEND_IP_CSUM) { + /* SWP bit */ + owner_opcode |= cpu_to_be32(1 << 24); + + /* IP offset starts from the begining of IB packet + * (and not ETH packet) in 2 bytes. + * In control segment, we use c & d: + * (a) tcp=0, ip=0 => calc TCP/UDP csum over IPv4 + * (b) tcp=0, ip=1 => calc IP csum only over IPv4 + * (c) tcp=1, ip=0 => calc TCP/UDP csum over IPv6 + * (d) tcp=1, ip=1 => calc TCP/UDP and IP csum over IPv4 + */ + if (ip_off) { + ip_off += (IB_LRH_BYTES + IB_BTH_BYTES + + IB_DETH_BYTES) >> 1; + ip_off += (to_mah(wr->wr.ud.ah)->av.ib.g_slid + & 0x80) ? (IB_GRH_BYTES >> 1) : 0; + owner_opcode |= cpu_to_be32((ip_off) << 8); + ctrl->srcrb_flags |= + cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM); + } else if (ip6_off) { + ip6_off += (IB_LRH_BYTES + IB_BTH_BYTES + + IB_DETH_BYTES) >> 1; + ip6_off += (to_mah(wr->wr.ud.ah)->av.ib.g_slid + & 0x80) ? (IB_GRH_BYTES >> 1) : 0; + owner_opcode |= cpu_to_be32((ip6_off) << 8); + } + + if (udp_off) { /* UDP offset and bit */ + owner_opcode |= cpu_to_be32(udp_off << 16); + owner_opcode |= cpu_to_be32(1 << 25); + ctrl->srcrb_flags |= + cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM); + } else if (tcp_off) { /* TCP offset */ + owner_opcode |= cpu_to_be32(tcp_off << 16); + ctrl->srcrb_flags |= + cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM); + } + } + + /* set opcode, use 0x4e for BIG_LSO */ + if (!blh) + owner_opcode |= mlx4_ib_opcode[wr->opcode]; + else + owner_opcode |= cpu_to_be32(0x4e); + + /* set owenership bit */ + owner_opcode |= (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); + + /* Make sure descriptor is fully written */ + wmb(); + ctrl->owner_opcode = owner_opcode; + + stamp = ind + qp->sq_spare_wqes; + ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift); + + /* simulate the for loop */ + nreq++; + +out: + if (nreq == 1 && inl && size > 1 && size < qp->bf.buf_size / 16) { + ctrl->owner_opcode |= htonl((qp->sq_next_wqe & 0xffff) << 8); + *(u32 *) (&ctrl->vlan_tag) |= qp->doorbell_qpn; + /* + * Make sure that descriptor is written to memory + * before writing to BlueFlame page. + */ + wmb(); + + ++qp->sq.head; + + mlx4_bf_copy(qp->bf.reg + qp->bf.offset, (unsigned long *) ctrl, + ALIGN(size * 16, 64)); + wc_wmb(); + + qp->bf.offset ^= qp->bf.buf_size; + + } else if (nreq) { + qp->sq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + writel(qp->doorbell_qpn, qp->bf.uar->map + MLX4_SEND_DOORBELL); + + /* + * Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order. + */ + mmiowb(); + + } + + stamp_send_wqe(qp, stamp, size * 16); + + ind = pad_wraparound(qp, ind); + qp->sq_next_wqe = ind; + return err; +} + +int __vnic_ib_create_qp_range(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, + struct ib_udata *udata, int nqps, + int align, struct ib_qp *list[]) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_qp *qp; + int err; + int base_qpn, qpn; + int i; + + for (i = 0; i < nqps; ++i) { + if (init_attr[i].create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO | + IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) + return -EINVAL; + if (init_attr[i].create_flags & (IB_QP_CREATE_IPOIB_UD_LSO | + IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) && + (pd->uobject || init_attr[i].qp_type != IB_QPT_UD)) + return -EINVAL; + + /* Userspace is not allowed to create special QPs: */ + if (pd->uobject && (init_attr[i].qp_type == IB_QPT_SMI || + init_attr[i].qp_type == IB_QPT_GSI)) + return -EINVAL; + + if (nqps > 1 && (init_attr[i].qp_type == IB_QPT_SMI || + init_attr[i].qp_type == IB_QPT_GSI)) + return -EINVAL; + } + + err = mlx4_qp_reserve_range(dev->dev, nqps, align, &base_qpn, 0); + if (err) + return err; + + for (i = 0, qpn = base_qpn; i < nqps; ++i, ++qpn) { + qp = kzalloc(sizeof *qp, GFP_KERNEL); + if (!qp) { + err = -ENOMEM; + goto exit_fail; + } + + err = create_qp_common(dev, pd, init_attr + i, udata, qpn, qp); + if (err) { + kfree(qp); + err = err; + goto exit_fail; + } + qp->xrcdn = 0; + qp->ibqp.qp_num = qp->mqp.qpn; + list[i] = &qp->ibqp; + } + return 0; + +exit_fail: + for (--i; i >= 0; --i) { + destroy_qp_common(dev, to_mqp(list[i]), init_attr + i); + kfree(to_mqp(list[i])); + } + + mlx4_qp_release_range(dev->dev, base_qpn, nqps); + return err; +} + +/* compare with ib_create_qp() in infiniband/core/verbs.c */ +int vnic_ib_create_qp_range(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, + struct ib_udata *udata, int nqps, + int align, struct ib_qp *list[]) +{ + struct ib_qp *qp; + struct ib_qp_init_attr *qp_init_attr; + int rc, i; + + rc = __vnic_ib_create_qp_range(pd, init_attr, udata ,nqps, align, list); + + if (rc) + return rc; + + for (i = 0; i < nqps; ++ i) { + qp = list[i]; + qp_init_attr = &init_attr[i]; + qp->device = pd->device; + qp->real_qp = qp; + qp->pd = pd; + qp->send_cq = qp_init_attr->send_cq; + qp->recv_cq = qp_init_attr->recv_cq; + qp->srq = qp_init_attr->srq; + qp->uobject = NULL; + qp->event_handler = qp_init_attr->event_handler; + qp->qp_context = qp_init_attr->qp_context; + qp->qp_type = qp_init_attr->qp_type; + qp->xrcd = qp->qp_type == IB_QPT_XRC_TGT ? + qp_init_attr->xrcd : NULL; + atomic_inc(&pd->usecnt); + atomic_inc(&qp_init_attr->send_cq->usecnt); + atomic_inc(&qp_init_attr->recv_cq->usecnt); + if (qp_init_attr->srq) + atomic_inc(&qp_init_attr->srq->usecnt); + if (qp->qp_type == IB_QPT_XRC_TGT) + atomic_inc(&qp->xrcd->usecnt); + } + return 0; +} + diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_utils.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_utils.h new file mode 100644 index 0000000000000..56ee8cff18e12 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_utils.h @@ -0,0 +1,317 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _VNIC_UTILS_H +#define _VNIC_UTILS_H + +/*#define CONFIG_MLX4_VNIC_DEBUG */ /* comment out in RELEASE and PERFORMANCE modes */ +/* #define VNIC_PROFILLNG */ /* comment out in RELEASE and PERFORMANCE modes */ +#define VNIC_EXTRA_STATS /* comment out in PERFORMANCE mode */ + +enum { + VNIC_DEBUG_GENERAL = 1 << 0, /* 0x1 */ + VNIC_DEBUG_MCAST = 1 << 1, /* 0x2 */ + VNIC_DEBUG_MCAST_V = 1 << 2, /* 0x4 */ + VNIC_DEBUG_DATA = 1 << 3, /* 0x8 */ + VNIC_DEBUG_DATA_V = 1 << 4, /* 0x10 */ + VNIC_DEBUG_FIP = 1 << 5, /* 0x20 */ + VNIC_DEBUG_FIP_V = 1 << 6, /* 0x40 */ + VNIC_DEBUG_SKB = 1 << 7, /* 0x80 */ + VNIC_DEBUG_SKB_V = 1 << 8, /* 0x100 */ + VNIC_DEBUG_VHUB = 1 << 9, /* 0x200 */ + VNIC_DEBUG_VHUB_V = 1 << 10, /* 0x400 */ + VNIC_DEBUG_ETHTOOL = 1 << 11, /* 0x800 */ + VNIC_DEBUG_ETHTOOL_V = 1 << 12, /* 0x1000 */ + VNIC_DEBUG_FUNC = 1 << 13, /* 0x2000 */ + VNIC_DEBUG_MARK = 1 << 14, /* 0x4000 */ + VNIC_DEBUG_MODER = 1 << 15, /* 0x8000 */ + VNIC_DEBUG_MODER_v = 1 << 16, /* 0x10000 */ + VNIC_DEBUG_PKT_DUMP = 1 << 17, /* 0x20000 */ + VNIC_DEBUG_FIP_P0 = 1 << 18, /* 0x40000 */ + VNIC_DEBUG_SYSFS = 1 << 19, /* 0x80000 */ + VNIC_DEBUG_MAC = 1 << 20, /* 0x100000 */ + VNIC_DEBUG_TSTAMP = 1 << 21, /* 0x200000 */ + VNIC_DEBUG_PARSER = 1 << 19, /* 0x400000 */ + VNIC_DEBUG_LAG = 1 << 20, /* 0x800000 */ + VNIC_DEBUG_LAG_V = 1 << 21, /* 0x1000000 */ + VNIC_DEBUG_MCAST_VV = 1 << 22, /* 0x2000000 */ + VNIC_DEBUG_DEBUG = 1 << 31, /* 0x80000000 */ +}; + +/* always defined */ +#define vnic_printk(level, prefix, format, arg...) \ + do { printk(level "T%.4ld [%s] %s:%s:%d: " format, \ + jiffies * 1000 / HZ, \ + DRV_NAME, prefix ? prefix : "", __func__, __LINE__ , \ + ## arg); \ +} while(0) + +#define vnic_info(format, arg...) \ +do { printk(KERN_INFO "[%s] " format, DRV_NAME, ## arg); } \ +while (0) + +#define vnic_warn(prefix, format, arg...) \ +do { vnic_printk(KERN_WARNING, prefix, format, ## arg); } \ +while (0) + +#define vnic_err(prefix, format, arg...) \ +do { vnic_printk(KERN_ERR, prefix, format, ## arg); } \ +while (0) + +#define _sprintf(p, buf, format, arg...) \ + (PAGE_SIZE - (int)(p - buf)) <= 0 ? 0 : \ + scnprintf(p, PAGE_SIZE - (int)(p - buf), format, ## arg) + +/* debug functions */ +#ifndef CONFIG_MLX4_VNIC_DEBUG +#define ASSERT(x) do { (void)(x); } while (0) +#define vnic_dbg_mark(void) do { } while (0) +#define vnic_dbg_func(prefix) do { } while (0) +#define vnic_dbg(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_mcast(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_mcast_v(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_mcast_vv(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_debug(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_ethtool(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_ethtool_v(prefix, format, arg...) \ + do { (void)(prefix); } while (0) +#define vnic_dbg_data(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_data_v(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_fip(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_parse(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_lag(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_lag_v(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_fip_p0(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_sysfs(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_mac(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_fip_v(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_vhub(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_vhub_v(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_moder(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_moder_v(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_printk_skb(prefix, skb, o1, o2) do { (void)(prefix); } while (0) +#define vnic_dbg_skb(prefix, skb, o1, o2) do { (void)(prefix); } while (0) +#else +#define ASSERT(x) \ +do { if (x) break; \ + printk(KERN_EMERG "### ASSERTION FAILED %s: %s: %d: %s\n", \ + __FILE__, __func__, __LINE__, #x); dump_stack(); BUG(); \ +} while (0) + +#define vnic_dbg(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_GENERAL)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_mcast(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_MCAST)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_mcast_v(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_MCAST_V)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_mcast_vv(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_MCAST_VV)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_debug(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_DEBUG)) break; \ + vnic_printk(KERN_WARNING, prefix, format, ## arg); \ +} while (0) + + +#define vnic_dbg_data(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_DATA)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_data_v(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_DATA_V)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_fip_p0(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_FIP_P0)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_sysfs(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_SYSFS)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_mac(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_MAC)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_fip(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_FIP)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_parse(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_PARSER)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_lag(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_LAG)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_lag_v(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_LAG_V)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_fip_v(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_FIP_V)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_vhub(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_VHUB)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_vhub_v(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_VHUB_V)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_moder(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_MODER)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_moder_v(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_MODER_V)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_ethtool(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_ETHTOOL)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_ethtool_v(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_ETHTOOL_V)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_mark(void) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_MARK)) break; \ + vnic_printk(KERN_DEBUG, NULL, "###\n"); \ +} while (0) + +#define vnic_dbg_func(prefix) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_FUNC)) break; \ + vnic_printk(KERN_DEBUG, prefix, "function called\n"); \ +} while (0) + +#define ethp2str(p, str) \ +do { \ + switch (ntohs(p)) { \ + case ETH_P_RARP: sprintf(str, "%s", "ETH_P_RARP"); break; \ + case ETH_P_ARP: sprintf(str, "%s", "ETH_P_ARP"); break; \ + case ETH_P_IP: sprintf(str, "%s", "ETH_P_IP"); break; \ + case ETH_P_IPV6: sprintf(str, "%s", "ETH_P_IPV6"); break; \ + case ETH_P_8021Q:sprintf(str, "%s", "ETH_P_8021Q");break; \ + default: sprintf(str, "0x%x", p); break; \ + } \ +} while (0) + +#define skb_printk(prefix, format, arg...) \ + printk(KERN_DEBUG "[%s] " format, prefix, ## arg) + +#define vnic_dbg_skb(_prefix, skb, eoib_off, eth_off) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_SKB)) break; \ + vnic_printk_skb(_prefix, skb, eoib_off, eth_off); \ +} while (0) + +#define VNIC_SYSLOG_LLEN 64 +#define vnic_printk_skb(_prefix, skb, eoib_off, eth_off) \ +do { \ + char pr[VNIC_SYSLOG_LLEN]; \ + char h_proto_str[VNIC_SYSLOG_LLEN]; \ + struct eoibhdr *eoib_hdr = (struct eoibhdr *) \ + (skb->data + eoib_off); \ + struct ethhdr *ethh = (struct ethhdr *) \ + (skb->data + eth_off); \ + struct net_device *dev = skb->dev; \ + ASSERT(dev); \ + snprintf(pr, VNIC_SYSLOG_LLEN, "%s:skb-%s", dev->name, _prefix);\ + skb_printk(pr, "\n"); \ + skb_printk(pr, "--- skb dump ---\n"); \ + skb_printk(pr, "len : %d\n", skb->len); \ + skb_printk(pr, "data_len : %d\n", skb->data_len); \ + skb_printk(pr, "frags : %d\n", \ + skb_shinfo(skb)->nr_frags); \ + skb_printk(pr, "gso : %d\n", skb_is_gso(skb)); \ + skb_printk(pr, "head_len : %d\n", (int)skb_headlen(skb)); \ + skb_printk(pr, "data : %p\n", skb->data); \ + skb_printk(pr, "head : %p\n", skb->head); \ + skb_printk(pr, "tail : %lu\n", \ + (unsigned long)(skb->tail)); \ + skb_printk(pr, "end : %lu\n", \ + (unsigned long)(skb->end)); \ + skb_printk(pr, "eoib_off : %lu\n", eoib_off); \ + skb_printk(pr, "eth_off : %lu\n", eth_off); \ + if (eth_off < 0 || !skb_headlen(skb)) \ + break; \ + ethp2str(ethh->h_proto, h_proto_str); \ + skb_printk(pr, "eth_proto : %s\n", h_proto_str); \ + skb_printk(pr, "eth_dest : "MAC_6_PRINT_FMT"\n", \ + MAC_6_PRINT_ARG(ethh->h_dest)); \ + skb_printk(pr, "eth_source : "MAC_6_PRINT_FMT"\n", \ + MAC_6_PRINT_ARG(ethh->h_source)); \ + if (eoib_off < 0) \ + break; \ + skb_printk(pr, "eoib_seg_id : 0x%04x\n", eoib_hdr->seg_id); \ + skb_printk(pr, "eoib_seg_off : 0x%02x\n", eoib_hdr->seg_off); \ + skb_printk(pr, "eoib_ip_chk : 0x%02x\n", \ + VNIC_EOIB_HDR_GET_IP_CHK(eoib_hdr)); \ + skb_printk(pr, "eoib_tcp_chk : 0x%02x\n", \ + VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr)); \ + skb_printk(pr, "eoib_ver : 0x%02x\n", \ + VNIC_EOIB_HDR_GET_VER(eoib_hdr)); \ + skb_printk(pr, "eoib_sig : 0x%02x\n", \ + VNIC_EOIB_HDR_GET_SIG(eoib_hdr)); \ +} while (0) + +#endif /* CONFIG_MLX4_VNIC_DEBUG */ +#endif /* _VNIC_UTILS_H */